[lua-torch-nn] 01/01: Imported Upstream version 0~20160604-gd23a8f5

Zhou Mo cdluminate-guest at moszumanska.debian.org
Sat Jun 18 09:15:14 UTC 2016


This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-nn.

commit 2ce9be746da0b9d7a4dcfdd2ff5f5c063b8e74b3
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Wed Jun 8 15:48:08 2016 +0000

    Imported Upstream version 0~20160604-gd23a8f5
---
 .gitignore                                     |    2 +
 .luacheckrc                                    |   13 +
 .travis.yml                                    |   56 +
 Abs.lua                                        |   22 +
 AbsCriterion.lua                               |   32 +
 Add.lua                                        |   66 +
 AddConstant.lua                                |   37 +
 BCECriterion.lua                               |  106 +
 BatchNormalization.lua                         |  195 +
 Bilinear.lua                                   |  156 +
 CAddTable.lua                                  |   36 +
 CDivTable.lua                                  |   26 +
 CMakeLists.txt                                 |   18 +
 CMul.lua                                       |  135 +
 CMulTable.lua                                  |   55 +
 CONTRIBUTING.md                                |  136 +
 COPYRIGHT.txt                                  |   36 +
 CSubTable.lua                                  |   26 +
 Clamp.lua                                      |    5 +
 ClassNLLCriterion.lua                          |   74 +
 ClassSimplexCriterion.lua                      |  118 +
 Concat.lua                                     |  114 +
 ConcatTable.lua                                |  117 +
 Container.lua                                  |  148 +
 Contiguous.lua                                 |   19 +
 Copy.lua                                       |   42 +
 Cosine.lua                                     |  175 +
 CosineDistance.lua                             |  116 +
 CosineEmbeddingCriterion.lua                   |  142 +
 Criterion.lua                                  |   56 +
 CriterionTable.lua                             |   17 +
 CrossEntropyCriterion.lua                      |   28 +
 DepthConcat.lua                                |  114 +
 DistKLDivCriterion.lua                         |   34 +
 DotProduct.lua                                 |   61 +
 Dropout.lua                                    |   69 +
 ELU.lua                                        |   45 +
 ErrorMessages.lua                              |   19 +
 Euclidean.lua                                  |  197 +
 Exp.lua                                        |    9 +
 FlattenTable.lua                               |  106 +
 GradientReversal.lua                           |   32 +
 HardShrink.lua                                 |   25 +
 HardTanh.lua                                   |   31 +
 HingeEmbeddingCriterion.lua                    |   43 +
 Identity.lua                                   |   30 +
 Index.lua                                      |   25 +
 Jacobian.lua                                   |  387 ++
 JoinTable.lua                                  |   74 +
 L1Cost.lua                                     |   30 +
 L1HingeEmbeddingCriterion.lua                  |   41 +
 L1Penalty.lua                                  |   42 +
 LeakyReLU.lua                                  |   41 +
 Linear.lua                                     |  120 +
 Log.lua                                        |   20 +
 LogSigmoid.lua                                 |   27 +
 LogSoftMax.lua                                 |   19 +
 LookupTable.lua                                |  170 +
 MM.lua                                         |   92 +
 MSECriterion.lua                               |   32 +
 MV.lua                                         |   82 +
 MarginCriterion.lua                            |   31 +
 MarginRankingCriterion.lua                     |   75 +
 MaskedSelect.lua                               |   71 +
 Max.lua                                        |   70 +
 Mean.lua                                       |   14 +
 Min.lua                                        |   70 +
 MixtureTable.lua                               |  170 +
 Module.lua                                     |  395 ++
 Mul.lua                                        |   38 +
 MulConstant.lua                                |   41 +
 MultiCriterion.lua                             |   40 +
 MultiLabelMarginCriterion.lua                  |   31 +
 MultiLabelSoftMarginCriterion.lua              |   44 +
 MultiMarginCriterion.lua                       |   54 +
 Narrow.lua                                     |   33 +
 NarrowTable.lua                                |   43 +
 Normalize.lua                                  |  155 +
 PReLU.lua                                      |   52 +
 Padding.lua                                    |   65 +
 PairwiseDistance.lua                           |   91 +
 Parallel.lua                                   |  115 +
 ParallelCriterion.lua                          |   41 +
 ParallelTable.lua                              |   57 +
 PartialLinear.lua                              |  113 +
 Power.lua                                      |   22 +
 README.md                                      |   21 +
 RReLU.lua                                      |   50 +
 ReLU.lua                                       |    5 +
 Replicate.lua                                  |   57 +
 Reshape.lua                                    |   72 +
 Select.lua                                     |   22 +
 SelectTable.lua                                |   62 +
 Sequential.lua                                 |  122 +
 Sigmoid.lua                                    |   19 +
 SmoothL1Criterion.lua                          |   32 +
 SoftMarginCriterion.lua                        |   24 +
 SoftMax.lua                                    |   19 +
 SoftMin.lua                                    |   31 +
 SoftPlus.lua                                   |   35 +
 SoftShrink.lua                                 |   25 +
 SoftSign.lua                                   |   20 +
 SparseJacobian.lua                             |  277 ++
 SparseLinear.lua                               |  242 +
 SpatialAdaptiveMaxPooling.lua                  |   41 +
 SpatialAveragePooling.lua                      |   93 +
 SpatialBatchNormalization.lua                  |   35 +
 SpatialClassNLLCriterion.lua                   |   74 +
 SpatialContrastiveNormalization.lua            |   36 +
 SpatialConvolution.lua                         |  195 +
 SpatialConvolutionLocal.lua                    |  207 +
 SpatialConvolutionMM.lua                       |  158 +
 SpatialConvolutionMap.lua                      |  154 +
 SpatialCrossMapLRN.lua                         |  153 +
 SpatialDilatedConvolution.lua                  |   99 +
 SpatialDivisiveNormalization.lua               |  136 +
 SpatialDropout.lua                             |   54 +
 SpatialFractionalMaxPooling.lua                |  160 +
 SpatialFullConvolution.lua                     |  225 +
 SpatialFullConvolutionMap.lua                  |   91 +
 SpatialLPPooling.lua                           |   43 +
 SpatialMaxPooling.lua                          |   89 +
 SpatialMaxUnpooling.lua                        |   45 +
 SpatialReflectionPadding.lua                   |   51 +
 SpatialReplicationPadding.lua                  |   51 +
 SpatialSoftMax.lua                             |   19 +
 SpatialSubSampling.lua                         |   79 +
 SpatialSubtractiveNormalization.lua            |  115 +
 SpatialUpSamplingNearest.lua                   |   67 +
 SpatialZeroPadding.lua                         |  104 +
 SplitTable.lua                                 |   43 +
 Sqrt.lua                                       |   26 +
 Square.lua                                     |   22 +
 Squeeze.lua                                    |   40 +
 StochasticGradient.lua                         |   62 +
 Sum.lua                                        |   61 +
 THNN.lua                                       |  139 +
 Tanh.lua                                       |   19 +
 TanhShrink.lua                                 |   20 +
 TemporalConvolution.lua                        |   71 +
 TemporalMaxPooling.lua                         |   39 +
 TemporalSubSampling.lua                        |   64 +
 Threshold.lua                                  |   50 +
 Transpose.lua                                  |   28 +
 Unsqueeze.lua                                  |   52 +
 View.lua                                       |   96 +
 VolumetricAveragePooling.lua                   |   54 +
 VolumetricBatchNormalization.lua               |    4 +
 VolumetricConvolution.lua                      |  195 +
 VolumetricDropout.lua                          |   54 +
 VolumetricFullConvolution.lua                  |  236 +
 VolumetricMaxPooling.lua                       |   95 +
 VolumetricMaxUnpooling.lua                     |   56 +
 WeightedEuclidean.lua                          |  244 +
 WeightedMSECriterion.lua                       |   45 +
 doc/containers.md                              |  283 ++
 doc/convolution.md                             |  964 ++++
 doc/criterion.md                               |  789 +++
 doc/image/abs.png                              |  Bin 0 -> 5918 bytes
 doc/image/elu.png                              |  Bin 0 -> 33089 bytes
 doc/image/exp.png                              |  Bin 0 -> 6104 bytes
 doc/image/hshrink.png                          |  Bin 0 -> 5576 bytes
 doc/image/htanh.png                            |  Bin 0 -> 5948 bytes
 doc/image/lena.jpg                             |  Bin 0 -> 39706 bytes
 doc/image/lenap.jpg                            |  Bin 0 -> 34838 bytes
 doc/image/logsigmoid.png                       |  Bin 0 -> 9116 bytes
 doc/image/logsoftmax.png                       |  Bin 0 -> 8712 bytes
 doc/image/parameterflattening.png              |  Bin 0 -> 74658 bytes
 doc/image/parameterflattening.svg              |  338 ++
 doc/image/power.png                            |  Bin 0 -> 6515 bytes
 doc/image/prelu.png                            |  Bin 0 -> 19812 bytes
 doc/image/relu.png                             |  Bin 0 -> 19636 bytes
 doc/image/rrelu.png                            |  Bin 0 -> 11781 bytes
 doc/image/sigmmoid.png                         |  Bin 0 -> 6533 bytes
 doc/image/sigmoid.png                          |  Bin 0 -> 6533 bytes
 doc/image/softmax.png                          |  Bin 0 -> 6252 bytes
 doc/image/softmin.png                          |  Bin 0 -> 6446 bytes
 doc/image/softplus.png                         |  Bin 0 -> 19616 bytes
 doc/image/softsign.png                         |  Bin 0 -> 6877 bytes
 doc/image/sqrt.png                             |  Bin 0 -> 6008 bytes
 doc/image/square.png                           |  Bin 0 -> 6984 bytes
 doc/image/sshrink.png                          |  Bin 0 -> 5576 bytes
 doc/image/tanh.png                             |  Bin 0 -> 7323 bytes
 doc/index.md                                   |   23 +
 doc/module.md                                  |  437 ++
 doc/overview.md                                |  200 +
 doc/simple.md                                  | 1406 ++++++
 doc/table.md                                   | 1214 +++++
 doc/testing.md                                 |   69 +
 doc/training.md                                |  294 ++
 doc/transfer.md                                |  382 ++
 hessian.lua                                    |  391 ++
 init.lua                                       |  173 +
 lib/CMakeLists.txt                             |    5 +
 lib/THNN/CMakeLists.txt                        |   65 +
 lib/THNN/README.md                             |   32 +
 lib/THNN/THNN.h                                |   25 +
 lib/THNN/doc/api_reference.md                  | 1509 ++++++
 lib/THNN/doc/generate_reference.lua            |  106 +
 lib/THNN/doc/style_guidelines.md               |   59 +
 lib/THNN/generic/Abs.c                         |   27 +
 lib/THNN/generic/AbsCriterion.c                |   39 +
 lib/THNN/generic/BatchNormalization.c          |  144 +
 lib/THNN/generic/ClassNLLCriterion.c           |  140 +
 lib/THNN/generic/DistKLDivCriterion.c          |   39 +
 lib/THNN/generic/ELU.c                         |   51 +
 lib/THNN/generic/HardShrink.c                  |   39 +
 lib/THNN/generic/HardTanh.c                    |   84 +
 lib/THNN/generic/L1Cost.c                      |   36 +
 lib/THNN/generic/LeakyReLU.c                   |   54 +
 lib/THNN/generic/LogSigmoid.c                  |   35 +
 lib/THNN/generic/LogSoftMax.c                  |  110 +
 lib/THNN/generic/LookupTable.c                 |  213 +
 lib/THNN/generic/MSECriterion.c                |   40 +
 lib/THNN/generic/MarginCriterion.c             |   42 +
 lib/THNN/generic/MultiLabelMarginCriterion.c   |  174 +
 lib/THNN/generic/MultiMarginCriterion.c        |  159 +
 lib/THNN/generic/PReLU.c                       |  228 +
 lib/THNN/generic/RReLU.c                       |  127 +
 lib/THNN/generic/Sigmoid.c                     |   31 +
 lib/THNN/generic/SmoothL1Criterion.c           |   45 +
 lib/THNN/generic/SoftMarginCriterion.c         |   40 +
 lib/THNN/generic/SoftMax.c                     |  149 +
 lib/THNN/generic/SoftPlus.c                    |   42 +
 lib/THNN/generic/SoftShrink.c                  |   39 +
 lib/THNN/generic/SparseLinear.c                |  550 +++
 lib/THNN/generic/SpatialAdaptiveMaxPooling.c   |  274 +
 lib/THNN/generic/SpatialAveragePooling.c       |  258 +
 lib/THNN/generic/SpatialClassNLLCriterion.c    |  124 +
 lib/THNN/generic/SpatialConvolutionLocal.c     |  241 +
 lib/THNN/generic/SpatialConvolutionMM.c        |  280 ++
 lib/THNN/generic/SpatialConvolutionMap.c       |  259 +
 lib/THNN/generic/SpatialDilatedConvolution.c   |  337 ++
 lib/THNN/generic/SpatialFractionalMaxPooling.c |  251 +
 lib/THNN/generic/SpatialFullConvolution.c      |  380 ++
 lib/THNN/generic/SpatialFullConvolutionMap.c   |  212 +
 lib/THNN/generic/SpatialMaxPooling.c           |  300 ++
 lib/THNN/generic/SpatialMaxUnpooling.c         |  223 +
 lib/THNN/generic/SpatialReflectionPadding.c    |  255 +
 lib/THNN/generic/SpatialReplicationPadding.c   |  254 +
 lib/THNN/generic/SpatialSubSampling.c          |  267 +
 lib/THNN/generic/SpatialUpSamplingNearest.c    |  143 +
 lib/THNN/generic/Sqrt.c                        |   50 +
 lib/THNN/generic/Square.c                      |   58 +
 lib/THNN/generic/THNN.h                        | 1096 ++++
 lib/THNN/generic/Tanh.c                        |   49 +
 lib/THNN/generic/TemporalConvolution.c         |  349 ++
 lib/THNN/generic/TemporalMaxPooling.c          |  235 +
 lib/THNN/generic/TemporalSubSampling.c         |  116 +
 lib/THNN/generic/Threshold.c                   |   58 +
 lib/THNN/generic/VolumetricAveragePooling.c    |  309 ++
 lib/THNN/generic/VolumetricConvolution.c       |  247 +
 lib/THNN/generic/VolumetricConvolutionMM.c     |  514 ++
 lib/THNN/generic/VolumetricFullConvolution.c   |  458 ++
 lib/THNN/generic/VolumetricMaxPooling.c        |  392 ++
 lib/THNN/generic/VolumetricMaxUnpooling.c      |  325 ++
 lib/THNN/generic/unfold.c                      |  158 +
 lib/THNN/init.c                                |  173 +
 mkdocs.yml                                     |   18 +
 rocks/nn-scm-1.rockspec                        |   27 +
 test.lua                                       | 6307 ++++++++++++++++++++++++
 utils.lua                                      |  218 +
 262 files changed, 38452 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e0fa91e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+build/
+THNN_h.lua
diff --git a/.luacheckrc b/.luacheckrc
new file mode 100644
index 0000000..3d358e9
--- /dev/null
+++ b/.luacheckrc
@@ -0,0 +1,13 @@
+-- -*- mode: lua; -*-
+std = "luajit"
+
+globals = {
+    "torch",
+    "nn",
+    "include",
+}
+
+unused_args = false
+
+
+files['test.lua'].redefined = false
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..0c0ba85
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,56 @@
+language: c
+compiler:
+  - gcc
+  - clang
+cache:
+  directories:
+  - $HOME/OpenBlasInstall
+sudo: false
+env:
+  - TORCH_LUA_VERSION=LUAJIT21
+  - TORCH_LUA_VERSION=LUA51
+  - TORCH_LUA_VERSION=LUA52
+addons:
+  apt:
+    packages:
+    - cmake
+    - gfortran
+    - gcc-multilib
+    - gfortran-multilib
+    - liblapack-dev
+    - build-essential
+    - gcc 
+    - g++ 
+    - curl
+    - cmake 
+    - libreadline-dev 
+    - git-core 
+    - libqt4-core 
+    - libqt4-gui
+    - libqt4-dev 
+    - libjpeg-dev 
+    - libpng-dev 
+    - ncurses-dev
+    - imagemagick 
+    - libzmq3-dev 
+    - gfortran 
+    - unzip 
+    - gnuplot
+    - gnuplot-x11 
+before_script: 
+- export ROOT_TRAVIS_DIR=$(pwd)
+- export INSTALL_PREFIX=~/torch/install
+-  ls $HOME/OpenBlasInstall/lib || (cd /tmp/ && git clone https://github.com/xianyi/OpenBLAS.git -b master && cd OpenBLAS && (make NO_AFFINITY=1 -j$(getconf _NPROCESSORS_ONLN) 2>/dev/null >/dev/null) && make PREFIX=$HOME/OpenBlasInstall install)
+- git clone https://github.com/torch/distro.git ~/torch --recursive
+- cd ~/torch && git submodule update --init --recursive
+- mkdir build && cd build
+- export CMAKE_LIBRARY_PATH=$HOME/OpenBlasInstall/include:$HOME/OpenBlasInstall/lib:$CMAKE_LIBRARY_PATH
+- cmake .. -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" -DCMAKE_BUILD_TYPE=Release -DWITH_${TORCH_LUA_VERSION}=ON
+- make && make install
+- cd $ROOT_TRAVIS_DIR
+- export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH
+script: 
+- ${INSTALL_PREFIX}/bin/luarocks make rocks/nn-scm-1.rockspec
+- export PATH=${INSTALL_PREFIX}/bin:$PATH
+- export TESTLUA=$(which luajit lua | head -n 1)
+- ${TESTLUA} -lnn -e "t=nn.test(); if t.errors[1] then os.exit(1) end"
diff --git a/Abs.lua b/Abs.lua
new file mode 100644
index 0000000..b32b64f
--- /dev/null
+++ b/Abs.lua
@@ -0,0 +1,22 @@
+local Abs, parent = torch.class('nn.Abs', 'nn.Module')
+
+function Abs:__init()
+   parent.__init(self)
+end
+
+function Abs:updateOutput(input)
+   input.THNN.Abs_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function Abs:updateGradInput(input, gradOutput)
+   input.THNN.Abs_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata()
+   )
+   return self.gradInput
+end
diff --git a/AbsCriterion.lua b/AbsCriterion.lua
new file mode 100644
index 0000000..65e2f8a
--- /dev/null
+++ b/AbsCriterion.lua
@@ -0,0 +1,32 @@
+local AbsCriterion, parent = torch.class('nn.AbsCriterion', 'nn.Criterion')
+
+function AbsCriterion:__init(sizeAverage)
+   parent.__init(self)
+   if sizeAverage ~= nil then
+      self.sizeAverage = sizeAverage
+   else
+      self.sizeAverage = true
+   end
+end
+
+function AbsCriterion:updateOutput(input, target)
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.AbsCriterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function AbsCriterion:updateGradInput(input, target)
+   input.THNN.AbsCriterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage
+   )
+   return self.gradInput
+end
diff --git a/Add.lua b/Add.lua
new file mode 100644
index 0000000..83ffd9f
--- /dev/null
+++ b/Add.lua
@@ -0,0 +1,66 @@
+local Add, parent = torch.class('nn.Add', 'nn.Module')
+
+function Add:__init(inputSize,scalar)
+   parent.__init(self)
+  
+   local size = inputSize
+   if scalar then size=1 end
+   self.scalar = scalar
+   self.bias = torch.Tensor(size)
+   self.gradBias = torch.Tensor(size)
+   
+   self._ones = torch.Tensor{1}
+
+   self:reset()
+end
+
+function Add:reset(stdv)
+   if stdv then 
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.bias:size(1))
+   end
+
+   self.bias:uniform(-stdv, stdv)
+end
+
+function Add:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   if self.scalar then
+      self.output:add(self.bias[1]);
+   else
+      if input:isSameSizeAs(self.bias) then
+         self.output:add(self.bias)
+      else
+         local batchSize = input:size(1)
+         if self._ones:size(1) ~= batchSize then
+            self._ones:resize(batchSize):fill(1)
+         end
+         local bias = self.bias:view(-1)
+         local output = self.output:view(batchSize, -1)
+         output:addr(1, self._ones, bias)
+      end
+   end
+   return self.output
+end
+
+function Add:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput) 
+      return self.gradInput
+   end
+end
+
+function Add:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   if self.gradBias:size(1) == 1 then
+      self.gradBias[1] = self.gradBias[1] + scale*gradOutput:sum();
+   else
+      if input:isSameSizeAs(self.bias) then
+         self.gradBias:add(scale, gradOutput)
+      else
+         local gradOutput = gradOutput:view(input:size(1), -1)
+         self.gradBias:view(-1):addmv(scale, gradOutput:t(), self._ones)
+      end
+   end
+end
diff --git a/AddConstant.lua b/AddConstant.lua
new file mode 100644
index 0000000..5848462
--- /dev/null
+++ b/AddConstant.lua
@@ -0,0 +1,37 @@
+local AddConstant, parent = torch.class('nn.AddConstant', 'nn.Module')
+
+function AddConstant:__init(constant_scalar,ip)
+  parent.__init(self)
+  assert(type(constant_scalar) == 'number', 'input is not scalar!')
+  self.constant_scalar = constant_scalar
+  
+  -- default for inplace is false
+   self.inplace = ip or false
+   if (ip and type(ip) ~= 'boolean') then
+      error('in-place flag must be boolean')
+   end
+end
+
+function AddConstant:updateOutput(input)
+  if self.inplace then
+    input:add(self.constant_scalar)
+    self.output:set(input)
+  else
+    self.output:resizeAs(input)
+    self.output:copy(input)
+    self.output:add(self.constant_scalar)
+  end
+  return self.output
+end 
+
+function AddConstant:updateGradInput(input, gradOutput)
+  if self.inplace then
+    self.gradInput:set(gradOutput)
+    -- restore previous input value
+    input:add(-self.constant_scalar)
+  else
+    self.gradInput:resizeAs(gradOutput)
+    self.gradInput:copy(gradOutput)
+  end
+  return self.gradInput
+end
diff --git a/BCECriterion.lua b/BCECriterion.lua
new file mode 100644
index 0000000..b319335
--- /dev/null
+++ b/BCECriterion.lua
@@ -0,0 +1,106 @@
+local BCECriterion, parent = torch.class('nn.BCECriterion', 'nn.Criterion')
+
+local eps = 1e-12
+
+function BCECriterion:__init(weights, sizeAverage)
+    parent.__init(self)
+    if sizeAverage ~= nil then
+        self.sizeAverage = sizeAverage
+    else
+        self.sizeAverage = true
+    end
+    if weights ~= nil then
+        assert(weights:dim() == 1, "weights input should be 1-D Tensor")
+        self.weights = weights
+    end
+end
+
+
+function BCECriterion:__len()
+    if (self.weights) then
+        return #self.weights
+    else
+        return 0
+    end
+end
+
+function BCECriterion:updateOutput(input, target)
+    -- - log(input) * target - log(1 - input) * (1 - target)
+
+    assert( input:nElement() == target:nElement(),
+    "input and target size mismatch")
+
+    self.buffer = self.buffer or input.new()
+
+    local buffer = self.buffer
+    local weights = self.weights
+    local output
+
+    buffer:resizeAs(input)
+
+    if weights ~= nil and target:dim() ~= 1 then
+        weights = self.weights:view(1, target:size(2)):expandAs(target)
+    end
+
+    -- log(input) * target
+    buffer:add(input, eps):log()
+    if weights ~= nil then buffer:cmul(weights) end
+
+    output = torch.dot(target, buffer)
+
+    -- log(1 - input) * (1 - target)
+    buffer:mul(input, -1):add(1):add(eps):log()
+    if weights ~= nil then buffer:cmul(weights) end
+
+    output = output + torch.sum(buffer)
+    output = output - torch.dot(target, buffer)
+
+    if self.sizeAverage then
+        output = output / input:nElement()
+    end
+
+    self.output = - output
+
+    return self.output
+end
+
+function BCECriterion:updateGradInput(input, target)
+    -- - (target - input) / ( input (1 - input) )
+    -- The gradient is slightly incorrect:
+    -- It should have be divided by (input + eps) (1 - input + eps)
+    -- but it is divided by input (1 - input + eps) + eps
+    -- This modification requires less memory to be computed.
+
+    assert( input:nElement() == target:nElement(),
+    "input and target size mismatch")
+
+    self.buffer = self.buffer or input.new()
+
+    local buffer = self.buffer
+    local weights = self.weights
+    local gradInput = self.gradInput
+
+    if weights ~= nil and target:dim() ~= 1 then
+        weights = self.weights:view(1, target:size(2)):expandAs(target)
+    end
+
+    buffer:resizeAs(input)
+    -- - x ( 1 + eps -x ) + eps
+    buffer:add(input, -1):add(-eps):cmul(input):add(-eps)
+
+    gradInput:resizeAs(input)
+    -- y - x
+    gradInput:add(target, -1, input)
+    -- - (y - x) / ( x ( 1 + eps -x ) + eps )
+    gradInput:cdiv(buffer)
+
+    if weights ~= nil then
+        gradInput:cmul(weights)
+    end
+
+    if self.sizeAverage then
+        gradInput:div(target:nElement())
+    end
+
+    return gradInput
+end
diff --git a/BatchNormalization.lua b/BatchNormalization.lua
new file mode 100644
index 0000000..ac42749
--- /dev/null
+++ b/BatchNormalization.lua
@@ -0,0 +1,195 @@
+--[[
+   This file implements Batch Normalization as described in the paper:
+   "Batch Normalization: Accelerating Deep Network Training
+                         by Reducing Internal Covariate Shift"
+                   by Sergey Ioffe, Christian Szegedy
+
+   This implementation is useful for inputs NOT coming from convolution layers.
+   For convolution layers, use nn.SpatialBatchNormalization.
+
+   The operation implemented is:
+   y =     ( x - mean(x) )
+        -------------------- * gamma + beta
+        standard-deviation(x)
+   where gamma and beta are learnable parameters.
+
+   The learning of gamma and beta is optional.
+
+   Usage:
+   with    learnable parameters: nn.BatchNormalization(N [,eps] [,momentum])
+                                 where N = dimensionality of input
+   without learnable parameters: nn.BatchNormalization(N [,eps] [,momentum], false)
+
+   eps is a small value added to the standard-deviation to avoid divide-by-zero.
+       Defaults to 1e-5
+
+   In training time, this layer keeps a running estimate of it's computed mean and std.
+   The running sum is kept with a default momentum of 0.1 (unless over-ridden)
+   In test time, this running mean/std is used to normalize.
+]]--
+local BN,parent = torch.class('nn.BatchNormalization', 'nn.Module')
+local THNN = require 'nn.THNN'
+
+BN.__version = 2
+
+-- expected dimension of input
+BN.nDim = 2
+
+function BN:__init(nOutput, eps, momentum, affine)
+   parent.__init(self)
+   assert(nOutput and type(nOutput) == 'number',
+          'Missing argument #1: dimensionality of input. ')
+   assert(nOutput ~= 0, 'To set affine=false call BatchNormalization'
+     .. '(nOutput,  eps, momentum, false) ')
+   if affine ~= nil then
+      assert(type(affine) == 'boolean', 'affine has to be true/false')
+      self.affine = affine
+   else
+      self.affine = true
+   end
+   self.eps = eps or 1e-5
+   self.train = true
+   self.momentum = momentum or 0.1
+   self.running_mean = torch.zeros(nOutput)
+   self.running_var = torch.ones(nOutput)
+
+   if self.affine then
+      self.weight = torch.Tensor(nOutput)
+      self.bias = torch.Tensor(nOutput)
+      self.gradWeight = torch.Tensor(nOutput)
+      self.gradBias = torch.Tensor(nOutput)
+      self:reset()
+   end
+end
+
+function BN:reset()
+   if self.weight then
+      self.weight:uniform()
+   end
+   if self.bias then
+      self.bias:zero()
+   end
+   self.running_mean:zero()
+   self.running_var:fill(1)
+end
+
+function BN:checkInputDim(input)
+   assert(input:dim() == self.nDim, string.format(
+      'only mini-batch supported (%dD tensor), got %dD tensor instead',
+      self.nDim, input:dim()))
+   assert(input:size(2) == self.running_mean:nElement(), string.format(
+      'got %d-feature tensor, expected %d',
+      input:size(2), self.running_mean:nElement()))
+end
+
+local function makeContiguous(self, input, gradOutput)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resizeAs(input):copy(input)
+      input = self._input
+   end
+   if gradOutput then
+      if not gradOutput:isContiguous() then
+         self._gradOutput = self._gradOutput or gradOutput.new()
+         self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+         gradOutput = self._gradOutput
+      end
+   end
+   return input, gradOutput
+end
+
+function BN:updateOutput(input)
+   self:checkInputDim(input)
+
+   input = makeContiguous(self, input)
+
+   self.output:resizeAs(input)
+   self.save_mean = self.save_mean or input.new()
+   self.save_mean:resizeAs(self.running_mean)
+   self.save_std = self.save_std or input.new()
+   self.save_std:resizeAs(self.running_var)
+
+   input.THNN.BatchNormalization_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      THNN.optionalTensor(self.weight),
+      THNN.optionalTensor(self.bias),
+      self.running_mean:cdata(),
+      self.running_var:cdata(),
+      self.save_mean:cdata(),
+      self.save_std:cdata(),
+      self.train,
+      self.momentum,
+      self.eps)
+
+   return self.output
+end
+
+local function backward(self, input, gradOutput, scale, gradInput, gradWeight, gradBias)
+   self:checkInputDim(input)
+   self:checkInputDim(gradOutput)
+   assert(self.save_mean and self.save_std, 'must call :updateOutput() first')
+
+   input, gradOutput = makeContiguous(self, input, gradOutput)
+
+   scale = scale or 1
+   if gradInput then
+      gradInput:resizeAs(gradOutput)
+   end
+
+   input.THNN.BatchNormalization_backward(
+      input:cdata(),
+      gradOutput:cdata(),
+      THNN.optionalTensor(gradInput),
+      THNN.optionalTensor(gradWeight),
+      THNN.optionalTensor(gradBias),
+      THNN.optionalTensor(self.weight),
+      self.running_mean:cdata(),
+      self.running_var:cdata(),
+      self.save_mean:cdata(),
+      self.save_std:cdata(),
+      self.train,
+      scale,
+      self.eps)
+
+   return self.gradInput
+end
+
+function BN:backward(input, gradOutput, scale)
+   return backward(self, input, gradOutput, scale, self.gradInput, self.gradWeight, self.gradBias)
+end
+
+function BN:updateGradInput(input, gradOutput)
+   return backward(self, input, gradOutput, 1, self.gradInput)
+end
+
+function BN:accGradParameters(input, gradOutput, scale)
+   return backward(self, input, gradOutput, scale, nil, self.gradWeight, self.gradBias)
+end
+
+function BN:read(file, version)
+   parent.read(self, file)
+   if version < 2 then
+      if self.running_std then
+         self.running_var = self.running_std:pow(-2):add(-self.eps)
+         self.running_std = nil
+      end
+   end
+end
+
+function BN:clearState()
+   -- first 5 buffers are not present in the current implementation,
+   -- but we keep them for cleaning old saved models
+   nn.utils.clear(self, {
+      'buffer',
+      'buffer2',
+      'centered',
+      'std',
+      'normalized',
+      '_input',
+      '_gradOutput',
+      'save_mean',
+      'save_std',
+   })
+   return parent.clearState(self)
+end
diff --git a/Bilinear.lua b/Bilinear.lua
new file mode 100644
index 0000000..5527686
--- /dev/null
+++ b/Bilinear.lua
@@ -0,0 +1,156 @@
+local Bilinear, parent = torch.class('nn.Bilinear', 'nn.Module')
+
+local function isint(x) return type(x) == 'number' and x == math.floor(x) end
+function Bilinear:__assertInput(input)
+   assert(input and type(input) == 'table' and #input == 2,
+      'input should be a table containing two data Tensors')
+   assert(input[1]:nDimension() == 2 and input[2]:nDimension() == 2,
+      'input Tensors should be two-dimensional')
+   assert(input[1]:size(1) == input[2]:size(1),
+      'input Tensors should have the same number of rows (instances)')
+   assert(input[1]:size(2) == self.weight:size(2),
+      'dimensionality of first input is erroneous')
+   assert(input[2]:size(2) == self.weight:size(3),
+      'dimensionality of second input is erroneous')
+end
+function Bilinear:__assertInputGradOutput(input, gradOutput)
+   assert(input[1]:size(1) == gradOutput:size(1),
+      'number of rows in gradOutput does not match input')
+   assert(gradOutput:size(2) == self.weight:size(1),
+      'number of columns in gradOutput does not output size of layer')
+end
+
+function Bilinear:__init(inputSize1, inputSize2, outputSize, bias)
+
+   -- assertions:
+   assert(self and inputSize1 and inputSize2 and outputSize,
+      'should specify inputSize1 and inputSize2 and outputSize')
+   assert(isint(inputSize1) and isint(inputSize2) and isint(outputSize),
+      'inputSize1 and inputSize2 and outputSize should be integer numbers')
+   assert(inputSize1 > 0 and inputSize2 > 0 and outputSize > 0,
+      'inputSize1 and inputSize2 and outputSize should be positive numbers')
+
+   -- set up model:
+   parent.__init(self)
+   local bias = ((bias == nil) and true) or bias
+   self.weight     = torch.Tensor(outputSize, inputSize1, inputSize2)
+   self.gradWeight = torch.Tensor(outputSize, inputSize1, inputSize2)
+   if bias then
+      self.bias     = torch.Tensor(outputSize)
+      self.gradBias = torch.Tensor(outputSize)
+   end
+   self.gradInput = {torch.Tensor(), torch.Tensor()}
+   self:reset()
+end
+
+function Bilinear:reset(stdv)
+   assert(self)
+   if stdv then
+      assert(stdv and type(stdv) == 'number' and stdv > 0,
+         'standard deviation should be a positive number')
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1 / math.sqrt(self.weight:size(2))
+   end
+   self.weight:uniform(-stdv, stdv)
+   if self.bias then self.bias:uniform(-stdv, stdv) end
+   return self
+end
+
+function Bilinear:updateOutput(input)
+   assert(self)
+   self:__assertInput(input)
+
+   -- set up buffer:
+   self.buff2 = self.buff2 or input[1].new()
+   self.buff2:resizeAs(input[2])
+
+   -- compute output scores:
+   self.output:resize(input[1]:size(1), self.weight:size(1))
+   for k = 1,self.weight:size(1) do
+      torch.mm(self.buff2, input[1], self.weight[k])
+      self.buff2:cmul(input[2])
+      torch.sum(self.output:narrow(2, k, 1), self.buff2, 2)
+   end
+   if self.bias then
+       self.output:add(
+           self.bias:reshape(1, self.bias:nElement()):expandAs(self.output)
+       )
+   end
+   return self.output
+end
+
+function Bilinear:updateGradInput(input, gradOutput)
+   assert(self)
+   if self.gradInput then
+      self:__assertInputGradOutput(input, gradOutput)
+      -- compute d output / d input:
+      self.gradInput[1]:resizeAs(input[1]):fill(0)
+      self.gradInput[2]:resizeAs(input[2]):fill(0)
+
+
+       -- do first slice of weight tensor (k = 1)
+      self.gradInput[1]:mm(input[2], self.weight[1]:t())
+      self.gradInput[1]:cmul(gradOutput:narrow(2,1,1):expand(self.gradInput[1]:size(1),
+          self.gradInput[1]:size(2)))
+      self.gradInput[2]:addmm(1, input[1], self.weight[1])
+      self.gradInput[2]:cmul(gradOutput:narrow(2,1,1):expand(self.gradInput[2]:size(1),
+          self.gradInput[2]:size(2)))
+
+      -- do remaining slices of weight tensor
+      if self.weight:size(1) > 1 then
+         self.buff1 = self.buff1 or input[1].new()
+         self.buff1:resizeAs(input[1])
+
+         for k = 2, self.weight:size(1) do
+            self.buff1:mm(input[2], self.weight[k]:t())
+            self.buff1:cmul(gradOutput:narrow(2,k,1):expand(self.gradInput[1]:size(1),
+              self.gradInput[1]:size(2)))
+            self.gradInput[1]:add(self.buff1)
+
+            self.buff2:mm(input[1], self.weight[k])
+            self.buff2:cmul(gradOutput:narrow(2,k,1):expand(self.gradInput[2]:size(1),
+              self.gradInput[2]:size(2)))
+            self.gradInput[2]:add(self.buff2)
+         end
+      end
+      return self.gradInput
+   end
+end
+
+function Bilinear:accGradParameters(input, gradOutput, scale)
+   local scale = scale or 1
+   self:__assertInputGradOutput(input, gradOutput)
+   assert(scale and type(scale) == 'number' and scale >= 0)
+
+   -- make sure we have buffer:
+   self.buff1 = self.buff1 or input[1].new()
+   self.buff1:resizeAs(input[1])
+
+   -- accumulate parameter gradients:
+   for k = 1,self.weight:size(1) do
+      torch.cmul(
+         self.buff1, input[1], gradOutput:narrow(2, k, 1):expandAs(input[1])
+      )
+      self.gradWeight[k]:addmm(self.buff1:t(), input[2])
+   end
+   if self.bias then self.gradBias:add(scale, gradOutput:sum(1)) end
+end
+
+-- we do not need to accumulate parameters when sharing:
+Bilinear.sharedAccUpdateGradParameters = Bilinear.accUpdateGradParameters
+
+function Bilinear:__tostring__()
+  return torch.type(self) ..
+      string.format(
+         '(%dx%d -> %d) %s',
+         self.weight:size(2), self.weight:size(3), self.weight:size(1),
+         (self.bias == nil and ' without bias' or '')
+      )
+end
+
+function Bilinear:clearState()
+   if self.buff2 then self.buff2:set() end
+   if self.buff1 then self.buff1:set() end
+   return parent.clearState(self)
+end
diff --git a/CAddTable.lua b/CAddTable.lua
new file mode 100644
index 0000000..79deb7e
--- /dev/null
+++ b/CAddTable.lua
@@ -0,0 +1,36 @@
+local CAddTable, parent = torch.class('nn.CAddTable', 'nn.Module')
+
+function CAddTable:__init(ip)
+   parent.__init(self)
+   self.inplace = ip
+   self.gradInput = {}
+end
+
+function CAddTable:updateOutput(input)
+   if self.inplace then
+      self.output:set(input[1])
+   else
+      self.output:resizeAs(input[1]):copy(input[1])
+   end
+   for i=2,#input do
+      self.output:add(input[i])
+   end
+   return self.output
+end
+
+function CAddTable:updateGradInput(input, gradOutput)
+   for i=1,#input do
+      self.gradInput[i] = self.gradInput[i] or input[1].new()
+      if self.inplace then
+         self.gradInput[i]:set(gradOutput)
+      else
+         self.gradInput[i]:resizeAs(input[i]):copy(gradOutput)
+      end
+   end
+
+   for i=#input+1, #self.gradInput do
+       self.gradInput[i] = nil
+   end
+
+   return self.gradInput
+end
diff --git a/CDivTable.lua b/CDivTable.lua
new file mode 100644
index 0000000..bf044c9
--- /dev/null
+++ b/CDivTable.lua
@@ -0,0 +1,26 @@
+
+local CDivTable, parent = torch.class('nn.CDivTable', 'nn.Module')
+
+function CDivTable:__init()
+   parent.__init(self)
+   self.gradInput = {}
+end
+
+function CDivTable:updateOutput(input)
+   self.output:resizeAs(input[1]):copy(input[1])
+   self.output:cdiv(input[2])
+   return self.output
+end
+
+function CDivTable:updateGradInput(input, gradOutput)
+   self.gradInput[1] = self.gradInput[1] or input[1].new()
+   self.gradInput[2] = self.gradInput[2] or input[1].new()
+   self.gradInput[1]:resizeAs(input[1]):copy(gradOutput):cdiv(input[2])
+   self.gradInput[2]:resizeAs(input[2]):zero():addcdiv(-1,self.gradInput[1],input[2]):cmul(input[1])
+
+   for i=#input+1, #self.gradInput do
+       self.gradInput[i] = nil
+   end
+
+   return self.gradInput
+end
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..7f07e8e
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,18 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.6)
+
+FIND_PACKAGE(Torch REQUIRED)
+
+ADD_SUBDIRECTORY(lib)
+
+FILE(STRINGS lib/THNN/generic/THNN.h THNN_headers NEWLINE_CONSUME)
+FILE(WRITE THNN_h.lua "return [[")
+FILE(APPEND THNN_h.lua ${THNN_headers})
+FILE(APPEND THNN_h.lua "]]")
+
+FILE(GLOB luasrc *.lua)
+
+ADD_TORCH_PACKAGE(nn "" "${luasrc}")
+
+INSTALL(DIRECTORY "doc" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/nn")
+INSTALL(FILES "README.md" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/nn")
diff --git a/CMul.lua b/CMul.lua
new file mode 100644
index 0000000..e84f7ba
--- /dev/null
+++ b/CMul.lua
@@ -0,0 +1,135 @@
+local CMul, parent = torch.class('nn.CMul', 'nn.Module')
+
+function CMul:__init(...)
+   parent.__init(self)
+   
+   local arg = {...}
+
+   self.size = torch.LongStorage()
+   local n = #arg
+   if n == 1 and torch.type(arg[1]) == 'torch.LongStorage' then
+      self.size:resize(#arg[1]):copy(arg[1])
+   else
+      self.size:resize(n)
+      for i=1,n do
+         self.size[i] = arg[i]
+      end
+   end
+  
+   self.weight = torch.Tensor(self.size)
+   self.gradWeight = torch.Tensor(self.size)
+   
+   self.output:resize(self.size) 
+
+   self:reset()
+end
+ 
+function CMul:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:nElement())
+   end
+   self.weight:uniform(-stdv,stdv)
+end
+
+function CMul:updateOutput(input)
+   -- lazy-initialize
+   self._output = self._output or input.new()
+   self._weight = self._weight or input.new()
+   self._expand = self._expand or input.new()
+   self._repeat = self._repeat or input.new()
+   
+   self.output:resizeAs(input):copy(input)
+   if input:nElement() == self.weight:nElement() then
+      self._output:view(self.output, -1)
+      self._weight:view(self.weight, -1)
+      
+      self._output:cmul(self._weight)
+   else
+      local batchSize = input:size(1)
+      self._output:view(self.output, batchSize, -1)
+      self._weight:view(self.weight, 1, -1)
+      
+      self._expand:expandAs(self._weight, self._output)
+      
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat:resizeAs(self._expand):copy(self._expand)
+         self._output:cmul(self._repeat)
+      else
+         self._output:cmul(self._expand)
+      end
+   end
+   
+   return self.output
+end
+
+function CMul:updateGradInput(input, gradOutput)
+   if not self.gradInput then
+      return
+   end
+   
+   self._gradOutput = self._gradOutput or input.new()
+   self._gradInput = self._gradInput or input.new()
+
+   self.gradInput:resizeAs(input):zero()
+   if self.weight:nElement() == gradOutput:nElement() then
+      self.gradInput:addcmul(1, self.weight, gradOutput)
+   else
+      local batchSize = input:size(1)
+      nn.utils.contiguousView(self._gradOutput, gradOutput, batchSize, -1)
+      nn.utils.contiguousView(self._gradInput, self.gradInput, batchSize, -1)
+      self._weight:view(self.weight, 1, -1)
+      self._expand:expandAs(self._weight, self._gradOutput)
+      
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat:resizeAs(self._expand):copy(self._expand)
+         self._gradInput:addcmul(1, self._repeat, self._gradOutput)
+      else
+         self._gradInput:addcmul(1, self._expand, self._gradOutput)
+      end
+   end
+   
+   return self.gradInput
+end
+
+function CMul:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   
+   self._input = self._input or input.new()
+   self._gradWeight = self._gradWeight or input.new()
+   self._sum = self._sum or input.new()
+   
+   if self.weight:nElement() == gradOutput:nElement() then
+      self.gradWeight:addcmul(scale, input, gradOutput)
+   else
+      local batchSize = input:size(1)
+      nn.utils.contiguousView(self._input, input, batchSize, -1)
+      nn.utils.contiguousView(self._gradOutput, gradOutput, batchSize, -1)
+      self._gradWeight:view(self.gradWeight, 1, -1)
+      
+      self._repeat:cmul(self._input, self._gradOutput)
+      self._sum:sum(self._repeat, 1)
+      self._gradWeight:add(scale, self._sum)
+   end
+end
+
+function CMul:type(type, tensorCache)
+   if type then
+      self:clearState()
+   end
+   return parent.type(self, type, tensorCache)
+end
+
+function CMul:clearState()
+   nn.utils.clear(self, {
+      '_input',
+      '_output',
+      '_weight',
+      '_gradWeight',
+      '_expand',
+      '_repeat',
+      '_sum',
+   })
+   return parent.clearState(self)
+end
diff --git a/CMulTable.lua b/CMulTable.lua
new file mode 100644
index 0000000..b47378e
--- /dev/null
+++ b/CMulTable.lua
@@ -0,0 +1,55 @@
+
+local CMulTable, parent = torch.class('nn.CMulTable', 'nn.Module')
+
+function CMulTable:__init()
+   parent.__init(self)
+   self.gradInput = {}
+end
+
+function CMulTable:updateOutput(input)
+   self.output:resizeAs(input[1]):copy(input[1])
+   for i=2,#input do
+      self.output:cmul(input[i])
+   end
+   return self.output
+end
+
+function CMulTable:updateGradInput_efficient(input, gradOutput)
+   self.tout = self.tout or input[1].new()
+   self.tout:resizeAs(self.output)
+   for i=1,#input do
+      self.gradInput[i] = self.gradInput[i] or input[1].new()
+      self.gradInput[i]:resizeAs(input[i]):copy(gradOutput)
+      self.tout:copy(self.output):cdiv(input[i])
+      self.gradInput[i]:cmul(self.tout)
+   end
+
+   for i=#input+1, #self.gradInput do
+       self.gradInput[i] = nil
+   end
+
+   return self.gradInput
+end
+
+function CMulTable:updateGradInput(input, gradOutput)
+   for i=1,#input do
+      self.gradInput[i] = self.gradInput[i] or input[1].new()
+      self.gradInput[i]:resizeAs(input[i]):copy(gradOutput)
+      for j=1,#input do
+         if i~=j then
+            self.gradInput[i]:cmul(input[j])
+         end
+      end
+   end
+
+   for i=#input+1, #self.gradInput do
+       self.gradInput[i] = nil
+   end
+
+   return self.gradInput
+end
+
+function CMulTable:clearState()
+   if self.tout then self.tout:set() end
+   return parent.clearState(self)
+end
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..d4da7c9
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,136 @@
+# Contributing to Torch7 Core (torch7, nn, cutorch, cunn)
+
+Thanks a lot! There are plenty of ways you can help!
+
+Please take a moment to review this document in order to make the contribution
+process easy and effective for everyone involved.
+
+Following these guidelines helps to communicate that you respect the time of
+the developers managing and developing this open source project. In return,
+they should reciprocate that respect in addressing your issue or assessing
+patches and features.
+
+
+## Using the issue tracker
+
+The [issue tracker](https://github.com/torch/nn/issues) is
+the preferred channel for [bug reports](#bugs), [features requests](#features)
+and [submitting pull requests](#pull-requests), but please respect the following
+restrictions:
+
+* Please **do not** use the issue tracker for personal support requests (use
+  [mailing-list](http://groups.google.com/forum/#!forum/torch7)).
+
+* Please **do not** open issues regarding the code in a torch package 
+  outside the core. For example dont open issues about the 
+  REPL in the nn issue tracker, use the trepl issue tracker for that.
+
+<a name="bugs"></a>
+## Bug reports
+
+A bug is a _demonstrable problem_ that is caused by the code in the repository.
+Good bug reports are extremely helpful - thank you!
+
+Guidelines for bug reports:
+
+1. **Use the GitHub issue search** — check if the issue has already been
+   reported.
+
+2. **Check if the issue has been fixed** — try to reproduce it using the
+   latest `master` or development branch in the repository.
+
+3. **Isolate the problem** — ideally create test case that is within reason,
+   preferably within 100 lines of code.
+
+A good bug report shouldn't leave others needing to chase you up for more
+information. Please try to be as detailed as possible in your report. What is
+your environment? What steps will reproduce the issue? What OS do you
+experience the problem? What would you expect to be the outcome? All these
+details will help people to fix any potential bugs.
+
+<a name="features"></a>
+## Feature requests
+
+Feature requests are welcome to be filed. Torch is community-developed, 
+the maintainers are not exclusive torch developers, so keep that in mind.
+The purpose of feature requests is for others who are looking to implement
+a feature are aware of the interest in the feature.
+
+
+<a name="pull-requests"></a>
+## Pull requests
+
+Good pull requests - patches, improvements, new features - are a fantastic
+help. They should remain focused in scope **and avoid containing unrelated
+commits.**
+
+**Please ask first** before embarking on any significant pull request (e.g.
+implementing features, refactoring code, porting to a different language),
+otherwise you risk spending a lot of time working on something that the
+project's developers might not want to merge into the project.
+
+Please adhere to the coding conventions used throughout a project (indentation,
+accurate comments, etc.) and any other requirements (such as test coverage).
+
+Adhering to the following this process is the best way to get your work
+included in the project:
+
+1. [Fork](https://help.github.com/articles/fork-a-repo) the project, clone your
+   fork, and configure the remotes:
+
+   ```bash
+   # Clone your fork of the repo into the current directory
+   git clone https://github.com/<your-username>/nn.git
+   # Navigate to the newly cloned directory
+   cd nn
+   # Assign the original repo to a remote called "upstream"
+   git remote add upstream https://github.com/torch/nn.git
+   ```
+
+2. If you cloned a while ago, get the latest changes from upstream:
+
+   ```bash
+   git checkout master
+   git pull upstream master
+   ```
+
+3. Create a new topic branch (off the main project development branch) to
+   contain your feature, change, or fix:
+
+   ```bash
+   git checkout -b <topic-branch-name>
+   ```
+
+4. Commit your changes in logical chunks. Please try to adhere to these [git commit
+   message guidelines](http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html)
+   . Use Git's [interactive rebase](https://help.github.com/articles/about-git-rebase)
+   feature to tidy up your commits before making them public. This helps us keep the 
+   commit history in logical blocks and clean, as torch grows. 
+   For example: 
+     - If you are adding a new function or a module, keep the module + tests + doc 
+       to a single commit unless logically warranted. 
+     - If you are fixing a bug, keep the bugfix to a single commit unless logically warranted.
+
+5. Locally merge (or rebase) the upstream development branch into your topic branch:
+
+   ```bash
+   git pull [--rebase] upstream master
+   ```
+
+6. Push your topic branch up to your fork:
+
+   ```bash
+   git push origin <topic-branch-name>
+   ```
+
+7. [Open a Pull Request](https://help.github.com/articles/using-pull-requests/)
+    with a clear title and description.
+
+**IMPORTANT**: By submitting a patch, you agree to allow the project owners to
+license your work under the terms of the BSD License.
+
+## Development workflow tips
+
+* While you are changing lua files, one can simply symlink the cloned nn directory to ~/torch/install/share/lua/5.1/nn so that any change is reflected in the current install, without constantly having to do luarocks make rocks/*
+* If you are changing C files, then, after every change, you run luarocks make rocks/*
+* To test, you can just use: th -lnn -e "nn.test()"
diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt
new file mode 100644
index 0000000..c9cc784
--- /dev/null
+++ b/COPYRIGHT.txt
@@ -0,0 +1,36 @@
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Deepmind Technologies, NYU, NEC Laboratories America 
+   and IDIAP Research Institute nor the names of its contributors may be 
+   used to endorse or promote products derived from this software without 
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/CSubTable.lua b/CSubTable.lua
new file mode 100644
index 0000000..eb74920
--- /dev/null
+++ b/CSubTable.lua
@@ -0,0 +1,26 @@
+
+local CSubTable, parent = torch.class('nn.CSubTable', 'nn.Module')
+
+function CSubTable:__init()
+   parent.__init(self)
+   self.gradInput = {}
+end
+
+function CSubTable:updateOutput(input)
+   self.output:resizeAs(input[1]):copy(input[1])
+   self.output:add(-1,input[2])
+   return self.output
+end
+
+function CSubTable:updateGradInput(input, gradOutput)
+   self.gradInput[1] = self.gradInput[1] or input[1].new()
+   self.gradInput[2] = self.gradInput[2] or input[1].new()
+   self.gradInput[1]:resizeAs(input[1]):copy(gradOutput)
+   self.gradInput[2]:resizeAs(input[2]):copy(gradOutput):mul(-1)
+
+   for i=#input+1, #self.gradInput do
+       self.gradInput[i] = nil
+   end
+
+   return self.gradInput
+end
diff --git a/Clamp.lua b/Clamp.lua
new file mode 100644
index 0000000..36397a1
--- /dev/null
+++ b/Clamp.lua
@@ -0,0 +1,5 @@
+local Clamp, Parent = torch.class('nn.Clamp', 'nn.HardTanh')
+
+function Clamp:__init(min_value, max_value)
+   Parent.__init(self, min_value, max_value)
+end
diff --git a/ClassNLLCriterion.lua b/ClassNLLCriterion.lua
new file mode 100644
index 0000000..8e8acbf
--- /dev/null
+++ b/ClassNLLCriterion.lua
@@ -0,0 +1,74 @@
+local THNN = require 'nn.THNN'
+local ClassNLLCriterion, parent = torch.class('nn.ClassNLLCriterion', 'nn.Criterion')
+
+function ClassNLLCriterion:__init(weights, sizeAverage)
+    parent.__init(self)
+    if sizeAverage ~= nil then
+       self.sizeAverage = sizeAverage
+    else
+       self.sizeAverage = true
+    end
+    if weights then
+       assert(weights:dim() == 1, "weights input should be 1-D Tensor")
+       self.weights = weights
+    end
+
+    self.output_tensor = torch.zeros(1)
+    self.total_weight_tensor = torch.ones(1)
+    self.target = torch.zeros(1):long()
+end
+
+function ClassNLLCriterion:__len()
+   if (self.weights) then
+      return #self.weights
+   else
+      return 0
+   end
+end
+
+function ClassNLLCriterion:updateOutput(input, target)
+   if type(target) == 'number' then
+      if input:type() ~= 'torch.CudaTensor' then
+         self.target = self.target:long()
+      end
+      self.target[1] = target
+   elseif target:type() == 'torch.CudaTensor' then
+      self.target = target
+   else
+      self.target = target:long()
+   end
+
+   input.THNN.ClassNLLCriterion_updateOutput(
+      input:cdata(),
+      self.target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage,
+      THNN.optionalTensor(self.weights),
+      self.total_weight_tensor:cdata()
+   )
+   self.output = self.output_tensor[1]
+   return self.output, self.total_weight_tensor[1]
+end
+
+function ClassNLLCriterion:updateGradInput(input, target)
+   if type(target) == 'number' then
+      self.target[1] = target
+   elseif target:type() == 'torch.CudaTensor' then
+      self.target = target
+   else
+      self.target = target:long()
+   end
+
+   self.gradInput:resizeAs(input):zero()
+
+   input.THNN.ClassNLLCriterion_updateGradInput(
+      input:cdata(),
+      self.target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage,
+      THNN.optionalTensor(self.weights),
+      self.total_weight_tensor:cdata()
+   )
+
+   return self.gradInput
+end
diff --git a/ClassSimplexCriterion.lua b/ClassSimplexCriterion.lua
new file mode 100644
index 0000000..6ccaed9
--- /dev/null
+++ b/ClassSimplexCriterion.lua
@@ -0,0 +1,118 @@
+local ClassSimplexCriterion, parent
+    = torch.class('nn.ClassSimplexCriterion', 'nn.MSECriterion')
+
+--[[
+    This file implements a criterion for multi-class classification.
+    It learns an embedding per class, where each class' embedding
+    is a point on an (N-1)-dimensional simplex, where N is
+    the number of classes.
+    For example usage of this class, look at doc/criterion.md
+
+    Reference: http://arxiv.org/abs/1506.08230
+
+]]--
+
+
+--[[
+    function regsplex(n):
+    regsplex returns the coordinates of the vertices of a
+    regular simplex centered at the origin.
+    The Euclidean norms of the vectors specifying the vertices are
+    all equal to 1. The input n is the dimension of the vectors;
+    the simplex has n+1 vertices.
+
+    input:
+    n -- dimension of the vectors specifying the vertices of the simplex
+
+    output:
+    a -- tensor dimensioned (n+1,n) whose rows are
+         vectors specifying the vertices
+
+    reference:
+    http://en.wikipedia.org/wiki/Simplex#Cartesian_coordinates_for_regular_n-dimensional_simplex_in_Rn
+--]]
+local function regsplex(n)
+    local a = torch.zeros(n+1,n)
+
+    for k = 1,n do
+        -- determine the last nonzero entry in the vector for the k-th vertex
+        if k==1 then a[k][k] = 1 end
+        if k>1 then a[k][k] = math.sqrt( 1 - a[{ {k},{1,k-1} }]:norm()^2 ) end
+
+        -- fill the k-th coordinates for the vectors of the remaining vertices
+        local c = (a[k][k]^2 - 1 - 1/n) / a[k][k]
+        a[{ {k+1,n+1},{k} }]:fill(c)
+    end
+
+    return a
+end
+
+
+function ClassSimplexCriterion:__init(nClasses)
+    parent.__init(self)
+    assert(nClasses and nClasses > 1 and nClasses == (nClasses -(nClasses % 1)),
+           "Required positive integer argument nClasses > 1")
+    self.nClasses = nClasses
+
+    -- embedding the simplex in a space of dimension strictly greater than
+    -- the minimum possible (nClasses-1) is critical for effective training.
+    local simp = regsplex(nClasses - 1)
+    self.simplex = torch.cat(simp,
+                             torch.zeros(simp:size(1), nClasses -simp:size(2)),
+                             2)
+    self._target = torch.Tensor(nClasses)
+end
+
+-- handle target being both 1D tensor, and
+-- target being 2D tensor (2D tensor means dont do anything)
+local function transformTarget(self, target)
+    if torch.type(target) == 'number' then
+        self._target:resize(self.nClasses)
+        self._target:copy(self.simplex[target])
+    elseif torch.isTensor(target) then
+        assert(target:dim() == 1, '1D tensors only!')
+        local nSamples = target:size(1)
+        self._target:resize(nSamples, self.nClasses)
+        for i=1,nSamples do
+            self._target[i]:copy(self.simplex[target[i]])
+        end
+    end
+end
+
+function ClassSimplexCriterion:updateOutput(input, target)
+    transformTarget(self, target)
+    assert(input:nElement() == self._target:nElement())
+    self.output_tensor = self.output_tensor or input.new(1)
+    input.THNN.MSECriterion_updateOutput(
+      input:cdata(),
+      self._target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage
+    )
+    self.output = self.output_tensor[1]
+    return self.output
+end
+
+function ClassSimplexCriterion:updateGradInput(input, target)
+    assert(input:nElement() == self._target:nElement())
+    input.THNN.MSECriterion_updateGradInput(
+      input:cdata(),
+      self._target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage
+    )
+    return self.gradInput
+end
+
+function ClassSimplexCriterion:getPredictions(input)
+    if input:dim() == 1 then
+        input = input:view(1, -1)
+    end
+    return torch.mm(input, self.simplex:t())
+end
+
+function ClassSimplexCriterion:getTopPrediction(input)
+    local prod = self:getPredictions(input)
+    local _, maxs = prod:max(prod:nDimension())
+    return maxs:view(-1)
+end
diff --git a/Concat.lua b/Concat.lua
new file mode 100644
index 0000000..ea2489e
--- /dev/null
+++ b/Concat.lua
@@ -0,0 +1,114 @@
+local Concat, parent = torch.class('nn.Concat', 'nn.Container')
+
+function Concat:__init(dimension)
+   parent.__init(self)
+   self.size = torch.LongStorage()
+   self.dimension = dimension
+end
+
+function Concat:updateOutput(input)
+   local outs = {}
+   for i=1,#self.modules do
+      local currentOutput = self:rethrowErrors(self.modules[i], i, 'updateOutput', input)
+      outs[i] = currentOutput
+      if i == 1 then
+         self.size:resize(currentOutput:dim()):copy(currentOutput:size())
+      else
+         self.size[self.dimension] = self.size[self.dimension] + currentOutput:size(self.dimension)
+      end
+   end
+   self.output:resize(self.size)
+
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = outs[i]
+      self.output:narrow(self.dimension, offset, currentOutput:size(self.dimension)):copy(currentOutput)
+      offset = offset + currentOutput:size(self.dimension)
+   end
+   return self.output
+end
+
+function Concat:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input)
+
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = module.output
+      local currentGradInput = self:rethrowErrors(module, i, 'updateGradInput', input, gradOutput:narrow(self.dimension, offset, currentOutput:size(self.dimension)))
+
+      if currentGradInput then -- if the module does not produce a gradInput (for example first layer), then ignore it and move on.
+         if i==1 then
+            self.gradInput:copy(currentGradInput)
+         else
+            self.gradInput:add(currentGradInput)
+         end
+      end
+      offset = offset + currentOutput:size(self.dimension)
+   end
+   return self.gradInput
+end
+
+function Concat:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = module.output
+      self:rethrowErrors(module, i, 'accGradParameters',
+          input,
+          gradOutput:narrow(self.dimension, offset, currentOutput:size(self.dimension)),
+          scale)
+      offset = offset + currentOutput:size(self.dimension)
+   end
+end
+
+function Concat:backward(input, gradOutput, scale)
+   self.gradInput:resizeAs(input)
+   scale = scale or 1
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = module.output
+      local currentGradInput = self:rethrowErrors(module, i, 'backward', input, gradOutput:narrow(self.dimension, offset, currentOutput:size(self.dimension)), scale)
+      if currentGradInput then -- if the module does not produce a gradInput (for example first layer), then ignore it and move on.
+         if i==1 then
+            self.gradInput:copy(currentGradInput)
+         else
+            self.gradInput:add(currentGradInput)
+         end
+      end
+      offset = offset + currentOutput:size(self.dimension)
+   end
+   return self.gradInput
+end
+
+function Concat:accUpdateGradParameters(input, gradOutput, lr)
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = module.output
+      self:rethrowErrors(module, i, 'accUpdateGradParameters',
+          input,
+          gradOutput:narrow(self.dimension, offset, currentOutput:size(self.dimension)),
+          lr)
+      offset = offset + currentOutput:size(self.dimension)
+   end
+end
+
+function Concat:__tostring__()
+   local tab = '  '
+   local line = '\n'
+   local next = '  |`-> '
+   local ext = '  |    '
+   local extlast = '       '
+   local last = '   ... -> '
+   local str = torch.type(self)
+   str = str .. ' {' .. line .. tab .. 'input'
+   for i=1,#self.modules do
+      if i == #self.modules then
+         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
+      else
+         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. ext)
+      end
+   end
+   str = str .. line .. tab .. last .. 'output'
+   str = str .. line .. '}'
+   return str
+end
diff --git a/ConcatTable.lua b/ConcatTable.lua
new file mode 100644
index 0000000..cb08de0
--- /dev/null
+++ b/ConcatTable.lua
@@ -0,0 +1,117 @@
+local ConcatTable, parent = torch.class('nn.ConcatTable', 'nn.Container')
+
+function ConcatTable:__init()
+   parent.__init(self)
+   self.modules = {}
+   self.output = {}
+end
+
+function ConcatTable:updateOutput(input)
+   for i=1,#self.modules do
+      self.output[i] = self:rethrowErrors(self.modules[i], i, 'updateOutput', input)
+   end
+   return self.output
+end
+
+local function retable(t1, t2, f)
+   for k, v in ipairs(t2) do
+      if (torch.type(v) == "table") then
+         t1[k] = retable(t1[k] or {}, t2[k], f)
+      else
+         f(t1, k, v)
+      end
+   end
+   for i=#t2+1, #t1 do
+      t1[i] = nil
+   end
+   return t1
+end
+
+local function backward(self, method, input, gradOutput, scale)
+   local isTable = torch.type(input) == 'table'
+   local wasTable = torch.type(self.gradInput) == 'table'
+   if isTable then
+      for i,module in ipairs(self.modules) do
+         local currentGradInput = self:rethrowErrors(module, i, method, input, gradOutput[i], scale)
+         if torch.type(currentGradInput) ~= 'table' then
+            error"currentGradInput is not a table!"
+         end
+         if #input ~= #currentGradInput then
+            error("table size mismatch: "..#input.." ~= "..#currentGradInput)
+         end
+         if i == 1 then
+            self.gradInput = wasTable and self.gradInput or {}
+            retable(self.gradInput, currentGradInput,
+               function(t, k, v)
+                  t[k] = t[k] or v:clone()
+                  t[k]:resizeAs(v)
+                  t[k]:copy(v)
+               end
+            )
+         else
+            retable(self.gradInput, currentGradInput,
+               function(t, k, v)
+                  if t[k] then
+                     t[k]:add(v)
+                  else
+                     t[k] = v:clone()
+                  end
+               end
+            )
+         end
+      end
+   else
+      self.gradInput = (not wasTable) and self.gradInput or input:clone()
+      for i,module in ipairs(self.modules) do
+         local currentGradInput = self:rethrowErrors(module, i, method, input, gradOutput[i], scale)
+         if i == 1 then
+            self.gradInput:resizeAs(currentGradInput):copy(currentGradInput)
+         else
+            self.gradInput:add(currentGradInput)
+         end
+      end
+   end
+   return self.gradInput
+end
+
+function ConcatTable:updateGradInput(input, gradOutput)
+   return backward(self, 'updateGradInput', input, gradOutput)
+end
+
+function ConcatTable:backward(input, gradOutput, scale)
+   return backward(self, 'backward', input, gradOutput, scale)
+end
+
+function ConcatTable:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   for i,module in ipairs(self.modules) do
+      self:rethrowErrors(module, i, 'accGradParameters', input, gradOutput[i], scale)
+   end
+end
+
+function ConcatTable:accUpdateGradParameters(input, gradOutput, lr)
+   for i,module in ipairs(self.modules) do
+      self:rethrowErrors(module, i, 'accUpdateGradParameters', input, gradOutput[i], lr)
+   end
+end
+
+function ConcatTable:__tostring__()
+   local tab = '  '
+   local line = '\n'
+   local next = '  |`-> '
+   local ext = '  |    '
+   local extlast = '       '
+   local last = '   ... -> '
+   local str = torch.type(self)
+   str = str .. ' {' .. line .. tab .. 'input'
+   for i=1,#self.modules do
+      if i == self.modules then
+         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
+      else
+         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. ext)
+      end
+   end
+   str = str .. line .. tab .. last .. 'output'
+   str = str .. line .. '}'
+   return str
+end
diff --git a/Container.lua b/Container.lua
new file mode 100644
index 0000000..6af4d7d
--- /dev/null
+++ b/Container.lua
@@ -0,0 +1,148 @@
+-- This is code common to container modules, which are collections of
+-- smaller constituent modules like Parallel, Sequential, etc.
+local Container, parent = torch.class('nn.Container', 'nn.Module')
+
+function Container:__init(...)
+    parent.__init(self, ...)
+    self.modules = {}
+end
+
+function Container:add(module)
+    table.insert(self.modules, module)
+    return self
+end
+
+function Container:get(index)
+    return self.modules[index]
+end
+
+function Container:size()
+    return #self.modules
+end
+
+-- Check if passing arguments through xpcall is supported in this Lua interpreter.
+local _, XPCALL_ARGS = xpcall(function(x) return x ~= nil end, function() end, 1)
+local TRACEBACK_WARNING = "WARNING: If you see a stack trace below, it doesn't point to the place where this error occured. Please use only the one above."
+-- module argument can be retrieved with moduleIndex, but code is cleaner when
+-- it has to be specified anyway.
+function Container:rethrowErrors(module, moduleIndex, funcName, ...)
+   assert(module == self.modules[moduleIndex],
+          "mismatch between moduleIndex and self.modules in rethrowErrors")
+   local function handleError(err)
+      -- This will be executed only in the first container that handles the error.
+      if not err:find(TRACEBACK_WARNING) then
+         local traceback = debug.traceback()
+         -- Remove this handler from the stack
+         local _, first_line_end = traceback:find('^.-\n')
+         local _, second_line_end = traceback:find('^.-\n.-\n')
+         traceback = traceback:sub(1, first_line_end) .. traceback:sub(second_line_end+1)
+         err = err .. '\n' .. traceback .. '\n\n' .. TRACEBACK_WARNING
+      else
+         -- Remove file path
+         err = err:sub(err:find('\n')+1)
+      end
+      local msg = string.format('In %d module of %s:',
+                              moduleIndex, torch.type(self))
+      -- Preceding newline has to be here, because Lua will prepend a file path.
+      err = '\n' .. msg .. '\n' .. err
+      return err
+   end
+
+   -- Lua 5.1 doesn't support passing arguments through xpcall, so they have to
+   -- be passed via a closure. This incurs some overhead, so it's better not to
+   -- make it the default.
+   local ok, ret, noret
+   if not XPCALL_ARGS then
+      local args = {...}
+      local unpack = unpack or table.unpack
+      ok, ret, noret = xpcall(function()
+                                 return module[funcName](module, unpack(args))
+                              end,
+                              handleError)
+   else
+      ok, ret, noret = xpcall(module[funcName], handleError, module, ...)
+   end
+   assert(noret == nil, "rethrowErrors supports only one return argument")
+
+   if not ok then error(ret) end
+   return ret
+end
+
+function Container:applyToModules(func)
+    for _, module in ipairs(self.modules) do
+        func(module)
+    end
+end
+
+function Container:zeroGradParameters()
+    self:applyToModules(function(module) module:zeroGradParameters() end)
+end
+
+function Container:updateParameters(learningRate)
+    self:applyToModules(function(module) module:updateParameters(learningRate) end)
+end
+
+function Container:training()
+    self:applyToModules(function(module) module:training() end)
+    parent.training(self)
+end
+
+function Container:evaluate()
+    self:applyToModules(function(module) module:evaluate() end)
+    parent.evaluate(self)
+end
+
+function Container:share(mlp, ...)
+    for i=1,#self.modules do
+        self.modules[i]:share(mlp.modules[i], ...);
+    end
+end
+
+function Container:reset(stdv)
+    self:applyToModules(function(module) module:reset(stdv) end)
+end
+
+function Container:parameters()
+    local function tinsert(to, from)
+        if type(from) == 'table' then
+            for i=1,#from do
+                tinsert(to,from[i])
+            end
+        else
+            table.insert(to,from)
+        end
+    end
+    local w = {}
+    local gw = {}
+    for i=1,#self.modules do
+        local mw,mgw = self.modules[i]:parameters()
+        if mw then
+            tinsert(w,mw)
+            tinsert(gw,mgw)
+        end
+    end
+    return w,gw
+end
+
+function Container:clearState()
+   -- don't call set because it might reset referenced tensors
+   local function clear(f)
+      if self[f] then
+         if torch.isTensor(self[f]) then
+            self[f] = self[f].new()
+         elseif type(self[f]) == 'table' then
+            self[f] = {}
+         else
+            self[f] = nil
+         end
+      end
+   end
+   clear('output')
+   clear('gradInput')
+   if self.modules then
+      for i,module in pairs(self.modules) do
+         module:clearState()
+      end
+   end
+   return self
+end
diff --git a/Contiguous.lua b/Contiguous.lua
new file mode 100644
index 0000000..2f07e92
--- /dev/null
+++ b/Contiguous.lua
@@ -0,0 +1,19 @@
+local Contiguous, parent = torch.class('nn.Contiguous', 'nn.Module')
+
+function Contiguous:updateOutput(input)
+   if not input:isContiguous() then
+      self.output:resizeAs(input):copy(input)
+   else
+      self.output:set(input)
+   end
+   return self.output
+end
+
+function Contiguous:updateGradInput(input, gradOutput)
+   if not gradOutput:isContiguous() then
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+   else
+      self.gradInput:set(gradOutput)
+   end
+   return self.gradInput
+end
diff --git a/Copy.lua b/Copy.lua
new file mode 100644
index 0000000..9f83cf9
--- /dev/null
+++ b/Copy.lua
@@ -0,0 +1,42 @@
+local Copy, parent = torch.class('nn.Copy', 'nn.Module')
+
+function Copy:__init(intype, outtype, forceCopy, dontCast)
+   intype = intype or torch.Tensor.__typename
+   outtype = outtype or torch.Tensor.__typename
+
+   self.dontCast = dontCast
+
+   parent.__init(self)
+   self.gradInput = torch.getmetatable(intype).new()
+   self.output = torch.getmetatable(outtype).new()
+
+   if (not forceCopy) and intype == outtype then
+
+      self.updateOutput = function(self, input)
+                        self.output:set(input)
+                        return input
+                     end
+
+      self.updateGradInput = function(self, input, gradOutput)
+                         self.gradInput:set(gradOutput)
+                         return gradOutput
+                      end
+   end
+end
+
+function Copy:updateOutput(input)
+   self.output:resize(input:size()):copy(input)
+   return self.output
+end
+
+function Copy:updateGradInput(input, gradOutput)
+   self.gradInput:resize(gradOutput:size()):copy(gradOutput)
+   return self.gradInput
+end
+
+function Copy:type(type, tensorCache)
+   if type and self.dontCast then
+      return self
+   end
+   return parent.type(self, type, tensorCache)
+end
diff --git a/Cosine.lua b/Cosine.lua
new file mode 100644
index 0000000..e655b9e
--- /dev/null
+++ b/Cosine.lua
@@ -0,0 +1,175 @@
+local Cosine, parent = torch.class('nn.Cosine', 'nn.Module')
+
+function Cosine:__init(inputSize,outputSize)
+   parent.__init(self)
+
+   self.weight = torch.Tensor(outputSize,inputSize)
+   self.gradWeight = torch.Tensor(outputSize,inputSize)
+   
+   self:reset()
+end
+
+function Cosine:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(1))
+   end
+   self.weight:uniform(-stdv, stdv)
+end
+
+function Cosine:updateOutput(input)
+   local inputSize = self.weight:size(2)
+   local outputSize = self.weight:size(1)
+   
+   self._weightNorm = self._weightNorm or self.weight.new()
+   self._inputNorm = self._inputNorm or self.weight.new()
+   
+   -- y_j = (w_j * x) / ( || w_j || * || x || )
+   
+   self._weightNorm:norm(self.weight,2,2):add(1e-12)
+   if input:dim() == 1 then
+      self.output:resize(outputSize):zero()
+      self.output:addmv(1, self.weight, input)
+      self.__norm = input:norm()+1e-12
+      self.output:cdiv(self._weightNorm:view(outputSize)):div(self.__norm)
+   elseif input:dim() == 2 then
+      local batchSize = input:size(1)
+      local nElement = self.output:nElement()
+      self.output:resize(batchSize, outputSize)
+      if self.output:nElement() ~= nElement then
+         self.output:zero()
+      end
+      self.output:addmm(0, self.output, 1, input, self.weight:t())
+      
+      self._inputNorm:norm(input,2,2):add(1e-12)
+      self.output:cdiv(self._weightNorm:view(1,outputSize):expandAs(self.output))
+      self.output:cdiv(self._inputNorm:expandAs(self.output))
+   else
+      error('input must be vector or matrix')
+   end
+   
+   return self.output
+end
+
+function Cosine:updateGradInput(input, gradOutput)
+   if not self.gradInput then
+      return
+   end
+   
+   local inputSize = self.weight:size(2)
+   local outputSize = self.weight:size(1)
+   
+   --[[ 
+   dy_j           w_ji                   x_i
+   ---- = -------------------  -  y_j ---------
+   dx_i   || w_j || * || x ||         || x ||^2
+   --]]
+
+   local nElement = self.gradInput:nElement()
+   self.gradInput:resizeAs(input)
+   if self.gradInput:nElement() ~= nElement then
+      self.gradInput:zero()
+   end
+   
+   if input:dim() == 1 then
+      self._weight = self._weight or input.new()
+      self._weight:resizeAs(self.weight):copy(self.weight)
+      self._weight:cdiv(self._weightNorm:expandAs(self.weight))
+      self._weight:div(self.__norm)
+      self._weight:addr(1, self._weight, -1/(self.__norm*self.__norm), self.output, input)
+      self.gradInput:addmv(0, 1, self._weight:t(), gradOutput)
+   elseif input:dim() == 2 then  
+      local inputNorm = self._inputNorm:expandAs(input)
+      local weightNorm = self._weightNorm:view(1,outputSize):expandAs(gradOutput)
+      
+      self.gradInput:copy(input):cdiv(inputNorm)
+      self._gradOutput = self._gradOutput or gradOutput.new()
+      self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+      self._gradOutput:cmul(self.output)
+      self._sum = self._sum or input.new()
+      self._sum:sum(self._gradOutput, 2)
+      self.gradInput:cmul(self._sum:expandAs(input))
+      
+      self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+      self._gradOutput:cdiv(weightNorm)
+      self.gradInput:addmm(-1, self.gradInput, 1, self._gradOutput, self.weight)
+      
+      self.gradInput:cdiv(inputNorm)
+   end
+
+   return self.gradInput
+end
+
+function Cosine:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   local inputSize = self.weight:size(2)
+   local outputSize = self.weight:size(1)
+   
+   --[[ 
+   dy_j            x_i                     w_ji
+   ----- = -------------------  -  y_j -----------
+   dw_ji   || w_j || * || x ||         || w_j ||^2
+   --]]
+   
+   if input:dim() == 1 then
+      self._gradOutput = self._gradOutput or gradOutput.new()
+      self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+      local weightNorm = self._weightNorm:view(outputSize)
+      self._gradOutput:cdiv(weightNorm)
+      self.gradWeight:addr(scale/self.__norm, self._gradOutput, input)
+      
+      self._gradOutput:cdiv(weightNorm)
+      self._gradOutput:cmul(self.output)
+      self._weight = self._weight or self.weight.new()
+      self._weight:resizeAs(self._weight):copy(self.weight)
+      self._weight:cmul(self._gradOutput:view(outputSize, 1):expandAs(self.weight))
+      self.gradWeight:add(-1, self._weight)
+   elseif input:dim() == 2 then
+      self._weight = self._weight or self.weight.new()
+      self._weight:resizeAs(self.weight):copy(self.weight)
+      self._gradOutput = self._gradOutput or gradOutput.new()
+      self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+      self._gradOutput:cmul(self.output)
+      self._sum = self._sum or input.new()
+      self._sum:sum(self._gradOutput, 1)
+      local grad = self._sum[1]
+      grad:cdiv(self._weightNorm:select(2,1))
+      self._weight:cmul(grad:view(outputSize,1):expandAs(self._weight))
+      
+      local input_ = self._gradOutput
+      input_:resizeAs(input):copy(input)
+      input_:cdiv(self._inputNorm:expandAs(input))
+      self._weight:addmm(-1, self._weight, 1, gradOutput:t(), input_)
+      
+      self._weight:cdiv(self._weightNorm:expandAs(self._weight))
+      self.gradWeight:add(self._weight)
+   else
+      error"1D or 2D input expected"
+   end
+end
+
+function Cosine:type(type, tensorCache)
+   if type then
+      -- prevent premature memory allocations
+      self._input = nil
+      self._weight = nil
+      self._inputNorm = nil
+      self._weightNorm = nil
+      self._gradOutput = nil
+      self._sum = nil
+   end
+   return parent.type(self, type, tensorCache)
+end
+
+function Cosine:clearState()
+   nn.utils.clear(self, {
+      '_input',
+      '_weight',
+      '_gradOutput',
+      '_sum',
+      '_inputNorm',
+      '_weightNorm',
+   })
+   return parent.clearState(self)
+end
diff --git a/CosineDistance.lua b/CosineDistance.lua
new file mode 100644
index 0000000..2988c65
--- /dev/null
+++ b/CosineDistance.lua
@@ -0,0 +1,116 @@
+local CosineDistance, parent = torch.class('nn.CosineDistance', 'nn.Module')
+
+function CosineDistance:__init()
+   parent.__init(self)
+   self.gradInput = {torch.Tensor(), torch.Tensor()}
+end 
+
+local function makeContiguous(self, input1, input2)
+   if not input1:isContiguous() then
+      self._input1 = self._input1 or input1.new()
+      self._input1:resizeAs(input1):copy(input1)
+      input1 = self._input1
+   end
+   if not input2:isContiguous() then
+      self._input2 = self._input2 or input2.new()
+      self._input2:resizeAs(input2):copy(input2)
+      input2 = self._input2
+   end
+   return input1, input2
+end
+
+function CosineDistance:updateOutput(input)
+   local input1, input2 = input[1], input[2]
+
+   input1, input2 = makeContiguous(self, input1, input2)
+   
+   if input1:dim() == 1 then
+      input1 = input1:view(1,-1)
+      input2 = input2:view(1,-1)
+   end
+
+   if not self.buffer then
+      self.buffer = input1.new()
+      self.w1  = input1.new()
+      self.w22 = input1.new()
+      self.w  = input1.new()
+      self.w32 = input1.new()
+      self.ones = input1.new()
+   end
+
+   self.buffer:cmul(input1,input2)
+   self.w1:sum(self.buffer,2)
+
+   local epsilon = 1e-12
+   self.buffer:cmul(input1,input1)
+   self.w22:sum(self.buffer,2):add(epsilon)
+   self.ones:resizeAs(self.w22):fill(1)
+   self.w22:cdiv(self.ones, self.w22)
+   self.w:resizeAs(self.w22):copy(self.w22)
+
+   self.buffer:cmul(input2,input2)
+   self.w32:sum(self.buffer,2):add(epsilon)
+   self.w32:cdiv(self.ones, self.w32)
+   self.w:cmul(self.w32)
+   self.w:sqrt()
+
+   self.output:cmul(self.w1,self.w)
+   self.output:resize(input1:size(1))
+
+   return self.output
+end
+
+function CosineDistance:updateGradInput(input, gradOutput)
+   local v1  = input[1]
+   local v2  = input[2]
+   local not_batch = false
+   
+   v1, v2 = makeContiguous(self, v1, v2)
+   
+   if v1:dim() == 1 then
+      v1 = v1:view(1,-1)
+      v2 = v2:view(1,-1)
+      not_batch = true
+   end
+
+   if #self.gradInput ~= 2 then
+      self.gradInput[1] = self.gradInput[1] or v1.new()
+      self.gradInput[2] = self.gradInput[2] or v1.new()
+   end
+
+   local gw1 = self.gradInput[1]
+   local gw2 = self.gradInput[2]
+   gw1:resizeAs(v1):copy(v2)
+   gw2:resizeAs(v1):copy(v1)
+
+   self.buffer:cmul(self.w1,self.w22)
+   gw1:addcmul(-1,self.buffer:expandAs(v1),v1)
+   gw1:cmul(self.w:expandAs(v1))
+
+   self.buffer:cmul(self.w1,self.w32)
+   gw2:addcmul(-1,self.buffer:expandAs(v1),v2)
+   gw2:cmul(self.w:expandAs(v1))
+
+   local go = gradOutput:view(-1,1):expandAs(v1)
+   gw1:cmul(go)
+   gw2:cmul(go)
+
+   if not_batch then
+      self.gradInput[1]:resize(gw1:size(2))
+      self.gradInput[2]:resize(gw2:size(2))
+   end
+
+   return self.gradInput
+end
+
+function CosineDistance:clearState()
+   nn.utils.clear(self, {
+      'buffer',
+      'w1',
+      'w22',
+      'w',
+      'w32',
+      'ones',
+   })
+   return parent.clearState(self)
+end
diff --git a/CosineEmbeddingCriterion.lua b/CosineEmbeddingCriterion.lua
new file mode 100644
index 0000000..51c9763
--- /dev/null
+++ b/CosineEmbeddingCriterion.lua
@@ -0,0 +1,142 @@
+local CosineEmbeddingCriterion, parent = torch.class('nn.CosineEmbeddingCriterion', 'nn.Criterion')
+
+function CosineEmbeddingCriterion:__init(margin)
+   parent.__init(self)
+   margin = margin or 0
+   self.margin = margin 
+   self.gradInput = {torch.Tensor(), torch.Tensor()}
+   self.sizeAverage = true
+end 
+
+function CosineEmbeddingCriterion:updateOutput(input,y)
+
+   local input1, input2 = input[1], input[2]
+
+   -- keep backward compatibility
+   if type(y) == 'number' then
+     self._y = self._y or input1.new(1)
+     self._y[1] = y
+     y = self._y
+   end
+
+   if input1:dim() == 1 then
+      input1 = input1:view(1,-1)
+      input2 = input2:view(1,-1)
+   end
+
+   if not self.buffer then
+      self.buffer = input1.new()
+      self.w1  = input1.new()
+      self.w22 = input1.new()
+      self.w  = input1.new()
+      self.w32 = input1.new()
+      self._outputs = input1.new()
+      -- comparison operators behave differently from cuda/c implementations
+      if input1:type() == 'torch.CudaTensor' then
+         self._idx = input1.new()
+      else
+         self._idx = torch.ByteTensor()
+      end
+   end
+
+   self.buffer:cmul(input1,input2)
+   self.w1:sum(self.buffer,2)
+
+   local epsilon = 1e-12
+   self.buffer:cmul(input1,input1)
+   self.w22:sum(self.buffer,2):add(epsilon)
+   -- self._outputs is also used as a temporary buffer
+   self._outputs:resizeAs(self.w22):fill(1)
+   self.w22:cdiv(self._outputs, self.w22)
+   self.w:resizeAs(self.w22):copy(self.w22)
+
+   self.buffer:cmul(input2,input2)
+   self.w32:sum(self.buffer,2):add(epsilon)
+   self.w32:cdiv(self._outputs, self.w32)
+   self.w:cmul(self.w32)
+   self.w:sqrt()
+
+   self._outputs:cmul(self.w1,self.w)
+   self._outputs = self._outputs:select(2,1)
+
+   y.eq(self._idx,y,-1)
+   self._outputs[self._idx] = self._outputs[self._idx]:add(-self.margin):cmax(0)
+   y.eq(self._idx,y,1)
+   self._outputs[self._idx] = self._outputs[self._idx]:mul(-1):add(1)
+
+   self.output = self._outputs:sum()
+
+   if self.sizeAverage then
+      self.output = self.output/y:size(1)
+   end
+
+   return self.output
+end
+
+function CosineEmbeddingCriterion:updateGradInput(input, y)
+
+   local v1  = input[1]
+   local v2  = input[2]
+   local not_batch = false
+
+   -- keep backward compatibility
+   if type(y) == 'number' then
+     self._y = self._y or input1.new(1)
+     self._y[1] = y
+     y = self._y
+   end
+
+   if v1:dim() == 1 then
+      v1 = v1:view(1,-1)
+      v2 = v2:view(1,-1)
+      not_batch = true
+   end
+
+   local gw1 = self.gradInput[1]
+   local gw2 = self.gradInput[2]
+   gw1:resizeAs(v1):copy(v2)
+   gw2:resizeAs(v1):copy(v1)
+
+   self.buffer:cmul(self.w1,self.w22)
+   gw1:addcmul(-1,self.buffer:expandAs(v1),v1)
+   gw1:cmul(self.w:expandAs(v1))
+
+   self.buffer:cmul(self.w1,self.w32)
+   gw2:addcmul(-1,self.buffer:expandAs(v1),v2)
+   gw2:cmul(self.w:expandAs(v1))
+
+   -- self._idx = self._outputs <= 0
+   y.le(self._idx,self._outputs,0)
+   self._idx = self._idx:view(-1,1):expand(gw1:size())
+   gw1[self._idx] = 0
+   gw2[self._idx] = 0
+
+   y.eq(self._idx,y,1)
+   self._idx = self._idx:view(-1,1):expand(gw2:size())
+   gw1[self._idx] = gw1[self._idx]:mul(-1)
+   gw2[self._idx] = gw2[self._idx]:mul(-1)
+
+   if self.sizeAverage then
+      gw1:div(y:size(1))
+      gw2:div(y:size(1))
+   end
+
+   if not_batch then
+      self.gradInput[1]:resize(gw1:size(2))
+      self.gradInput[2]:resize(gw2:size(2))
+   end
+
+   return self.gradInput
+end
+
+function CosineEmbeddingCriterion:type(type)
+   self._idx = nil
+   parent.type(self,type)
+   -- comparison operators behave differently from cuda/c implementations
+   if type == 'torch.CudaTensor' then
+      self._idx = torch.CudaTensor()
+   else
+      self._idx = torch.ByteTensor()
+   end
+   return self
+end
diff --git a/Criterion.lua b/Criterion.lua
new file mode 100644
index 0000000..4efb279
--- /dev/null
+++ b/Criterion.lua
@@ -0,0 +1,56 @@
+local Criterion = torch.class('nn.Criterion')
+
+function Criterion:__init()
+   self.gradInput = torch.Tensor()
+   self.output = 0
+end
+
+function Criterion:updateOutput(input, target)
+end
+
+function Criterion:forward(input, target)
+   return self:updateOutput(input, target)
+end
+
+function Criterion:backward(input, target)
+   return self:updateGradInput(input, target)
+end
+
+function Criterion:updateGradInput(input, target)
+end
+
+function Criterion:clone()
+   local f = torch.MemoryFile("rw"):binary()
+   f:writeObject(self)
+   f:seek(1)
+   local clone = f:readObject()
+   f:close()
+   return clone
+end
+
+function Criterion:type(type, tensorCache)
+   assert(type, 'Criterion: must provide a type to convert to')
+   -- find all tensors and convert them
+   for key,param in pairs(self) do
+      self[key] = nn.utils.recursiveType(param, type, tensorCache)
+   end
+   return self
+end
+
+function Criterion:float()
+   return self:type('torch.FloatTensor')
+end
+
+function Criterion:double()
+   return self:type('torch.DoubleTensor')
+end
+
+function Criterion:cuda()
+   return self:type('torch.CudaTensor')
+end
+
+function Criterion:__call__(input, target)
+   self.output = self:forward(input, target)
+   self.gradInput = self:backward(input, target)
+   return self.output, self.gradInput
+end
diff --git a/CriterionTable.lua b/CriterionTable.lua
new file mode 100644
index 0000000..14c64ac
--- /dev/null
+++ b/CriterionTable.lua
@@ -0,0 +1,17 @@
+local CriterionTable, parent = torch.class('nn.CriterionTable', 'nn.Module')
+
+function CriterionTable:__init(criterion)
+   parent.__init(self)
+   self.criterion = criterion
+   self.gradInput = {criterion.gradInput}
+end
+
+function CriterionTable:updateOutput(input) 
+   self.output = self.criterion:updateOutput(table.unpack(input))
+   return self.output
+end
+    
+function CriterionTable:updateGradInput(input, gradOutput)
+  self.criterion:updateGradInput(table.unpack(input))
+  return self.gradInput
+end 
diff --git a/CrossEntropyCriterion.lua b/CrossEntropyCriterion.lua
new file mode 100644
index 0000000..d4d19e5
--- /dev/null
+++ b/CrossEntropyCriterion.lua
@@ -0,0 +1,28 @@
+local CrossEntropyCriterion, Criterion = torch.class('nn.CrossEntropyCriterion', 'nn.Criterion')
+
+function CrossEntropyCriterion:__init(weights)
+   Criterion.__init(self)
+   self.lsm = nn.LogSoftMax()
+   self.nll = nn.ClassNLLCriterion(weights)
+end
+
+function CrossEntropyCriterion:updateOutput(input, target)
+   input = input:squeeze()
+   target = type(target) == 'number' and target or target:squeeze()
+   self.lsm:updateOutput(input)
+   self.nll:updateOutput(self.lsm.output, target)
+   self.output = self.nll.output
+   return self.output
+end
+
+function CrossEntropyCriterion:updateGradInput(input, target)
+   local size = input:size()
+   input = input:squeeze()
+   target = type(target) == 'number' and target or target:squeeze()
+   self.nll:updateGradInput(self.lsm.output, target)
+   self.lsm:updateGradInput(input, self.nll.gradInput)
+   self.gradInput:view(self.lsm.gradInput, size)
+   return self.gradInput
+end
+
+return nn.CrossEntropyCriterion
diff --git a/DepthConcat.lua b/DepthConcat.lua
new file mode 100644
index 0000000..8ae8384
--- /dev/null
+++ b/DepthConcat.lua
@@ -0,0 +1,114 @@
+------------------------------------------------------------------------
+--[[ DepthConcat ]]--
+-- Concatenates the output of Convolutions along the depth dimension
+-- (nOutputFrame). This is used to implement the DepthConcat layer
+-- of the Going deeper with convolutions paper :
+-- http://arxiv.org/pdf/1409.4842v1.pdf
+-- The normal Concat Module can't be used since the spatial dimensions
+-- of tensors to be concatenated may have different values. To deal with
+-- this, we select the largest spatial dimensions and add zero-padding
+-- around the smaller dimensions.
+------------------------------------------------------------------------
+local DepthConcat, _ = torch.class('nn.DepthConcat', 'nn.Concat')
+
+function DepthConcat:windowNarrow(output, currentOutput, offset)
+   local outputWindow = output:narrow(self.dimension, offset, currentOutput:size(self.dimension))
+   for dim=1,self.size:size(1) do
+      local currentSize = currentOutput:size(dim)
+      if dim ~= self.dimension and self.size[dim] ~= currentSize then
+         -- 5x5 vs 3x3 -> start = [(5-3)/2] + 1 = 2 (1 pad each side)
+         -- 9x9 vs 5x5 -> start = [(9-5)/2] + 1 = 3 (2 pad each side)
+         -- 9x9 vs 4x4 -> start = [(9-4)/2] + 1 = 3.5 (2 pad, 3 pad)
+         local start = math.floor(((self.size[dim] - currentSize) / 2) + 1)
+         outputWindow = outputWindow:narrow(dim, start, currentSize)
+      end
+   end
+   return outputWindow
+end
+
+function DepthConcat:updateOutput(input)
+   local outs = {}
+   for i=1,#self.modules do
+      local currentOutput = self:rethrowErrors(self.modules[i], i, 'updateOutput', input)
+      outs[i] = currentOutput
+      if i == 1 then
+         self.size:resize(currentOutput:dim()):copy(currentOutput:size())
+      else
+         self.size[self.dimension] = self.size[self.dimension] + currentOutput:size(self.dimension)
+         for dim=1,self.size:size(1) do
+            if dim ~= self.dimension then
+               -- take the maximum size (shouldn't change anything for batch dim)
+               self.size[dim] = math.max(self.size[dim], currentOutput:size(dim))
+            end
+         end
+      end
+   end
+   self.output:resize(self.size):zero() --zero for padding
+
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = outs[i]
+      local outputWindow = self:windowNarrow(self.output, currentOutput, offset)
+      outputWindow:copy(currentOutput)
+      offset = offset + currentOutput:size(self.dimension)
+   end
+   return self.output
+end
+
+function DepthConcat:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input)
+
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = module.output
+      local gradOutputWindow = self:windowNarrow(gradOutput, currentOutput, offset)
+      local currentGradInput = self:rethrowErrors(module, i, 'updateGradInput', input, gradOutputWindow)
+      if i==1 then
+         self.gradInput:copy(currentGradInput)
+      else
+         self.gradInput:add(currentGradInput)
+      end
+      offset = offset + currentOutput:size(self.dimension)
+   end
+   return self.gradInput
+end
+
+function DepthConcat:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = module.output
+      local gradOutputWindow = self:windowNarrow(gradOutput, currentOutput, offset)
+      self:rethrowErrors(module, i, 'accGradParameters', input, gradOutputWindow, scale)
+      offset = offset + currentOutput:size(self.dimension)
+   end
+end
+
+function DepthConcat:backward(input, gradOutput, scale)
+   self.gradInput:resizeAs(input)
+
+   scale = scale or 1
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = module.output
+      local gradOutputWindow = self:windowNarrow(gradOutput, currentOutput, offset)
+      local currentGradInput = self:rethrowErrors(module, i, 'backward', input, gradOutputWindow)
+      if i==1 then
+         self.gradInput:copy(currentGradInput)
+      else
+         self.gradInput:add(currentGradInput)
+      end
+      offset = offset + currentOutput:size(self.dimension)
+   end
+   return self.gradInput
+end
+
+function DepthConcat:accUpdateGradParameters(input, gradOutput, lr)
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = module.output
+      local gradOutputWindow = self:windowNarrow(gradOutput, currentOutput, offset)
+      self:rethrowErrors(module, i, 'accUpdateGradParameters', input, gradOutputWindow, lr)
+      offset = offset + currentOutput:size(self.dimension)
+   end
+end
diff --git a/DistKLDivCriterion.lua b/DistKLDivCriterion.lua
new file mode 100644
index 0000000..bfad575
--- /dev/null
+++ b/DistKLDivCriterion.lua
@@ -0,0 +1,34 @@
+local DistKLDivCriterion, parent = torch.class('nn.DistKLDivCriterion', 'nn.Criterion')
+
+function DistKLDivCriterion:__init()
+   parent.__init(self)
+   self.sizeAverage = true
+end
+
+function DistKLDivCriterion:updateOutput(input, target)
+   assert(input:dim() == target:dim() and
+      torch.LongTensor(input:size()):eq(torch.LongTensor(target:size())):all(),
+      'input and target should have the same size')
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.DistKLDivCriterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function DistKLDivCriterion:updateGradInput(input, target)
+   assert(input:dim() == target:dim() and
+      torch.LongTensor(input:size()):eq(torch.LongTensor(target:size())):all(),
+      'input and target should have the same size')
+   input.THNN.DistKLDivCriterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage
+   )
+   return self.gradInput
+end
diff --git a/DotProduct.lua b/DotProduct.lua
new file mode 100644
index 0000000..20ba1e2
--- /dev/null
+++ b/DotProduct.lua
@@ -0,0 +1,61 @@
+local DotProduct, parent = torch.class('nn.DotProduct', 'nn.Module')
+
+function DotProduct:__init()
+   parent.__init(self)
+   self.gradInput = {torch.Tensor(), torch.Tensor()}
+end 
+ 
+function DotProduct:updateOutput(input)
+   local input1, input2 = input[1], input[2]
+   if input1:dim() == 1 then
+      -- convert non batch input to batch input
+      input1 = input1:view(1,-1)
+      input2 = input2:view(1,-1)
+   end
+   if not self.buffer then
+      self.buffer = input1.new()
+   end
+   self.buffer:cmul(input1, input2)
+   self.output:sum(self.buffer, 2)
+   self.output:resize(input1:size(1))
+   return self.output
+end
+
+function DotProduct:updateGradInput(input, gradOutput)
+   local v1 = input[1]
+   local v2 = input[2]
+   local not_batch = false
+
+   if #self.gradInput ~= 2 then
+     self.gradInput[1] = self.gradInput[1] or input[1].new()
+     self.gradInput[2] = self.gradInput[2] or input[2].new()
+   end
+
+   if v1:dim() == 1 then
+      v1 = v1:view(1,-1)
+      v2 = v2:view(1,-1)
+      not_batch = true
+   end
+
+   local gw1 = self.gradInput[1]
+   local gw2 = self.gradInput[2]
+   gw1:resizeAs(v1):copy(v2)
+   gw2:resizeAs(v2):copy(v1)
+
+   local go = gradOutput:view(-1,1):expandAs(v1)
+   gw1:cmul(go)
+   gw2:cmul(go)
+
+   if not_batch then
+      -- unbatch gradInput
+      self.gradInput[1]:set(gw1:select(1,1))
+      self.gradInput[2]:set(gw2:select(1,1))
+   end
+
+   return self.gradInput
+end
+
+function DotProduct:clearState()
+   if self.buffer then self.buffer:set() end
+   return parent.clearState(self)
+end
diff --git a/Dropout.lua b/Dropout.lua
new file mode 100644
index 0000000..946c37f
--- /dev/null
+++ b/Dropout.lua
@@ -0,0 +1,69 @@
+local Dropout, Parent = torch.class('nn.Dropout', 'nn.Module')
+
+function Dropout:__init(p,v1,inplace)
+   Parent.__init(self)
+   self.p = p or 0.5
+   self.train = true
+   self.inplace = inplace
+   -- version 2 scales output during training instead of evaluation
+   self.v2 = not v1
+   if self.p >= 1 or self.p < 0 then
+      error('<Dropout> illegal percentage, must be 0 <= p < 1')
+   end
+   self.noise = torch.Tensor()
+end
+
+function Dropout:updateOutput(input)
+   if self.inplace then
+      self.output:set(input)
+   else
+      self.output:resizeAs(input):copy(input)
+   end
+   if self.p > 0 then
+      if self.train then
+         self.noise:resizeAs(input)
+         self.noise:bernoulli(1-self.p)
+         if self.v2 then
+            self.noise:div(1-self.p)
+         end
+         self.output:cmul(self.noise)
+      elseif not self.v2 then
+         self.output:mul(1-self.p)
+      end
+   end
+   return self.output
+end
+
+function Dropout:updateGradInput(input, gradOutput)
+   if self.inplace then
+      self.gradInput:set(gradOutput)
+   else
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+   end
+   if self.train then
+      if self.p > 0 then
+         self.gradInput:cmul(self.noise) -- simply mask the gradients with the noise vector
+      end
+   else
+      if not self.v2 and self.p > 0 then
+         self.gradInput:mul(1-self.p)
+      end
+   end
+   return self.gradInput
+end
+
+function Dropout:setp(p)
+   self.p = p
+end
+
+function Dropout:__tostring__()
+   return string.format('%s(%f)', torch.type(self), self.p)
+end
+
+
+function Dropout:clearState()
+   if self.noise then
+      self.noise:set()
+   end
+   return Parent.clearState(self)
+end
diff --git a/ELU.lua b/ELU.lua
new file mode 100644
index 0000000..48a6caa
--- /dev/null
+++ b/ELU.lua
@@ -0,0 +1,45 @@
+local ELU, parent = torch.class('nn.ELU', 'nn.Module')
+
+--[[
+   Djork-Arné Clevert, Thomas Unterthiner, Sepp Hochreiter
+   Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+   http://arxiv.org/pdf/1511.07289.pdf
+--]]
+
+function ELU:__init(alpha, inplace)
+   parent.__init(self)
+   self.alpha = alpha or 1
+   assert(type(self.alpha) == 'number')
+   self.inplace = inplace or false
+   assert(type(self.inplace) == 'boolean')
+end
+
+function ELU:updateOutput(input)
+   local inplace = self.inplace or false
+
+   input.THNN.ELU_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.alpha,
+      inplace
+   )
+   return self.output
+end
+
+function ELU:updateGradInput(input, gradOutput)
+   local inplace = self.inplace or false
+
+   input.THNN.ELU_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata(),
+      self.alpha,
+      inplace
+   )
+   return self.gradInput
+end
+
+function ELU:__tostring__()
+  return string.format('%s (alpha:%f)', torch.type(self), self.alpha)
+end
diff --git a/ErrorMessages.lua b/ErrorMessages.lua
new file mode 100644
index 0000000..a5cbed0
--- /dev/null
+++ b/ErrorMessages.lua
@@ -0,0 +1,19 @@
+
+local mt = {
+  __index = function(table, key)
+    error("nn."..key.." is only supported for Float or Double Tensors.")
+  end
+}
+
+local tensors = {
+  torch.ByteTensor,
+  torch.CharTensor,
+  torch.ShortTensor,
+  torch.IntTensor,
+  torch.LongTensor,
+}
+
+for _, t in ipairs(tensors) do
+  t.nn = {}
+  setmetatable(t.nn, mt)
+end
diff --git a/Euclidean.lua b/Euclidean.lua
new file mode 100644
index 0000000..8269d13
--- /dev/null
+++ b/Euclidean.lua
@@ -0,0 +1,197 @@
+local Euclidean, parent = torch.class('nn.Euclidean', 'nn.Module')
+
+function Euclidean:__init(inputSize,outputSize)
+   parent.__init(self)
+
+   self.weight = torch.Tensor(inputSize,outputSize)
+   self.gradWeight = torch.Tensor(inputSize,outputSize)
+
+   -- state
+   self.gradInput:resize(inputSize)
+   self.output:resize(outputSize)
+   
+   self.fastBackward = true
+
+   self:reset()
+end
+
+function Euclidean:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(1))
+   end
+   if nn.oldSeed then
+      for i=1,self.weight:size(2) do
+         self.weight:select(2, i):apply(function()
+            return torch.uniform(-stdv, stdv)
+         end)
+      end
+   else
+      self.weight:uniform(-stdv, stdv)
+   end
+end
+
+local function view(res, src, ...)
+   local args = {...}
+   if src:isContiguous() then
+      res:view(src, table.unpack(args))
+   else
+      res:reshape(src, table.unpack(args))
+   end
+end
+
+function Euclidean:updateOutput(input)
+   -- lazy initialize buffers
+   self._input = self._input or input.new()
+   self._weight = self._weight or self.weight.new()
+   self._expand = self._expand or self.output.new()
+   self._expand2 = self._expand2 or self.output.new()
+   self._repeat = self._repeat or self.output.new()
+   self._repeat2 = self._repeat2 or self.output.new()
+   
+   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)
+   
+   -- y_j = || w_j - x || = || x - w_j ||
+   if input:dim() == 1 then
+      view(self._input, input, inputSize, 1)
+      self._expand:expandAs(self._input, self.weight)
+      self._repeat:resizeAs(self._expand):copy(self._expand)
+      self._repeat:add(-1, self.weight)
+      self.output:norm(self._repeat, 2, 1)
+      self.output:resize(outputSize)
+   elseif input:dim() == 2 then
+      local batchSize = input:size(1)
+      
+      view(self._input, input, batchSize, inputSize, 1)
+      self._expand:expand(self._input, batchSize, inputSize, outputSize)
+      -- make the expanded tensor contiguous (requires lots of memory)
+      self._repeat:resizeAs(self._expand):copy(self._expand)
+      
+      self._weight:view(self.weight, 1, inputSize, outputSize)
+      self._expand2:expandAs(self._weight, self._repeat)
+      
+      if torch.type(input) == 'torch.CudaTensor' then
+         -- requires lots of memory, but minimizes cudaMallocs and loops
+         self._repeat2:resizeAs(self._expand2):copy(self._expand2)
+         self._repeat:add(-1, self._repeat2)
+      else
+         self._repeat:add(-1, self._expand2)
+      end
+      
+      self.output:norm(self._repeat, 2, 2)
+      self.output:resize(batchSize, outputSize)
+   else
+      error"1D or 2D input expected"
+   end
+   
+   return self.output
+end
+
+function Euclidean:updateGradInput(input, gradOutput)
+   if not self.gradInput then
+      return
+   end
+   
+   self._div = self._div or input.new()
+   self._output = self._output or self.output.new()
+   self._gradOutput = self._gradOutput or input.new()
+   self._expand3 = self._expand3 or input.new()
+   
+   if not self.fastBackward then
+      self:updateOutput(input)
+   end
+   
+   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)
+   
+   --[[ 
+   dy_j   -2 * (w_j - x)     x - w_j
+   ---- = ---------------  = -------
+    dx    2 || w_j - x ||      y_j
+   --]]
+
+   -- to prevent div by zero (NaN) bugs
+   self._output:resizeAs(self.output):copy(self.output):add(0.0000001)
+   view(self._gradOutput, gradOutput, gradOutput:size())
+   self._div:cdiv(gradOutput, self._output) 
+   if input:dim() == 1 then
+      self._div:resize(1, outputSize)
+      self._expand3:expandAs(self._div, self.weight)
+      
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat2:resizeAs(self._expand3):copy(self._expand3)
+         self._repeat2:cmul(self._repeat)
+      else
+         self._repeat2:cmul(self._repeat, self._expand3)
+      end
+      
+      self.gradInput:sum(self._repeat2, 2)
+      self.gradInput:resizeAs(input)
+   elseif input:dim() == 2 then
+      local batchSize = input:size(1)
+      
+      self._div:resize(batchSize, 1, outputSize)
+      self._expand3:expand(self._div, batchSize, inputSize, outputSize)
+      
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat2:resizeAs(self._expand3):copy(self._expand3)
+         self._repeat2:cmul(self._repeat)
+      else
+         self._repeat2:cmul(self._repeat, self._expand3)
+      end
+      
+      self.gradInput:sum(self._repeat2, 3)
+      self.gradInput:resizeAs(input)
+   else
+      error"1D or 2D input expected"
+   end
+   
+   return self.gradInput
+end
+
+function Euclidean:accGradParameters(input, gradOutput, scale)
+   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)
+   scale = scale or 1
+   
+   --[[ 
+   dy_j    2 * (w_j - x)     w_j - x
+   ---- = ---------------  = -------
+   dw_j   2 || w_j - x ||      y_j
+   --]]
+   -- assumes a preceding call to updateGradInput 
+   if input:dim() == 1 then
+      self.gradWeight:add(-scale, self._repeat2)
+   elseif input:dim() == 2 then
+      self._sum = self._sum or input.new()
+      self._sum:sum(self._repeat2, 1)
+      self._sum:resize(inputSize, outputSize)
+      self.gradWeight:add(-scale, self._sum)
+   else
+      error"1D or 2D input expected"
+   end
+end
+
+function Euclidean:type(type, tensorCache)
+   if type then
+      -- prevent premature memory allocations
+      self:clearState()
+   end
+   return parent.type(self, type, tensorCache)
+end
+
+function Euclidean:clearState()
+   nn.utils.clear(self, {
+      '_input',
+      '_output',
+      '_gradOutput',
+      '_weight',
+      '_div',
+      '_sum',
+      '_expand',
+      '_expand2',
+      '_expand3',
+      '_repeat',
+      '_repeat2',
+   })
+   return parent.clearState(self)
+end
diff --git a/Exp.lua b/Exp.lua
new file mode 100644
index 0000000..f415690
--- /dev/null
+++ b/Exp.lua
@@ -0,0 +1,9 @@
+local Exp = torch.class('nn.Exp', 'nn.Module')
+
+function Exp:updateOutput(input)
+  return self.output:exp(input)
+end
+
+function Exp:updateGradInput(input, gradOutput)
+  return self.gradInput:cmul(self.output, gradOutput)
+end
diff --git a/FlattenTable.lua b/FlattenTable.lua
new file mode 100644
index 0000000..1c18255
--- /dev/null
+++ b/FlattenTable.lua
@@ -0,0 +1,106 @@
+local FlattenTable, parent = torch.class('nn.FlattenTable', 'nn.Module')
+
+function FlattenTable:__init()
+  parent.__init(self)
+
+  self.output = {}
+  self.input_map = {}
+  self.gradInput = {}
+end
+
+-- Recursive function to flatten a table (output is a table)
+local function flatten(output, input)
+  local input_map  -- has the same structure as input, but stores the
+                   -- indices to the corresponding output
+  if type(input) == 'table' then
+    input_map = {}
+    -- forward DFS order
+    for i = 1, #input do
+      input_map[#input_map+1] = flatten(output, input[i])
+    end
+  else
+    input_map = #output + 1
+    output[input_map] = input  -- append the tensor
+  end
+  return input_map
+end
+
+-- Recursive function to check if we need to rebuild the output table
+local function checkMapping(output, input, input_map)
+  if input_map == nil or output == nil or input == nil then
+    return false
+  end
+  if type(input) == 'table' then
+    if type(input_map) ~= 'table' then
+      return false
+    end
+    if #input ~= #input_map then
+      return false
+    end
+    -- forward DFS order
+    for i = 1, #input do
+       local ok = checkMapping(output, input[i], input_map[i])
+       if not ok then
+          return false
+       end
+    end
+    return true
+  else
+    if type(input_map) ~= 'number' then
+      return false
+    end
+    return output[input_map] == input
+  end
+end
+
+-- During BPROP we have to build a gradInput with the same shape as the
+-- input.  This is a recursive function to build up a gradInput
+local function inverseFlatten(gradOutput, input_map)
+  if type(input_map) == 'table' then
+    local gradInput = {}
+    for i = 1, #input_map do
+      gradInput[#gradInput + 1] = inverseFlatten(gradOutput, input_map[i])
+    end
+    return gradInput
+  else
+    return gradOutput[input_map]
+  end
+end
+
+function FlattenTable:updateOutput(input)
+  assert(type(input) == 'table', 'input must be a table')
+  -- to avoid updating rebuilding the flattened table every updateOutput call
+  -- we will do a DFS pass over the existing output table and the inputs to
+  -- see if it needs to be rebuilt.
+  if not checkMapping(self.output, input, self.input_map) then
+    self.output = {}
+    self.input_map = flatten(self.output, input)
+  end
+  return self.output
+end
+
+function FlattenTable:updateGradInput(input, gradOutput)
+  assert(type(input) == 'table', 'input must be a table')
+  assert(type(input) == 'table', 'gradOutput must be a table')
+  -- If the input changes between the updateOutput and updateGradInput call,
+  -- then we may have to rebuild the input_map!  However, let's assume that
+  -- the input_map is valid and that forward has already been called.
+
+  -- However, we should check that the gradInput is valid:
+  if not checkMapping(gradOutput, self.gradInput, self.input_map) then
+    self.gradInput = inverseFlatten(gradOutput, self.input_map)
+  end
+
+  return self.gradInput
+end
+
+function FlattenTable:type(type, tensorCache)
+  -- This function just stores references so we don't need to do any type
+  -- conversions.  Just force the tables to be empty.
+  self:clearState()
+end
+
+function FlattenTable:clearState()
+  self.input_map = {}
+  return parent.clearState(self)
+end
diff --git a/GradientReversal.lua b/GradientReversal.lua
new file mode 100644
index 0000000..c08b1df
--- /dev/null
+++ b/GradientReversal.lua
@@ -0,0 +1,32 @@
+local GradientReversal, parent = torch.class('nn.GradientReversal', 'nn.Module')
+
+GradientReversal.__version = 2
+
+function GradientReversal:__init(lambda)
+   lambda = lambda or 1
+   parent.__init(self)
+   self.lambda = lambda
+end
+
+function GradientReversal:setLambda(lambda)
+  self.lambda = lambda
+end
+
+function GradientReversal:updateOutput(input)
+   self.output:set(input)
+   return self.output
+end
+
+function GradientReversal:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(gradOutput)
+   self.gradInput:copy(gradOutput)
+   self.gradInput:mul(-self.lambda)
+   return self.gradInput
+end
+
+function GradientReversal:read(file, version)
+   parent.read(self, file)
+   if version < 2 then
+      self.lambda = 1
+   end
+end
diff --git a/HardShrink.lua b/HardShrink.lua
new file mode 100644
index 0000000..85ff590
--- /dev/null
+++ b/HardShrink.lua
@@ -0,0 +1,25 @@
+local HardShrink, parent = torch.class('nn.HardShrink', 'nn.Module')
+
+function HardShrink:__init(lam)
+   parent.__init(self)
+   self.lambda = lam or 0.5
+end
+
+function HardShrink:updateOutput(input)
+   input.THNN.HardShrink_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.lambda
+   )
+   return self.output
+end
+
+function HardShrink:updateGradInput(input, gradOutput)
+   input.THNN.HardShrink_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.lambda
+   )
+   return self.gradInput
+end
diff --git a/HardTanh.lua b/HardTanh.lua
new file mode 100644
index 0000000..d3449a1
--- /dev/null
+++ b/HardTanh.lua
@@ -0,0 +1,31 @@
+local HardTanh, parent = torch.class('nn.HardTanh', 'nn.Module')
+
+function HardTanh:__init(min_value, max_value)
+   parent.__init(self)
+   self.min_val = min_value or -1
+   self.max_val = max_value or 1
+   assert(self.max_val>self.min_val, 'max_value must be larger than min_value')
+end
+
+function HardTanh:updateOutput(input)
+   self.min_val = self.min_val or -1
+   self.max_val = self.max_val or 1
+   input.THNN.HardTanh_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.min_val,
+      self.max_val
+   )
+   return self.output
+end
+
+function HardTanh:updateGradInput(input, gradOutput)
+   input.THNN.HardTanh_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.min_val,
+      self.max_val
+   )
+   return self.gradInput
+end
diff --git a/HingeEmbeddingCriterion.lua b/HingeEmbeddingCriterion.lua
new file mode 100644
index 0000000..fe8f1a6
--- /dev/null
+++ b/HingeEmbeddingCriterion.lua
@@ -0,0 +1,43 @@
+local HingeEmbeddingCriterion, parent = torch.class('nn.HingeEmbeddingCriterion', 'nn.Criterion')
+
+function HingeEmbeddingCriterion:__init(margin)
+   parent.__init(self)
+   self.margin = margin or 1
+   self.sizeAverage = true
+end 
+ 
+function HingeEmbeddingCriterion:updateOutput(input,y)
+   self.buffer = self.buffer or input.new()
+   if not torch.isTensor(y) then 
+      self.ty = self.ty or input.new():resize(1)
+      self.ty[1]=y
+      y=self.ty
+   end
+
+   self.buffer:resizeAs(input):copy(input)
+   self.buffer[torch.eq(y, -1)] = 0
+   self.output = self.buffer:sum()
+   
+   self.buffer:fill(self.margin):add(-1, input)
+   self.buffer:cmax(0)
+   self.buffer[torch.eq(y, 1)] = 0
+   self.output = self.output + self.buffer:sum()
+   
+   if (self.sizeAverage == nil or self.sizeAverage == true) then 
+      self.output = self.output / input:nElement()
+   end
+
+   return self.output
+end
+
+function HingeEmbeddingCriterion:updateGradInput(input, y)
+   if not torch.isTensor(y) then self.ty[1]=y; y=self.ty end
+   self.gradInput:resizeAs(input):copy(y)
+   self.gradInput[torch.cmul(torch.eq(y, -1), torch.gt(input, self.margin))] = 0
+   
+   if (self.sizeAverage == nil or self.sizeAverage == true) then
+      self.gradInput:mul(1 / input:nElement())
+   end
+      
+   return self.gradInput 
+end
diff --git a/Identity.lua b/Identity.lua
new file mode 100644
index 0000000..5e6ccb6
--- /dev/null
+++ b/Identity.lua
@@ -0,0 +1,30 @@
+local Identity, _ = torch.class('nn.Identity', 'nn.Module')
+
+function Identity:updateOutput(input)
+   self.output = input
+   return self.output
+end
+
+
+function Identity:updateGradInput(input, gradOutput)
+   self.gradInput = gradOutput
+   return self.gradInput
+end
+
+function Identity:clearState()
+   -- don't call set because it might reset referenced tensors
+   local function clear(f)
+      if self[f] then
+         if torch.isTensor(self[f]) then
+            self[f] = self[f].new()
+         elseif type(self[f]) == 'table' then
+            self[f] = {}
+         else
+            self[f] = nil
+         end
+      end
+   end
+   clear('output')
+   clear('gradInput')
+   return self
+end
diff --git a/Index.lua b/Index.lua
new file mode 100644
index 0000000..8ae6063
--- /dev/null
+++ b/Index.lua
@@ -0,0 +1,25 @@
+local Index, parent = torch.class('nn.Index', 'nn.Module')
+
+function Index:__init(dimension)
+    parent.__init(self)
+    self.dimension = dimension
+    self.gradInput = {self.gradInput}
+end
+
+function Index:updateOutput(input)
+    local t = input[1]
+    local index = input[2]
+    self.output:index(t, self.dimension, index)
+    return self.output
+end
+
+function Index:updateGradInput(input, gradOutput)
+    local t = input[1]
+    local index = input[2]
+
+    local gradInput = self.gradInput[1] -- no gradient for the index variable
+    gradInput:resizeAs(t):zero()
+    gradInput:indexAdd(self.dimension, index, gradOutput)
+    return self.gradInput
+end
+
diff --git a/Jacobian.lua b/Jacobian.lua
new file mode 100644
index 0000000..64187c3
--- /dev/null
+++ b/Jacobian.lua
@@ -0,0 +1,387 @@
+nn.Jacobian = {}
+
+function nn.Jacobian.backward(module, input, param, dparam)
+   local doparam = 0
+   if param then
+      doparam = 1
+   end
+   param = param or input
+   -- output deriv
+   module:forward(input)
+   local dout = module.output.new():resizeAs(module.output)
+   -- 1D view
+   local sdout = module.output.new(dout:storage(),1,dout:nElement())
+   -- jacobian matrix to calculate
+   local jacobian = torch.Tensor(param:nElement(),dout:nElement()):zero()
+
+   for i=1,sdout:nElement() do
+      dout:zero()
+      sdout[i] = 1
+      module:zeroGradParameters()
+      local din = module:updateGradInput(input, dout)
+      module:accGradParameters(input, dout)
+      if doparam == 1 then
+         jacobian:select(2,i):copy(dparam)
+      else
+         jacobian:select(2,i):copy(din)
+      end
+   end
+   return jacobian
+end
+
+function nn.Jacobian.backwardUpdate(module, input, param)
+
+   -- output deriv
+   module:forward(input)
+   local dout = module.output.new():resizeAs(module.output)
+   -- 1D view
+   local sdout = module.output.new(dout:storage(),1,dout:nElement())
+   -- jacobian matrix to calculate
+   local jacobian = torch.Tensor(param:nElement(),dout:nElement()):zero()
+
+   -- original param
+   local params = module:parameters()
+   local origparams = {}
+   for j=1,#params do
+      table.insert(origparams, params[j]:clone())
+   end
+
+   for i=1,sdout:nElement() do
+      for j=1,#params do
+         params[j]:copy(origparams[j])
+      end
+      dout:zero()
+      sdout[i] = 1
+      module:updateGradInput(input, dout)
+      module:accUpdateGradParameters(input, dout, 1)
+      jacobian:select(2,i):copy(param)
+   end
+
+   for j=1,#params do
+      params[j]:copy(origparams[j])
+   end
+
+   return jacobian
+end
+
+function nn.Jacobian.forward(module, input, param, perturbation)
+   param = param or input
+   -- perturbation amount
+   perturbation = perturbation or 1e-6
+   -- 1D view of input
+   --local tst = param:storage()
+   local sin = param.new(param):resize(param:nElement())--param.new(tst,1,tst:size())
+   -- jacobian matrix to calculate
+   local jacobian = torch.Tensor():resize(param:nElement(),module:forward(input):nElement())
+
+   local outa = torch.Tensor(jacobian:size(2))
+   local outb = torch.Tensor(jacobian:size(2))
+
+   for i=1,sin:nElement() do
+      local orig = sin[i]
+      sin[i] = orig - perturbation
+      outa:copy(module:forward(input))
+      sin[i] = orig + perturbation
+      outb:copy(module:forward(input))
+      sin[i] = orig
+
+      outb:add(-1,outa):div(2*perturbation)
+      jacobian:select(1,i):copy(outb)
+   end
+
+   return jacobian
+end
+
+function nn.Jacobian.backwardDiagHessian(module, input, diagHessianParamName)
+   -- Compute the second derivatives (diagonal Hessian elements)
+   -- by backpropagation (using the code from hessian.lua).
+   --
+   -- This function computes the diagonal Hessian elements of the following function:
+   --
+   -- F(x_1, x_2, ..., x_n) = y_1^2/2 + y_2^2/2 + ... + y_m^2/2,
+   --
+   -- where
+   -- x_1, ..., x_n are the input values and parameters of the given module,
+   -- y_1, ..., y_m are the output values of the given module.
+   --
+   -- All x_i and y_i values are scalars here. In other words,
+   -- x_1, ..., x_n denote the scalar elements of the module input tensor,
+   --             the scalar elements of module.weight,
+   --             and the scalar elements of module.bias;
+   -- y_1, ..., y_m are the scalar elements of the module output tensor.
+   --
+   -- The diagonal Hessian elements of F are computed with respect to
+   -- the module input values and parameters (x_1, .., x_n).
+   --
+   -- The function F is chosen for its convenient properties:
+   --
+   -- dF / dy_i = y_i,
+   -- d^2F / dy_i^2 = 1.
+   --
+   -- In other words, the diagonal Hessian elements of F with respect
+   -- to the module OUTPUT values (y_1, ... y_m) are equal to 1.
+   --
+   -- Because of that, computing the diagonal Hessian elements of F
+   -- with respect to the module INPUT values and PARAMETERS (x_1, ..., x_n)
+   -- can be done by calling updateDiagHessianInput() and accDiagHessianParameters()
+   -- using a tensor of ones as diagHessianOutput.
+
+   module:forward(input)
+   local diagHessianOutput = module.output.new():resizeAs(module.output):fill(1)
+
+   module.diagHessianWeight:zero()
+   module.diagHessianBias:zero()
+   module:updateDiagHessianInput(input, diagHessianOutput)
+   module:accDiagHessianParameters(input, diagHessianOutput)
+
+   return module[diagHessianParamName]
+end
+
+function nn.Jacobian.linearModuleDiagHessian(module, input, gradParamName)
+   -- Compute the second derivatives (diagonal Hessian elements)
+   -- from the first derivatives for the given module
+   -- (without using the code from hessian.lua).
+   --
+   -- The given module is assumed to be linear with respect to its inputs and weights
+   -- (like nn.Linear, nn.SpatialConvolution, etc.)
+   --
+   -- This function computes the diagonal Hessian elements of the following function:
+   --
+   -- F(x_1, x_2, ..., x_n) = y_1^2/2 + y_2^2/2 + ... + y_m^2/2.
+   --
+   -- (See the the comment for nn.Jacobian.backwardDiagHessian() for explanation.)
+   --
+   -- The first derivatives of F with respect to
+   -- the module inputs and parameters (x_1, ..., x_n) are:
+   --
+   -- dF / dx_i = \sum_k (dF / dy_k) (dy_k / dx_i).
+   --
+   -- The second derivatives are:
+   --
+   -- d^2F / dx_i = \sum_k [(d^2F / dy_k^2) (dy_k / dx_i)^2 + (dF / dy_k) (d^2y_k / dx_i^2)].
+   --
+   -- The second derivatives of F with respect to the module outputs (y_1, ..., y_m)
+   -- are equal to 1, so:
+   --
+   -- d^2F / dx_i = \sum_k [(dy_k / dx_i)^2 + (dF / dy_k) (d^2y_k / dx_i^2)].
+   --
+   -- Assuming the linearity of module outputs (y_1, ..., y_m)
+   -- with respect to module inputs and parameters (x_1, ..., x_n),
+   -- we have (d^2y_k / dx_i^2) = 0,
+   -- and the expression finally becomes:
+   --
+   -- d^2F / dx_i = \sum_k (dy_k / dx_i)^2.
+   --
+   -- The first derivatives (dy_k / dx_i) are computed by normal backpropagation,
+   -- using updateGradInput() and accGradParameters().
+
+   local gradParam = module[gradParamName]
+
+   local diagHessian = gradParam.new():resize(gradParam:nElement()):zero()
+
+   module:forward(input)
+   local gradOutput = module.output.new():resizeAs(module.output)
+   local gradOutput1D = gradOutput:view(gradOutput:nElement())
+
+   for i=1,gradOutput:nElement() do
+      gradOutput1D:zero()
+      gradOutput1D[i] = 1
+      module.gradWeight:zero()
+      if module.bias then
+         module.gradBias:zero()
+      end
+      module:updateGradInput(input, gradOutput)
+      module:accGradParameters(input, gradOutput)
+      diagHessian:addcmul(gradParam, gradParam)
+   end
+
+   return diagHessian
+end
+
+function nn.Jacobian.forwardUpdate(module, input, param, perturbation)
+   -- perturbation amount
+   perturbation = perturbation or 1e-6
+   -- 1D view of input
+   --local tst = param:storage()
+   local sin =  param.new(param):resize(param:nElement())--param.new(tst,1,tst:size())
+   -- jacobian matrix to calculate
+   local jacobian = torch.Tensor():resize(param:nElement(),module:forward(input):nElement())
+
+   local outa = torch.Tensor(jacobian:size(2))
+   local outb = torch.Tensor(jacobian:size(2))
+
+   for i=1,sin:nElement() do
+      local orig = sin[i]
+      sin[i] = orig - perturbation
+      outa:copy(module:forward(input))
+      sin[i] = orig + perturbation
+      outb:copy(module:forward(input))
+      sin[i] = orig
+
+      outb:add(-1,outa):div(2*perturbation)
+      jacobian:select(1,i):copy(outb)
+      jacobian:select(1,i):mul(-1)
+      jacobian:select(1,i):add(sin[i])
+   end
+   return jacobian
+end
+
+function nn.Jacobian.testJacobian(module, input, minval, maxval, perturbation)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   input:copy(torch.rand(input:nElement()):mul(inrange):add(minval))
+   local jac_fprop = nn.Jacobian.forward(module, input, input, perturbation)
+   local jac_bprop = nn.Jacobian.backward(module, input)
+   local error = jac_fprop-jac_bprop
+   return error:abs():max()
+end
+
+function nn.Jacobian.testJacobianParameters(module, input, param, dparam, minval, maxval, perturbation)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   input:copy(torch.rand(input:nElement()):mul(inrange):add(minval))
+   param:copy(torch.rand(param:nElement()):mul(inrange):add(minval))
+   local jac_bprop = nn.Jacobian.backward(module, input, param, dparam)
+   local jac_fprop = nn.Jacobian.forward(module, input, param, perturbation)
+   local error = jac_fprop - jac_bprop
+   return error:abs():max()
+end
+
+function nn.Jacobian.testJacobianUpdateParameters(module, input, param, minval, maxval, perturbation)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   input:copy(torch.rand(input:nElement()):mul(inrange):add(minval))
+   param:copy(torch.rand(param:nElement()):mul(inrange):add(minval))
+   local params_bprop = nn.Jacobian.backwardUpdate(module, input, param)
+   local params_fprop = nn.Jacobian.forwardUpdate(module, input, param, perturbation)
+
+   local error = params_fprop - params_bprop
+   return error:abs():max()
+end
+
+function nn.Jacobian.testDiagHessian(module, input, gradParamName, diagHessianParamName, minval, maxval)
+   -- Compute the diagonal Hessian elements for the same function in two different ways,
+   -- then compare the results and return the difference.
+
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   input:copy(torch.rand(input:nElement()):mul(inrange):add(minval))
+   module:initDiagHessianParameters()
+   local h_bprop = nn.Jacobian.backwardDiagHessian(module, input, diagHessianParamName)
+   local h_linearmodule = nn.Jacobian.linearModuleDiagHessian(module, input, gradParamName)
+   local error = h_bprop - h_linearmodule
+   return error:abs():max()
+end
+
+function nn.Jacobian.testDiagHessianInput(module, input, minval, maxval)
+   return nn.Jacobian.testDiagHessian(module, input, 'gradInput', 'diagHessianInput', minval, maxval)
+end
+
+function nn.Jacobian.testDiagHessianWeight(module, input, minval, maxval)
+   return nn.Jacobian.testDiagHessian(module, input, 'gradWeight', 'diagHessianWeight', minval, maxval)
+end
+
+function nn.Jacobian.testDiagHessianBias(module, input, minval, maxval)
+   return nn.Jacobian.testDiagHessian(module, input, 'gradBias', 'diagHessianBias', minval, maxval)
+end
+
+function nn.Jacobian.testIO(module,input, minval, maxval)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+
+   -- run module
+   module:forward(input)
+   local go = module.output:clone():copy(torch.rand(module.output:nElement()):mul(inrange):add(minval))
+   module:zeroGradParameters()
+   module:updateGradInput(input,go)
+   module:accGradParameters(input,go)
+
+   local fo = module.output:clone()
+   local bo = module.gradInput:clone()
+
+   -- write module
+   local filename = os.tmpname()
+   local f = torch.DiskFile(filename, 'w'):binary()
+   -- call clearState and check that it returns itself
+   assert(module == module:clearState(),'clearState did not return self')
+   f:writeObject(module)
+   f:close()
+   -- read module
+   local m = torch.DiskFile(filename):binary():readObject()
+   m:forward(input)
+   m:zeroGradParameters()
+   m:updateGradInput(input,go)
+   m:accGradParameters(input,go)
+   -- cleanup
+   os.remove(filename)
+
+   local fo2 = m.output:clone()
+   local bo2 = m.gradInput:clone()
+
+   local errf = fo - fo2
+   local errb = bo - bo2
+   return errf:abs():max(), errb:numel() == 0 and 0 or errb:abs():max()
+end
+
+function nn.Jacobian.testAllUpdate(module, input, weight, gradWeight)
+   local gradOutput
+   local lr = torch.uniform(0.1, 1)
+   local errors = {}
+
+   -- accGradParameters
+   local maccgp = module:clone()
+   local weightc = maccgp[weight]:clone()
+   maccgp:forward(input)
+   gradOutput = torch.rand(maccgp.output:size())
+   maccgp:zeroGradParameters()
+   maccgp:updateGradInput(input, gradOutput)
+   maccgp:accGradParameters(input, gradOutput)
+   maccgp:updateParameters(lr)
+   errors["accGradParameters"] = (weightc-maccgp[gradWeight]*lr-maccgp[weight]):norm()
+
+   -- accUpdateGradParameters
+   local maccugp = module:clone()
+   maccugp:forward(input)
+   maccugp:updateGradInput(input, gradOutput)
+   maccugp:accUpdateGradParameters(input, gradOutput, lr)
+   errors["accUpdateGradParameters"] = (maccugp[weight]-maccgp[weight]):norm()
+
+   -- shared, accGradParameters
+   local macsh1 = module:clone()
+   local macsh2 = module:clone()
+   macsh2:share(macsh1, weight)
+   macsh1:forward(input)
+   macsh2:forward(input)
+   macsh1:zeroGradParameters()
+   macsh2:zeroGradParameters()
+   macsh1:updateGradInput(input, gradOutput)
+   macsh2:updateGradInput(input, gradOutput)
+   macsh1:accGradParameters(input, gradOutput)
+   macsh2:accGradParameters(input, gradOutput)
+   macsh1:updateParameters(lr)
+   macsh2:updateParameters(lr)
+   local err = (weightc-maccgp[gradWeight]*(lr*2)-macsh1[weight]):norm()
+   err = err + (weightc-maccgp[gradWeight]*(lr*2)-macsh2[weight]):norm()
+   errors["accGradParameters [shared]"] = err
+
+   -- shared, accUpdateGradParameters
+   local macshu1 = module:clone()
+   local macshu2 = module:clone()
+   macshu2:share(macshu1, weight)
+   macshu1:forward(input)
+   macshu2:forward(input)
+   macshu1:updateGradInput(input, gradOutput)
+   macshu2:updateGradInput(input, gradOutput)
+   macshu1:accUpdateGradParameters(input, gradOutput, lr)
+   macshu2:accUpdateGradParameters(input, gradOutput, lr)
+   err = (weightc-maccgp[gradWeight]*(lr*2)-macshu1[weight]):norm()
+   err = err + (weightc-maccgp[gradWeight]*(lr*2)-macshu2[weight]):norm()
+   errors["accUpdateGradParameters [shared]"] = err
+
+   return errors
+end
diff --git a/JoinTable.lua b/JoinTable.lua
new file mode 100644
index 0000000..0d20fb9
--- /dev/null
+++ b/JoinTable.lua
@@ -0,0 +1,74 @@
+local JoinTable, parent = torch.class('nn.JoinTable', 'nn.Module')
+
+function JoinTable:__init(dimension, nInputDims)
+   parent.__init(self)
+   self.size = torch.LongStorage()
+   self.dimension = dimension
+   self.gradInput = {}
+   self.nInputDims = nInputDims
+end
+
+function JoinTable:_getPositiveDimension(input)
+   local dimension = self.dimension
+   if dimension < 0 then
+      dimension = input:dim() + dimension + 1
+   elseif self.nInputDims and input[1]:dim()==(self.nInputDims+1) then
+      dimension = dimension + 1
+   end
+   return dimension
+end
+
+function JoinTable:updateOutput(input)
+   local dimension = self:_getPositiveDimension(input)
+
+   for i=1,#input do
+      local currentOutput = input[i]
+      if i == 1 then
+         self.size:resize(currentOutput:dim()):copy(currentOutput:size())
+      else
+         self.size[dimension] = self.size[dimension]
+            + currentOutput:size(dimension)
+      end
+   end
+   self.output:resize(self.size)
+
+   local offset = 1
+   for i=1,#input do
+      local currentOutput = input[i]
+      self.output:narrow(dimension, offset,
+         currentOutput:size(dimension)):copy(currentOutput)
+      offset = offset + currentOutput:size(dimension)
+   end
+   return self.output
+end
+
+function JoinTable:updateGradInput(input, gradOutput)
+   local dimension = self:_getPositiveDimension(input)
+
+   for i=1,#input do
+      if self.gradInput[i] == nil then
+         self.gradInput[i] = input[i].new()
+      end
+      self.gradInput[i]:resizeAs(input[i])
+   end
+
+   -- clear out invalid gradInputs
+   for i=#input+1, #self.gradInput do
+      self.gradInput[i] = nil
+   end
+
+   local offset = 1
+   for i=1,#input do
+      local currentOutput = input[i]
+      local currentGradInput = gradOutput:narrow(dimension, offset,
+                      currentOutput:size(dimension))
+      self.gradInput[i]:copy(currentGradInput)
+      offset = offset + currentOutput:size(dimension)
+   end
+   return self.gradInput
+end
+
+function JoinTable:type(type, tensorCache)
+   self.gradInput = {}
+   return parent.type(self, type, tensorCache)
+end
diff --git a/L1Cost.lua b/L1Cost.lua
new file mode 100644
index 0000000..6b58e0e
--- /dev/null
+++ b/L1Cost.lua
@@ -0,0 +1,30 @@
+local THNN = require 'nn.THNN'
+local L1Cost, parent = torch.class('nn.L1Cost','nn.Criterion')
+
+function L1Cost:__init()
+   parent.__init(self)
+end
+
+function L1Cost:updateOutput(input)
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.L1Cost_updateOutput(
+      input:cdata(),
+      self.output_tensor:cdata()
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function L1Cost:updateGradInput(input)
+   input.THNN.L1Cost_updateGradInput(
+      input:cdata(),
+      THNN.NULL,
+      self.gradInput:cdata()
+   )
+   return self.gradInput
+end
+
+function L1Cost:clearState()
+   if self.output_tensor then self.output_tensor:set() end
+   return parent.clearState(self)
+end
diff --git a/L1HingeEmbeddingCriterion.lua b/L1HingeEmbeddingCriterion.lua
new file mode 100644
index 0000000..6957278
--- /dev/null
+++ b/L1HingeEmbeddingCriterion.lua
@@ -0,0 +1,41 @@
+local L1HingeEmbeddingCriterion, parent = torch.class('nn.L1HingeEmbeddingCriterion', 'nn.Criterion')
+
+function L1HingeEmbeddingCriterion:__init(margin)
+   parent.__init(self)
+   margin = margin or 1
+   self.margin = margin
+   self.gradInput = {torch.Tensor(), torch.Tensor()}
+end
+
+function L1HingeEmbeddingCriterion:updateOutput(input,y)
+   self.output=input[1]:dist(input[2],1);
+   if y == -1 then
+	 self.output = math.max(0,self.margin - self.output);
+   end
+   return self.output
+end
+
+
+local function mathsign(t)
+   if t>0 then return 1; end
+   if t<0 then return -1; end
+   return 2*torch.random(2)-3;
+end
+
+function L1HingeEmbeddingCriterion:updateGradInput(input, y)
+  self.gradInput[1]:resizeAs(input[1])
+  self.gradInput[2]:resizeAs(input[2])
+  self.gradInput[1]:copy(input[1])
+  self.gradInput[1]:add(-1, input[2])
+  local dist = self.gradInput[1]:norm(1);
+  self.gradInput[1]:apply(mathsign)    -- L1 gradient
+  if y == -1 then -- just to avoid a mul by 1
+   if dist > self.margin then
+     self.gradInput[1]:zero()
+   else
+     self.gradInput[1]:mul(-1)
+   end
+  end
+  self.gradInput[2]:zero():add(-1, self.gradInput[1])
+  return self.gradInput
+end
diff --git a/L1Penalty.lua b/L1Penalty.lua
new file mode 100644
index 0000000..24ee769
--- /dev/null
+++ b/L1Penalty.lua
@@ -0,0 +1,42 @@
+local L1Penalty, parent = torch.class('nn.L1Penalty','nn.Module')
+
+--This module acts as an L1 latent state regularizer, adding the 
+--[gradOutput] to the gradient of the L1 loss. The [input] is copied to 
+--the [output]. 
+
+function L1Penalty:__init(l1weight, sizeAverage, provideOutput)
+    parent.__init(self)
+    self.l1weight = l1weight 
+    self.sizeAverage = sizeAverage or false  
+    if provideOutput == nil then
+       self.provideOutput = true
+    else
+       self.provideOutput = provideOutput
+    end
+end
+    
+function L1Penalty:updateOutput(input)
+    local m = self.l1weight 
+    if self.sizeAverage == true then 
+      m = m/input:nElement()
+    end
+    local loss = m*input:norm(1) 
+    self.loss = loss  
+    self.output = input 
+    return self.output 
+end
+
+function L1Penalty:updateGradInput(input, gradOutput)
+    local m = self.l1weight 
+    if self.sizeAverage == true then 
+      m = m/input:nElement() 
+    end
+    
+    self.gradInput:resizeAs(input):copy(input):sign():mul(m)
+    
+    if self.provideOutput == true then 
+        self.gradInput:add(gradOutput)  
+    end 
+
+    return self.gradInput 
+end
diff --git a/LeakyReLU.lua b/LeakyReLU.lua
new file mode 100644
index 0000000..56b7f25
--- /dev/null
+++ b/LeakyReLU.lua
@@ -0,0 +1,41 @@
+local LeakyReLU, parent = torch.class('nn.LeakyReLU','nn.Module')
+
+function LeakyReLU:__init(negval,ip)
+   parent.__init(self)
+   if type(negval) == 'boolean' then
+      local ip = negval
+      self.negval = 1/100
+   else
+      self.negval = negval or (1/100)
+   end
+   -- default for inplace is false
+   self.inplace = ip or false
+   if self.negval < 0 then
+      self.inplace = false
+   end
+end
+
+function LeakyReLU:updateOutput(input)
+      input.THNN.LeakyReLU_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.negval,
+      self.inplace
+   )
+   return self.output
+end
+
+function LeakyReLU:updateGradInput(input, gradOutput)
+   input.THNN.LeakyReLU_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.negval,
+      self.inplace
+   )
+   return self.gradInput
+end
+
+function LeakyReLU:__tostring__()
+   return torch.type(self) .. string.format('(%g)', self.negval)
+end
diff --git a/Linear.lua b/Linear.lua
new file mode 100644
index 0000000..c26ba37
--- /dev/null
+++ b/Linear.lua
@@ -0,0 +1,120 @@
+local Linear, parent = torch.class('nn.Linear', 'nn.Module')
+
+function Linear:__init(inputSize, outputSize, bias)
+   parent.__init(self)
+   local bias = ((bias == nil) and true) or bias
+   self.weight = torch.Tensor(outputSize, inputSize)
+   self.gradWeight = torch.Tensor(outputSize, inputSize)
+   if bias then
+      self.bias = torch.Tensor(outputSize)
+      self.gradBias = torch.Tensor(outputSize)
+   end
+   self:reset()
+end
+
+function Linear:noBias()
+   self.bias = nil
+   self.gradBias = nil
+   return self
+end
+
+function Linear:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(2))
+   end
+   if nn.oldSeed then
+      for i=1,self.weight:size(1) do
+         self.weight:select(1, i):apply(function()
+            return torch.uniform(-stdv, stdv)
+         end)
+      end
+      if self.bias then
+         for i=1,self.bias:nElement() do
+            self.bias[i] = torch.uniform(-stdv, stdv)
+         end
+      end
+   else
+      self.weight:uniform(-stdv, stdv)
+      if self.bias then self.bias:uniform(-stdv, stdv) end
+   end
+   return self
+end
+
+local function updateAddBuffer(self, input)
+   local nframe = input:size(1)
+   self.addBuffer = self.addBuffer or input.new()
+   if self.addBuffer:nElement() ~= nframe then
+      self.addBuffer:resize(nframe):fill(1)
+   end
+end
+
+function Linear:updateOutput(input)
+   if input:dim() == 1 then
+      self.output:resize(self.weight:size(1))
+      if self.bias then self.output:copy(self.bias) else self.output:zero() end
+      self.output:addmv(1, self.weight, input)
+   elseif input:dim() == 2 then
+      local nframe = input:size(1)
+      local nElement = self.output:nElement()
+      self.output:resize(nframe, self.weight:size(1))
+      if self.output:nElement() ~= nElement then
+         self.output:zero()
+      end
+      updateAddBuffer(self, input)
+      self.output:addmm(0, self.output, 1, input, self.weight:t())
+      if self.bias then self.output:addr(1, self.addBuffer, self.bias) end
+   else
+      error('input must be vector or matrix')
+   end
+
+   return self.output
+end
+
+function Linear:updateGradInput(input, gradOutput)
+   if self.gradInput then
+
+      local nElement = self.gradInput:nElement()
+      self.gradInput:resizeAs(input)
+      if self.gradInput:nElement() ~= nElement then
+         self.gradInput:zero()
+      end
+      if input:dim() == 1 then
+         self.gradInput:addmv(0, 1, self.weight:t(), gradOutput)
+      elseif input:dim() == 2 then
+         self.gradInput:addmm(0, 1, gradOutput, self.weight)
+      end
+
+      return self.gradInput
+   end
+end
+
+function Linear:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   if input:dim() == 1 then
+      self.gradWeight:addr(scale, gradOutput, input)
+      if self.bias then self.gradBias:add(scale, gradOutput) end
+   elseif input:dim() == 2 then
+      self.gradWeight:addmm(scale, gradOutput:t(), input)
+      if self.bias then
+         -- update the size of addBuffer if the input is not the same size as the one we had in last updateGradInput
+         updateAddBuffer(self, input)
+         self.gradBias:addmv(scale, gradOutput:t(), self.addBuffer)
+      end
+   end
+end
+
+-- we do not need to accumulate parameters when sharing
+Linear.sharedAccUpdateGradParameters = Linear.accUpdateGradParameters
+
+function Linear:clearState()
+   if self.addBuffer then self.addBuffer:set() end
+   return parent.clearState(self)
+end
+
+function Linear:__tostring__()
+  return torch.type(self) ..
+      string.format('(%d -> %d)', self.weight:size(2), self.weight:size(1)) ..
+      (self.bias == nil and ' without bias' or '')
+end
diff --git a/Log.lua b/Log.lua
new file mode 100644
index 0000000..d7755d3
--- /dev/null
+++ b/Log.lua
@@ -0,0 +1,20 @@
+local Log, parent = torch.class('nn.Log', 'nn.Module')
+
+function Log:__init()
+   parent.__init(self)
+end
+
+function Log:updateOutput(input)
+   self.output:resizeAs(input)
+   self.output:copy(input)
+   self.output:log()
+   return self.output 
+end
+
+function Log:updateGradInput(input, gradOutput) 
+   self.gradInput:resizeAs(input)
+   self.gradInput:fill(1)
+   self.gradInput:cdiv(input)
+   self.gradInput:cmul(gradOutput)
+   return self.gradInput
+end
diff --git a/LogSigmoid.lua b/LogSigmoid.lua
new file mode 100644
index 0000000..cab848f
--- /dev/null
+++ b/LogSigmoid.lua
@@ -0,0 +1,27 @@
+local LogSigmoid, parent = torch.class('nn.LogSigmoid', 'nn.Module')
+
+function LogSigmoid:updateOutput(input)
+   self.buffer = self.buffer or input.new()
+   input.THNN.LogSigmoid_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.buffer:cdata()
+   )
+   return self.output
+end
+
+function LogSigmoid:updateGradInput(input, gradOutput)
+   input.THNN.LogSigmoid_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.buffer:cdata()
+   )
+   return self.gradInput
+end
+
+function LogSigmoid:clearState()
+   if self.buffer then self.buffer:set() end
+   return parent.clearState(self)
+end
+
diff --git a/LogSoftMax.lua b/LogSoftMax.lua
new file mode 100644
index 0000000..37c8aca
--- /dev/null
+++ b/LogSoftMax.lua
@@ -0,0 +1,19 @@
+local LogSoftMax = torch.class('nn.LogSoftMax', 'nn.Module')
+
+function LogSoftMax:updateOutput(input)
+   input.THNN.LogSoftMax_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function LogSoftMax:updateGradInput(input, gradOutput)
+   input.THNN.LogSoftMax_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+   return self.gradInput
+end
diff --git a/LookupTable.lua b/LookupTable.lua
new file mode 100644
index 0000000..8ec2b34
--- /dev/null
+++ b/LookupTable.lua
@@ -0,0 +1,170 @@
+local THNN = require 'nn.THNN'
+local LookupTable, parent = torch.class('nn.LookupTable', 'nn.Module')
+
+LookupTable.__version = 4
+
+function LookupTable:__init(nIndex, nOutput, paddingValue, maxNorm, normType)
+   parent.__init(self)
+
+   self.weight = torch.Tensor(nIndex, nOutput)
+   self.gradWeight = torch.Tensor(nIndex, nOutput):zero()
+   self.paddingValue = paddingValue or 0
+   self.maxNorm = maxNorm or nil
+   self.normType = normType or nil
+
+   self:reset()
+end
+
+function LookupTable:backCompatibility()
+   self._count = self._count or torch.IntTensor()
+   self._input = self._input or torch.LongTensor()
+
+   if not self.shouldScaleGradByFreq then
+      self.shouldScaleGradByFreq = false
+   end
+end
+
+function LookupTable:accUpdateOnly()
+   self.gradWeight = nil
+   return self
+end
+
+function LookupTable:setPadding(paddingValue)
+   self.paddingValue = paddingValue
+   return self
+end
+
+function LookupTable:setMaxNorm(maxNorm)
+   self.maxNorm = maxNorm
+   return self
+end
+
+function LookupTable:setNormType(normType)
+   self.normType = normType
+   return self
+end
+
+function LookupTable:scaleGradByFreq()
+   self.shouldScaleGradByFreq = true
+   return self
+end
+
+function LookupTable:reset(stdv)
+   stdv = stdv or 1
+   self.weight:normal(0, stdv)
+end
+
+function LookupTable:makeInputContiguous(input)
+   -- make sure input is a contiguous torch.LongTensor
+   if (not input:isContiguous()) or torch.type(input) ~= torch.type(self._input) then
+      self.copiedInput = true
+      self._input:resize(input:size()):copy(input)
+      return self._input
+   end
+   self.copiedInput = false
+   return input
+end
+
+function LookupTable:updateOutput(input)
+   self:backCompatibility()
+   self:renorm(input)
+   input = self:makeInputContiguous(input)
+   if input:dim() == 1 then
+      self.output:index(self.weight, 1, input)
+   elseif input:dim() == 2 then
+      self.output:index(self.weight, 1, input:view(-1))
+      self.output = self.output:view(input:size(1), input:size(2), self.weight:size(2))
+   else
+      error("input must be a vector or matrix")
+   end
+   return self.output
+end
+
+function LookupTable:updateGradInput(input, gradOutput)
+   -- the input can be of any type (as in the forward it's 
+   -- converted anyway to LongTensor) thus, need to allocate
+   -- new memory each time the user changes the input type
+   if torch.type(self.gradInput) ~= torch.type(input) then
+      self.gradInput = input.new()
+   end
+   if not self.gradInput:isSameSizeAs(input) then
+      self.gradInput:resizeAs(input):zero()
+   end
+   return self.gradInput
+end
+
+function LookupTable:accGradParameters(input, gradOutput, scale)
+   self:backCompatibility()
+   input = self.copiedInput and self._input or input
+   if input:dim() == 2 then
+      input = input:view(-1)
+   elseif input:dim() ~= 1 then
+      error("input must be a vector or matrix")
+   end
+
+   if not gradOutput:isContiguous() then
+      self._gradOutput = self._gradOutput or gradOutput.new()
+      self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+      gradOutput = self._gradOutput
+   end
+
+   self.gradWeight.THNN.LookupTable_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      self._count:cdata(),
+      THNN.optionalTensor(self._sorted),
+      THNN.optionalTensor(self._indices),
+      self.shouldScaleGradByFreq or false,
+      self.paddingValue or 0,
+      scale or 1
+   )
+end
+
+function LookupTable:renorm(input)
+   if not self.maxNorm then
+      return
+   end
+   -- copy input into _input, so _input is continous.
+   -- The copied _input will be modified in the C code.
+   self._input:resize(input:size()):copy(input)
+   local row_idx = self._input
+   if row_idx:dim() == 2 then
+      row_idx = row_idx:view(-1)
+   elseif row_idx:dim() ~= 1 then
+      error("input must be a vector or matrix")
+   end
+   -- "row_idx" and "weight" will be modified in the C code
+   self.weight.THNN.LookupTable_renorm(
+      row_idx:cdata(),
+      self.weight:cdata(),
+      self.maxNorm,
+      self.normType or 2
+   )
+end
+
+function LookupTable:type(type, tensorCache)
+   parent.type(self, type, tensorCache)
+
+   if type == 'torch.CudaTensor' then
+      -- CUDA uses _sorted and _indices temporary tensors
+      self._sorted = self.weight.new()
+      self._indices = self.weight.new()
+      self._count = self.weight.new()
+      self._input = self.weight.new()
+   else
+      -- self._count and self._input should only be converted if using Cuda
+      self._count = torch.IntTensor()
+      self._input = torch.LongTensor()
+   end
+
+   return self
+end
+
+function LookupTable:clearState()
+   nn.utils.clear(self, '_count', '_input', '_gradOutput')
+   return parent.clearState(self)
+end
+
+-- we do not need to accumulate parameters when sharing
+LookupTable.sharedAccUpdateGradParameters = LookupTable.accUpdateGradParameters
diff --git a/MM.lua b/MM.lua
new file mode 100644
index 0000000..cc978c8
--- /dev/null
+++ b/MM.lua
@@ -0,0 +1,92 @@
+--[[ Module to perform matrix multiplication on two minibatch inputs,
+     producing a minibatch.
+]]
+
+local MM, parent = torch.class('nn.MM', 'nn.Module')
+
+--[[ The constructor takes two optional arguments, specifying whether or not transpose
+     any of the input matrices before perfoming the multiplication.
+]]
+function MM:__init(transA, transB)
+  parent.__init(self)
+
+  self.transA = transA or false
+  self.transB = transB or false
+
+  self.gradInput = {torch.Tensor(), torch.Tensor()}
+end
+
+function MM:updateOutput(input)
+  assert(#input == 2, 'input must be a pair of minibatch matrices')
+  local a, b = table.unpack(input)
+  assert(a:nDimension() == 2 or a:nDimension() == 3, 'input tensors must be 2D or 3D')
+
+  if a:nDimension() == 2 then
+    assert(b:nDimension() == 2, 'second input tensor must be 2D')
+
+    if self.transA then a = a:t() end
+    if self.transB then b = b:t() end
+    assert(a:size(2) == b:size(1), 'matrix sizes do not match')
+
+    self.output:resize(a:size(1), b:size(2))
+    self.output:mm(a, b)
+  else
+    assert(b:nDimension() == 3, 'second input tensor must be 3D')
+    assert(a:size(1) == b:size(1), 'inputs must contain the same number of minibatches')
+
+    if self.transA then a = a:transpose(2, 3) end
+    if self.transB then b = b:transpose(2, 3) end
+    assert(a:size(3) == b:size(2), 'matrix sizes do not match')
+
+    self.output:resize(a:size(1), a:size(2), b:size(3))
+    self.output:bmm(a, b)
+  end
+
+  return self.output
+end
+
+function MM:updateGradInput(input, gradOutput)
+  self.gradInput[1] = self.gradInput[1] or input[1].new()
+  self.gradInput[2] = self.gradInput[2] or input[2].new()
+
+  assert(#input == 2, 'input must be a pair of tensors')
+  local a, b = table.unpack(input)
+  self.gradInput[1]:resizeAs(a)
+  self.gradInput[2]:resizeAs(b)
+
+  assert(gradOutput:nDimension() == 2 or gradOutput:nDimension() == 3, 'arguments must be a 2D or 3D Tensor')
+
+  local h_dim, w_dim, f
+  if gradOutput:nDimension() == 2 then
+    assert(a:nDimension() == 2, 'first input tensor must be 2D')
+    assert(b:nDimension() == 2, 'second input tensor must be 2D')
+
+    h_dim, w_dim = 1, 2
+    f = "mm"
+  else
+    assert(a:nDimension() == 3, 'first input tensor must be 3D')
+    assert(b:nDimension() == 3, 'second input tensor must be 3D')
+
+    h_dim, w_dim = 2, 3
+    f = "bmm"
+  end
+
+  if self.transA == self.transB then
+    a = a:transpose(h_dim, w_dim)
+    b = b:transpose(h_dim, w_dim)
+  end
+
+  if self.transA then
+    self.gradInput[1][f](self.gradInput[1], b, gradOutput:transpose(h_dim, w_dim))
+  else
+    self.gradInput[1][f](self.gradInput[1], gradOutput, b)
+  end
+
+  if self.transB then
+    self.gradInput[2][f](self.gradInput[2], gradOutput:transpose(h_dim, w_dim), a)
+  else
+    self.gradInput[2][f](self.gradInput[2], a, gradOutput)
+  end
+
+  return self.gradInput
+end
diff --git a/MSECriterion.lua b/MSECriterion.lua
new file mode 100644
index 0000000..d38beb6
--- /dev/null
+++ b/MSECriterion.lua
@@ -0,0 +1,32 @@
+local MSECriterion, parent = torch.class('nn.MSECriterion', 'nn.Criterion')
+
+function MSECriterion:__init(sizeAverage)
+   parent.__init(self)
+   if sizeAverage ~= nil then
+     self.sizeAverage = sizeAverage
+   else
+     self.sizeAverage = true
+   end
+end
+
+function MSECriterion:updateOutput(input, target)
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.MSECriterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function MSECriterion:updateGradInput(input, target)
+   input.THNN.MSECriterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage
+   )
+   return self.gradInput
+end
diff --git a/MV.lua b/MV.lua
new file mode 100644
index 0000000..a00478e
--- /dev/null
+++ b/MV.lua
@@ -0,0 +1,82 @@
+--[[ Module to perform matrix vector multiplication on two minibatch inputs,
+producing a minibatch.
+]]
+
+local MV, parent = torch.class('nn.MV', 'nn.Module')
+
+-- Backward compatibility
+local unpack = unpack or table.unpack
+
+function MV:__init(trans)
+  parent.__init(self)
+
+  self.trans = trans or false
+  assert(type(self.trans) == 'boolean', "argument must be a boolean, matrix transpose before multiplication")
+
+  self.gradInput = {torch.Tensor(), torch.Tensor()}
+end
+
+function MV:updateOutput(input)
+  assert(#input == 2, 'input must be a pair of minibatch matrices')
+  local M, v = unpack(input)
+  assert(M:nDimension() == 2 or M:nDimension() == 3, 'input matrix must be 2D or 3D')
+  assert(v:nDimension() == 1 or v:nDimension() == 2, 'input vector must be 1D or 2D')
+
+  if M:nDimension() == 2 then
+    assert(v:nDimension() == 1, 'vector must be 1D')
+
+    if self.trans then M = M:transpose(1,2) end
+    assert(M:size(2) == v:size(1), 'matrix row count and vector length do not match')
+
+    self.output:resize(M:size(1))
+    self.output:mv(M, v)
+  else
+    assert(v:nDimension() == 2, 'vector must be 2D (batch dimension)')
+    assert(M:size(1) == v:size(1), 'inputs must contain the same number of minibatches')
+
+    if self.trans then M = M:transpose(2,3) end
+    assert(M:size(3) == v:size(2), 'matrix row count and vector length do not match')
+
+    self.output:resize(M:size(1), M:size(2), 1)
+    self.output:bmm(M, v:view(v:size(1), v:size(2), 1)):resize(M:size(1), M:size(2))
+  end
+
+  return self.output
+end
+
+function MV:updateGradInput(input, gradOutput)
+  assert(#input == 2, 'input must be a pair of tensors')
+  local M, v = unpack(input)
+  self.gradInput[1]:resizeAs(M)
+  self.gradInput[2]:resizeAs(v)
+
+  assert(gradOutput:nDimension() == 1 or gradOutput:nDimension() == 2, 'arguments must be a 1D or 2D Tensor')
+
+  if gradOutput:nDimension() == 2 then
+    assert(M:nDimension() == 3, 'matrix must must be 3D (batched)')
+    assert(v:nDimension() == 2, 'vector must be 2D (batched)')
+    local bdim = M:size(1)
+    local odim = M:size(2)
+    local idim = M:size(3)
+
+    if self.trans then
+      self.gradInput[1]:bmm(v:view(bdim, odim, 1), gradOutput:view(bdim, 1, idim))
+      self.gradInput[2]:view(bdim, odim, 1):bmm(M, gradOutput:view(bdim, idim, 1))
+    else
+      self.gradInput[1]:bmm(gradOutput:view(bdim, odim, 1), v:view(bdim, 1, idim))
+      self.gradInput[2]:view(bdim, idim, 1):bmm(M:transpose(2,3), gradOutput:view(bdim, odim, 1))
+    end
+  else
+    assert(M:nDimension() == 2, 'matrix must be 2D')
+    assert(v:nDimension() == 1, 'vector must be 1D')
+
+    if self.trans then
+      self.gradInput[1]:ger(v, gradOutput)
+      self.gradInput[2] = M * gradOutput
+    else
+      self.gradInput[1]:ger(gradOutput, v)
+      self.gradInput[2] = M:t() * gradOutput
+    end
+  end
+  return self.gradInput
+end
diff --git a/MarginCriterion.lua b/MarginCriterion.lua
new file mode 100644
index 0000000..1ab8ad7
--- /dev/null
+++ b/MarginCriterion.lua
@@ -0,0 +1,31 @@
+local MarginCriterion, parent = torch.class('nn.MarginCriterion', 'nn.Criterion')
+
+function MarginCriterion:__init(margin)
+   parent.__init(self)
+   self.sizeAverage = true
+   self.margin = margin or 1
+end
+
+function MarginCriterion:updateOutput(input, target)
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.MarginCriterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage,
+      self.margin
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function MarginCriterion:updateGradInput(input, target)
+   input.THNN.MarginCriterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage,
+      self.margin
+   )
+   return self.gradInput
+end
diff --git a/MarginRankingCriterion.lua b/MarginRankingCriterion.lua
new file mode 100644
index 0000000..2c1f4c2
--- /dev/null
+++ b/MarginRankingCriterion.lua
@@ -0,0 +1,75 @@
+local MarginRankingCriterion, parent = torch.class('nn.MarginRankingCriterion', 'nn.Criterion')
+
+function MarginRankingCriterion:__init(margin)
+   parent.__init(self)
+   margin=margin or 1
+   self.margin = margin 
+   self.gradInput = {torch.Tensor(1), torch.Tensor(1)}
+   self.sizeAverage = true
+end 
+ 
+function MarginRankingCriterion:updateOutput(input,y)
+   if input[1]:size(1) == 1 then
+      self.output=math.max(0, -y*(input[1][1]-input[2][1]) + self.margin  ) 
+   else
+      self._output = self._output or input[1]:clone()
+      self._output:resizeAs(input[1])
+      self._output:copy(input[1])
+
+      self._output:add(-1, input[2])
+      self._output:mul(-1):cmul(y)
+      self._output:add(self.margin)
+
+      self._output:cmax(0)
+
+      self.output = self._output:sum()
+
+      if self.sizeAverage then
+         self.output = self.output/y:size(1)
+      end
+   end
+
+   return self.output
+end
+
+function MarginRankingCriterion:updateGradInput(input, y)
+   if input[1]:size(1) == 1 then
+      local dist = -y*(input[1][1]-input[2][1]) + self.margin
+      if dist < 0 then
+         self.gradInput[1][1]=0;
+         self.gradInput[2][1]=0;
+      else	
+         self.gradInput[1][1]=-y
+         self.gradInput[2][1]=y
+      end
+   else
+      self.dist = self.dist or input[1].new()
+      self.dist = self.dist:resizeAs(input[1]):copy(input[1])
+      local dist = self.dist
+
+      dist:add(-1, input[2])
+      dist:mul(-1):cmul(y)
+      dist:add(self.margin)
+
+      self.mask = self.mask or input[1].new()
+      self.mask = self.mask:resizeAs(input[1]):copy(dist)
+      local mask = self.mask
+
+      mask:ge(dist, 0)
+
+      self.gradInput[1]:resize(dist:size())
+      self.gradInput[2]:resize(dist:size())
+
+      self.gradInput[1]:copy(mask)
+      self.gradInput[1]:mul(-1):cmul(y)
+      self.gradInput[2]:copy(mask)
+      self.gradInput[2]:cmul(y)
+
+      if self.sizeAverage then
+         self.gradInput[1]:div(y:size(1))
+         self.gradInput[2]:div(y:size(1))
+      end
+
+   end
+   return self.gradInput 
+end
diff --git a/MaskedSelect.lua b/MaskedSelect.lua
new file mode 100644
index 0000000..c3f7834
--- /dev/null
+++ b/MaskedSelect.lua
@@ -0,0 +1,71 @@
+local unpack = unpack or table.unpack
+
+local MaskedSelect, parent = torch.class('nn.MaskedSelect', 'nn.Module')
+
+--[[ Sets the provided mask value for the module. ]]
+function MaskedSelect:__init()
+  parent.__init(self)
+  self._maskIndices = torch.LongTensor()
+  self._maskIndexBuffer = torch.LongTensor()
+  self._maskIndexBufferCPU = torch.FloatTensor()
+  self._gradBuffer = torch.Tensor()
+  self._gradMask = torch.ByteTensor()
+end
+
+--[[ Performs maskedSelect operation. ]]
+function MaskedSelect:updateOutput(input)
+  local input, mask = unpack(input)
+  self.output:maskedSelect(input, mask)
+  return self.output
+end
+
+--[[ Reverse maps unmasked gradOutput back to gradInput. ]]
+function MaskedSelect:updateGradInput(input, gradOutput)
+  local input, mask = unpack(input)
+  if input:type() == 'torch.CudaTensor' then
+    self._maskIndexBufferCPU:range(1, mask:nElement()):resize(mask:size())
+    self._maskIndexBuffer:resize(
+      self._maskIndexBufferCPU:size()):copy(self._maskIndexBufferCPU)
+  else
+    self._maskIndexBuffer:range(1, mask:nElement()):resize(mask:size())
+  end
+  self._maskIndices:maskedSelect(self._maskIndexBuffer, mask)
+  self._gradBuffer:resize(input:nElement()):zero()
+  self._gradBuffer:scatter(1, self._maskIndices, gradOutput)
+  self._gradBuffer:resize(input:size())
+  self.gradInput = {self._gradBuffer,
+                    self._gradMask:resize(mask:size()):fill(0)}
+  return self.gradInput
+end
+
+function MaskedSelect:type(type, tensorCache)
+  if not type then
+    return self._type
+  end
+  self._gradBuffer = self._gradBuffer:type(type)
+  self.gradInput = self.gradInput:type(type)
+  self.output = self.output:type(type)
+
+  -- These casts apply when switching between cuda/non-cuda types
+  if type ~= 'torch.CudaTensor' then
+    self._maskIndexBuffer = self._maskIndexBuffer:long()
+    self._maskIndices = self._maskIndices:long()
+    self._gradMask = self._gradMask:byte()
+  elseif  type == 'torch.CudaTensor' then
+    self._maskIndexBuffer = self._maskIndexBuffer:cuda()
+    self._maskIndices = self._maskIndices:cuda()
+    self._gradMask = self._gradMask:cuda()
+  end
+  self._type = type
+  return self
+end
+
+function MaskedSelect:clearState()
+  return nn.utils.clear(self, {'output',
+                               'gradInput',
+                               '_maskIndexBuffer',
+                               '_maskIndexBufferCPU',
+                               '_maskIndices',
+                               '_gradBuffer',
+                               '_gradMask'})
+end
diff --git a/Max.lua b/Max.lua
new file mode 100644
index 0000000..691fe9d
--- /dev/null
+++ b/Max.lua
@@ -0,0 +1,70 @@
+local Max, parent = torch.class('nn.Max', 'nn.Module')
+
+function Max:__init(dimension, nInputDims)
+   parent.__init(self)
+   dimension = dimension or 1
+   self.dimension = dimension
+   -- do not assign default value to nInputDims or it will break backward compatibility
+   self.nInputDims = nInputDims
+end
+
+function Max:_getPositiveDimension(input)
+   local dimension = self.dimension
+   if dimension < 0 then
+      dimension = input:dim() + dimension + 1
+   elseif self.nInputDims and input:dim()==(self.nInputDims+1) then
+      dimension = dimension + 1
+   end
+   return dimension
+end
+
+function Max:_lazyInit()
+   self._output = self._output or self.output.new()
+   self._indices = self._indices or
+      (torch.type(self.output) == 'torch.CudaTensor' and torch.CudaTensor() or torch.LongTensor())
+end
+
+function Max:updateOutput(input)
+   self:_lazyInit()
+   local dimension = self:_getPositiveDimension(input)
+   torch.max(self._output, self._indices, input, dimension)
+   if input:dim() > 1 then
+     self.output:set(self._output:select(dimension, 1))
+   else
+     self.output:set(self._output)
+   end
+   return self.output
+end
+
+function Max:updateGradInput(input, gradOutput)
+   self:_lazyInit()
+   local dimension = self:_getPositiveDimension(input)
+   local gradOutputView
+   if input:dim() > 1 then
+     gradOutputView = nn.utils.addSingletonDimension(gradOutput, dimension)
+   else
+     gradOutputView = gradOutput
+   end
+   self.gradInput:resizeAs(input):zero():scatter(dimension, self._indices, gradOutputView)
+   return self.gradInput
+end
+
+function Max:type(type, tensorCache)
+  -- torch.max expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
+  if type == 'torch.CudaTensor' then
+    parent.type(self, type, tensorCache)
+  else
+    -- self._indices must be a LongTensor. Setting it to nil temporarily avoids
+    -- unnecessary memory allocations.
+    local indices
+    indices, self._indices = self._indices, nil
+    parent.type(self, type, tensorCache)
+    self._indices = indices and indices:long() or nil
+  end
+  return self
+end
+
+function Max:clearState()
+   nn.utils.clear(self, '_indices', '_output')
+   return parent.clearState(self)
+end
diff --git a/Mean.lua b/Mean.lua
new file mode 100644
index 0000000..8087ac9
--- /dev/null
+++ b/Mean.lua
@@ -0,0 +1,14 @@
+local Mean, parent = torch.class('nn.Mean', 'nn.Sum')
+
+--[[
+
+This file is still here because of backward compatibility.
+
+Please use instead "nn.Sum(dimension, nInputDims, sizeAverage)"
+
+]]--
+
+
+function Mean:__init(dimension, nInputDims)
+   parent.__init(self, dimension, nInputDims, true)
+end
diff --git a/Min.lua b/Min.lua
new file mode 100644
index 0000000..f1d2b45
--- /dev/null
+++ b/Min.lua
@@ -0,0 +1,70 @@
+local Min, parent = torch.class('nn.Min', 'nn.Module')
+
+function Min:__init(dimension, nInputDims)
+   parent.__init(self)
+   dimension = dimension or 1
+   self.dimension = dimension
+   -- do not assign default value to nInputDims or it will break backward compatibility
+   self.nInputDims = nInputDims
+end
+
+function Min:_getPositiveDimension(input)
+   local dimension = self.dimension
+   if dimension < 0 then
+      dimension = input:dim() + dimension + 1
+   elseif self.nInputDims and input:dim()==(self.nInputDims+1) then
+      dimension = dimension + 1
+   end
+   return dimension
+end
+
+function Min:_lazyInit()
+   self._output = self._output or self.output.new()
+   self._indices = self._indices or
+      (torch.type(self.output) == 'torch.CudaTensor' and torch.CudaTensor() or torch.LongTensor())
+end
+
+function Min:updateOutput(input)
+   self:_lazyInit()
+   local dimension = self:_getPositiveDimension(input)
+   torch.min(self._output, self._indices, input, dimension)
+   if input:dim() > 1 then
+     self.output:set(self._output:select(dimension, 1))
+   else
+     self.output:set(self._output)
+   end
+   return self.output
+end
+
+function Min:updateGradInput(input, gradOutput)
+   self:_lazyInit()
+   local dimension = self:_getPositiveDimension(input)
+   local gradOutputView
+   if input:dim() > 1 then
+     gradOutputView = nn.utils.addSingletonDimension(gradOutput, dimension)
+   else
+     gradOutputView = gradOutput
+   end
+   self.gradInput:resizeAs(input):zero():scatter(dimension, self._indices, gradOutputView)
+   return self.gradInput
+end
+
+function Min:type(type, tensorCache)
+  -- torch.min expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
+  if type == 'torch.CudaTensor' then
+    parent.type(self, type, tensorCache)
+  else
+    -- self._indices must be a LongTensor. Setting it to nil temporarily avoids
+    -- unnecessary memory allocations.
+    local indices
+    indices, self._indices = self._indices, nil
+    parent.type(self, type, tensorCache)
+    self._indices = indices and indices:long() or nil
+  end
+  return self
+end
+
+function Min:clearState()
+   nn.utils.clear(self, '_indices', '_output')
+   return parent.clearState(self)
+end
diff --git a/MixtureTable.lua b/MixtureTable.lua
new file mode 100644
index 0000000..17c307e
--- /dev/null
+++ b/MixtureTable.lua
@@ -0,0 +1,170 @@
+local MixtureTable, parent = torch.class('nn.MixtureTable', 'nn.Module')
+
+function MixtureTable:__init(dim)
+   parent.__init(self)
+   self.dim = dim
+   self.size = torch.LongStorage()
+   self.batchSize = 0
+   self.size2 = torch.LongStorage()
+   self.backwardSetup = false
+   self.gradInput = {}
+end
+
+function MixtureTable:updateOutput(input) 
+   local gaterInput, expertInputs = table.unpack(input)
+   
+   -- buffers 
+   self._gaterView = self._gaterView or input[1].new()
+   self._expert = self._expert or input[1].new()
+   self._expertView = self._expertView or input[1].new()
+   
+   self.dimG = 2
+   local batchSize = gaterInput:size(1)
+   if gaterInput:dim() < 2 then
+      self.dimG = 1
+      self.dim = self.dim or 1
+      batchSize = 1
+   end
+   self.dim = self.dim or 2
+      
+   if self.table or torch.type(expertInputs) == 'table' then 
+      -- expertInputs is a Table :
+      self.table = true
+      if gaterInput:size(self.dimG) ~= #expertInputs then
+         error"Should be one gater output per expert"
+      end
+      local expertInput = expertInputs[1]
+      if self.batchSize ~= batchSize then
+         self.size:resize(expertInput:dim()+1):fill(1)
+         if self.dimG > 1 then 
+            self.size[1] = gaterInput:size(1)
+         end
+         self.size[self.dim] = gaterInput:size(self.dimG)
+         self.output:resizeAs(expertInput)
+         self.backwardSetup = false
+         self.batchSize = batchSize
+      end
+      self._gaterView:view(gaterInput, self.size)
+      self.output:zero()
+      -- multiply accumulate gater outputs by their commensurate expert
+      for i,expertInput in ipairs(expertInputs) do
+         local gate = self._gaterView:select(self.dim,i):expandAs(expertInput)
+         self.output:addcmul(expertInput, gate)
+      end
+   else
+      -- expertInputs is a Tensor :
+      if self.batchSize ~= batchSize then
+         self.size:resize(expertInputs:dim()):fill(1)
+         if self.dimG > 1 then
+            self.size[1] = gaterInput:size(1)
+         end
+         self.size[self.dim] = gaterInput:size(self.dimG)
+         self.output:resizeAs(expertInputs:select(self.dim, 1))
+         self.batchSize = batchSize
+         self.backwardSetup = false
+      end
+      self._gaterView:view(gaterInput, self.size)
+      self._expert:cmul(self._gaterView:expandAs(expertInputs), expertInputs)
+      self.output:sum(self._expert, self.dim)
+      self.output:resizeAs(expertInputs:select(self.dim, 1))
+   end
+
+   return self.output
+end
+
+function MixtureTable:updateGradInput(input, gradOutput)
+   local gaterInput, expertInputs = table.unpack(input)
+   nn.utils.recursiveResizeAs(self.gradInput, input)
+   local gaterGradInput, expertGradInputs = table.unpack(self.gradInput)
+   
+   -- buffers
+   self._sum = self._sum or input[1].new()
+   self._expertView2 = self._expertView2 or input[1].new()
+   self._expert2 = self._expert2 or input[1].new()
+      
+   if self.table then
+      if not self.backwardSetup then
+         for i,expertInput in ipairs(expertInputs) do
+            local expertGradInput = expertGradInputs[i] or expertInput:clone()
+            expertGradInput:resizeAs(expertInput)
+            expertGradInputs[i] = expertGradInput
+         end
+         gaterGradInput:resizeAs(gaterInput)
+         self.backwardSetup = true
+      end
+      
+      -- like CMulTable, but with broadcasting
+      for i,expertGradInput in ipairs(expertGradInputs) do
+         -- gater updateGradInput
+         self._expert:cmul(gradOutput, expertInputs[i])
+         if self.dimG == 1 then
+            self._expertView:view(self._expert, -1)
+         else
+            self._expertView:view(self._expert, gradOutput:size(1), -1)
+         end
+         self._sum:sum(self._expertView, self.dimG)
+         if self.dimG == 1 then
+            gaterGradInput[i] = self._sum:select(self.dimG,1)
+         else
+            gaterGradInput:select(self.dimG,i):copy(self._sum:select(self.dimG,1))
+         end
+         
+         -- expert updateGradInput
+         local gate = self._gaterView:select(self.dim,i):expandAs(expertGradInput)
+         expertGradInput:cmul(gate, gradOutput)     
+      end
+   else
+      if not self.backwardSetup then
+         self.size2:resize(expertInputs:dim())
+         self.size2:copy(expertInputs:size())
+         self.size2[self.dim] = 1
+         gaterGradInput:resizeAs(gaterInput)
+         self.backwardSetup = true
+      end
+      
+      -- gater updateGradInput
+      self._expertView:view(gradOutput, self.size2)
+      local gradOutput = self._expertView:expandAs(expertInputs)
+      self._expert:cmul(gradOutput, expertInputs)
+      local expert = self._expert:transpose(self.dim, self.dimG)
+      if not expert:isContiguous() then
+         self._expert2:resizeAs(expert)
+         self._expert2:copy(expert)
+         expert = self._expert2
+      end
+      if self.dimG == 1 then
+         self._expertView2:view(expert, gaterInput:size(1), -1)
+      else
+         self._expertView2:view(expert, gaterInput:size(1), gaterInput:size(2), -1)
+      end
+      gaterGradInput:sum(self._expertView2, self.dimG+1)
+      gaterGradInput:resizeAs(gaterInput)
+      
+      -- expert updateGradInput
+      expertGradInputs:cmul(self._gaterView:expandAs(expertInputs), gradOutput)
+   end
+
+   return self.gradInput
+end
+
+function MixtureTable:type(type, tensorCache)
+   self._gaterView = nil
+   self._expert = nil
+   self._expertView = nil
+   self._sum = nil
+   self._expert2 = nil
+   self._expertView2 = nil
+   return parent.type(self, type, tensorCache)
+end
+
+function MixtureTable:clearState()
+   nn.utils.clear(self, {
+     '_gaterView',
+     '_expert',
+     '_expertView',
+     '_sum',
+     '_expert2',
+     '_expertView2',
+   })
+   return parent.clearState(self)
+end
diff --git a/Module.lua b/Module.lua
new file mode 100644
index 0000000..19e2416
--- /dev/null
+++ b/Module.lua
@@ -0,0 +1,395 @@
+local Module = torch.class('nn.Module')
+
+function Module:__init()
+   self.gradInput = torch.Tensor()
+   self.output = torch.Tensor()
+   self._type = self.output:type()
+end
+
+function Module:parameters()
+   if self.weight and self.bias then
+      return {self.weight, self.bias}, {self.gradWeight, self.gradBias}
+   elseif self.weight then
+      return {self.weight}, {self.gradWeight}
+   elseif self.bias then
+      return {self.bias}, {self.gradBias}
+   else
+      return
+   end
+end
+
+function Module:updateOutput(input)
+   return self.output
+end
+
+function Module:forward(input)
+   return self:updateOutput(input)
+end
+
+function Module:backward(input, gradOutput, scale)
+   scale = scale or 1
+   self:updateGradInput(input, gradOutput)
+   self:accGradParameters(input, gradOutput, scale)
+   return self.gradInput
+end
+
+function Module:backwardUpdate(input, gradOutput, lr)
+   self:updateGradInput(input, gradOutput)
+   self:accUpdateGradParameters(input, gradOutput, lr)
+   return self.gradInput
+end
+
+function Module:updateGradInput(input, gradOutput)
+   return self.gradInput
+end
+
+function Module:accGradParameters(input, gradOutput, scale)
+end
+
+function Module:accUpdateGradParameters(input, gradOutput, lr)
+   local gradWeight = self.gradWeight
+   local gradBias = self.gradBias
+   self.gradWeight = self.weight
+   self.gradBias = self.bias
+   self:accGradParameters(input, gradOutput, -lr)
+   self.gradWeight = gradWeight
+   self.gradBias = gradBias
+end
+
+function Module:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   if self:parameters() then
+      self:zeroGradParameters()
+      self:accGradParameters(input, gradOutput, 1)
+      self:updateParameters(lr)
+   end
+end
+
+function Module:zeroGradParameters()
+   local _,gradParams = self:parameters()
+   if gradParams then
+      for i=1,#gradParams do
+         gradParams[i]:zero()
+      end
+   end
+end
+
+function Module:updateParameters(learningRate)
+   local params, gradParams = self:parameters()
+   if params then
+      for i=1,#params do
+         params[i]:add(-learningRate, gradParams[i])
+      end
+   end
+end
+
+function Module:training()
+   self.train = true
+end
+
+function Module:evaluate()
+   self.train = false
+end
+
+function Module:share(mlp, ...)
+   local arg = {...}
+   for i,v in ipairs(arg) do
+      if self[v] ~= nil then
+         self[v]:set(mlp[v])
+         self.accUpdateGradParameters = self.sharedAccUpdateGradParameters
+         mlp.accUpdateGradParameters = mlp.sharedAccUpdateGradParameters
+      end
+   end
+   return self
+end
+
+function Module:clone(...)
+   local f = torch.MemoryFile("rw"):binary()
+   f:writeObject(self)
+   f:seek(1)
+   local clone = f:readObject()
+   f:close()
+   if select('#',...) > 0 then
+      clone:share(self,...)
+   end
+   return clone
+end
+
+function Module:type(type, tensorCache)
+   if not type then
+      return self._type
+   end
+
+   tensorCache = tensorCache or {}
+
+   -- find all tensors and convert them
+   for key,param in pairs(self) do
+      self[key] = nn.utils.recursiveType(param, type, tensorCache)
+   end
+
+   self._type = type
+   return self
+end
+
+function Module:float(...)
+   return self:type('torch.FloatTensor',...)
+end
+
+function Module:double(...)
+   return self:type('torch.DoubleTensor',...)
+end
+
+function Module:cuda(...)
+   return self:type('torch.CudaTensor',...)
+end
+
+function Module:reset()
+end
+
+function Module:write(file)
+  -- Write all values in the object as a table.
+  local object = {}
+  for k, v in pairs(self) do
+    object[k] = v
+  end
+  file:writeObject(object)
+end
+
+function Module:read(file)
+  local object = file:readObject()
+  for k, v in pairs(object) do
+    self[k] = v
+  end
+end
+
+-- This function is not easy to understand. It works as follows:
+--
+-- - gather all parameter tensors for this module (and children);
+--   count all parameter values (floats)
+-- - create one ginormous memory area (Storage object) with room for all
+--   parameters
+-- - remap each parameter tensor to point to an area within the ginormous
+--   Storage, and copy it there
+--
+-- It has the effect of making all parameters point to the same memory area,
+-- which is then returned.
+--
+-- The purpose is to allow operations over all parameters (such as momentum
+-- updates and serialization), but it assumes that all parameters are of
+-- the same type (and, in the case of CUDA, on the same device), which
+-- is not always true. Use for_each() to iterate over this module and
+-- children instead.
+--
+-- Module._flattenTensorBuffer can be used by other packages (e.g. cunn)
+-- to specify the type of temporary buffers. For example, the temporary
+-- buffers for CudaTensor could be FloatTensor, to avoid GPU memory usage.
+--
+-- TODO: This logically belongs to torch.Tensor, not nn.
+Module._flattenTensorBuffer = {}
+function Module.flatten(parameters)
+
+   -- returns true if tensor occupies a contiguous region of memory (no holes)
+   local function isCompact(tensor)
+      local sortedStride, perm = torch.sort(
+            torch.LongTensor(tensor:nDimension()):set(tensor:stride()), 1, true)
+      local sortedSize = torch.LongTensor(tensor:nDimension()):set(
+            tensor:size()):index(1, perm)
+      local nRealDim = torch.clamp(sortedStride, 0, 1):sum()
+      sortedStride = sortedStride:narrow(1, 1, nRealDim):clone()
+      sortedSize   = sortedSize:narrow(1, 1, nRealDim):clone()
+      local t = tensor.new():set(tensor:storage(), 1,
+                                 sortedSize:storage(),
+                                 sortedStride:storage())
+      return t:isContiguous()
+   end
+
+   if not parameters or #parameters == 0 then
+      return torch.Tensor()
+   end
+   local Tensor = parameters[1].new
+   local TmpTensor = Module._flattenTensorBuffer[torch.type(parameters[1])] or Tensor
+
+   -- 1. construct the set of all unique storages referenced by parameter tensors
+   local storages = {}
+   local nParameters = 0
+   local parameterMeta = {}
+   for k = 1,#parameters do
+      local param = parameters[k]
+      local storage = parameters[k]:storage()
+      local storageKey = torch.pointer(storage)
+
+      if not storages[storageKey] then
+         storages[storageKey] = {storage, nParameters}
+         nParameters = nParameters + storage:size()
+      end
+
+      parameterMeta[k] = {storageOffset = param:storageOffset() +
+                                          storages[storageKey][2],
+                          size          = param:size(),
+                          stride        = param:stride()}
+   end
+
+   -- 2. construct a single tensor that will hold all the parameters
+   local flatParameters = TmpTensor(nParameters):zero()
+
+   -- 3. determine if there are elements in the storage that none of the
+   --    parameter tensors reference ('holes')
+   local tensorsCompact = true
+   for k = 1,#parameters do
+      local meta = parameterMeta[k]
+      local tmp = TmpTensor():set(
+         flatParameters:storage(), meta.storageOffset, meta.size, meta.stride)
+      tmp:fill(1)
+      tensorsCompact = tensorsCompact and isCompact(tmp)
+   end
+
+   local maskParameters  = flatParameters:byte():clone()
+   local compactOffsets  = flatParameters:long():cumsum(1)
+   local nUsedParameters = compactOffsets[-1]
+
+   -- 4. copy storages into the flattened parameter tensor
+   for _, storageAndOffset in pairs(storages) do
+      local storage, offset = table.unpack(storageAndOffset)
+      flatParameters[{{offset+1,offset+storage:size()}}]:copy(Tensor():set(storage))
+   end
+
+   -- 5. allow garbage collection
+   storages = nil
+   for k = 1,#parameters do
+       parameters[k]:set(Tensor())
+   end
+
+   -- 6. compact the flattened parameters if there were holes
+   if nUsedParameters ~= nParameters then
+      assert(tensorsCompact,
+         "Cannot gather tensors that are not compact")
+
+      flatParameters = TmpTensor(nUsedParameters):copy(
+            flatParameters:maskedSelect(maskParameters))
+      for k = 1,#parameters do
+        parameterMeta[k].storageOffset =
+              compactOffsets[parameterMeta[k].storageOffset]
+      end
+   end
+
+   if TmpTensor ~= Tensor then
+      flatParameters = Tensor(flatParameters:nElement()):copy(flatParameters)
+   end
+
+   -- 7. fix up the parameter tensors to point at the flattened parameters
+   for k = 1,#parameters do
+      parameters[k]:set(flatParameters:storage(),
+          parameterMeta[k].storageOffset,
+          parameterMeta[k].size,
+          parameterMeta[k].stride)
+   end
+
+   return flatParameters
+end
+
+function Module:getParameters()
+   -- get parameters
+   local parameters,gradParameters = self:parameters()
+   local p, g = Module.flatten(parameters), Module.flatten(gradParameters)
+   assert(p:nElement() == g:nElement(),
+      'check that you are sharing parameters and gradParameters')
+   if parameters then
+      for i=1,#parameters do
+         assert(parameters[i]:storageOffset() == gradParameters[i]:storageOffset(),
+            'misaligned parameter at ' .. tostring(i))
+      end
+   end
+   return p, g
+end
+
+function Module:__call__(input, gradOutput)
+   self:forward(input)
+   if gradOutput then
+      self:backward(input, gradOutput)
+      return self.output, self.gradInput
+   else
+      return self.output
+   end
+end
+
+-- Run a callback (called with the module as an argument) in preorder over this
+-- module and its children.
+--
+function Module:apply(callback)
+    callback(self)
+
+    if self.modules then
+        for _, module in ipairs(self.modules) do
+            module:apply(callback)
+        end
+    end
+end
+
+function Module:findModules(typename, container)
+  container = container or self
+  local nodes = {}
+  local containers = {}
+  local mod_type = torch.typename(self)
+  if mod_type == typename then
+    nodes[#nodes+1] = self
+    containers[#containers+1] = container
+  end
+  -- Recurse on nodes with 'modules'
+  if (self.modules ~= nil) then
+    if (torch.type(self.modules) == 'table') then
+      for i = 1, #self.modules do
+        local child = self.modules[i]
+        local cur_nodes, cur_containers =
+          child:findModules(typename, self)
+        assert(#cur_nodes == #cur_containers,
+          'Internal error: incorrect return length')  -- This shouldn't happen
+        -- add the list items from our child to our list (ie return a
+        -- flattened table of the return nodes).
+        for j = 1, #cur_nodes do
+          nodes[#nodes+1] = cur_nodes[j]
+          containers[#containers+1] = cur_containers[j]
+        end
+      end
+    end
+  end
+  return nodes, containers
+end
+
+-- returns a list of modules
+function Module:listModules()
+   local function tinsert(to, from)
+      if torch.type(from) == 'table' then
+         for i=1,#from do
+            tinsert(to,from[i])
+         end
+      else
+         table.insert(to,from)
+      end
+   end
+   -- include self first
+   local modules = {self}
+   if self.modules then
+      for i=1,#self.modules do
+         local modulas = self.modules[i]:listModules()
+         if modulas then
+            tinsert(modules,modulas)
+         end
+      end
+   end
+   return modules
+end
+
+function Module:clearState()
+   return nn.utils.clear(self, 'output', 'gradInput')
+end
+
+-- similar to apply, recursively goes over network and calls
+-- a callback function which returns a new module replacing the old one
+function nn.Module:replace(callback)
+   local out = callback(self)
+   if self.modules then
+      for i, module in ipairs(self.modules) do
+         self.modules[i] = module:replace(callback)
+      end
+   end
+   return out
+end
diff --git a/Mul.lua b/Mul.lua
new file mode 100644
index 0000000..efa1db6
--- /dev/null
+++ b/Mul.lua
@@ -0,0 +1,38 @@
+local Mul, parent = torch.class('nn.Mul', 'nn.Module')
+
+function Mul:__init()
+   parent.__init(self)
+
+   self.weight = torch.Tensor(1)
+   self.gradWeight = torch.Tensor(1)
+
+   self:reset()
+end
+
+
+function Mul:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(1))
+   end
+
+   self.weight:uniform(-stdv, stdv);
+end
+
+function Mul:updateOutput(input)
+   self.output:resizeAs(input):copy(input);
+   self.output:mul(self.weight[1]);
+   return self.output
+end
+
+function Mul:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input):zero()
+   self.gradInput:add(self.weight[1], gradOutput)
+   return self.gradInput
+end
+
+function Mul:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   self.gradWeight[1] = self.gradWeight[1] + scale*input:dot(gradOutput);
+end
diff --git a/MulConstant.lua b/MulConstant.lua
new file mode 100644
index 0000000..e8c473b
--- /dev/null
+++ b/MulConstant.lua
@@ -0,0 +1,41 @@
+local MulConstant, parent = torch.class('nn.MulConstant', 'nn.Module')
+
+function MulConstant:__init(constant_scalar,ip)
+  parent.__init(self)
+  assert(type(constant_scalar) == 'number', 'input is not scalar!')
+  self.constant_scalar = constant_scalar
+
+  -- default for inplace is false
+   self.inplace = ip or false
+   if (ip and type(ip) ~= 'boolean') then
+      error('in-place flag must be boolean')
+   end
+end
+
+function MulConstant:updateOutput(input)
+  if self.inplace then
+    input:mul(self.constant_scalar)
+    self.output:set(input)
+  else
+    self.output:resizeAs(input)
+    self.output:copy(input)
+    self.output:mul(self.constant_scalar)
+  end
+  return self.output
+end
+
+function MulConstant:updateGradInput(input, gradOutput)
+  if self.gradInput then
+    if self.inplace then
+      gradOutput:mul(self.constant_scalar)
+      self.gradInput:set(gradOutput)
+      -- restore previous input value
+      input:div(self.constant_scalar)
+    else
+      self.gradInput:resizeAs(gradOutput)
+      self.gradInput:copy(gradOutput)
+      self.gradInput:mul(self.constant_scalar)
+    end
+    return self.gradInput
+  end
+end
diff --git a/MultiCriterion.lua b/MultiCriterion.lua
new file mode 100644
index 0000000..9593177
--- /dev/null
+++ b/MultiCriterion.lua
@@ -0,0 +1,40 @@
+local MultiCriterion, parent = torch.class('nn.MultiCriterion', 'nn.Criterion')
+
+function MultiCriterion:__init()
+   parent.__init(self)
+   self.criterions = {}
+   self.weights = torch.DoubleStorage()
+end
+
+function MultiCriterion:add(criterion, weight)
+   assert(criterion, 'no criterion provided')
+   weight = weight or 1
+   table.insert(self.criterions, criterion)
+   self.weights:resize(#self.criterions, true)
+   self.weights[#self.criterions] = weight
+   return self
+end
+
+function MultiCriterion:updateOutput(input, target)
+   self.output = 0
+   for i=1,#self.criterions do
+      self.output = self.output + self.weights[i]*self.criterions[i]:updateOutput(input, target)
+   end
+   return self.output
+end
+
+function MultiCriterion:updateGradInput(input, target)
+   self.gradInput = nn.utils.recursiveResizeAs(self.gradInput, input)
+   nn.utils.recursiveFill(self.gradInput, 0)
+   for i=1,#self.criterions do
+      nn.utils.recursiveAdd(self.gradInput, self.weights[i], self.criterions[i]:updateGradInput(input, target))
+   end
+   return self.gradInput
+end
+
+function MultiCriterion:type(type)
+   for i,criterion in ipairs(self.criterions) do
+      criterion:type(type)
+   end
+   return parent.type(self, type)
+end
diff --git a/MultiLabelMarginCriterion.lua b/MultiLabelMarginCriterion.lua
new file mode 100644
index 0000000..a0b2a9c
--- /dev/null
+++ b/MultiLabelMarginCriterion.lua
@@ -0,0 +1,31 @@
+local MultiLabelMarginCriterion, parent = torch.class('nn.MultiLabelMarginCriterion', 'nn.Criterion')
+
+function MultiLabelMarginCriterion:__init()
+   parent.__init(self)
+   self.sizeAverage = true
+   self.isTarget = torch.Tensor()
+end
+
+function MultiLabelMarginCriterion:updateOutput(input, target)
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.MultiLabelMarginCriterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.isTarget:cdata(),
+      self.sizeAverage
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function MultiLabelMarginCriterion:updateGradInput(input, target)
+   input.THNN.MultiLabelMarginCriterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.isTarget:cdata(),
+      self.sizeAverage
+   )
+   return self.gradInput
+end
diff --git a/MultiLabelSoftMarginCriterion.lua b/MultiLabelSoftMarginCriterion.lua
new file mode 100644
index 0000000..a73ef38
--- /dev/null
+++ b/MultiLabelSoftMarginCriterion.lua
@@ -0,0 +1,44 @@
+--[[
+-- A MultiLabel multiclass criterion based on sigmoid:
+--
+-- the loss is:
+-- l(x,y) = - sum_i y[i] * log(p[i]) + (1 - y[i]) * log (1 - p[i])
+-- where p[i] = exp(x[i]) / (1 + exp(x[i]))
+--
+-- and with weights:
+-- l(x,y) = - sum_i weights[i] (y[i] * log(p[i]) + (1 - y[i]) * log (1 - p[i]))
+--
+--
+--]]
+
+
+local MultiLabelSoftMarginCriterion, parent =
+torch.class('nn.MultiLabelSoftMarginCriterion', 'nn.Criterion')
+
+
+function MultiLabelSoftMarginCriterion:__init(weights)
+    parent.__init(self)
+    self.lsm = nn.Sigmoid()
+    self.nll = nn.BCECriterion(weights)
+end
+
+function MultiLabelSoftMarginCriterion:updateOutput(input, target)
+    input = input:nElement() == 1 and input or input:squeeze()
+    target = target:nElement() == 1 and target or target:squeeze()
+    self.lsm:updateOutput(input)
+    self.nll:updateOutput(self.lsm.output, target)
+    self.output = self.nll.output
+    return self.output
+end
+
+function MultiLabelSoftMarginCriterion:updateGradInput(input, target)
+    local size = input:size()
+    input = input:nElement() ==1 and input or input:squeeze()
+    target = target:nElement() == 1 and target or target:squeeze()
+    self.nll:updateGradInput(self.lsm.output, target)
+    self.lsm:updateGradInput(input, self.nll.gradInput)
+    self.gradInput:view(self.lsm.gradInput, size)
+    return self.gradInput
+end
+
+ return nn.MultiLabelSoftMarginCriterion
diff --git a/MultiMarginCriterion.lua b/MultiMarginCriterion.lua
new file mode 100644
index 0000000..1a22bde
--- /dev/null
+++ b/MultiMarginCriterion.lua
@@ -0,0 +1,54 @@
+local THNN = require 'nn.THNN'
+local MultiMarginCriterion, parent = torch.class('nn.MultiMarginCriterion', 'nn.Criterion')
+
+function MultiMarginCriterion:__init(p, weights, margin)
+   assert(p == nil or p == 1 or p == 2, 'only p=1 and p=2 supported')
+   self.p = p or 1
+   self.margin = margin or 1.0
+   parent.__init(self)
+   self.sizeAverage = true
+   if weights then
+       assert(weights:dim() == 1, "weights input should be 1-D Tensor")
+       self.weights = weights
+   end
+end
+
+function MultiMarginCriterion:updateOutput(input, target)
+   -- backward compatibility
+   if not torch.isTensor(target) then
+     self.target_tensor = self.target_tensor or input.new(1)
+     self.target_tensor[1] = target
+     target = self.target_tensor
+   end
+   self.p = self.p or 1
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.MultiMarginCriterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage,
+      self.p,
+      THNN.optionalTensor(self.weights),
+      self.margin
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function MultiMarginCriterion:updateGradInput(input, target)
+   if not torch.isTensor(target) then
+     self.target_tensor = self.target_tensor or input.new(1)
+     self.target_tensor[1] = target
+     target = self.target_tensor
+   end
+   input.THNN.MultiMarginCriterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage,
+      self.p,
+      THNN.optionalTensor(self.weights),
+      self.margin
+   )
+   return self.gradInput
+end
diff --git a/Narrow.lua b/Narrow.lua
new file mode 100644
index 0000000..07322d8
--- /dev/null
+++ b/Narrow.lua
@@ -0,0 +1,33 @@
+local Narrow, parent = torch.class('nn.Narrow', 'nn.Module')
+
+function Narrow:__init(dimension,offset,length)
+   parent.__init(self)
+   self.dimension=dimension
+   self.index=offset
+   self.length=length or 1
+   if not dimension or not offset then
+      error('nn.Narrow(dimension, offset, length)')
+   end
+end
+
+function Narrow:updateOutput(input)
+   local length = self.length
+   if length < 0 then
+      length = input:size(self.dimension) - self.index + self.length + 2
+   end
+   local output=input:narrow(self.dimension,self.index,length)
+   self.output = self.output:typeAs(output)
+   self.output:resizeAs(output):copy(output)
+   return self.output
+end
+
+function Narrow:updateGradInput(input, gradOutput)
+   local length = self.length
+   if length < 0 then
+      length = input:size(self.dimension) - self.index + self.length + 2
+   end
+   self.gradInput = self.gradInput:typeAs(input)
+   self.gradInput:resizeAs(input):zero()
+   self.gradInput:narrow(self.dimension,self.index,length):copy(gradOutput)
+   return self.gradInput
+end
diff --git a/NarrowTable.lua b/NarrowTable.lua
new file mode 100644
index 0000000..17e06e1
--- /dev/null
+++ b/NarrowTable.lua
@@ -0,0 +1,43 @@
+local NarrowTable, parent = torch.class('nn.NarrowTable', 'nn.Module')
+
+function NarrowTable:__init(offset, length)
+   parent.__init(self)
+   self.offset = offset
+   self.length = length or 1
+   if not offset then
+      error('nn.NarrowTable(offset, length)')
+   end
+   
+   self.output = {}
+   self.gradInput = {}
+end
+
+function NarrowTable:updateOutput(input)
+   for k,v in ipairs(self.output) do self.output[k] = nil end
+   for i=1,self.length do
+      self.output[i] = input[self.offset+i-1]
+   end
+   return self.output
+end
+
+function NarrowTable:updateGradInput(input, gradOutput)
+   for i=1,#gradOutput do
+      self.gradInput[self.offset+i-1] = gradOutput[i]
+   end
+   for i=1,#input do
+      if (i < self.offset) or (i >= self.offset + self.length) then
+         self.gradInput[i] = nn.utils.recursiveResizeAs(self.gradInput[i], input[i])
+         nn.utils.recursiveFill(self.gradInput[i], 0)
+      end
+   end
+   for i=#input+1,#self.gradInput do self.gradInput[i] = nil end
+   return self.gradInput
+end 
+
+function NarrowTable:type(type, tensorCache)
+   self.output = {}
+   self.gradInput = {}
+   return parent.type(self, type, tensorCache)
+end
+
+NarrowTable.clearState = nn.Identity.clearState
diff --git a/Normalize.lua b/Normalize.lua
new file mode 100644
index 0000000..24c1d07
--- /dev/null
+++ b/Normalize.lua
@@ -0,0 +1,155 @@
+local Normalize, parent = torch.class('nn.Normalize', 'nn.Module')
+
+function Normalize:__init(p,eps)
+  parent.__init(self)
+  assert(p,'p-norm not provided')
+  assert(p > 0, p..'-norm not supported')
+  self.p = p
+  self.eps = eps or 1e-10
+end
+
+function Normalize:updateOutput(input)
+  assert(input:dim() <= 2, 'only 1d layer supported')
+  local input_size = input:size()
+  if input:dim() == 1 then
+    input = input:view(1,-1)
+  end
+
+  self._output = self._output or input.new()
+  self.norm = self.norm or input.new()
+  self.buffer = self.buffer or input.new()
+
+  self._output:resizeAs(input)
+
+  if self.p == math.huge then
+    -- specialization for the infinity norm
+    self._indices = self._indices or
+      (torch.type(self.output) == 'torch.CudaTensor' and
+       torch.CudaTensor() or torch.LongTensor())
+
+    self.buffer:abs(input)
+    torch.max(self.norm, self._indices, self.buffer, 2)
+    self.norm:add(self.eps)
+  else
+    self.normp = self.normp or input.new()
+    if self.p % 2 ~= 0 then
+      self.buffer:abs(input):pow(self.p)
+    else
+      self.buffer:pow(input,self.p)
+    end
+    self.normp:sum(self.buffer,2):add(self.eps)
+    self.norm:pow(self.normp,1/self.p)
+  end
+  self._output:cdiv(input, self.norm:view(-1,1):expandAs(input))
+
+  self.output:view(self._output, input_size)
+  return self.output
+end
+
+function Normalize:updateGradInput(input, gradOutput)
+  assert(input:dim() <= 2, 'only 1d layer supported')
+  assert(gradOutput:dim() <= 2, 'only 1d layer supported')
+
+  local input_size = input:size()
+  if input:dim() == 1 then
+    input = input:view(1,-1)
+  end
+
+  local n = input:size(1) -- batch size
+  local d = input:size(2) -- dimensionality of vectors
+
+  self._gradInput = self._gradInput or input.new()
+  self.cross = self.cross or input.new()
+  -- compute diagonal term with gradOutput
+  self._gradInput:resize(n,d)
+  if self.p == math.huge then
+    -- specialization for the inf case
+    self._gradInput:cmul(self.norm:view(n,1,1):expand(n,d,1),gradOutput)
+    self.buffer:resizeAs(input):zero()
+    self.cross:resize(n,1)
+    self.cross:gather(input,2,self._indices)
+    self.cross:cdiv(self.norm)
+    self.buffer:scatter(2,self._indices,self.cross)
+  else
+    self._gradInput:cmul(self.normp:view(n,1):expand(n,d), gradOutput)
+    -- small optimizations for different p
+    -- buffer = input*|input|^(p-2)
+    if self.p % 2 ~= 0 then
+      -- for non-even p, need to add absolute value
+      if self.p < 2 then
+        -- add eps to avoid possible division by 0
+        self.buffer:abs(input):add(self.eps):pow(self.p-2):cmul(input)
+      else
+        self.buffer:abs(input):pow(self.p-2):cmul(input)
+      end
+    elseif self.p == 2 then
+      -- special case for p == 2, pow(x,0) = 1
+      self.buffer:copy(input)
+    else
+      -- p is even and > 2, pow(x,p) is always positive
+      self.buffer:pow(input,self.p-2):cmul(input)
+    end
+  end
+  -- compute cross term in two steps
+  self.cross:resize(n,1)
+
+  -- instead of having a huge temporary matrix (b1*b2),
+  -- do the computations as b1*(b2*gradOutput). This avoids redundant
+  -- computation and also a huge buffer of size n*d^2
+  self.buffer2 = self.buffer2 or input.new() -- nxd
+  self.buffer2:cmul(input, gradOutput)
+  self.cross:sum(self.buffer2, 2)
+
+  self.buffer:cmul(self.cross:expandAs(self.buffer))
+  self._gradInput:add(-1, self.buffer)
+
+  -- reuse cross buffer for normalization
+  if self.p == math.huge then
+    self.cross:cmul(self.norm,self.norm)
+  else
+    self.cross:cmul(self.normp,self.norm)
+  end
+  self._gradInput:cdiv(self.cross:expand(n,d))
+
+  self.gradInput:view(self._gradInput, input_size)
+  return self.gradInput
+end
+
+function Normalize:__tostring__()
+  local s
+  -- different prints if the norm is integer
+  if self.p % 1 == 0 then
+    s = '%s(%d)'
+  else
+    s = '%s(%f)'
+  end
+  return string.format(s,torch.type(self),self.p)
+end
+
+function Normalize:type(type, tensorCache)
+  -- torch.max expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
+  if type == 'torch.CudaTensor' then
+    parent.type(self, type, tensorCache)
+  else
+    -- self._indices must be a LongTensor. Setting it to nil temporarily avoids
+    -- unnecessary memory allocations.
+    local indices
+    indices, self._indices = self._indices, nil
+    parent.type(self, type, tensorCache)
+    self._indices = indices and indices:long() or nil
+  end
+  return self
+end
+
+function Normalize:clearState()
+   nn.utils.clear(self, {
+      '_output',
+      '_indices',
+      '_gradInput',
+      'buffer',
+      'norm',
+      'normp',
+      'cross',
+   })
+   return parent.clearState(self)
+end
diff --git a/PReLU.lua b/PReLU.lua
new file mode 100644
index 0000000..2e58fba
--- /dev/null
+++ b/PReLU.lua
@@ -0,0 +1,52 @@
+local PReLU, parent = torch.class('nn.PReLU','nn.Module')
+
+function PReLU:__init(nOutputPlane)
+   parent.__init(self)
+   -- if no argument provided, use shared model (weight is scalar)
+   self.nOutputPlane = nOutputPlane or 0
+   self.weight = torch.Tensor(nOutputPlane or 1):fill(0.25)
+   self.gradWeight = torch.Tensor(nOutputPlane or 1)
+end
+
+function PReLU:updateOutput(input)
+   input.THNN.PReLU_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      self.nOutputPlane
+   )
+   return self.output
+end
+
+function PReLU:updateGradInput(input, gradOutput)
+   input.THNN.PReLU_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata(),
+      self.nOutputPlane
+   )
+   return self.gradInput
+end
+
+function PReLU:accGradParameters(input, gradOutput, scale)
+   self.gradWeightBuf = self.gradWeightBuf or input.new()
+   self.gradWeightBuf2 = self.gradWeightBuf2 or input.new()
+   input.THNN.PReLU_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata(),
+      self.gradWeight:cdata(),
+      self.gradWeightBuf:cdata(),
+      self.gradWeightBuf2:cdata(),
+      self.nOutputPlane,
+      scale or 1
+   )
+   return self.gradWeight
+end
+
+function PReLU:clearState()
+   nn.utils.clear(self, 'gradWeightBuf', 'gradWeightBuf2')
+   return parent.clearState(self)
+end
diff --git a/Padding.lua b/Padding.lua
new file mode 100644
index 0000000..7ff0cc6
--- /dev/null
+++ b/Padding.lua
@@ -0,0 +1,65 @@
+local Padding, parent = torch.class('nn.Padding', 'nn.Module')
+
+-- pad puts in [pad] amount of [value] over dimension [dim], starting at index [index] in that dimension. If pad<0, index counts from the left.  If pad>0 index counts from the right
+-- index = 1 pads before index 1.  index = 2 pads starting before index 2 and after index 1 in dimension [dim]
+function Padding:__init(dim, pad, nInputDim, value, index)
+   self.value = value or 0
+   self.index = index or 1
+   self.dim = dim
+   self.pad = pad
+   self.nInputDim = nInputDim
+   self.outputSize = torch.LongStorage()
+   parent.__init(self)
+end
+
+function Padding:updateOutput(input)
+   self.outputSize:resize(input:dim())
+   self.outputSize:copy(input:size())
+   local dim = self.dim 
+   if self.nInputDim and input:dim() ~= self.nInputDim then
+      dim = dim + 1
+   end
+   self.outputSize[dim] = self.outputSize[dim] + math.abs(self.pad)
+   self.output:resize(self.outputSize)
+   self.output:fill(self.value)
+   local index = self.index
+   local pad = self.pad
+   if pad > 0 then
+      index = input:size(dim) - index + 2
+   else
+      pad = -pad
+   end
+   if index == 1 then
+      self.output:narrow(dim, 1 + pad, input:size(dim)):copy(input)
+   elseif index == input:size(dim) + 1 then
+      self.output:narrow(dim, 1, input:size(dim)):copy(input)
+   else
+      self.output:narrow(dim, 1, index - 1):copy(input:narrow(dim, 1, index - 1))
+      self.output:narrow(dim, index + pad, input:size(dim) - (index - 1)):copy(input:narrow(dim, index, input:size(dim) - (index - 1)))
+   end
+   return self.output
+end
+
+function Padding:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input)
+   local dim = self.dim 
+   if self.nInputDim and input:dim() ~= self.nInputDim then
+      dim = dim + 1
+   end
+   local index = self.index
+   local pad = self.pad
+   if pad > 0 then
+      index = input:size(dim) - index + 2
+   else
+      pad = -pad
+   end
+   if index == 1 then
+      self.gradInput:copy(gradOutput:narrow(dim, 1 + pad, input:size(dim)))
+   elseif index == input:size(dim) + 1 then
+      self.gradInput:copy(gradOutput:narrow(dim, 1, input:size(dim)))
+   else
+      self.gradInput:narrow(dim, 1, index - 1):copy(gradOutput:narrow(dim, 1, index - 1))
+      self.gradInput:narrow(dim, index, input:size(dim) - (index - 1)):copy(gradOutput:narrow(dim, index + pad, input:size(dim) - (index - 1)))
+   end
+   return self.gradInput
+end
diff --git a/PairwiseDistance.lua b/PairwiseDistance.lua
new file mode 100644
index 0000000..d5022a7
--- /dev/null
+++ b/PairwiseDistance.lua
@@ -0,0 +1,91 @@
+local PairwiseDistance, parent = torch.class('nn.PairwiseDistance', 'nn.Module')
+
+function PairwiseDistance:__init(p)
+   parent.__init(self)
+
+   -- state
+   self.gradInput = {}
+   self.diff = torch.Tensor()
+   self.norm = p
+end 
+  
+function PairwiseDistance:updateOutput(input)
+   self.output:resize(1)
+   if input[1]:dim() == 1 then
+      self.output:resize(1)
+      self.output[1]=input[1]:dist(input[2],self.norm)
+   elseif input[1]:dim() == 2 then
+      self.diff = self.diff or input[1].new()
+      self.diff:resizeAs(input[1])
+
+      local diff = self.diff:zero()
+      diff:add(input[1], -1, input[2])
+      diff:abs()
+
+      self.output:resize(input[1]:size(1))
+      self.output:zero()
+      self.output:add(diff:pow(self.norm):sum(2))
+      self.output:pow(1./self.norm)
+   else
+      error('input must be vector or matrix')
+   end
+ 
+   return self.output
+end
+
+local function mathsign(x) 
+   if x==0 then return  2*torch.random(2)-3; end
+   if x>0 then return 1; else return -1; end
+end
+
+function PairwiseDistance:updateGradInput(input, gradOutput)
+   if input[1]:dim() > 2 then
+      error('input must be vector or matrix')
+   end
+
+   self.gradInput[1] = (self.gradInput[1] or input[1].new()):resize(input[1]:size()) 
+   self.gradInput[2] = (self.gradInput[2] or input[2].new()):resize(input[2]:size())
+   self.gradInput[1]:copy(input[1])
+   self.gradInput[1]:add(-1, input[2]) 
+   
+   if self.norm==1 then
+     self.gradInput[1]:apply(mathsign)
+   else
+     -- Note: derivative of p-norm:
+     -- d/dx_k(||x||_p) = (x_k * abs(x_k)^(p-2)) / (||x||_p)^(p-1)
+     if (self.norm > 2) then
+        self.gradInput[1]:cmul(self.gradInput[1]:clone():abs():pow(self.norm-2))
+     end
+
+     if (input[1]:dim() > 1) then
+        self.outExpand = self.outExpand or self.output.new()
+        self.outExpand:resize(self.output:size(1), 1)
+        self.outExpand:copy(self.output)
+        self.outExpand:add(1.0e-6)  -- Prevent divide by zero errors
+        self.outExpand:pow(-(self.norm-1))
+        self.gradInput[1]:cmul(self.outExpand:expand(self.gradInput[1]:size(1),
+           self.gradInput[1]:size(2)))
+     else
+        self.gradInput[1]:mul(math.pow(self.output[1] + 1e-6, -(self.norm-1)))
+     end
+   end
+   if input[1]:dim() == 1 then
+      self.gradInput[1]:mul(gradOutput[1])
+   else
+      self.grad = self.grad or gradOutput.new()
+      self.ones = self.ones or gradOutput.new()
+
+      self.grad:resizeAs(input[1]):zero()
+      self.ones:resize(input[1]:size(2)):fill(1)
+
+      self.grad:addr(gradOutput, self.ones)
+      self.gradInput[1]:cmul(self.grad)
+   end
+   self.gradInput[2]:zero():add(-1, self.gradInput[1])
+   return self.gradInput
+end
+
+function PairwiseDistance:clearState()
+   nn.utils.clear(self, 'diff', 'outExpand', 'grad', 'ones')
+   return parent.clearState(self)
+end
diff --git a/Parallel.lua b/Parallel.lua
new file mode 100644
index 0000000..7d2b4f1
--- /dev/null
+++ b/Parallel.lua
@@ -0,0 +1,115 @@
+local Parallel, parent = torch.class('nn.Parallel', 'nn.Container')
+
+function Parallel:__init(inputDimension,outputDimension)
+   parent.__init(self)
+   self.modules = {}
+   self.inputDimension = inputDimension
+   self.outputDimension = outputDimension
+end
+
+function Parallel:updateOutput(input)
+   local nModule=input:size(self.inputDimension)
+   local outputs = {}
+   self.totalOutputSize = self.totalOutputSize or torch.LongStorage()
+   local totalOutputSize = self.totalOutputSize
+
+   for i=1,nModule do
+      local currentInput = input:select(self.inputDimension,i)
+      local currentOutput = self:rethrowErrors(self.modules[i], i, 'updateOutput', currentInput)
+      table.insert(outputs, currentOutput)
+      local outputSize = currentOutput:size(self.outputDimension)
+
+      if i == 1 then
+         totalOutputSize:resize(currentOutput:dim()):copy(currentOutput:size())
+      else
+         totalOutputSize[self.outputDimension] = totalOutputSize[self.outputDimension] + outputSize
+      end
+
+   end
+   self.output:resize(totalOutputSize)
+
+   local offset = 1
+   for i=1,nModule do
+      local currentOutput = outputs[i]
+      local outputSize = currentOutput:size(self.outputDimension)
+      self.output:narrow(self.outputDimension, offset, outputSize):copy(currentOutput)
+      offset = offset + currentOutput:size(self.outputDimension)
+   end
+   return self.output
+end
+
+function Parallel:updateGradInput(input, gradOutput)
+   local nModule=input:size(self.inputDimension)
+   self.gradInput:resizeAs(input)
+
+   local offset = 1
+   for i=1,nModule do
+      local module=self.modules[i]
+      local currentInput = input:select(self.inputDimension,i)
+      local currentOutput = module.output
+      local outputSize = currentOutput:size(self.outputDimension)
+      local currentGradOutput = gradOutput:narrow(self.outputDimension, offset, outputSize)
+
+      local currentGradInput = self:rethrowErrors(module, i, 'updateGradInput', currentInput, currentGradOutput)
+
+      self.gradInput:select(self.inputDimension,i):copy(currentGradInput)
+      offset = offset + outputSize
+   end
+   return self.gradInput
+end
+
+function Parallel:accGradParameters(input, gradOutput, scale)
+   local nModule=input:size(self.inputDimension)
+
+   local offset = 1
+   for i=1,nModule do
+      local module = self.modules[i]
+      local currentOutput = module.output
+      local outputSize = currentOutput:size(self.outputDimension)
+
+      self:rethrowErrors(module, i, 'accGradParameters',
+          input:select(self.inputDimension,i),
+          gradOutput:narrow(self.outputDimension, offset,outputSize),
+          scale)
+
+      offset = offset + outputSize
+   end
+end
+
+function Parallel:accUpdateGradParameters(input, gradOutput, lr)
+   local nModule=input:size(self.inputDimension)
+
+   local offset = 1
+   for i=1,nModule do
+      local module = self.modules[i];
+      local currentOutput = module.output
+      self:rethrowErrors(module, i, 'accUpdateGradParameters',
+          input:select(self.inputDimension,i),
+          gradOutput:narrow(self.outputDimension, offset,
+                            currentOutput:size(self.outputDimension)),
+          lr)
+
+      offset = offset + currentOutput:size(self.outputDimension)
+   end
+end
+
+function Parallel:__tostring__()
+   local tab = '  '
+   local line = '\n'
+   local next = '  |`-> '
+   local ext = '  |    '
+   local extlast = '       '
+   local last = '   ... -> '
+   local str = torch.type(self)
+   str = str .. ' {' .. line .. tab .. 'input'
+   for i=1,#self.modules do
+      if i == #self.modules then
+         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
+      else
+         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. ext)
+      end
+   end
+   str = str .. line .. tab .. last .. 'output'
+   str = str .. line .. '}'
+   return str
+end
diff --git a/ParallelCriterion.lua b/ParallelCriterion.lua
new file mode 100644
index 0000000..45607d5
--- /dev/null
+++ b/ParallelCriterion.lua
@@ -0,0 +1,41 @@
+local ParallelCriterion, parent = torch.class('nn.ParallelCriterion', 'nn.Criterion')
+
+function ParallelCriterion:__init(repeatTarget)
+   parent.__init(self)
+   self.criterions = {}
+   self.weights = {}
+   self.gradInput = {}
+   self.repeatTarget = repeatTarget
+end
+
+function ParallelCriterion:add(criterion, weight)
+   assert(criterion, 'no criterion provided')
+   weight = weight or 1
+   table.insert(self.criterions, criterion)
+   table.insert(self.weights, weight)
+   return self
+end
+
+function ParallelCriterion:updateOutput(input, target)
+   self.output = 0
+   for i,criterion in ipairs(self.criterions) do
+      local target = self.repeatTarget and target or target[i]
+      self.output = self.output + self.weights[i]*criterion:updateOutput(input[i],target)
+   end
+   return self.output
+end
+
+function ParallelCriterion:updateGradInput(input, target)
+   self.gradInput = nn.utils.recursiveResizeAs(self.gradInput, input)
+   nn.utils.recursiveFill(self.gradInput, 0)
+   for i,criterion in ipairs(self.criterions) do
+      local target = self.repeatTarget and target or target[i]
+      nn.utils.recursiveAdd(self.gradInput[i], self.weights[i], criterion:updateGradInput(input[i], target))
+   end
+   return self.gradInput
+end
+
+function ParallelCriterion:type(type, tensorCache)
+   self.gradInput = {}
+   return parent.type(self, type, tensorCache)
+end
diff --git a/ParallelTable.lua b/ParallelTable.lua
new file mode 100644
index 0000000..6de9534
--- /dev/null
+++ b/ParallelTable.lua
@@ -0,0 +1,57 @@
+local ParallelTable, parent = torch.class('nn.ParallelTable', 'nn.Container')
+
+function ParallelTable:__init()
+   parent.__init(self)
+   self.modules = {}
+   self.output = {}
+   self.gradInput = {}
+end
+
+function ParallelTable:updateOutput(input)
+   for i=1,#self.modules do
+      self.output[i] = self:rethrowErrors(self.modules[i], i, 'updateOutput', input[i])
+   end
+   return self.output
+end
+
+function ParallelTable:updateGradInput(input, gradOutput)
+   for i,module in ipairs(self.modules) do
+      self.gradInput[i] = self:rethrowErrors(module, i, 'updateGradInput', input[i], gradOutput[i])
+   end
+   return self.gradInput
+end
+
+function ParallelTable:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   for i,module in ipairs(self.modules) do
+      self:rethrowErrors(module, i, 'accGradParameters', input[i], gradOutput[i], scale)
+   end
+end
+
+function ParallelTable:accUpdateGradParameters(input, gradOutput, lr)
+   lr = lr or 1
+   for i,module in ipairs(self.modules) do
+      self:rethrowErrors(module, i, 'accUpdateGradParameters', input[i], gradOutput[i], lr)
+   end
+end
+
+function ParallelTable:__tostring__()
+   local tab = '  '
+   local line = '\n'
+   local next = '  |`-> '
+   local ext = '  |    '
+   local extlast = '       '
+   local last = '   ... -> '
+   local str = torch.type(self)
+   str = str .. ' {' .. line .. tab .. 'input'
+   for i=1,#self.modules do
+      if i == self.modules then
+         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
+      else
+         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. ext)
+      end
+   end
+   str = str .. line .. tab .. last .. 'output'
+   str = str .. line .. '}'
+   return str
+end
diff --git a/PartialLinear.lua b/PartialLinear.lua
new file mode 100644
index 0000000..d208f52
--- /dev/null
+++ b/PartialLinear.lua
@@ -0,0 +1,113 @@
+local PartialLinear, Module = torch.class('nn.PartialLinear', 'nn.Module')
+
+--[[
+
+PartialLinear is a Linear layer that allows the user to a set a collection of
+column indices. When the column indices are set, the layer will behave like a
+Linear layer that only has those columns. Meanwhile, all parameters are
+preserved, so resetting the PartialLinear layer will result in a module that
+behaves just like a regular Linear layer.
+
+This module is useful, for instance, when you want to do forward-backward on
+only a subset of a Linear layer during training but use the full Linear layer
+at test time.
+
+]]--
+
+function PartialLinear:__init(inputsize, outputsize, bias)
+   local bias = ((bias == nil) and true) or bias
+   Module.__init(self)
+
+   -- define the layer as a small network:
+   local pt = nn.ParallelTable()
+   pt:add(nn.Identity()):add(nn.LookupTable(outputsize, inputsize))
+   self.network = nn.Sequential():add(pt):add(nn.MM(false, true))
+   if bias then
+      self.bias     = torch.Tensor(1, outputsize):zero()
+      self.gradBias = torch.Tensor(1, outputsize):zero()
+   end
+
+   -- set partition:
+   self.inputsize  = inputsize
+   self.outputsize = outputsize
+   self.allcolumns = torch.range(1, self.outputsize)
+   self:resetPartition()
+end
+
+function PartialLinear:setPartition(indices)
+   self.partition = indices:type(self.allcolumns:type())
+end
+
+function PartialLinear:resetPartition()
+   self.partition = self.allcolumns
+end
+
+function PartialLinear:parameters()
+   return {self.network:get(1):get(2).weight,     self.bias},
+          {self.network:get(1):get(2).gradWeight, self.gradBias}
+end  -- should return only the relevant partition?
+
+function PartialLinear:updateOutput(input)
+   self.output:set(self.network:forward{input, self.partition})
+   if self.bias then
+      self.output:add(
+         self.bias:index(2, self.partition:long()):expandAs(self.output)
+      )
+      self.addBuffer = self.addBuffer or input.new()
+      if self.addBuffer:nElement() ~= input:size(1) then
+         self.addBuffer:resize(input:size(1)):fill(1)
+      end
+   end
+   return self.output
+end
+
+function PartialLinear:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      self.network:updateGradInput({input, self.partition}, gradOutput)
+      self.gradInput:set(self.network.gradInput[1])
+   end
+   return self.gradInput
+end
+
+function PartialLinear:accGradParameters(input, gradOutput, scale)
+   local scale = scale or 1
+   self.network:accGradParameters({input, self.partition}, gradOutput, scale)
+   if self.bias then
+      self.buffer = self.buffer or input.new()
+      self.buffer:resize(gradOutput:size(2))
+      self.buffer:mv(gradOutput:t(), self.addBuffer):mul(scale)
+      self.gradBias:indexAdd(
+         2, self.partition:long(), self.buffer:view(1, self.buffer:nElement())
+      )
+   end
+end
+
+function PartialLinear:accUpdateGradParameters(input, gradOutput, lr)
+   local gradWeight = self.network:get(1):get(2).gradWeight
+   local gradBias = self.gradBias
+   self.network:get(1):get(2).gradWeight = self.network:get(1):get(2).weight
+   self.gradBias = self.bias
+   self:accGradParameters(input, gradOutput, -lr)
+   self.network:get(1):get(2).gradWeight = gradWeight
+   self.gradBias = gradBias
+end
+
+function PartialLinear:zeroGradParameters()
+   self.network:zeroGradParameters()
+   self.gradBias:zero()
+end
+
+function PartialLinear:updateParameters(learningRate)
+   self.network:updateParameters(learningRate)
+   self.bias:add(-learningRate, self.gradBias)
+end
+
+-- we do not need to accumulate parameters when sharing
+PartialLinear.sharedAccUpdateGradParameters =
+   PartialLinear.accUpdateGradParameters
+
+function PartialLinear:__tostring__()
+   return torch.type(self) ..
+      string.format('(%d -> %d)', self.inputsize, self.outputsize) ..
+      (self.bias == nil and ' without bias' or '')
+end
diff --git a/Power.lua b/Power.lua
new file mode 100644
index 0000000..771183c
--- /dev/null
+++ b/Power.lua
@@ -0,0 +1,22 @@
+local Power, parent = torch.class('nn.Power','nn.Module')
+
+function Power:__init(p)
+   parent.__init(self)
+   self.pow = p
+   if not p then
+      error('nn.Power(power)')
+   end
+end
+
+function Power:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   self.output:pow(self.pow)
+   return self.output
+end
+
+function Power:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input):copy(input)
+   self.gradInput:pow(self.pow - 1)
+   self.gradInput:cmul(gradOutput):mul(self.pow)
+   return self.gradInput
+end
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e848fd8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,21 @@
+[![Build Status](https://travis-ci.org/torch/nn.svg?branch=master)](https://travis-ci.org/torch/nn)
+<a name="nn.dok"></a>
+# Neural Network Package #
+
+This package provides an easy and modular way to build and train simple or complex neural networks using [Torch](https://github.com/torch/torch7/blob/master/README.md):
+ * Modules are the bricks used to build neural networks. Each are themselves neural networks, but can be combined with other networks using containers to create complex neural networks:
+   * [Module](doc/module.md#nn.Module): abstract class inherited by all modules;
+   * [Containers](doc/containers.md#nn.Containers): container classes like [`Sequential`](doc/containers.md#nn.Sequential), [`Parallel`](doc/containers.md#nn.Parallel) and [`Concat`](doc/containers.md#nn.Concat);
+   * [Transfer functions](doc/transfer.md#nn.transfer.dok): non-linear functions like [`Tanh`](doc/transfer.md#nn.Tanh) and [`Sigmoid`](doc/transfer.md#nn.Sigmoid);
+   * [Simple layers](doc/simple.md#nn.simplelayers.dok): like [`Linear`](doc/simple.md#nn.Linear), [`Mean`](doc/simple.md#nn.Mean), [`Max`](doc/simple.md#nn.Max) and [`Reshape`](doc/simple.md#nn.Reshape);
+   * [Table layers](doc/table.md#nn.TableLayers): layers for manipulating `table`s like [`SplitTable`](doc/table.md#nn.SplitTable), [`ConcatTable`](doc/table.md#nn.ConcatTable) and [`JoinTable`](doc/table.md#nn.JoinTable);
+   * [Convolution layers](doc/convolution.md#nn.convlayers.dok): [`Temporal`](doc/convolution.md#nn.TemporalModules),  [`Spatial`](doc/convolution.md#nn.SpatialModules) and [`Volumetric`](doc/convolution.md#nn.VolumetricModules) convolutions;
+ * Criterions compute a gradient according to a given loss function given an input and a target:
+   * [Criterions](doc/criterion.md#nn.Criterions): a list of all criterions, including [`Criterion`](doc/criterion.md#nn.Criterion), the abstract class;
+   * [`MSECriterion`](doc/criterion.md#nn.MSECriterion): the Mean Squared Error criterion used for regression;
+   * [`ClassNLLCriterion`](doc/criterion.md#nn.ClassNLLCriterion): the Negative Log Likelihood criterion used for classification;
+ * Additional documentation:
+   * [Overview](doc/overview.md#nn.overview.dok) of the package essentials including modules, containers and training;
+   * [Training](doc/training.md#nn.traningneuralnet.dok): how to train a neural network using [optim](https://github.com/torch/optim);
+   * [Testing](doc/testing.md): how to test your modules.
+   * [Experimental Modules](https://github.com/clementfarabet/lua---nnx/blob/master/README.md): a package containing experimental modules and criteria.
diff --git a/RReLU.lua b/RReLU.lua
new file mode 100644
index 0000000..843415f
--- /dev/null
+++ b/RReLU.lua
@@ -0,0 +1,50 @@
+local ffi = require 'ffi'
+local RReLU, parent = torch.class('nn.RReLU', 'nn.Module')
+
+function RReLU:__init(l, u, ip)
+   parent.__init(self)
+   self.lower = l or 1/8
+   self.upper = u or 1/3
+   assert(self.lower <= self.upper and self.lower >= 0 and self.upper >= 0)
+   self.noise = torch.Tensor()
+   self.train = true
+   self.inplace = ip or false
+end
+
+function RReLU:updateOutput(input)
+   local gen = ffi.typeof('THGenerator**')(torch._gen)[0]
+   input.THNN.RReLU_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.noise:cdata(),
+      self.lower,
+      self.upper,
+      self.train,
+      self.inplace,
+      gen
+   )
+   return self.output
+end
+
+function RReLU:updateGradInput(input, gradOutput)
+   input.THNN.RReLU_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.noise:cdata(),
+      self.lower,
+      self.upper,
+      self.train,
+      self.inplace
+   )
+   return self.gradInput
+end
+
+function RReLU:__tostring__()
+  return string.format('%s (l:%f, u:%f)', torch.type(self), self.lower, self.upper)
+end
+
+function RReLU:clearState()
+   if self.noise then self.noise:set() end
+   return parent.clearState(self)
+end
diff --git a/ReLU.lua b/ReLU.lua
new file mode 100644
index 0000000..a6eb271
--- /dev/null
+++ b/ReLU.lua
@@ -0,0 +1,5 @@
+local ReLU, Parent = torch.class('nn.ReLU', 'nn.Threshold')
+
+function ReLU:__init(p)
+   Parent.__init(self,0,0,p)
+end
diff --git a/Replicate.lua b/Replicate.lua
new file mode 100644
index 0000000..c7dedd7
--- /dev/null
+++ b/Replicate.lua
@@ -0,0 +1,57 @@
+local Replicate, parent = torch.class('nn.Replicate','nn.Module')
+
+function Replicate:__init(nf, dim, ndim)
+   parent.__init(self)
+   self.nfeatures = nf
+   self.dim = dim or 1
+   self.ndim = ndim
+   assert(self.dim > 0, "Can only replicate across positive integer dimensions.")
+end
+
+function Replicate:updateOutput(input)
+   self.dim = self.dim or 1 --backwards compatible
+   assert(
+      self.dim <= input:dim()+1,
+      "Not enough input dimensions to replicate along dimension " ..
+      tostring(self.dim) .. ".")
+   local batchOffset = self.ndim and input:dim() > self.ndim and 1 or 0
+   local rdim = self.dim + batchOffset
+   local sz = torch.LongStorage(input:dim()+1)
+   sz[rdim] = self.nfeatures
+   for i = 1,input:dim() do
+      local offset = 0
+      if i >= rdim then
+         offset = 1
+      end
+      sz[i+offset] = input:size(i)
+   end
+   local st = torch.LongStorage(input:dim()+1)
+   st[rdim] = 0
+   for i = 1,input:dim() do
+      local offset = 0
+      if i >= rdim then
+         offset = 1
+      end
+      st[i+offset] = input:stride(i)
+   end
+   self.output:set(input:storage(),input:storageOffset(),sz,st)
+   return self.output
+end
+
+function Replicate:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input):zero()
+   local batchOffset = self.ndim and input:dim() > self.ndim and 1 or 0
+   local rdim = self.dim + batchOffset
+   local sz = torch.LongStorage(input:dim()+1)
+   sz[rdim] = 1
+   for i = 1,input:dim() do
+      local offset = 0
+      if i >= rdim then
+         offset = 1
+      end
+      sz[i+offset] = input:size(i)
+   end
+   local gradInput = self.gradInput:view(sz)
+   gradInput:sum(gradOutput, rdim)
+   return self.gradInput
+end
diff --git a/Reshape.lua b/Reshape.lua
new file mode 100644
index 0000000..d508369
--- /dev/null
+++ b/Reshape.lua
@@ -0,0 +1,72 @@
+local Reshape, parent = torch.class('nn.Reshape', 'nn.Module')
+
+function Reshape:__init(...)
+   parent.__init(self)
+   local arg = {...}
+
+   self.size = torch.LongStorage()
+   self.batchsize = torch.LongStorage()
+   if torch.type(arg[#arg]) == 'boolean' then
+      self.batchMode = arg[#arg]
+      table.remove(arg, #arg)
+   end
+   local n = #arg
+   if n == 1 and torch.typename(arg[1]) == 'torch.LongStorage' then
+      self.size:resize(#arg[1]):copy(arg[1])
+   else
+      self.size:resize(n)
+      for i=1,n do
+         self.size[i] = arg[i]
+      end
+   end
+
+   self.nelement = 1
+   self.batchsize:resize(#self.size+1)
+   for i=1,#self.size do
+      self.nelement = self.nelement * self.size[i]
+      self.batchsize[i+1] = self.size[i]
+   end
+end
+
+function Reshape:updateOutput(input)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resizeAs(input)
+      self._input:copy(input)
+      input = self._input
+   end
+
+   if (self.batchMode == false) or (
+         (self.batchMode == nil) and
+         (input:nElement() == self.nelement and input:size(1) ~= 1)
+      ) then
+      self.output:view(input, self.size)
+   else
+      self.batchsize[1] = input:size(1)
+      self.output:view(input, self.batchsize)
+   end
+   return self.output
+end
+
+function Reshape:updateGradInput(input, gradOutput)
+   if not gradOutput:isContiguous() then
+      self._gradOutput = self._gradOutput or gradOutput.new()
+      self._gradOutput:resizeAs(gradOutput)
+      self._gradOutput:copy(gradOutput)
+      gradOutput = self._gradOutput
+   end
+
+   self.gradInput:viewAs(gradOutput, input)
+   return self.gradInput
+end
+
+
+function Reshape:__tostring__()
+  return torch.type(self) .. '(' ..
+      table.concat(self.size:totable(), 'x') .. ')'
+end
+
+function Reshape:clearState()
+   nn.utils.clear(self, '_input', '_gradOutput')
+   return parent.clearState(self)
+end
diff --git a/Select.lua b/Select.lua
new file mode 100644
index 0000000..fccdf32
--- /dev/null
+++ b/Select.lua
@@ -0,0 +1,22 @@
+local Select, parent = torch.class('nn.Select', 'nn.Module')
+
+function Select:__init(dimension,index)
+   parent.__init(self)
+   self.dimension = dimension
+   self.index = index 
+end
+
+function Select:updateOutput(input)
+   local index = self.index < 0 and input:size(self.dimension) + self.index + 1 or self.index
+   local output = input:select(self.dimension, index);
+   self.output:resizeAs(output)
+   return self.output:copy(output)
+end
+
+function Select:updateGradInput(input, gradOutput)
+   local index = self.index < 0 and input:size(self.dimension) + self.index + 1 or self.index
+   self.gradInput:resizeAs(input)  
+   self.gradInput:zero()
+   self.gradInput:select(self.dimension,index):copy(gradOutput) 
+   return self.gradInput
+end 
diff --git a/SelectTable.lua b/SelectTable.lua
new file mode 100644
index 0000000..8eba85e
--- /dev/null
+++ b/SelectTable.lua
@@ -0,0 +1,62 @@
+local SelectTable, parent = torch.class('nn.SelectTable', 'nn.Module')
+
+function SelectTable:__init(index)
+   parent.__init(self)
+   self.index = index
+   self.gradInput = {}
+end
+
+function SelectTable:updateOutput(input)
+   -- handle negative indices
+   local index = self.index < 0 and #input + self.index + 1 or self.index
+
+   assert(input[index], "index does not exist in the input table")
+   self.output = input[index]
+
+   return self.output
+end
+
+local function zeroTableCopy(t1, t2)
+   for k, v in pairs(t2) do
+      if (torch.type(v) == "table") then
+         t1[k] = zeroTableCopy(t1[k] or {}, t2[k])
+      else
+         if not t1[k] then
+            t1[k] = v:clone():zero()
+         else
+            t1[k]:resizeAs(v)
+            t1[k]:zero()
+         end
+      end
+   end
+   for k, v in pairs(t1) do
+      if not t2[k] then
+         t1[k] = nil
+      end
+   end
+   return t1
+end
+
+function SelectTable:updateGradInput(input, gradOutput)
+   -- make gradInput a zeroed copy of input
+   zeroTableCopy(self.gradInput, input)
+   -- handle negative indices
+   local index = self.index < 0 and #input + self.index + 1 or self.index
+   -- copy into gradInput[index] (necessary for variable sized inputs)
+   assert(self.gradInput[index])
+   nn.utils.recursiveCopy(self.gradInput[index], gradOutput)
+
+   return self.gradInput
+end
+
+function SelectTable:type(type, tensorCache)
+   self.gradInput = {}
+   self.output = {}
+   return parent.type(self, type, tensorCache)
+end
+
+function SelectTable:__tostring__()
+  return torch.type(self) .. '(' .. self.index .. ')'
+end
+
+SelectTable.clearState = nn.Identity.clearState
diff --git a/Sequential.lua b/Sequential.lua
new file mode 100644
index 0000000..22b0886
--- /dev/null
+++ b/Sequential.lua
@@ -0,0 +1,122 @@
+local Sequential, _ = torch.class('nn.Sequential', 'nn.Container')
+
+function Sequential:__len()
+   return #self.modules
+end
+
+function Sequential:add(module)
+   if #self.modules == 0 then
+      self.gradInput = module.gradInput
+   end
+   table.insert(self.modules, module)
+   self.output = module.output
+   return self
+end
+
+function Sequential:insert(module, index)
+   index = index or (#self.modules + 1)
+   if index > (#self.modules + 1) or index < 1 then
+      error"index should be contiguous to existing modules"
+   end
+   table.insert(self.modules, index, module)
+   self.output = self.modules[#self.modules].output
+   self.gradInput = self.modules[1].gradInput
+end
+
+function Sequential:remove(index)
+   index = index or #self.modules
+   if index > #self.modules or index < 1 then
+      error"index out of range"
+   end
+   table.remove(self.modules, index)
+   if #self.modules > 0 then
+       self.output = self.modules[#self.modules].output
+       self.gradInput = self.modules[1].gradInput
+   else
+       self.output = torch.Tensor()
+       self.gradInput = torch.Tensor()
+   end
+end
+
+function Sequential:updateOutput(input)
+   local currentOutput = input
+   for i=1,#self.modules do
+      currentOutput = self:rethrowErrors(self.modules[i], i, 'updateOutput', currentOutput)
+   end
+   self.output = currentOutput
+   return currentOutput
+end
+
+function Sequential:updateGradInput(input, gradOutput)
+   local currentGradOutput = gradOutput
+   local currentModule = self.modules[#self.modules]
+   for i=#self.modules-1,1,-1 do
+      local previousModule = self.modules[i]
+      currentGradOutput = self:rethrowErrors(currentModule, i+1, 'updateGradInput', previousModule.output, currentGradOutput)
+      currentModule = previousModule
+   end
+   currentGradOutput = self:rethrowErrors(currentModule, 1, 'updateGradInput', input, currentGradOutput)
+   self.gradInput = currentGradOutput
+   return currentGradOutput
+end
+
+function Sequential:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+
+   local currentGradOutput = gradOutput
+   local currentModule = self.modules[#self.modules]
+   for i=#self.modules-1,1,-1 do
+      local previousModule = self.modules[i]
+      self:rethrowErrors(currentModule, i+1, 'accGradParameters', previousModule.output, currentGradOutput, scale)
+      currentGradOutput = currentModule.gradInput
+      currentModule = previousModule
+   end
+
+   self:rethrowErrors(currentModule, 1, 'accGradParameters', input, currentGradOutput, scale)
+end
+
+function Sequential:backward(input, gradOutput, scale)
+   scale = scale or 1
+   local currentGradOutput = gradOutput
+   local currentModule = self.modules[#self.modules]
+   for i=#self.modules-1,1,-1 do
+      local previousModule = self.modules[i]
+      currentGradOutput = self:rethrowErrors(currentModule, i+1, 'backward', previousModule.output, currentGradOutput, scale)
+      currentModule.gradInput = currentGradOutput
+      currentModule = previousModule
+   end
+   currentGradOutput = self:rethrowErrors(currentModule, 1, 'backward', input, currentGradOutput, scale)
+   self.gradInput = currentGradOutput
+   return currentGradOutput
+end
+
+function Sequential:accUpdateGradParameters(input, gradOutput, lr)
+   local currentGradOutput = gradOutput
+   local currentModule = self.modules[#self.modules]
+   for i=#self.modules-1,1,-1 do
+      local previousModule = self.modules[i]
+      self:rethrowErrors(currentModule, i+1, 'accUpdateGradParameters', previousModule.output, currentGradOutput, lr)
+      currentGradOutput = currentModule.gradInput
+      currentModule = previousModule
+   end
+
+   self:rethrowErrors(currentModule, 1, 'accUpdateGradParameters', input, currentGradOutput, lr)
+end
+
+
+function Sequential:__tostring__()
+   local tab = '  '
+   local line = '\n'
+   local next = ' -> '
+   local str = 'nn.Sequential'
+   str = str .. ' {' .. line .. tab .. '[input'
+   for i=1,#self.modules do
+      str = str .. next .. '(' .. i .. ')'
+   end
+   str = str .. next .. 'output]'
+   for i=1,#self.modules do
+      str = str .. line .. tab .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab)
+   end
+   str = str .. line .. '}'
+   return str
+end
diff --git a/Sigmoid.lua b/Sigmoid.lua
new file mode 100644
index 0000000..0126f6f
--- /dev/null
+++ b/Sigmoid.lua
@@ -0,0 +1,19 @@
+local Sigmoid = torch.class('nn.Sigmoid', 'nn.Module')
+
+function Sigmoid:updateOutput(input)
+   input.THNN.Sigmoid_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function Sigmoid:updateGradInput(input, gradOutput)
+   input.THNN.Sigmoid_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+   return self.gradInput
+end
diff --git a/SmoothL1Criterion.lua b/SmoothL1Criterion.lua
new file mode 100644
index 0000000..be636a9
--- /dev/null
+++ b/SmoothL1Criterion.lua
@@ -0,0 +1,32 @@
+local SmoothL1Criterion, parent = torch.class('nn.SmoothL1Criterion', 'nn.Criterion')
+
+function SmoothL1Criterion:__init(sizeAverage)
+   parent.__init(self)
+   if sizeAverage ~= nil then
+     self.sizeAverage = sizeAverage
+   else
+     self.sizeAverage = true
+   end
+end
+
+function SmoothL1Criterion:updateOutput(input, target)
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.SmoothL1Criterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function SmoothL1Criterion:updateGradInput(input, target)
+   input.THNN.SmoothL1Criterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage
+   )
+   return self.gradInput
+end
diff --git a/SoftMarginCriterion.lua b/SoftMarginCriterion.lua
new file mode 100644
index 0000000..96ccda8
--- /dev/null
+++ b/SoftMarginCriterion.lua
@@ -0,0 +1,24 @@
+local SoftMarginCriterion, parent = torch.class('nn.SoftMarginCriterion', 'nn.Criterion')
+
+function SoftMarginCriterion:__init()
+   parent.__init(self)
+   self.sizeAverage = true
+end
+
+function SoftMarginCriterion:updateOutput(input, target)
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.SoftMarginCriterion_updateOutput(
+      input:cdata(), target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage)
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function SoftMarginCriterion:updateGradInput(input, target)
+   input.THNN.SoftMarginCriterion_updateGradInput(
+      input:cdata(), target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage)
+   return self.gradInput
+end
diff --git a/SoftMax.lua b/SoftMax.lua
new file mode 100644
index 0000000..23a444c
--- /dev/null
+++ b/SoftMax.lua
@@ -0,0 +1,19 @@
+local SoftMax, _ = torch.class('nn.SoftMax', 'nn.Module')
+
+function SoftMax:updateOutput(input)
+   input.THNN.SoftMax_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function SoftMax:updateGradInput(input, gradOutput)
+   input.THNN.SoftMax_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+   return self.gradInput
+end
diff --git a/SoftMin.lua b/SoftMin.lua
new file mode 100644
index 0000000..7da2a65
--- /dev/null
+++ b/SoftMin.lua
@@ -0,0 +1,31 @@
+local SoftMin, parent = torch.class('nn.SoftMin', 'nn.Module')
+
+function SoftMin:updateOutput(input)
+   self.mininput = self.mininput or input.new()
+   self.mininput:resizeAs(input):copy(input):mul(-1)
+   input.THNN.SoftMax_updateOutput(
+      self.mininput:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function SoftMin:updateGradInput(input, gradOutput)
+   self.mininput = self.mininput or input.new()
+   self.mininput:resizeAs(input):copy(input):mul(-1)
+
+   input.THNN.SoftMax_updateGradInput(
+      self.mininput:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+
+   self.gradInput:mul(-1)
+   return self.gradInput
+end
+
+function SoftMin:clearState()
+   if self.mininput then self.mininput:set() end
+   return parent.clearState(self)
+end
diff --git a/SoftPlus.lua b/SoftPlus.lua
new file mode 100644
index 0000000..f77b253
--- /dev/null
+++ b/SoftPlus.lua
@@ -0,0 +1,35 @@
+local SoftPlus, parent = torch.class('nn.SoftPlus', 'nn.Module')
+
+function SoftPlus:__init(beta)
+   parent.__init(self)
+   self.beta = beta or 1  -- Beta controls sharpness of transfer function
+   self.threshold = 20    -- Avoid floating point issues with exp(x), x>20
+end
+
+function SoftPlus:updateOutput(input)
+   -- f(x) = 1/beta * log(1 + exp(beta * x))
+   input.THNN.SoftPlus_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.beta,
+      self.threshold
+   )
+   return self.output
+end
+
+function SoftPlus:updateGradInput(input, gradOutput)
+   -- d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
+   -- SINCE
+   -- y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1)
+   -- THEREFORE:
+   -- d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
+   input.THNN.SoftPlus_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata(),
+      self.beta,
+      self.threshold
+   )
+   return self.gradInput
+end
diff --git a/SoftShrink.lua b/SoftShrink.lua
new file mode 100644
index 0000000..67af15a
--- /dev/null
+++ b/SoftShrink.lua
@@ -0,0 +1,25 @@
+local SoftShrink, parent = torch.class('nn.SoftShrink', 'nn.Module')
+
+function SoftShrink:__init(lam)
+   parent.__init(self)
+   self.lambda = lam or 0.5
+end
+
+function SoftShrink:updateOutput(input)
+   input.THNN.SoftShrink_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.lambda
+   )
+   return self.output
+end
+
+function SoftShrink:updateGradInput(input, gradOutput)
+   input.THNN.SoftShrink_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.lambda
+   )
+   return self.gradInput
+end
diff --git a/SoftSign.lua b/SoftSign.lua
new file mode 100644
index 0000000..ee72011
--- /dev/null
+++ b/SoftSign.lua
@@ -0,0 +1,20 @@
+local SoftSign, parent = torch.class('nn.SoftSign', 'nn.Module')
+
+function SoftSign:updateOutput(input)
+   self.temp = self.temp or input.new()
+   self.temp:resizeAs(input):copy(input):abs():add(1)
+   self.output:resizeAs(input):copy(input):cdiv(self.temp)
+   return self.output
+end
+
+function SoftSign:updateGradInput(input, gradOutput)
+   self.tempgrad = self.tempgrad or input.new()
+   self.tempgrad:resizeAs(self.output):copy(input):abs():add(1):cmul(self.tempgrad)
+   self.gradInput:resizeAs(input):copy(gradOutput):cdiv(self.tempgrad)
+   return self.gradInput
+end
+
+function SoftSign:clearState()
+   nn.utils.clear(self, 'temp', 'tempgrad')
+   return parent.clearState(self)
+end
diff --git a/SparseJacobian.lua b/SparseJacobian.lua
new file mode 100644
index 0000000..19334d1
--- /dev/null
+++ b/SparseJacobian.lua
@@ -0,0 +1,277 @@
+nn.SparseJacobian = {}
+
+function nn.SparseJacobian.backward (module, input, param, dparam)
+   local doparam = 0
+   if param then
+      doparam = 1
+   end
+   
+   -- output deriv
+   module:forward(input)
+   local dout = module.output.new():resizeAs(module.output)
+   -- 1D view
+   local sdout = module.output.new(dout:storage(), 1, dout:nElement())
+   -- jacobian matrix to calculate
+   local jacobian
+   if doparam == 1 then
+      jacobian = torch.Tensor(param:nElement(), dout:nElement()):zero()
+   else
+      jacobian = torch.Tensor(input:size(1), dout:nElement()):zero()
+   end
+
+   for i=1,sdout:nElement() do
+      dout:zero()
+      sdout[i] = 1
+      module:zeroGradParameters()
+      local din = module:updateGradInput(input, dout)
+      module:accGradParameters(input, dout)
+      if doparam == 1 then
+         jacobian:select(2,i):copy(dparam)
+      else
+         jacobian:select(2,i):copy(din:select(2,2))
+      end
+   end
+
+   return jacobian
+end
+
+
+function nn.SparseJacobian.backwardUpdate (module, input, param)
+
+   -- output deriv
+   module:forward(input)
+   local dout = module.output.new():resizeAs(module.output)
+   -- 1D view
+   local sdout = module.output.new(dout:storage(),1,dout:nElement())
+   -- jacobian matrix to calculate
+   local jacobian = torch.Tensor(param:nElement(),dout:nElement()):zero()
+
+   -- original param
+   local params = module:parameters()
+   local origparams = {}
+   for j=1,#params do
+      table.insert(origparams, params[j]:clone())
+   end
+
+   for i=1,sdout:nElement() do
+      -- Reset parameters
+      for j=1,#params do
+         params[j]:copy(origparams[j])
+      end
+      dout:zero()
+      sdout[i] = 1
+      module:zeroGradParameters()
+      module:updateGradInput(input, dout)
+      module:accUpdateGradParameters(input, dout, 1)
+      jacobian:select(2,i):copy(param)
+   end
+
+   for j=1,#params do
+      params[j]:copy(origparams[j])
+   end
+
+   return jacobian
+end
+
+function nn.SparseJacobian.forward(module, input, param)
+   local doparam = 0
+   if param then
+      doparam = 1
+   end
+   param = param or input
+
+   -- perturbation amount
+   local small = 1e-6
+   -- 1D view of input
+   --local tst = param:storage()
+   local sin 
+   if doparam == 1 then
+      sin = param.new(param):resize(param:nElement())
+   else
+      sin = input.new(input):select(2,2)
+   end
+   
+   local out = module:forward(input)
+   -- jacobian matrix to calculate
+   local jacobian 
+   if doparam == 1 then
+      jacobian = torch.Tensor():resize(param:nElement(),
+                                       out:nElement())
+   else
+      jacobian = torch.Tensor():resize(input:size(1),
+                                       out:nElement())
+   end
+
+   local outa = torch.Tensor(jacobian:size(2))
+   local outb = torch.Tensor(jacobian:size(2))
+   
+   for i=1,sin:nElement() do      
+      sin[i] = sin[i] - small
+      outa:copy(module:forward(input))
+      sin[i] = sin[i] + 2*small
+      outb:copy(module:forward(input))
+      sin[i] = sin[i] - small
+
+      outb:add(-1,outa):div(2*small)
+      jacobian:select(1,i):copy(outb)
+   end
+
+   return jacobian
+end
+
+function nn.SparseJacobian.forwardUpdate(module, input, param)
+   -- perturbation amount
+   local small = 1e-6
+   -- 1D view of input
+   --local tst = param:storage()
+   local sin =  param.new(param):resize(param:nElement())--param.new(tst,1,tst:size())
+   -- jacobian matrix to calculate
+   local jacobian = torch.Tensor():resize(param:nElement(),module:forward(input):nElement())
+   
+   local outa = torch.Tensor(jacobian:size(2))
+   local outb = torch.Tensor(jacobian:size(2))
+   
+   for i=1,sin:nElement() do      
+      sin[i] = sin[i] - small
+      outa:copy(module:forward(input))
+      sin[i] = sin[i] + 2*small
+      outb:copy(module:forward(input))
+      sin[i] = sin[i] - small
+
+      outb:add(-1,outa):div(2*small)
+      jacobian:select(1,i):copy(outb)
+      jacobian:select(1,i):mul(-1)
+      jacobian:select(1,i):add(sin[i])
+   end
+   return jacobian
+end
+
+function nn.SparseJacobian.testJacobian (module, input, minval, maxval)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   input:select(2,2):copy(torch.rand(input:size(1)):mul(inrange):add(minval))
+   local jac_fprop = nn.SparseJacobian.forward(module,input)
+   local jac_bprop = nn.SparseJacobian.backward(module,input)
+   local error = jac_fprop-jac_bprop
+   return error:abs():max()
+end
+
+function nn.SparseJacobian.testJacobianParameters (module, input, param, dparam, minval, maxval)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   input:select(2,2):copy(torch.rand(input:size(1)):mul(inrange):add(minval))
+   param:copy(torch.rand(param:nElement()):mul(inrange):add(minval))
+   local jac_bprop = nn.SparseJacobian.backward(module, input, param, dparam)
+   local jac_fprop = nn.SparseJacobian.forward(module, input, param)
+   local error = jac_fprop - jac_bprop
+   return error:abs():max()
+end
+
+function nn.SparseJacobian.testJacobianUpdateParameters (module, input, param, minval, maxval)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   input:select(2,2):copy(torch.rand(input:size(1)):mul(inrange):add(minval))
+   param:copy(torch.rand(param:nElement()):mul(inrange):add(minval))
+   local params_bprop = nn.SparseJacobian.backwardUpdate(module, input, param)
+   local params_fprop = nn.SparseJacobian.forwardUpdate(module, input, param)
+
+   local error = params_fprop - params_bprop
+   return error:abs():max()
+end
+
+function nn.SparseJacobian.testIO(module,input, minval, maxval)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+
+   -- run module
+   module:forward(input)
+   local go = module.output:clone():copy(torch.rand(module.output:nElement()):mul(inrange):add(minval))
+   module:zeroGradParameters()
+   module:updateGradInput(input,go)
+   module:accGradParameters(input,go)
+
+   local fo = module.output:clone()
+   local bo = module.gradInput:clone()
+
+   -- write module
+   local f = torch.DiskFile('tmp.bin','w'):binary()
+   f:writeObject(module)
+   f:close()
+   -- read module
+   local m = torch.DiskFile('tmp.bin'):binary():readObject()
+   m:forward(input)
+   m:zeroGradParameters()
+   m:updateGradInput(input,go)
+   m:accGradParameters(input,go)
+   -- cleanup
+   os.remove('tmp.bin')
+
+   local fo2 = m.output:clone()
+   local bo2 = m.gradInput:clone()
+
+   local errf = fo - fo2
+   local errb = bo - bo2
+   return errf:abs():max(), errb:abs():max()
+end
+
+function nn.SparseJacobian.testAllUpdate(module, input, weight, gradWeight)
+   local gradOutput
+   local lr = torch.uniform(0.1, 1)
+   local errors = {}
+
+   -- accGradParameters
+   local maccgp = module:clone()
+   local weightc = maccgp[weight]:clone()
+   maccgp:forward(input)
+   gradOutput = torch.rand(maccgp.output:size())
+   maccgp:zeroGradParameters()
+   maccgp:updateGradInput(input, gradOutput)
+   maccgp:accGradParameters(input, gradOutput)
+   maccgp:updateParameters(lr)
+   errors["accGradParameters"] = (weightc-maccgp[gradWeight]*lr-maccgp[weight]):norm()
+   
+   -- accUpdateGradParameters
+   local maccugp = module:clone()
+   maccugp:forward(input)
+   maccugp:updateGradInput(input, gradOutput)
+   maccugp:accUpdateGradParameters(input, gradOutput, lr)
+   errors["accUpdateGradParameters"] = (maccugp[weight]-maccgp[weight]):norm()
+
+   -- shared, accGradParameters
+   local macsh1 = module:clone()
+   local macsh2 = module:clone()
+   macsh2:share(macsh1, weight)
+   macsh1:forward(input)
+   macsh2:forward(input)
+   macsh1:zeroGradParameters()
+   macsh2:zeroGradParameters()
+   macsh1:updateGradInput(input, gradOutput)
+   macsh2:updateGradInput(input, gradOutput)
+   macsh1:accGradParameters(input, gradOutput)
+   macsh2:accGradParameters(input, gradOutput)
+   macsh1:updateParameters(lr)
+   macsh2:updateParameters(lr)
+   local err = (weightc-maccgp[gradWeight]*(lr*2)-macsh1[weight]):norm()
+   err = err + (weightc-maccgp[gradWeight]*(lr*2)-macsh2[weight]):norm()
+   errors["accGradParameters [shared]"] = err
+   
+   -- shared, accUpdateGradParameters
+   local macshu1 = module:clone()
+   local macshu2 = module:clone()
+   macshu2:share(macshu1, weight)
+   macshu1:forward(input)
+   macshu2:forward(input)
+   macshu1:updateGradInput(input, gradOutput)
+   macshu2:updateGradInput(input, gradOutput)
+   macshu1:accUpdateGradParameters(input, gradOutput, lr)
+   macshu2:accUpdateGradParameters(input, gradOutput, lr)
+   err = (weightc-maccgp[gradWeight]*(lr*2)-macshu1[weight]):norm()
+   err = err + (weightc-maccgp[gradWeight]*(lr*2)-macshu2[weight]):norm()
+   errors["accUpdateGradParameters [shared]"] = err
+
+   return errors
+end
diff --git a/SparseLinear.lua b/SparseLinear.lua
new file mode 100644
index 0000000..7c3edad
--- /dev/null
+++ b/SparseLinear.lua
@@ -0,0 +1,242 @@
+local THNN = require 'nn.THNN'
+local SparseLinear, parent = torch.class('nn.SparseLinear', 'nn.Module')
+
+local NO_LAST_INPUT = 0
+local ONE_LAST_INPUT = 1
+local ACC_MULTIPLE_TIMES = 2
+
+function SparseLinear:__init(inputSize, outputSize, doGradInput)
+   parent.__init(self)
+
+   self.weightDecay = 0
+   self.doGradInput = doGradInput or false
+   self.weight = torch.Tensor(outputSize, inputSize):zero()
+   self.bias = torch.Tensor(outputSize):zero()
+   self.gradWeight = torch.Tensor(outputSize, inputSize):zero()
+   self.gradBias = torch.Tensor(outputSize):zero()
+
+   assert(type(self.doGradInput) == type(true))
+
+   self.lastInput = nil
+   self.sparseUpdate = NO_LAST_INPUT
+   self.formatted_input = nil
+
+   -- state
+   self.gradInput = {}
+   self.output:resize(outputSize)
+
+   self:reset()
+end
+
+function SparseLinear:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(2))
+   end
+   self.weight:uniform(-stdv, stdv)
+   self.bias:uniform(-stdv, stdv):mul(0.000001)
+end
+
+function SparseLinear:reshapeInput(input)
+   if type(input) == 'table' then
+      return input, true, false
+   else
+      if input:dim() == 2 then
+         return {input}, false, false
+      else
+         return input, true, true
+      end
+   end
+end
+
+function SparseLinear:updateOutput(input)
+   if self.sparseUpdate == ONE_LAST_INPUT then
+      self.sparseUpdate = ACC_MULTIPLE_TIMES
+   end
+   local input, batchMode, legacyMode = self:reshapeInput(input)
+   self.legacyMode = legacyMode
+
+   if legacyMode then 
+      input.THNN.SparseLinear_legacyUpdateOutput(
+         input:cdata(),
+         self.output:cdata(),
+         self.weight:cdata(),
+         self.bias:cdata()
+      )
+   else
+      local nbatches = #input
+      if nbatches == 0 then
+         self.output:copy(self.bias)
+         return self.output
+      end
+
+      local size = 0
+      local marker = 1
+      self.formatted_input = self.formatted_input or input[1].new()
+
+      for i,v in ipairs(input) do size = size + input[i]:size(1) end
+      self.formatted_input:resize(size, 3)
+      for i,v in ipairs(input) do
+         local buf = self.formatted_input:narrow(1, marker, input[i]:size(1))
+         buf:narrow(2,2,2):copy(input[i])
+         buf:select(2,1):fill(i)
+         marker = marker + input[i]:size(1)
+      end
+
+      self.output:resize(nbatches, self.weight:size(1))
+      input[1].THNN.SparseLinear_updateOutput(
+         self.formatted_input:cdata(),
+         self.output:cdata(),
+         self.weight:cdata(),
+         self.bias:cdata()
+      )
+
+      -- fix output size for batchSize = 1
+      if not batchMode then
+         self.output = self.output[1]
+      end
+   end
+
+   return self.output
+end
+
+function SparseLinear:accGradParameters(input, gradOutput, scale)
+   local input, batchMode, legacyMode = self:reshapeInput(input)
+   self.legacyMode = legacyMode
+   self.lastInput = self.lastInput or gradOutput.new()
+   if self.sparseUpdate == NO_LAST_INPUT then
+      local v = self.formatted_input
+      if self.legacyMode then v = input end
+      self.lastInput:resizeAs(v):copy(v)
+      self.sparseUpdate = ONE_LAST_INPUT
+   elseif self.sparseUpdate == ONE_LAST_INPUT then
+      self.sparseUpdate = ACC_MULTIPLE_TIMES
+   end
+
+   if legacyMode then
+      input.THNN.SparseLinear_legacyAccGradParameters(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradWeight:cdata(),
+         self.gradBias:cdata(),
+         self.weight:cdata(),
+         self.bias:cdata(),
+         self.weightDecay or 0,
+         scale or 1
+      )
+   else
+      if not batchMode then
+         gradOutput:resize(1, gradOutput:size(1))
+      end
+
+      local rows = self.formatted_input:select(2, 1)
+      local cols = self.formatted_input:select(2, 2)
+      local sortinds = cols * gradOutput:size(1) + rows
+      local _, inds = sortinds:sort(1, false)
+      local newinput = self.formatted_input:index(1, inds)
+      input[1].THNN.SparseLinear_accGradParameters(
+         newinput:cdata(),
+         gradOutput:cdata(),
+         self.gradWeight:cdata(),
+         self.gradBias:cdata(),
+         self.weight:cdata(),
+         self.bias:cdata(),
+         self.weightDecay or 0,
+         scale or 1
+      )
+   end
+end
+
+function SparseLinear:updateGradInput(input, gradOutput)
+   if self.legacyMode then 
+      if type(self.gradInput) ~= type(gradOutput) then self.gradInput = gradOutput.new() end
+      self.gradInput:resizeAs(input)
+   else
+      self.gradInput = {}
+   end
+   if self.doGradInput then
+      -- GradInput should be dense anyway
+      local gi
+      local batchMode = true
+      if gradOutput:dim() == 1 then
+         gi = self.weight:t()*gradOutput
+         batchMode = false
+      elseif gradOutput:dim() == 2 then
+         gi = gradOutput*self.weight
+      end
+      local ini = self.weight:size(2)
+
+      if self.legacyMode then
+         local batches = self.gradInput:size(1)
+         self.gradInput:resize(batches, ini, 2)
+         self.gradInput:select(3,1):copy(torch.repeatTensor(torch.range(1, ini), batches, 1))
+         self.gradInput:select(3,2):copy(gi)
+      else
+         local indicies = torch.range(1, ini)
+         if not batchMode then gi:resize(1, ini) end
+         for i = 1,gi:size(1) do
+            self.gradInput[i] = gradOutput.new(ini, 2)
+            self.gradInput[i]:select(2, 2):copy(gi[i])
+            self.gradInput[i]:select(2, 1):range(1, ini)
+         end
+      end
+   end
+   return self.gradInput
+end
+
+-- These functions do sparse updates / zeros. However, if we accumulated 
+-- gradients multiple times, we can't depend on the last input to do sparse
+-- updates.
+function SparseLinear:updateParameters(learningRate)
+   if self.lastInput and self.sparseUpdate == ONE_LAST_INPUT then
+      if self.legacyMode then
+         self.lastInput.THNN.SparseLinear_legacyUpdateParameters(
+            self.weight:cdata(),
+            self.bias:cdata(),
+            self.gradWeight:cdata(),
+            self.gradBias:cdata(),
+            self.lastInput:cdata(),
+            learningRate
+         )
+      else
+         self.lastInput.THNN.SparseLinear_updateParameters(
+            self.weight:cdata(),
+            self.bias:cdata(),
+            self.gradWeight:cdata(),
+            self.gradBias:cdata(),
+            self.lastInput:cdata(),
+            learningRate
+         )
+      end
+   else
+      parent.updateParameters(self, learningRate)
+   end
+end
+
+function SparseLinear:zeroGradParameters()
+   if self.lastInput and self.sparseUpdate == ONE_LAST_INPUT then
+      if self.legacyMode then
+         self.lastInput.THNN.SparseLinear_legacyZeroGradParameters(
+            self.gradWeight:cdata(),
+            self.gradBias:cdata(),
+            self.lastInput:cdata()
+         )
+      else
+         self.lastInput.THNN.SparseLinear_zeroGradParameters(
+            self.gradWeight:cdata(),
+            self.gradBias:cdata(),
+            self.lastInput:cdata()
+         )
+      end
+   else
+      parent.zeroGradParameters(self)
+   end
+   self.sparseUpdate = NO_LAST_INPUT
+end
+
+function SparseLinear:clearState()
+   if self.lastInput then self.lastInput:set() end
+   input.THNN.SparseLinear_cudaClearState()
+   return parent.clearState(self)
+end
diff --git a/SpatialAdaptiveMaxPooling.lua b/SpatialAdaptiveMaxPooling.lua
new file mode 100644
index 0000000..74d4cd6
--- /dev/null
+++ b/SpatialAdaptiveMaxPooling.lua
@@ -0,0 +1,41 @@
+local SpatialAdaptiveMaxPooling, parent = torch.class('nn.SpatialAdaptiveMaxPooling', 'nn.Module')
+
+function SpatialAdaptiveMaxPooling:__init(W, H)
+   parent.__init(self)
+   
+   self.W = W
+   self.H = H
+end
+
+function SpatialAdaptiveMaxPooling:updateOutput(input)
+   self.indices = self.indices or input.new()
+   input.THNN.SpatialAdaptiveMaxPooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.indices:cdata(),
+      self.W, self.H
+   )
+   return self.output
+end
+
+function SpatialAdaptiveMaxPooling:updateGradInput(input, gradOutput)
+   input.THNN.SpatialAdaptiveMaxPooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.indices:cdata()
+   )
+   return self.gradInput
+end
+
+-- for backward compat
+function SpatialAdaptiveMaxPooling:empty()
+   self:clearState()
+end
+
+function SpatialAdaptiveMaxPooling:clearState()
+   if self.indices then
+      self.indices:set()
+   end
+   return parent.clearState(self)
+end
diff --git a/SpatialAveragePooling.lua b/SpatialAveragePooling.lua
new file mode 100644
index 0000000..1e76668
--- /dev/null
+++ b/SpatialAveragePooling.lua
@@ -0,0 +1,93 @@
+local SpatialAveragePooling, parent = torch.class('nn.SpatialAveragePooling', 'nn.Module')
+
+function SpatialAveragePooling:__init(kW, kH, dW, dH, padW, padH)
+   parent.__init(self)
+
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW or 1
+   self.dH = dH or 1
+   self.padW = padW or 0
+   self.padH = padH or 0
+   self.ceil_mode = false
+   self.count_include_pad = true
+   self.divide = true
+end
+
+function SpatialAveragePooling:ceil()
+   self.ceil_mode = true
+   return self
+end
+
+function SpatialAveragePooling:floor()
+   self.ceil_mode = false
+   return self
+end
+
+function SpatialAveragePooling:setCountIncludePad()
+   self.count_include_pad = true
+   return self
+end
+
+function SpatialAveragePooling:setCountExcludePad()
+   self.count_include_pad = false
+   return self
+end
+
+local function backwardCompatible(self)
+   if self.ceil_mode == nil then
+      self.ceil_mode = false
+      self.count_include_pad = true
+      self.padH = 0
+      self.padW = 0
+   end
+end
+
+function SpatialAveragePooling:updateOutput(input)
+   backwardCompatible(self)
+   input.THNN.SpatialAveragePooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.ceil_mode,
+      self.count_include_pad
+   )
+   -- for backward compatibility with saved models
+   -- which are not supposed to have "divide" field
+   if not self.divide then
+     self.output:mul(self.kW*self.kH)
+   end
+   return self.output
+end
+
+function SpatialAveragePooling:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      input.THNN.SpatialAveragePooling_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.kW, self.kH,
+         self.dW, self.dH,
+         self.padW, self.padH,
+         self.ceil_mode,
+         self.count_include_pad
+      )
+      -- for backward compatibility
+      if not self.divide then
+         self.gradInput:mul(self.kW*self.kH)
+      end
+      return self.gradInput
+   end
+end
+
+function SpatialAveragePooling:__tostring__()
+   local s = string.format('%s(%dx%d, %d,%d', torch.type(self),
+                            self.kW, self.kH, self.dW, self.dH)
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+      s = s .. ', ' .. self.padW .. ','.. self.padH
+   end
+   s = s .. ')'
+   return s
+end
diff --git a/SpatialBatchNormalization.lua b/SpatialBatchNormalization.lua
new file mode 100644
index 0000000..c5004ce
--- /dev/null
+++ b/SpatialBatchNormalization.lua
@@ -0,0 +1,35 @@
+--[[
+   This file implements Batch Normalization as described in the paper:
+   "Batch Normalization: Accelerating Deep Network Training
+                         by Reducing Internal Covariate Shift"
+                by Sergey Ioffe, Christian Szegedy
+
+   This implementation is useful for inputs coming from convolution layers.
+   For non-convolutional layers, see BatchNormalization.lua
+
+   The operation implemented is:
+   y =     ( x - mean(x) )
+        -------------------- * gamma + beta
+        standard-deviation(x)
+   where gamma and beta are learnable parameters.
+
+   The learning of gamma and beta is optional.
+
+   Usage:
+   with    learnable parameters: nn.SpatialBatchNormalization(N [,eps] [,momentum])
+                                 where N = dimensionality of input
+   without learnable parameters: nn.SpatialBatchNormalization(N [,eps] [,momentum], false)
+
+   eps is a small value added to the variance to avoid divide-by-zero.
+       Defaults to 1e-5
+
+   In training time, this layer keeps a running estimate of it's computed mean and std.
+   The running sum is kept with a default momentum of 0.1 (unless over-ridden)
+   In test time, this running mean/std is used to normalize.
+]]--
+local BN, parent = torch.class('nn.SpatialBatchNormalization', 'nn.BatchNormalization')
+
+BN.__version = 2
+
+-- expected dimension of input
+BN.nDim = 4
diff --git a/SpatialClassNLLCriterion.lua b/SpatialClassNLLCriterion.lua
new file mode 100644
index 0000000..8652e88
--- /dev/null
+++ b/SpatialClassNLLCriterion.lua
@@ -0,0 +1,74 @@
+local THNN = require 'nn.THNN'
+local SpatialClassNLLCriterion, parent = torch.class('nn.SpatialClassNLLCriterion', 'nn.Criterion')
+
+function SpatialClassNLLCriterion:__init(weights, sizeAverage)
+    parent.__init(self)
+    if sizeAverage ~= nil then
+       self.sizeAverage = sizeAverage
+    else
+       self.sizeAverage = true
+    end
+    if weights then
+       assert(weights:dim() == 1, "weights input should be 1-D Tensor")
+       self.weights = weights
+    end
+
+    self.output_tensor = torch.zeros(1)
+    self.total_weight_tensor = torch.ones(1)
+    self.target = torch.zeros(1):long()
+end
+
+function SpatialClassNLLCriterion:__len()
+   if (self.weights) then
+      return #self.weights
+   else
+      return 0
+   end
+end
+
+function SpatialClassNLLCriterion:updateOutput(input, target)
+   if type(target) == 'number' then
+      if input:type() ~= 'torch.CudaTensor' then
+         self.target = self.target:long()
+      end
+      self.target[1] = target
+   elseif target:type() == 'torch.CudaTensor' then
+      self.target = target
+   else
+      self.target = target:long()
+   end
+
+   input.THNN.SpatialClassNLLCriterion_updateOutput(
+      input:cdata(),
+      self.target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage,
+      THNN.optionalTensor(self.weights),
+      self.total_weight_tensor:cdata()
+   )
+   self.output = self.output_tensor[1]
+   return self.output, self.total_weight_tensor[1]
+end
+
+function SpatialClassNLLCriterion:updateGradInput(input, target)
+   if type(target) == 'number' then
+      self.target[1] = target
+   elseif target:type() == 'torch.CudaTensor' then
+      self.target = target
+   else
+      self.target = target:long()
+   end
+
+   self.gradInput:resizeAs(input):zero()
+
+   input.THNN.SpatialClassNLLCriterion_updateGradInput(
+      input:cdata(),
+      self.target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage,
+      THNN.optionalTensor(self.weights),
+      self.total_weight_tensor:cdata()
+   )
+
+   return self.gradInput
+end
diff --git a/SpatialContrastiveNormalization.lua b/SpatialContrastiveNormalization.lua
new file mode 100644
index 0000000..0ad251a
--- /dev/null
+++ b/SpatialContrastiveNormalization.lua
@@ -0,0 +1,36 @@
+local SpatialContrastiveNormalization, parent = torch.class('nn.SpatialContrastiveNormalization','nn.Module')
+
+function SpatialContrastiveNormalization:__init(nInputPlane, kernel, threshold, thresval)
+   parent.__init(self)
+
+   -- get args
+   self.nInputPlane = nInputPlane or 1
+   self.kernel = kernel or torch.Tensor(9,9):fill(1)
+   self.threshold = threshold or 1e-4
+   self.thresval = thresval or threshold or 1e-4
+   local kdim = self.kernel:nDimension()
+
+   -- check args
+   if kdim ~= 2 and kdim ~= 1 then
+      error('<SpatialContrastiveNormalization> averaging kernel must be 2D or 1D')
+   end
+   if (self.kernel:size(1) % 2) == 0 or (kdim == 2 and (self.kernel:size(2) % 2) == 0) then
+      error('<SpatialContrastiveNormalization> averaging kernel must have ODD dimensions')
+   end
+
+   -- instantiate sub+div normalization
+   self.normalizer = nn.Sequential()
+   self.normalizer:add(nn.SpatialSubtractiveNormalization(self.nInputPlane, self.kernel))
+   self.normalizer:add(nn.SpatialDivisiveNormalization(self.nInputPlane, self.kernel,
+                                                       self.threshold, self.thresval))
+end
+
+function SpatialContrastiveNormalization:updateOutput(input)
+   self.output = self.normalizer:forward(input)
+   return self.output
+end
+
+function SpatialContrastiveNormalization:updateGradInput(input, gradOutput)
+   self.gradInput = self.normalizer:backward(input, gradOutput)
+   return self.gradInput
+end
diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua
new file mode 100644
index 0000000..8324f95
--- /dev/null
+++ b/SpatialConvolution.lua
@@ -0,0 +1,195 @@
+local THNN = require 'nn.THNN'
+local SpatialConvolution, parent = torch.class('nn.SpatialConvolution', 'nn.Module')
+
+function SpatialConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kW = kW
+   self.kH = kH
+
+   self.dW = dW
+   self.dH = dH
+   self.padW = padW or 0
+   self.padH = padH or self.padW
+
+   self.weight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW)
+   self.bias = torch.Tensor(nOutputPlane)
+   self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW)
+   self.gradBias = torch.Tensor(nOutputPlane)
+
+   self:reset()
+end
+
+function SpatialConvolution:noBias()
+   self.bias = nil
+   self.gradBias = nil
+   return self
+end
+
+function SpatialConvolution:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      if self.bias then
+         self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+         end)
+      end
+   else
+      self.weight:uniform(-stdv, stdv)
+      if self.bias then
+         self.bias:uniform(-stdv, stdv)
+      end
+   end
+end
+
+local function backCompatibility(self)
+   self.finput = self.finput or self.weight.new()
+   self.fgradInput = self.fgradInput or self.weight.new()
+   if self.padding then
+      self.padW = self.padding
+      self.padH = self.padding
+      self.padding = nil
+   else
+      self.padW = self.padW or 0
+      self.padH = self.padH or 0
+   end
+   if self.weight:dim() == 2 then
+      self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+   end
+   if self.gradWeight and self.gradWeight:dim() == 2 then
+      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+   end
+end
+
+local function makeContiguous(self, input, gradOutput)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resizeAs(input):copy(input)
+      input = self._input
+   end
+   if gradOutput then
+      if not gradOutput:isContiguous() then
+	 self._gradOutput = self._gradOutput or gradOutput.new()
+	 self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+	 gradOutput = self._gradOutput
+      end
+   end
+   return input, gradOutput
+end
+
+-- function to re-view the weight layout in a way that would make the MM ops happy
+local function viewWeight(self)
+   self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
+   if self.gradWeight and self.gradWeight:dim() > 0 then
+      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
+   end
+end
+
+local function unviewWeight(self)
+   self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+   if self.gradWeight and self.gradWeight:dim() > 0 then
+      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+   end
+end
+
+function SpatialConvolution:updateOutput(input)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   backCompatibility(self)
+   viewWeight(self)
+   input = makeContiguous(self, input)
+   input.THNN.SpatialConvolutionMM_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      THNN.optionalTensor(self.bias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH
+   )
+   unviewWeight(self)
+   return self.output
+end
+
+function SpatialConvolution:updateGradInput(input, gradOutput)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   if self.gradInput then
+      backCompatibility(self)
+      viewWeight(self)
+      input, gradOutput = makeContiguous(self, input, gradOutput)
+      input.THNN.SpatialConvolutionMM_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),         
+         self.finput:cdata(),
+         self.fgradInput:cdata(),
+         self.kW, self.kH,
+         self.dW, self.dH,
+         self.padW, self.padH
+      )
+      unviewWeight(self)
+      return self.gradInput
+   end
+end
+
+function SpatialConvolution:accGradParameters(input, gradOutput, scale)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   scale = scale or 1
+   backCompatibility(self)
+   input, gradOutput = makeContiguous(self, input, gradOutput)
+   viewWeight(self)
+   input.THNN.SpatialConvolutionMM_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      THNN.optionalTensor(self.gradBias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      scale
+   )
+   unviewWeight(self)
+end
+
+function SpatialConvolution:type(type,tensorCache)
+   self.finput = self.finput and torch.Tensor()
+   self.fgradInput = self.fgradInput and torch.Tensor()
+   return parent.type(self,type,tensorCache)
+end
+
+function SpatialConvolution:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%d', torch.type(self),
+         self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
+   if self.dW ~= 1 or self.dH ~= 1 or self.padW ~= 0 or self.padH ~= 0 then
+     s = s .. string.format(', %d,%d', self.dW, self.dH)
+   end
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+     s = s .. ', ' .. self.padW .. ',' .. self.padH
+   end
+   if self.bias then
+      return s .. ')'
+   else
+      return s .. ') without bias'
+   end
+end
+
+function SpatialConvolution:clearState()
+   nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+   return parent.clearState(self)
+end
diff --git a/SpatialConvolutionLocal.lua b/SpatialConvolutionLocal.lua
new file mode 100644
index 0000000..3abc46b
--- /dev/null
+++ b/SpatialConvolutionLocal.lua
@@ -0,0 +1,207 @@
+local SpatialConvolutionLocal, parent = torch.class('nn.SpatialConvolutionLocal', 'nn.Module')
+
+function SpatialConvolutionLocal:__init(nInputPlane, nOutputPlane, iW, iH ,kW, kH, dW, dH, padW, padH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kW = kW
+   self.kH = kH
+   self.iW = iW
+   self.iH = iH
+
+   self.dW = dW
+   self.dH = dH
+   self.padW = padW or 0
+   self.padH = padH or self.padW
+   self.oW = math.floor((self.padW * 2 + iW - self.kW) / self.dW) + 1
+   self.oH = math.floor((self.padH * 2 + iH - self.kH) / self.dH) + 1
+   assert(1 <= self.oW and 1 <= self.oH, 'illegal configuration: output width or height less than 1')
+
+   self.weight = torch.Tensor(self.oH, self.oW, nOutputPlane, nInputPlane, kH, kW)
+   self.bias = torch.Tensor(nOutputPlane, self.oH, self.oW)
+   self.gradWeight = torch.Tensor():resizeAs(self.weight)
+   self.gradBias = torch.Tensor():resizeAs(self.bias)
+
+   self:reset()
+end
+
+function SpatialConvolutionLocal:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+   else
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+local function makeContiguous(self, input, gradOutput)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resizeAs(input):copy(input)
+      input = self._input
+   end
+   if gradOutput then
+      if not gradOutput:isContiguous() then
+	 self._gradOutput = self._gradOutput or gradOutput.new()
+	 self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+	 gradOutput = self._gradOutput
+      end
+   end
+   return input, gradOutput
+end
+
+local function viewWeight(self)
+   self.weight = self.weight:view(self.oH * self.oW, self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
+   if self.gradWeight and self.gradWeight:dim() > 0 then
+      self.gradWeight = self.gradWeight:view(self.oH * self.oW, self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
+   end
+end
+
+local function unviewWeight(self)
+   self.weight = self.weight:view(self.oH, self.oW, self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+   if self.gradWeight and self.gradWeight:dim() > 0 then
+      self.gradWeight = self.gradWeight:view(self.oH, self.oW, self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+   end
+end
+
+local function checkInputSize(self, input)
+   if input:nDimension() == 3 then
+      if input:size(1) ~= self.nInputPlane or input:size(2) ~= self.iH or input:size(3) ~= self.iW then
+         error(string.format('Given input size: (%dx%dx%d) inconsistent with expected input size: (%dx%dx%d).',
+                             input:size(1), input:size(2), input:size(3), self.nInputPlane, self.iH, self.iW))
+      end
+   elseif input:nDimension() == 4 then
+      if input:size(2) ~= self.nInputPlane or input:size(3) ~= self.iH or input:size(4) ~= self.iW then
+         error(string.format('Given input size: (%dx%dx%dx%d) inconsistent with expected input size: (batchsize x%dx%dx%d).',
+                              input:size(1), input:size(2), input:size(3), input:size(4), self.nInputPlane, self.iH, self.iW))
+      end
+   else
+      error('3D or 4D(batch mode) tensor expected')
+   end
+end
+
+local function checkOutputSize(self, input, output)
+   if output:nDimension() ~= input:nDimension() then
+      error('inconsistent dimension between output and input.')
+   end
+   if output:nDimension() == 3 then
+      if output:size(1) ~= self.nOutputPlane or output:size(2) ~= self.oH or output:size(3) ~= self.oW then
+         error(string.format('Given output size: (%dx%dx%d) inconsistent with expected output size: (%dx%dx%d).',
+                             output:size(1), output:size(2), output:size(3), self.nOutputPlane, self.oH, self.oW))
+      end
+   elseif output:nDimension() == 4 then
+      if output:size(2) ~= self.nOutputPlane or output:size(3) ~= self.oH or output:size(4) ~= self.oW then
+         error(string.format('Given output size: (%dx%dx%dx%d) inconsistent with expected output size: (batchsize x%dx%dx%d).',
+                              output:size(1), output:size(2), output:size(3), output:size(4), self.nOutputPlane, self.oH, self.oW))
+      end
+   else
+      error('3D or 4D(batch mode) tensor expected')
+   end
+end
+
+function SpatialConvolutionLocal:updateOutput(input)
+   self.finput = self.finput or input.new()
+   self.fgradInput = self.fgradInput or input.new()
+   checkInputSize(self, input)
+   viewWeight(self)
+   input = makeContiguous(self, input)
+   input.THNN.SpatialConvolutionLocal_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.iW, self.iH,
+      self.oW, self.oH
+   )
+   unviewWeight(self)
+   return self.output
+end
+
+function SpatialConvolutionLocal:updateGradInput(input, gradOutput)
+   checkInputSize(self, input)
+   checkOutputSize(self, input, gradOutput)
+   if self.gradInput then
+      viewWeight(self)
+      input, gradOutput = makeContiguous(self, input, gradOutput)
+      input.THNN.SpatialConvolutionLocal_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.finput:cdata(),
+         self.fgradInput:cdata(),
+         self.kW, self.kH,
+         self.dW, self.dH,
+         self.padW, self.padH,
+         self.iW, self.iH,
+         self.oW, self.oH
+      )
+      unviewWeight(self)
+      return self.gradInput
+   end
+end
+
+function SpatialConvolutionLocal:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   checkInputSize(self, input)
+   checkOutputSize(self, input, gradOutput)
+   input, gradOutput = makeContiguous(self, input, gradOutput)
+   viewWeight(self)
+   input.THNN.SpatialConvolutionLocal_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      self.gradBias:cdata(),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.iW, self.iH,
+      self.oW, self.oH,
+      scale
+   )
+   unviewWeight(self)
+end
+
+function SpatialConvolutionLocal:type(type,tensorCache)
+   self.finput = self.finput and torch.Tensor()
+   self.fgradInput = self.fgradInput and torch.Tensor()
+   return parent.type(self,type,tensorCache)
+end
+
+function SpatialConvolutionLocal:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%d, %dx%d', torch.type(self),
+         self.nInputPlane, self.nOutputPlane, self.iW, self.iH, self.kW, self.kH)
+   if self.dW ~= 1 or self.dH ~= 1 or self.padW ~= 0 or self.padH ~= 0 then
+     s = s .. string.format(', %d,%d', self.dW, self.dH)
+   end
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+     s = s .. ', ' .. self.padW .. ',' .. self.padH
+   end
+   return s .. ')'
+end
+
+function SpatialConvolutionLocal:clearState()
+   nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+   return parent.clearState(self)
+end
diff --git a/SpatialConvolutionMM.lua b/SpatialConvolutionMM.lua
new file mode 100644
index 0000000..f3e5293
--- /dev/null
+++ b/SpatialConvolutionMM.lua
@@ -0,0 +1,158 @@
+local THNN = require 'nn.THNN'
+local SpatialConvolutionMM, parent = torch.class('nn.SpatialConvolutionMM', 'nn.Module')
+
+function SpatialConvolutionMM:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kW = kW
+   self.kH = kH
+
+   self.dW = dW
+   self.dH = dH
+   self.padW = padW or 0
+   self.padH = padH or self.padW
+
+   self.weight = torch.Tensor(nOutputPlane, nInputPlane*kH*kW)
+   self.bias = torch.Tensor(nOutputPlane)
+   self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane*kH*kW)
+   self.gradBias = torch.Tensor(nOutputPlane)
+
+   self:reset()
+end
+
+function SpatialConvolutionMM:noBias()
+   self.bias = nil
+   self.gradBias = nil
+   return self
+end
+
+function SpatialConvolutionMM:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+   else
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+local function makeContiguous(self, input, gradOutput)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resizeAs(input):copy(input)
+      input = self._input
+   end
+   if gradOutput then
+      if not gradOutput:isContiguous() then
+	 self._gradOutput = self._gradOutput or gradOutput.new()
+	 self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+	 gradOutput = self._gradOutput
+      end
+   end
+   return input, gradOutput
+end
+
+function SpatialConvolutionMM:updateOutput(input)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   self.finput = self.finput or input.new()
+   self.fgradInput = self.fgradInput or input.new()
+   -- backward compatibility
+   if self.padding then
+      self.padW = self.padding
+      self.padH = self.padding
+      self.padding = nil
+   end
+   input = makeContiguous(self, input)
+   input.THNN.SpatialConvolutionMM_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      THNN.optionalTensor(self.bias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH
+   )
+   return self.output
+end
+
+function SpatialConvolutionMM:updateGradInput(input, gradOutput)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   if self.gradInput then
+      input, gradOutput = makeContiguous(self, input, gradOutput)
+      input.THNN.SpatialConvolutionMM_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.finput:cdata(),
+         self.fgradInput:cdata(),
+         self.kW, self.kH,
+         self.dW, self.dH,
+         self.padW, self.padH
+      )
+      return self.gradInput
+   end
+end
+
+function SpatialConvolutionMM:accGradParameters(input, gradOutput, scale)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   scale = scale or 1
+   input, gradOutput = makeContiguous(self, input, gradOutput)
+   assert((self.bias and self.gradBias) or (self.bias == nil and self.gradBias == nil))
+   input.THNN.SpatialConvolutionMM_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      THNN.optionalTensor(self.gradBias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      scale
+   )
+end
+
+function SpatialConvolutionMM:type(type,tensorCache)
+   self.finput = self.finput and torch.Tensor()
+   self.fgradInput = self.fgradInput and torch.Tensor()
+   return parent.type(self,type,tensorCache)
+end
+
+function SpatialConvolutionMM:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%d', torch.type(self),
+         self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
+   if self.dW ~= 1 or self.dH ~= 1 or self.padW ~= 0 or self.padH ~= 0 then
+     s = s .. string.format(', %d,%d', self.dW, self.dH)
+   end
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+     s = s .. ', ' .. self.padW .. ',' .. self.padH
+   end
+   if self.bias then
+      return s .. ')'
+   else
+      return s .. ') without bias'
+   end
+end
+
+function SpatialConvolutionMM:clearState()
+   nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+   return parent.clearState(self)
+end
+
diff --git a/SpatialConvolutionMap.lua b/SpatialConvolutionMap.lua
new file mode 100644
index 0000000..9051c11
--- /dev/null
+++ b/SpatialConvolutionMap.lua
@@ -0,0 +1,154 @@
+local SpatialConvolutionMap, parent = torch.class('nn.SpatialConvolutionMap', 'nn.Module')
+
+nn.tables = nn.tables or {}
+
+function nn.tables.full(nin, nout)
+   local ft = torch.Tensor(nin*nout,2)
+   local p = 1
+   for j=1,nout do
+      for i=1,nin do
+	 ft[p][1] = i
+	 ft[p][2] = j
+	 p = p + 1
+      end
+   end
+   return ft
+end
+
+function nn.tables.oneToOne(nfeat)
+   local ft = torch.Tensor(nfeat,2)
+   for i=1,nfeat do
+      ft[i][1] = i
+      ft[i][2] = i
+   end
+   return ft
+end
+
+function nn.tables.random(nin, nout, nto)
+   local nker = nto * nout
+   local tbl = torch.Tensor(nker, 2)
+   local fi = torch.randperm(nin)
+   local frcntr = 1
+   local nfi = math.floor(nin/nto) -- number of distinct nto chunks
+   local totbl = tbl:select(2,2)
+   local frtbl = tbl:select(2,1)
+   local fitbl = fi:narrow(1, 1, (nfi * nto)) -- part of fi that covers distinct chunks
+   local ufrtbl= frtbl:unfold(1, nto, nto)
+   local utotbl= totbl:unfold(1, nto, nto)
+   local ufitbl= fitbl:unfold(1, nto, nto)
+
+   -- start filling frtbl
+   for i=1,nout do -- fro each unit in target map
+      ufrtbl:select(1,i):copy(ufitbl:select(1,frcntr))
+      frcntr = frcntr + 1
+      if frcntr-1 ==  nfi then -- reset fi
+	 fi:copy(torch.randperm(nin))
+	 frcntr = 1
+      end
+   end
+   for tocntr=1,utotbl:size(1) do
+      utotbl:select(1,tocntr):fill(tocntr)
+   end
+   return tbl
+end
+
+function SpatialConvolutionMap:__init(conMatrix, kW, kH, dW, dH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW
+   self.dH = dH
+   self.connTable = conMatrix
+   self.nInputPlane = self.connTable:select(2,1):max()
+   self.nOutputPlane = self.connTable:select(2,2):max()
+   self.weight = torch.Tensor(self.connTable:size(1), kH, kW)
+   self.bias = torch.Tensor(self.nOutputPlane)
+   self.gradWeight = torch.Tensor(self.connTable:size(1), kH, kW)
+   self.gradBias = torch.Tensor(self.nOutputPlane)
+
+   self:reset()
+end
+
+function SpatialConvolutionMap:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+      if nn.oldSeed then
+         self.weight:apply(function()
+            return torch.uniform(-stdv, stdv)
+         end)
+         self.bias:apply(function()
+            return torch.uniform(-stdv, stdv)
+         end)
+      else
+         self.weight:uniform(-stdv, stdv)
+         self.bias:uniform(-stdv, stdv)
+      end
+   else
+      local ninp = torch.Tensor(self.nOutputPlane):zero()
+      for i=1,self.connTable:size(1) do ninp[self.connTable[i][2]] =  ninp[self.connTable[i][2]]+1 end
+      for k=1,self.connTable:size(1) do
+         stdv = 1/math.sqrt(self.kW*self.kH*ninp[self.connTable[k][2]])
+         if nn.oldSeed then
+            self.weight:select(1,k):apply(function() return torch.uniform(-stdv,stdv) end)
+         else
+            self.weight:select(1,k):uniform(-stdv,stdv)
+         end
+      end
+      for k=1,self.bias:size(1) do
+         stdv = 1/math.sqrt(self.kW*self.kH*ninp[k])
+         self.bias[k] = torch.uniform(-stdv,stdv)
+      end
+   end
+end
+
+function SpatialConvolutionMap:updateOutput(input)
+   input.THNN.SpatialConvolutionMap_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.connTable:cdata(),
+      self.nInputPlane,
+      self.nOutputPlane,
+      self.dW, self.dH
+   )
+   return self.output
+end
+
+function SpatialConvolutionMap:updateGradInput(input, gradOutput)
+   input.THNN.SpatialConvolutionMap_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.connTable:cdata(),
+      self.nInputPlane,
+      self.nOutputPlane,
+      self.dW, self.dH
+   )
+   return self.gradInput
+end
+
+function SpatialConvolutionMap:accGradParameters(input, gradOutput, scale)
+   input.THNN.SpatialConvolutionMap_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      self.gradBias:cdata(),
+      self.connTable:cdata(),
+      self.nInputPlane,
+      self.nOutputPlane,
+      self.dW, self.dH,
+      scale or 1
+   )
+end
+
+function SpatialConvolutionMap:decayParameters(decay)
+   self.weight:add(-decay, self.weight)
+   self.bias:add(-decay, self.bias)
+end
diff --git a/SpatialCrossMapLRN.lua b/SpatialCrossMapLRN.lua
new file mode 100644
index 0000000..9758c79
--- /dev/null
+++ b/SpatialCrossMapLRN.lua
@@ -0,0 +1,153 @@
+local SpatialCrossMapLRN, parent = torch.class('nn.SpatialCrossMapLRN', 'nn.Module')
+
+function SpatialCrossMapLRN:__init(size, alpha, beta, k)
+  parent.__init(self)
+
+  self.size = size
+  self.alpha = alpha or 0.0001
+  self.beta = beta or 0.75
+  self.k = k or 1
+end
+
+function SpatialCrossMapLRN:updateOutput(input)
+  assert(input:dim() == 3 or input:dim() == 4,
+         'Input must be 3D or 4D')
+
+  self.scale = self.scale or input.new()
+
+  if torch.type(input) == 'torch.CudaTensor' then
+     input.THNN.SpatialCrossMapLRN_updateOutput(
+        input:cdata(),
+        self.output:cdata(),
+        self.scale:cdata(),
+        self.size,
+        self.alpha,
+        self.beta,
+        self.k
+     )
+  else
+     local isBatch = true
+     if input:dim() == 3 then
+       input = nn.utils.addSingletonDimension(input)
+       isBatch = false
+     end
+
+     local batchSize   = input:size(1)
+     local channels    = input:size(2) 
+     local inputHeight = input:size(3) 
+     local inputWidth  = input:size(4) 
+
+     self.output:resizeAs(input)
+     self.scale:resizeAs(input)
+
+     -- use output storage as temporary buffer
+     local inputSquare = self.output
+     inputSquare:pow(input, 2)
+       
+     local prePad = (self.size - 1)/2 + 1
+     local prePadCrop = prePad > channels and channels or prePad
+
+     local scaleFirst = self.scale:select(2,1)
+     scaleFirst:zero()
+     -- compute first feature map normalization
+     for c = 1, prePadCrop do
+       scaleFirst:add(inputSquare:select(2, c))
+     end
+
+     -- reuse computations for next feature maps normalization
+     -- by adding the next feature map and removing the previous
+     for c = 2, channels do
+       local scalePrevious = self.scale:select(2, c -1)
+       local scaleCurrent  = self.scale:select(2, c)
+       scaleCurrent:copy(scalePrevious)
+       if c < channels - prePad + 2 then
+	 local squareNext   = inputSquare:select(2, c + prePad - 1)
+	 scaleCurrent:add(1, squareNext)
+       end
+       if c > prePad  then
+	 local squarePrevious = inputSquare:select(2, c - prePad )
+	 scaleCurrent:add(-1, squarePrevious)
+       end
+     end
+
+     self.scale:mul(self.alpha/self.size):add(self.k)
+
+     self.output:pow(self.scale,-self.beta)
+     self.output:cmul(input)
+
+     if not isBatch then
+       self.output = self.output[1]
+     end
+  end
+
+  return self.output
+end
+
+function SpatialCrossMapLRN:updateGradInput(input, gradOutput)
+  assert(input:dim() == 3 or input:dim() == 4,
+         'Input must be 3D or 4D')
+ 
+  if torch.type(input) == 'torch.CudaTensor' then
+     input.THNN.SpatialCrossMapLRN_updateGradInput(
+        input:cdata(),
+        gradOutput:cdata(),
+        self.gradInput:cdata(),
+        self.scale:cdata(),
+        self.output:cdata(),
+        self.size,
+        self.alpha,
+        self.beta,
+        self.k
+     )
+  else
+     local isBatch = true
+     if input:dim() == 3 then
+       input = nn.utils.addSingletonDimension(input)
+       gradOutput = nn.utils.addSingletonDimension(gradOutput)
+       self.output = nn.utils.addSingletonDimension(self.output)
+       isBatch = false
+     end
+
+     local batchSize   = input:size(1)
+     local channels    = input:size(2) 
+     local inputHeight = input:size(3) 
+     local inputWidth  = input:size(4) 
+
+     self.paddedRatio = self.paddedRatio or input.new()
+     self.accumRatio = self.accumRatio or input.new()
+     self.paddedRatio:resize(channels + self.size - 1, inputHeight, inputWidth)
+     self.accumRatio:resize(inputHeight,inputWidth)
+
+     local cacheRatioValue = 2*self.alpha*self.beta/self.size
+     local inversePrePad = self.size - (self.size - 1) / 2
+
+     self.gradInput:resizeAs(input)
+     self.gradInput:pow(self.scale,-self.beta):cmul(gradOutput)
+
+     self.paddedRatio:zero()
+     local paddedRatioCenter = self.paddedRatio:narrow(1, inversePrePad, channels)
+     for n = 1, batchSize do
+       paddedRatioCenter:cmul(gradOutput[n],self.output[n])
+       paddedRatioCenter:cdiv(self.scale[n])
+       self.accumRatio:sum(self.paddedRatio:narrow(1,1,self.size-1), 1)
+       for c = 1, channels do
+	 self.accumRatio:add(self.paddedRatio[c+self.size-1])
+	 self.gradInput[n][c]:addcmul(-cacheRatioValue, input[n][c], self.accumRatio)
+	 self.accumRatio:add(-1, self.paddedRatio[c])
+       end
+     end
+
+     if not isBatch then
+       self.gradInput = self.gradInput[1]
+       self.output = self.output[1]
+     end
+  end
+
+  return self.gradInput
+end
+
+
+function SpatialCrossMapLRN:clearState()
+   nn.utils.clear(self, 'scale', 'paddedRatio', 'accumRatio')
+  return parent.clearState(self)
+end
diff --git a/SpatialDilatedConvolution.lua b/SpatialDilatedConvolution.lua
new file mode 100644
index 0000000..8611ee9
--- /dev/null
+++ b/SpatialDilatedConvolution.lua
@@ -0,0 +1,99 @@
+local THNN = require 'nn.THNN'
+local SpatialDilatedConvolution, parent = torch.class('nn.SpatialDilatedConvolution', 'nn.SpatialConvolution')
+
+function SpatialDilatedConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH, dilationH, dilationW)
+   parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
+
+   self.dilationH = dilationH or 1
+   self.dilationW = dilationW or 1
+end
+
+local function makeContiguous(self, input, gradOutput)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resizeAs(input):copy(input)
+      input = self._input
+   end
+   if gradOutput then
+      if not gradOutput:isContiguous() then
+	 self._gradOutput = self._gradOutput or gradOutput.new()
+	 self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+	 gradOutput = self._gradOutput
+      end
+   end
+   return input, gradOutput
+end
+
+function SpatialDilatedConvolution:updateOutput(input)
+   self.finput = self.finput or self.weight.new()
+   self.fgradInput = self.fgradInput or self.weight.new()
+   input = makeContiguous(self, input)
+   input.THNN.SpatialDilatedConvolution_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      THNN.optionalTensor(self.bias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.dilationH, self.dilationW
+   )
+   return self.output
+end
+
+function SpatialDilatedConvolution:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      input, gradOutput = makeContiguous(self, input, gradOutput)
+      self.fgradInput = self.fgradInput or self.weight.new()
+      input.THNN.SpatialDilatedConvolution_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.finput:cdata(),
+         self.kW, self.kH,
+         self.dW, self.dH,
+         self.padW, self.padH,
+         self.dilationH, self.dilationW
+      )
+      return self.gradInput
+   end
+end
+
+function SpatialDilatedConvolution:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   input, gradOutput = makeContiguous(self, input, gradOutput)
+   self.fgradInput = self.fgradInput or self.weight.new()
+   input.THNN.SpatialDilatedConvolution_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      THNN.optionalTensor(self.gradBias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.dilationH, self.dilationW,
+      scale
+   )
+end
+
+function SpatialDilatedConvolution:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%d', torch.type(self),
+         self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
+   if self.dW ~= 1 or self.dH ~= 1 or self.padW ~= 0 or self.padH ~= 0 then
+     s = s .. string.format(', %d,%d', self.dW, self.dH)
+   end
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+     s = s .. ', ' .. self.padW .. ',' .. self.padH
+   end
+   s = s .. ', ' .. self.dilationW .. ',' .. self.dilationH
+   if self.bias then
+      return s .. ')'
+   else
+      return s .. ') without bias'
+   end
+end
diff --git a/SpatialDivisiveNormalization.lua b/SpatialDivisiveNormalization.lua
new file mode 100644
index 0000000..bdc7bac
--- /dev/null
+++ b/SpatialDivisiveNormalization.lua
@@ -0,0 +1,136 @@
+local SpatialDivisiveNormalization, parent = torch.class('nn.SpatialDivisiveNormalization','nn.Module')
+
+function SpatialDivisiveNormalization:__init(nInputPlane, kernel, threshold, thresval)
+   parent.__init(self)
+
+   -- get args
+   self.nInputPlane = nInputPlane or 1
+   self.kernel = kernel or torch.Tensor(9,9):fill(1)
+   self.threshold = threshold or 1e-4
+   self.thresval = thresval or threshold or 1e-4
+   local kdim = self.kernel:nDimension()
+
+   -- check args
+   if kdim ~= 2 and kdim ~= 1 then
+      error('<SpatialDivisiveNormalization> averaging kernel must be 2D or 1D')
+   end
+   if (self.kernel:size(1) % 2) == 0 or (kdim == 2 and (self.kernel:size(2) % 2) == 0) then
+      error('<SpatialDivisiveNormalization> averaging kernel must have ODD dimensions')
+   end
+
+   -- padding values
+   local padH = math.floor(self.kernel:size(1)/2)
+   local padW = padH
+   if kdim == 2 then
+      padW = math.floor(self.kernel:size(2)/2)
+   end
+
+   -- create convolutional mean estimator
+   self.meanestimator = nn.Sequential()
+   self.meanestimator:add(nn.SpatialZeroPadding(padW, padW, padH, padH))
+   if kdim == 2 then
+      self.meanestimator:add(nn.SpatialConvolution(self.nInputPlane, 1, self.kernel:size(2), self.kernel:size(1)))
+   else
+      self.meanestimator:add(nn.SpatialConvolutionMap(nn.tables.oneToOne(self.nInputPlane), self.kernel:size(1), 1))
+      self.meanestimator:add(nn.SpatialConvolution(self.nInputPlane, 1, 1, self.kernel:size(1)))
+   end
+   self.meanestimator:add(nn.Replicate(self.nInputPlane,1,3))
+
+   -- create convolutional std estimator
+   self.stdestimator = nn.Sequential()
+   self.stdestimator:add(nn.Square())
+   self.stdestimator:add(nn.SpatialZeroPadding(padW, padW, padH, padH))
+   if kdim == 2 then
+      self.stdestimator:add(nn.SpatialConvolution(self.nInputPlane, 1, self.kernel:size(2), self.kernel:size(1)))
+   else
+      self.stdestimator:add(nn.SpatialConvolutionMap(nn.tables.oneToOne(self.nInputPlane), self.kernel:size(1), 1))
+      self.stdestimator:add(nn.SpatialConvolution(self.nInputPlane, 1, 1, self.kernel:size(1)))
+   end
+   self.stdestimator:add(nn.Replicate(self.nInputPlane,1,3))
+   self.stdestimator:add(nn.Sqrt())
+
+   -- set kernel and bias
+   if kdim == 2 then
+      self.kernel:div(self.kernel:sum() * self.nInputPlane)
+      for i = 1,self.nInputPlane do 
+         self.meanestimator.modules[2].weight[1][i] = self.kernel
+         self.stdestimator.modules[3].weight[1][i] = self.kernel
+      end
+      self.meanestimator.modules[2].bias:zero()
+      self.stdestimator.modules[3].bias:zero()
+   else
+      self.kernel:div(self.kernel:sum() * math.sqrt(self.nInputPlane))
+      for i = 1,self.nInputPlane do 
+         self.meanestimator.modules[2].weight[i]:copy(self.kernel)
+         self.meanestimator.modules[3].weight[1][i]:copy(self.kernel)
+         self.stdestimator.modules[3].weight[i]:copy(self.kernel)
+         self.stdestimator.modules[4].weight[1][i]:copy(self.kernel)
+      end
+      self.meanestimator.modules[2].bias:zero()
+      self.meanestimator.modules[3].bias:zero()
+      self.stdestimator.modules[3].bias:zero()
+      self.stdestimator.modules[4].bias:zero()
+   end
+
+   -- other operation
+   self.normalizer = nn.CDivTable()
+   self.divider = nn.CDivTable()
+   self.thresholder = nn.Threshold(self.threshold, self.thresval)
+
+   -- coefficient array, to adjust side effects
+   self.coef = torch.Tensor(1,1,1)
+end
+
+function SpatialDivisiveNormalization:updateOutput(input)
+   
+   self.localstds = self.stdestimator:updateOutput(input)
+
+   -- compute side coefficients
+   local dim = input:dim()
+   if self.localstds:dim() ~= self.coef:dim() or (input:size(dim) ~= self.coef:size(dim)) or (input:size(dim-1) ~= self.coef:size(dim-1)) then
+      self.ones = self.ones or input.new()
+      if dim == 4 then
+         -- batch mode
+         self.ones:resizeAs(input[1]):fill(1)
+         local coef = self.meanestimator:updateOutput(self.ones)
+         self._coef = self._coef or input.new()
+         self._coef:resizeAs(coef):copy(coef) -- make contiguous for view
+         self.coef = self._coef:view(1,table.unpack(self._coef:size():totable())):expandAs(self.localstds)
+      else
+         self.ones:resizeAs(input):fill(1)
+         self.coef = self.meanestimator:updateOutput(self.ones)
+      end
+      
+   end
+
+   -- normalize std dev
+   self.adjustedstds = self.divider:updateOutput{self.localstds, self.coef}
+   self.thresholdedstds = self.thresholder:updateOutput(self.adjustedstds)
+   self.output = self.normalizer:updateOutput{input, self.thresholdedstds}
+
+   -- done
+   return self.output
+end
+
+function SpatialDivisiveNormalization:updateGradInput(input, gradOutput)
+   -- resize grad
+   self.gradInput:resizeAs(input):zero()
+
+   -- backprop through all modules
+   local gradnorm = self.normalizer:updateGradInput({input, self.thresholdedstds}, gradOutput)
+   local gradadj = self.thresholder:updateGradInput(self.adjustedstds, gradnorm[2])
+   local graddiv = self.divider:updateGradInput({self.localstds, self.coef}, gradadj)
+   self.gradInput:add(self.stdestimator:updateGradInput(input, graddiv[1]))
+   self.gradInput:add(gradnorm[1])
+
+   -- done
+   return self.gradInput
+end
+
+function SpatialDivisiveNormalization:clearState()
+   if self.ones then self.ones:set() end
+   if self._coef then self._coef:set() end
+   self.meanestimator:clearState()
+   self.stdestimator:clearState()
+   return parent.clearState(self)
+end
diff --git a/SpatialDropout.lua b/SpatialDropout.lua
new file mode 100644
index 0000000..35daa18
--- /dev/null
+++ b/SpatialDropout.lua
@@ -0,0 +1,54 @@
+local SpatialDropout, Parent = torch.class('nn.SpatialDropout', 'nn.Module')
+
+function SpatialDropout:__init(p)
+   Parent.__init(self)
+   self.p = p or 0.5
+   self.train = true
+   self.noise = torch.Tensor()
+end
+
+function SpatialDropout:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   if self.train then
+      if input:dim() == 4 then
+        self.noise:resize(input:size(1), input:size(2), 1, 1)
+      elseif input:dim() == 3 then
+        self.noise:resize(input:size(1), 1, 1)
+      else
+        error('Input must be 4D (nbatch, nfeat, h, w) or 3D (nfeat, h, w)')
+      end
+      self.noise:bernoulli(1-self.p)
+      -- We expand the random dropouts to the entire feature map because the
+      -- features are likely correlated accross the map and so the dropout
+      -- should also be correlated.
+      self.output:cmul(torch.expandAs(self.noise, input))
+   else
+      self.output:mul(1-self.p)
+   end
+   return self.output
+end
+
+function SpatialDropout:updateGradInput(input, gradOutput)
+   if self.train then
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+      self.gradInput:cmul(torch.expandAs(self.noise, input)) -- simply mask the gradients with the noise vector
+   else
+      error('backprop only defined while training')
+   end
+   return self.gradInput
+end
+
+function SpatialDropout:setp(p)
+   self.p = p
+end
+
+function SpatialDropout:__tostring__()
+  return string.format('%s(%f)', torch.type(self), self.p)
+end
+
+function SpatialDropout:clearState()
+  if self.noise then
+    self.noise:set()
+  end
+  return Parent.clearState(self)
+end
diff --git a/SpatialFractionalMaxPooling.lua b/SpatialFractionalMaxPooling.lua
new file mode 100644
index 0000000..f5d8076
--- /dev/null
+++ b/SpatialFractionalMaxPooling.lua
@@ -0,0 +1,160 @@
+local SpatialFractionalMaxPooling, parent =
+   torch.class('nn.SpatialFractionalMaxPooling', 'nn.Module')
+
+-- Usage:
+-- nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
+--   the output should be the exact size (outH x outW)
+-- nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH)
+--   the output should be the size (floor(inH x ratioH) x floor(inW x ratioW))
+--   ratios are numbers between (0, 1) exclusive
+function SpatialFractionalMaxPooling:__init(poolSizeW, poolSizeH, arg1, arg2)
+   parent.__init(self)
+   assert(poolSizeW >= 2)
+   assert(poolSizeH >= 2)
+
+   -- Pool size (how wide the pooling for each output unit is)
+   self.poolSizeW = poolSizeW
+   self.poolSizeH = poolSizeH
+
+   -- Random samples are drawn for all
+   -- batch * plane * (height, width; i.e., 2) points. This determines
+   -- the 2d "pseudorandom" overlapping pooling regions for each
+   -- (batch element x input plane). A new set of random samples is
+   -- drawn every updateOutput call, unless we disable it via
+   -- :fixPoolingRegions().
+   self.randomSamples = nil
+
+   -- Flag to disable re-generation of random samples for producing
+   -- a new pooling. For testing purposes
+   self.newRandomPool = false
+
+   if arg1 >= 1 and arg2 >= 1 then
+      -- Desired output size: the input tensor will determine the reduction
+      -- ratio
+      self.outW = arg1
+      self.outH = arg2
+   else
+      -- Reduction ratio specified per each input
+      -- This is the reduction ratio that we use
+      self.ratioW = arg1
+      self.ratioH = arg2
+
+      -- The reduction ratio must be between 0 and 1
+      assert(self.ratioW > 0 and self.ratioW < 1)
+      assert(self.ratioH > 0 and self.ratioH < 1)
+   end
+end
+
+function SpatialFractionalMaxPooling:getBufferSize_(input)
+   local batchSize = 0
+   local planeSize = 0
+
+   if input:nDimension() == 3 then
+      batchSize = 1
+      planeSize = input:size(1)
+   elseif input:nDimension() == 4 then
+      batchSize = input:size(1)
+      planeSize = input:size(2)
+   else
+      error('input must be dim 3 or 4')
+   end
+
+   return torch.LongStorage({batchSize, planeSize, 2})
+end
+
+function SpatialFractionalMaxPooling:initSampleBuffer_(input)
+   local sampleBufferSize = self:getBufferSize_(input)
+
+   if self.randomSamples == nil then
+      self.randomSamples = input.new():resize(sampleBufferSize):uniform()
+   elseif (self.randomSamples:size(1) ~= sampleBufferSize[1] or
+           self.randomSamples:size(2) ~= sampleBufferSize[2]) then
+      self.randomSamples:resize(sampleBufferSize):uniform()
+   else
+      if not self.newRandomPool then
+         -- Create new pooling windows, since this is a subsequent call
+         self.randomSamples:uniform()
+      end
+   end
+end
+
+function SpatialFractionalMaxPooling:getOutputSizes_(input)
+   local outW = self.outW
+   local outH = self.outH
+   if self.ratioW ~= nil and self.ratioH ~= nil then
+      if input:nDimension() == 4 then
+         outW = math.floor(input:size(4) * self.ratioW)
+         outH = math.floor(input:size(3) * self.ratioH)
+      elseif input:nDimension() == 3 then
+         outW = math.floor(input:size(3) * self.ratioW)
+         outH = math.floor(input:size(2) * self.ratioH)
+      else
+         error('input must be dim 3 or 4')
+      end
+
+      -- Neither can be smaller than 1
+      assert(outW > 0, 'reduction ratio or input width too small')
+      assert(outH > 0, 'reduction ratio or input height too small')
+   else
+      assert(outW ~= nil and outH ~= nil)
+   end
+
+   return outW, outH
+end
+
+-- Call this to turn off regeneration of random pooling regions each
+-- updateOutput call.
+function SpatialFractionalMaxPooling:fixPoolingRegions(val)
+   if val == nil then
+      val = true
+   end
+
+   self.newRandomPool = val
+   return self
+end
+
+function SpatialFractionalMaxPooling:updateOutput(input)
+   self.indices = self.indices or input.new()
+   self:initSampleBuffer_(input)
+   local outW, outH = self:getOutputSizes_(input)
+
+   input.THNN.SpatialFractionalMaxPooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      outW, outH, self.poolSizeW, self.poolSizeH,
+      self.indices:cdata(), self.randomSamples:cdata())
+   return self.output
+end
+
+function SpatialFractionalMaxPooling:updateGradInput(input, gradOutput)
+   assert(self.randomSamples ~= nil,
+          'must call updateOutput/forward first')
+
+   local outW, outH = self:getOutputSizes_(input)
+
+   input.THNN.SpatialFractionalMaxPooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      outW, outH, self.poolSizeW, self.poolSizeH,
+      self.indices:cdata())
+   return self.gradInput
+end
+
+-- backward compat
+function SpatialFractionalMaxPooling:empty()
+   self:clearState()
+end
+
+function SpatialFractionalMaxPooling:clearState()
+   self.indices = nil
+   self.randomSamples = nil
+   return parent.clearState(self)
+end
+
+function SpatialFractionalMaxPooling:__tostring__()
+   return string.format('%s(%dx%d, %d,%d)', torch.type(self),
+                        self.outW and self.outW or self.ratioW,
+                        self.outH and self.outH or self.ratioH,
+                        self.poolSizeW, self.poolSizeH)
+end
diff --git a/SpatialFullConvolution.lua b/SpatialFullConvolution.lua
new file mode 100644
index 0000000..40fcd3d
--- /dev/null
+++ b/SpatialFullConvolution.lua
@@ -0,0 +1,225 @@
+local SpatialFullConvolution, parent = torch.class('nn.SpatialFullConvolution','nn.Module')
+
+function SpatialFullConvolution:__init(nInputPlane, nOutputPlane,
+                                       kW, kH, dW, dH, padW, padH, adjW, adjH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW
+   self.dH = dH
+   self.padW = padW or 0
+   self.padH = padH or 0
+   self.adjW = adjW or 0
+   self.adjH = adjH or 0
+
+   if self.adjW > self.dW - 1 or self.adjH > self.dH - 1 then
+      error('adjW and adjH must be smaller than self.dW - 1' ..
+            ' and self.dH - 1 respectively')
+   end
+
+   self.weight = torch.Tensor(nInputPlane, nOutputPlane, kH, kW)
+   self.gradWeight = torch.Tensor(nInputPlane, nOutputPlane, kH, kW)
+   self.bias = torch.Tensor(self.nOutputPlane)
+   self.gradBias = torch.Tensor(self.nOutputPlane)
+
+   self.ones = torch.Tensor()
+
+   self:reset()
+end
+
+function SpatialFullConvolution:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      local nInputPlane = self.nInputPlane
+      local kH = self.kH
+      local kW = self.kW
+      stdv = 1/math.sqrt(kW*kH*nInputPlane)
+   end
+   self.weight:uniform(-stdv, stdv)
+   self.bias:uniform(-stdv, stdv)
+end
+
+local function makeContiguous(self, input, gradOutput)
+  if not input:isContiguous() then
+    self._input = self._input or input.new()
+    self._input:resizeAs(input):copy(input)
+    input = self._input
+  end
+  if gradOutput then
+    if not gradOutput:isContiguous() then
+      self._gradOutput = self._gradOutput or gradOutput.new()
+      self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+      gradOutput = self._gradOutput
+    end
+  end
+  return input, gradOutput
+end
+
+local function calculateAdj(targetSize, ker, pad, stride)
+  return (targetSize + 2 * pad - ker) % stride
+end
+
+function SpatialFullConvolution:backCompatibility()
+  self.adjW = self.adjW or 0
+  self.adjH = self.adjH or 0
+end
+
+function SpatialFullConvolution:updateOutput(input)
+  self:backCompatibility()
+
+  local inputTensor = input
+  local adjW, adjH = self.adjW, self.adjH
+
+  -- The input can be a table where the second element indicates the target
+  -- output size, in which case the adj factors are computed automatically
+  if type(inputTensor) == 'table' then
+    inputTensor = input[1]
+    local targetTensor = input[2]
+    local tDims = targetTensor:dim()
+    local tH = targetTensor:size(tDims-1)
+    local tW = targetTensor:size(tDims)
+    adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
+    adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
+    self.finput = self.finput or input[1].new()
+    self.fgradInput = self.fgradInput or input[1].new()
+  else
+    self.finput = self.finput or input.new()
+    self.fgradInput = self.fgradInput or input.new()
+  end
+
+  inputTensor = makeContiguous(self, inputTensor)
+  inputTensor.THNN.SpatialFullConvolution_updateOutput(
+    inputTensor:cdata(),
+    self.output:cdata(),
+    self.weight:cdata(),
+    self.bias:cdata(),
+    self.finput:cdata(),
+    self.fgradInput:cdata(),
+    self.kW, self.kH,
+    self.dW, self.dH,
+    self.padW, self.padH,
+    adjW, adjH
+  )
+
+  return self.output
+end
+
+function SpatialFullConvolution:updateGradInput(input, gradOutput)
+  self:backCompatibility()
+
+  if self.gradInput then
+
+    local inputTensor = input
+    local adjW, adjH = self.adjW, self.adjH
+
+    -- The input can be a table where the second element indicates the target
+    -- output size, in which case the adj factors are computed automatically
+    if type(inputTensor) == 'table' then
+      inputTensor = input[1]
+      local targetTensor = input[2]
+      local tDims = targetTensor:dim()
+      local tH = targetTensor:size(tDims-1)
+      local tW = targetTensor:size(tDims)
+      adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
+      adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
+      -- Momentarily extract the gradInput tensor
+      if type(self.gradInput) == 'table' then
+        self.gradInput = self.gradInput[1]
+      end
+    end
+
+    inputTensor, gradOutput = makeContiguous(self, inputTensor, gradOutput)
+    inputTensor.THNN.SpatialFullConvolution_updateGradInput(
+      inputTensor:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata(),
+      self.finput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      adjW, adjH
+    )
+
+    if type(input) == 'table' then
+     -- Create a zero tensor to be expanded and used as gradInput[2].
+      self.zeroScalar = self.zeroScalar or input[2].new(1):zero()
+      self.ones:resize(input[2]:dim()):fill(1)
+      local zeroTensor =  self.zeroScalar
+          :view(table.unpack(self.ones:totable()))
+          :expandAs(input[2])
+      self.gradInput = {self.gradInput, zeroTensor}
+    end
+
+    return self.gradInput
+  end
+end
+
+function SpatialFullConvolution:accGradParameters(input, gradOutput, scale)
+  scale = scale or 1
+  self:backCompatibility()
+
+  local inputTensor = input
+  local adjW, adjH = self.adjW, self.adjH
+
+  -- The input can be a table where the second element indicates the target
+  -- output size, in which case the adj factors are computed automatically
+  if type(inputTensor) == 'table' then
+    inputTensor = input[1]
+    local targetTensor = input[2]
+    local tDims = targetTensor:dim()
+    local tH = targetTensor:size(tDims-1)
+    local tW = targetTensor:size(tDims)
+    adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
+    adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
+  end
+
+  inputTensor, gradOutput = makeContiguous(self, inputTensor, gradOutput)
+  inputTensor.THNN.SpatialFullConvolution_accGradParameters(
+    inputTensor:cdata(),
+    gradOutput:cdata(),
+    self.gradWeight:cdata(),
+    self.gradBias:cdata(),
+    self.finput:cdata(),
+    self.fgradInput:cdata(),
+    self.kW, self.kH,
+    self.dW, self.dH,
+    self.padW, self.padH,
+    adjW, adjH,
+    scale
+  )
+end
+
+function SpatialFullConvolution:type(type, tensorCache)
+  self.finput = self.finput and torch.Tensor()
+  self.fgradInput = self.fgradInput and torch.Tensor()
+  return parent.type(self, type, tensorCache)
+end
+
+function SpatialFullConvolution:__tostring__()
+  local s = string.format('%s(%d -> %d, %dx%d', torch.type(self),
+  self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
+  if self.dW ~= 1 or self.dH ~= 1 or self.padW ~= 0 or self.padH ~= 0 then
+    s = s .. string.format(', %d,%d', self.dW, self.dH)
+  end
+  if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+    s = s .. ', ' .. self.padW .. ',' .. self.padH
+  end
+  if (self.adjW or self.adjH) and (self.adjW ~= 0 or self.adjH ~= 0) then
+    s = s .. ', ' .. self.adjW .. ',' .. self.adjH
+  end
+  return s .. ')'
+end
+
+function SpatialFullConvolution:clearState()
+   nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+   return parent.clearState(self)
+end
+
diff --git a/SpatialFullConvolutionMap.lua b/SpatialFullConvolutionMap.lua
new file mode 100644
index 0000000..5dfc068
--- /dev/null
+++ b/SpatialFullConvolutionMap.lua
@@ -0,0 +1,91 @@
+local SpatialFullConvolutionMap, parent = torch.class('nn.SpatialFullConvolutionMap', 'nn.Module')
+
+function SpatialFullConvolutionMap:__init(conMatrix, kW, kH, dW, dH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW
+   self.dH = dH
+   self.connTable = conMatrix
+   self.nInputPlane = self.connTable:select(2,1):max()
+   self.nOutputPlane = self.connTable:select(2,2):max()
+
+   self.weight = torch.Tensor(self.connTable:size(1), kH, kW)
+   self.gradWeight = torch.Tensor(self.connTable:size(1), kH, kW)
+
+   self.bias = torch.Tensor(self.nOutputPlane)
+   self.gradBias = torch.Tensor(self.nOutputPlane)
+   
+   self:reset()
+end
+
+function SpatialFullConvolutionMap:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+      self.weight:apply(function()
+			   return torch.uniform(-stdv, stdv)
+			end)
+      self.bias:apply(function()
+			 return torch.uniform(-stdv, stdv)
+		      end)
+   else
+      local ninp = torch.Tensor(self.nOutputPlane):zero()
+      for i=1,self.connTable:size(1) do ninp[self.connTable[i][2]] =  ninp[self.connTable[i][2]]+1 end
+      for k=1,self.connTable:size(1) do
+         stdv = 1/math.sqrt(self.kW*self.kH*ninp[self.connTable[k][2]])
+         self.weight:select(1,k):apply(function() return torch.uniform(-stdv,stdv) end)
+      end
+      for k=1,self.bias:size(1) do
+         stdv = 1/math.sqrt(self.kW*self.kH*ninp[k])
+         self.bias[k] = torch.uniform(-stdv,stdv)
+      end
+
+   end
+end
+
+function SpatialFullConvolutionMap:updateOutput(input)
+   input.THNN.SpatialFullConvolutionMap_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.connTable:cdata(),
+      self.nInputPlane,
+      self.nOutputPlane,
+      self.dW, self.dH
+   )
+   return self.output
+end
+
+function SpatialFullConvolutionMap:updateGradInput(input, gradOutput)
+   input.THNN.SpatialFullConvolutionMap_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.connTable:cdata(),
+      self.nInputPlane,
+      self.nOutputPlane,
+      self.dW, self.dH
+   )
+   return self.gradInput
+end
+
+function SpatialFullConvolutionMap:accGradParameters(input, gradOutput, scale)
+   input.THNN.SpatialFullConvolutionMap_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      self.gradBias:cdata(),
+      self.connTable:cdata(),
+      self.nInputPlane,
+      self.nOutputPlane,
+      self.dW, self.dH,
+      scale or 1
+   )
+end
diff --git a/SpatialLPPooling.lua b/SpatialLPPooling.lua
new file mode 100644
index 0000000..fc56296
--- /dev/null
+++ b/SpatialLPPooling.lua
@@ -0,0 +1,43 @@
+local SpatialLPPooling, parent = torch.class('nn.SpatialLPPooling', 'nn.Sequential')
+
+function SpatialLPPooling:__init(nInputPlane, pnorm, kW, kH, dW, dH)
+   parent.__init(self)
+
+   dW = dW or kW
+   dH = dH or kH
+   
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW
+   self.dH = dH
+
+   if pnorm == 2 then
+      self:add(nn.Square())
+   else
+      self:add(nn.Power(pnorm))
+   end
+   self:add(nn.SpatialAveragePooling(kW, kH, dW, dH))
+   self:add(nn.MulConstant(kW*kH))
+   if pnorm == 2 then
+      self:add(nn.Sqrt())
+   else
+      self:add(nn.Power(1/pnorm))
+   end
+end
+
+-- the module is a Sequential: by default, it'll try to learn the parameters
+-- of the sub sampler: we avoid that by redefining its methods.
+function SpatialLPPooling:reset()
+end
+
+function SpatialLPPooling:accGradParameters()
+end
+
+function SpatialLPPooling:accUpdateGradParameters()
+end
+
+function SpatialLPPooling:zeroGradParameters()
+end
+
+function SpatialLPPooling:updateParameters()
+end
diff --git a/SpatialMaxPooling.lua b/SpatialMaxPooling.lua
new file mode 100644
index 0000000..8475b13
--- /dev/null
+++ b/SpatialMaxPooling.lua
@@ -0,0 +1,89 @@
+local SpatialMaxPooling, parent = torch.class('nn.SpatialMaxPooling', 'nn.Module')
+
+function SpatialMaxPooling:__init(kW, kH, dW, dH, padW, padH)
+   parent.__init(self)
+
+   dW = dW or kW
+   dH = dH or kH
+
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW
+   self.dH = dH
+
+   self.padW = padW or 0
+   self.padH = padH or 0
+
+   self.ceil_mode = false
+   self.indices = torch.Tensor()
+end
+
+function SpatialMaxPooling:ceil()
+  self.ceil_mode = true
+  return self
+end
+
+function SpatialMaxPooling:floor()
+  self.ceil_mode = false
+  return self
+end
+
+function SpatialMaxPooling:updateOutput(input)
+   self.indices = self.indices or input.new()
+
+   local dims = input:dim()
+   self.iheight = input:size(dims-1)
+   self.iwidth = input:size(dims)
+
+   -- backward compatibility
+   self.ceil_mode = self.ceil_mode or false
+   self.padW = self.padW or 0
+   self.padH = self.padH or 0
+   input.THNN.SpatialMaxPooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.indices:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.ceil_mode
+   )
+   return self.output
+end
+
+function SpatialMaxPooling:updateGradInput(input, gradOutput)
+   input.THNN.SpatialMaxPooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.indices:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.ceil_mode
+   )
+   return self.gradInput
+end
+
+-- for backward compat
+function SpatialMaxPooling:empty()
+   self:clearState()
+end
+
+function SpatialMaxPooling:__tostring__()
+   local s =  string.format('%s(%dx%d, %d,%d', torch.type(self),
+                            self.kW, self.kH, self.dW, self.dH)
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+      s = s .. ', ' .. self.padW .. ','.. self.padH
+   end
+   s = s .. ')'
+
+   return s
+end
+
+function SpatialMaxPooling:clearState()
+   if self.indices then
+      self.indices:set()
+   end
+   return parent.clearState(self)
+end
diff --git a/SpatialMaxUnpooling.lua b/SpatialMaxUnpooling.lua
new file mode 100644
index 0000000..408bcc0
--- /dev/null
+++ b/SpatialMaxUnpooling.lua
@@ -0,0 +1,45 @@
+local SpatialMaxUnpooling, parent = torch.class('nn.SpatialMaxUnpooling', 'nn.Module')
+
+function SpatialMaxUnpooling:__init(poolingModule)
+   parent.__init(self)
+   assert(torch.type(poolingModule)=='nn.SpatialMaxPooling', 'Argument must be a nn.SpatialMaxPooling module')
+   assert(poolingModule.kH==poolingModule.dH and poolingModule.kW==poolingModule.dW, "The size of pooling module's kernel must be equal to its stride")
+   self.pooling = poolingModule
+end
+
+function SpatialMaxUnpooling:setParams()
+   self.indices = self.pooling.indices
+   self.oheight = self.pooling.iheight
+   self.owidth = self.pooling.iwidth
+end
+
+function SpatialMaxUnpooling:updateOutput(input)
+   self:setParams()
+   input.THNN.SpatialMaxUnpooling_updateOutput(
+   input:cdata(),
+   self.output:cdata(),
+   self.indices:cdata(),
+   self.owidth, self.oheight
+   )
+   return self.output
+end
+
+function SpatialMaxUnpooling:updateGradInput(input, gradOutput)
+   self:setParams()
+   input.THNN.SpatialMaxUnpooling_updateGradInput(
+   input:cdata(),
+   gradOutput:cdata(),
+   self.gradInput:cdata(),
+   self.indices:cdata(),
+   self.owidth, self.oheight
+   )
+   return self.gradInput
+end
+
+function SpatialMaxUnpooling:empty()
+   self:clearState()
+end
+
+function SpatialMaxUnpooling:__tostring__()
+   return 'nn.SpatialMaxUnpooling associated to '..tostring(self.pooling)
+end
diff --git a/SpatialReflectionPadding.lua b/SpatialReflectionPadding.lua
new file mode 100644
index 0000000..9ce4612
--- /dev/null
+++ b/SpatialReflectionPadding.lua
@@ -0,0 +1,51 @@
+local SpatialReflectionPadding, parent =
+   torch.class('nn.SpatialReflectionPadding', 'nn.Module')
+
+function SpatialReflectionPadding:__init(pad_l, pad_r, pad_t, pad_b)
+   parent.__init(self)
+   self.pad_l = pad_l
+   self.pad_r = pad_r or self.pad_l
+   self.pad_t = pad_t or self.pad_l
+   self.pad_b = pad_b or self.pad_l
+end
+
+function SpatialReflectionPadding:updateOutput(input)
+   if input:dim() == 3 or input:dim() == 4 then
+      input.THNN.SpatialReflectionPadding_updateOutput(
+         input:cdata(), self.output:cdata(),
+         self.pad_l, self.pad_r, self.pad_t, self.pad_b)
+   else
+      error('input must be 3 or 4-dimensional')
+   end
+   return self.output
+end
+
+function SpatialReflectionPadding:updateGradInput(input, gradOutput)
+   if input:dim() == 3 and gradOutput:dim() == 3 then
+      assert(input:size(1) == gradOutput:size(1)
+             and input:size(2) + self.pad_t + self.pad_b == gradOutput:size(2)
+             and input:size(3) + self.pad_l + self.pad_r == gradOutput:size(3),
+             'input and gradOutput must be compatible in size')
+   elseif input:dim() == 4 and gradOutput:dim() == 4 then
+      assert(input:size(1) == gradOutput:size(1)
+             and input:size(2) == gradOutput:size(2)
+             and input:size(3) + self.pad_t + self.pad_b == gradOutput:size(3)
+             and input:size(4) + self.pad_l + self.pad_r == gradOutput:size(4),
+             'input and gradOutput must be compatible in size')
+   else
+      error(
+        [[input and gradOutput must be 3 or 4-dimensional
+        and have equal number of dimensions]]
+        )
+   end
+   input.THNN.SpatialReflectionPadding_updateGradInput(
+      input:cdata(), gradOutput:cdata(), self.gradInput:cdata(),
+      self.pad_l, self.pad_r, self.pad_t, self.pad_b)
+   return self.gradInput
+end
+
+function SpatialReflectionPadding:__tostring__()
+  return torch.type(self) ..
+      string.format('(l=%d, r=%d, t=%d, b=%d)', self.pad_l, self.pad_r,
+                    self.pad_t, self.pad_b)
+end
diff --git a/SpatialReplicationPadding.lua b/SpatialReplicationPadding.lua
new file mode 100644
index 0000000..429763f
--- /dev/null
+++ b/SpatialReplicationPadding.lua
@@ -0,0 +1,51 @@
+local SpatialReplicationPadding, parent =
+   torch.class('nn.SpatialReplicationPadding', 'nn.Module')
+
+function SpatialReplicationPadding:__init(pad_l, pad_r, pad_t, pad_b)
+   parent.__init(self)
+   self.pad_l = pad_l
+   self.pad_r = pad_r or self.pad_l
+   self.pad_t = pad_t or self.pad_l
+   self.pad_b = pad_b or self.pad_l
+end
+
+function SpatialReplicationPadding:updateOutput(input)
+   if input:dim() == 3 or input:dim() == 4 then
+      input.THNN.SpatialReplicationPadding_updateOutput(
+         input:cdata(), self.output:cdata(),
+         self.pad_l, self.pad_r, self.pad_t, self.pad_b)
+   else
+      error('input must be 3 or 4-dimensional')
+   end
+   return self.output
+end
+
+function SpatialReplicationPadding:updateGradInput(input, gradOutput)
+   if input:dim() == 3 and gradOutput:dim() == 3 then
+      assert(input:size(1) == gradOutput:size(1)
+             and input:size(2) + self.pad_t + self.pad_b == gradOutput:size(2)
+             and input:size(3) + self.pad_l + self.pad_r == gradOutput:size(3),
+             'input and gradOutput must be compatible in size')
+   elseif input:dim() == 4 and gradOutput:dim() == 4 then
+      assert(input:size(1) == gradOutput:size(1)
+             and input:size(2) == gradOutput:size(2)
+             and input:size(3) + self.pad_t + self.pad_b == gradOutput:size(3)
+             and input:size(4) + self.pad_l + self.pad_r == gradOutput:size(4),
+             'input and gradOutput must be compatible in size')
+   else
+      error(
+         [[input and gradOutput must be 3 or 4-dimensional
+         and have equal number of dimensions]]
+         )
+   end
+   input.THNN.SpatialReplicationPadding_updateGradInput(
+      input:cdata(), gradOutput:cdata(), self.gradInput:cdata(),
+      self.pad_l, self.pad_r, self.pad_t, self.pad_b)
+   return self.gradInput
+end
+
+function SpatialReplicationPadding:__tostring__()
+   return torch.type(self) ..
+   string.format('(l=%d, r=%d, t=%d, b=%d)', self.pad_l, self.pad_r,
+   self.pad_t, self.pad_b)
+end
diff --git a/SpatialSoftMax.lua b/SpatialSoftMax.lua
new file mode 100644
index 0000000..56f0b40
--- /dev/null
+++ b/SpatialSoftMax.lua
@@ -0,0 +1,19 @@
+local SpatialSoftMax, _ = torch.class('nn.SpatialSoftMax', 'nn.Module')
+
+function SpatialSoftMax:updateOutput(input)
+   input.THNN.SoftMax_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function SpatialSoftMax:updateGradInput(input, gradOutput)
+   input.THNN.SoftMax_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+   return self.gradInput
+end
diff --git a/SpatialSubSampling.lua b/SpatialSubSampling.lua
new file mode 100644
index 0000000..2aab799
--- /dev/null
+++ b/SpatialSubSampling.lua
@@ -0,0 +1,79 @@
+local SpatialSubSampling, parent = torch.class('nn.SpatialSubSampling', 'nn.Module')
+
+function SpatialSubSampling:__init(nInputPlane, kW, kH, dW, dH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.nInputPlane = nInputPlane
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW
+   self.dH = dH
+
+   self.weight = torch.Tensor(nInputPlane)
+   self.bias = torch.Tensor(nInputPlane)
+   self.gradWeight = torch.Tensor(nInputPlane)
+   self.gradBias = torch.Tensor(nInputPlane)
+   
+   self:reset()
+end
+
+function SpatialSubSampling:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW*self.kH)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end) 
+   else
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+function SpatialSubSampling:updateOutput(input)
+   input.THNN.SpatialSubSampling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH
+   )
+   return self.output
+end
+
+function SpatialSubSampling:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      input.THNN.SpatialSubSampling_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.kW, self.kH,
+         self.dW, self.dH
+      )
+      return self.gradInput
+   end
+end
+
+function SpatialSubSampling:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   input.THNN.SpatialSubSampling_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      self.gradBias:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      scale
+   )
+end
diff --git a/SpatialSubtractiveNormalization.lua b/SpatialSubtractiveNormalization.lua
new file mode 100644
index 0000000..e2da2c6
--- /dev/null
+++ b/SpatialSubtractiveNormalization.lua
@@ -0,0 +1,115 @@
+local SpatialSubtractiveNormalization, parent = torch.class('nn.SpatialSubtractiveNormalization','nn.Module')
+
+function SpatialSubtractiveNormalization:__init(nInputPlane, kernel)
+   parent.__init(self)
+
+   -- get args
+   self.nInputPlane = nInputPlane or 1
+   self.kernel = kernel or torch.Tensor(9,9):fill(1)
+   local kdim = self.kernel:nDimension()
+
+   -- check args
+   if kdim ~= 2 and kdim ~= 1 then
+      error('<SpatialSubtractiveNormalization> averaging kernel must be 2D or 1D')
+   end
+   if (self.kernel:size(1) % 2) == 0 or (kdim == 2 and (self.kernel:size(2) % 2) == 0) then
+      error('<SpatialSubtractiveNormalization> averaging kernel must have ODD dimensions')
+   end
+
+   -- normalize kernel
+   self.kernel:div(self.kernel:sum() * self.nInputPlane)
+
+   -- padding values
+   local padH = math.floor(self.kernel:size(1)/2)
+   local padW = padH
+   if kdim == 2 then
+      padW = math.floor(self.kernel:size(2)/2)
+   end
+
+   -- create convolutional mean extractor
+   self.meanestimator = nn.Sequential()
+   self.meanestimator:add(nn.SpatialZeroPadding(padW, padW, padH, padH))
+   if kdim == 2 then
+      self.meanestimator:add(nn.SpatialConvolution(self.nInputPlane, 1, self.kernel:size(2), self.kernel:size(1)))
+   else
+      self.meanestimator:add(nn.SpatialConvolutionMap(nn.tables.oneToOne(self.nInputPlane), self.kernel:size(1), 1))
+      self.meanestimator:add(nn.SpatialConvolution(self.nInputPlane, 1, 1, self.kernel:size(1)))
+   end
+   self.meanestimator:add(nn.Replicate(self.nInputPlane,1,3))
+
+   -- set kernel and bias
+   if kdim == 2 then
+      for i = 1,self.nInputPlane do 
+         self.meanestimator.modules[2].weight[1][i] = self.kernel
+      end
+      self.meanestimator.modules[2].bias:zero()
+   else
+      for i = 1,self.nInputPlane do 
+         self.meanestimator.modules[2].weight[i]:copy(self.kernel)
+         self.meanestimator.modules[3].weight[1][i]:copy(self.kernel)
+      end
+      self.meanestimator.modules[2].bias:zero()
+      self.meanestimator.modules[3].bias:zero()
+   end
+
+   -- other operation
+   self.subtractor = nn.CSubTable()
+   self.divider = nn.CDivTable()
+
+   -- coefficient array, to adjust side effects
+   self.coef = torch.Tensor(1,1,1)
+end
+
+function SpatialSubtractiveNormalization:updateOutput(input)   
+   -- compute side coefficients
+   local dim = input:dim()
+   if input:dim()+1 ~= self.coef:dim() or (input:size(dim) ~= self.coef:size(dim)) or (input:size(dim-1) ~= self.coef:size(dim-1)) then
+      self.ones = self.ones or input.new()
+      self._coef = self._coef or self.coef.new()
+      if dim == 4 then
+         -- batch mode
+         self.ones:resizeAs(input[1]):fill(1)
+         local coef = self.meanestimator:updateOutput(self.ones)
+         self._coef:resizeAs(coef):copy(coef) -- make contiguous for view
+         local size = coef:size():totable()
+         table.insert(size,1,input:size(1))
+         self.coef = self._coef:view(1,table.unpack(self._coef:size():totable())):expand(table.unpack(size))
+      else
+         self.ones:resizeAs(input):fill(1)
+         local coef = self.meanestimator:updateOutput(self.ones)
+         self._coef:resizeAs(coef):copy(coef) -- copy meanestimator.output as it will be used below
+         self.coef = self._coef
+      end
+      
+   end
+
+   -- compute mean
+   self.localsums = self.meanestimator:updateOutput(input)
+   self.adjustedsums = self.divider:updateOutput{self.localsums, self.coef}
+   self.output = self.subtractor:updateOutput{input, self.adjustedsums}
+
+   -- done
+   return self.output
+end
+
+function SpatialSubtractiveNormalization:updateGradInput(input, gradOutput)
+   -- resize grad
+   self.gradInput:resizeAs(input):zero()
+
+   -- backprop through all modules
+   local gradsub = self.subtractor:updateGradInput({input, self.adjustedsums}, gradOutput)
+   local graddiv = self.divider:updateGradInput({self.localsums, self.coef}, gradsub[2])
+   local size = self.meanestimator:updateGradInput(input, graddiv[1]):size()
+   self.gradInput:add(self.meanestimator:updateGradInput(input, graddiv[1]))
+   self.gradInput:add(gradsub[1])
+
+   -- done
+   return self.gradInput
+end
+
+function SpatialSubtractiveNormalization:clearState()
+   if self.ones then self.ones:set() end
+   if self._coef then self._coef:set() end
+   self.meanestimator:clearState()
+   return parent.clearState(self)
+end
diff --git a/SpatialUpSamplingNearest.lua b/SpatialUpSamplingNearest.lua
new file mode 100644
index 0000000..c3b2330
--- /dev/null
+++ b/SpatialUpSamplingNearest.lua
@@ -0,0 +1,67 @@
+local SpatialUpSamplingNearest, parent = torch.class('nn.SpatialUpSamplingNearest', 'nn.Module')
+
+--[[
+Applies a 2D up-sampling over an input image composed of several input planes.
+
+The upsampling is done using the simple nearest neighbor technique.
+
+The Y and X dimensions are assumed to be the last 2 tensor dimensions.  For
+instance, if the tensor is 4D, then dim 3 is the y dimension and dim 4 is the x.
+
+owidth  = width*scale_factor
+oheight  = height*scale_factor
+--]]
+
+function SpatialUpSamplingNearest:__init(scale)
+   parent.__init(self)
+
+   self.scale_factor = scale
+   if self.scale_factor < 1 then
+     error('scale_factor must be greater than 1')
+   end
+   if math.floor(self.scale_factor) ~= self.scale_factor then
+     error('scale_factor must be integer')
+   end
+   self.inputSize = torch.LongStorage(4)
+   self.outputSize = torch.LongStorage(4)
+   self.usage = nil
+end
+
+function SpatialUpSamplingNearest:updateOutput(input)
+   if input:dim() ~= 4 and input:dim() ~= 3 then
+     error('SpatialUpSamplingNearest only support 3D or 4D tensors')
+   end
+   -- Copy the input size
+   local xdim = input:dim()
+   local ydim = input:dim() - 1
+   for i = 1, input:dim() do
+     self.inputSize[i] = input:size(i)
+     self.outputSize[i] = input:size(i)
+   end
+   self.outputSize[ydim] = self.outputSize[ydim] * self.scale_factor
+   self.outputSize[xdim] = self.outputSize[xdim] * self.scale_factor
+   -- Resize the output if needed
+   if input:dim() == 3 then
+     self.output:resize(self.outputSize[1], self.outputSize[2],
+       self.outputSize[3])
+   else
+     self.output:resize(self.outputSize)
+   end
+   input.THNN.SpatialUpSamplingNearest_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.scale_factor
+   )
+   return self.output
+end
+
+function SpatialUpSamplingNearest:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input)
+   input.THNN.SpatialUpSamplingNearest_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.scale_factor
+   )
+   return self.gradInput
+end
diff --git a/SpatialZeroPadding.lua b/SpatialZeroPadding.lua
new file mode 100644
index 0000000..f199258
--- /dev/null
+++ b/SpatialZeroPadding.lua
@@ -0,0 +1,104 @@
+local SpatialZeroPadding, parent = torch.class('nn.SpatialZeroPadding', 'nn.Module')
+
+function SpatialZeroPadding:__init(pad_l, pad_r, pad_t, pad_b)
+   parent.__init(self)
+   self.pad_l = pad_l
+   self.pad_r = pad_r or self.pad_l
+   self.pad_t = pad_t or self.pad_l
+   self.pad_b = pad_b or self.pad_l
+end
+
+function SpatialZeroPadding:updateOutput(input)
+   if input:dim() == 3 then
+      -- sizes
+      local h = input:size(2) + self.pad_t + self.pad_b
+      local w = input:size(3) + self.pad_l + self.pad_r
+      if w < 1 or h < 1 then error('input is too small') end
+      self.output:resize(input:size(1), h, w)
+      self.output:zero()
+      -- crop input if necessary
+      local c_input = input
+      if self.pad_t < 0 then c_input = c_input:narrow(2, 1 - self.pad_t, c_input:size(2) + self.pad_t) end
+      if self.pad_b < 0 then c_input = c_input:narrow(2, 1, c_input:size(2) + self.pad_b) end
+      if self.pad_l < 0 then c_input = c_input:narrow(3, 1 - self.pad_l, c_input:size(3) + self.pad_l) end
+      if self.pad_r < 0 then c_input = c_input:narrow(3, 1, c_input:size(3) + self.pad_r) end
+      -- crop outout if necessary
+      local c_output = self.output
+      if self.pad_t > 0 then c_output = c_output:narrow(2, 1 + self.pad_t, c_output:size(2) - self.pad_t) end
+      if self.pad_b > 0 then c_output = c_output:narrow(2, 1, c_output:size(2) - self.pad_b) end
+      if self.pad_l > 0 then c_output = c_output:narrow(3, 1 + self.pad_l, c_output:size(3) - self.pad_l) end
+      if self.pad_r > 0 then c_output = c_output:narrow(3, 1, c_output:size(3) - self.pad_r) end
+      -- copy input to output
+      c_output:copy(c_input)
+   elseif input:dim() == 4 then
+      -- sizes
+      local h = input:size(3) + self.pad_t + self.pad_b
+      local w = input:size(4) + self.pad_l + self.pad_r
+      if w < 1 or h < 1 then error('input is too small') end
+      self.output:resize(input:size(1), input:size(2), h, w)
+      self.output:zero()
+      -- crop input if necessary
+      local c_input = input
+      if self.pad_t < 0 then c_input = c_input:narrow(3, 1 - self.pad_t, c_input:size(3) + self.pad_t) end
+      if self.pad_b < 0 then c_input = c_input:narrow(3, 1, c_input:size(3) + self.pad_b) end
+      if self.pad_l < 0 then c_input = c_input:narrow(4, 1 - self.pad_l, c_input:size(4) + self.pad_l) end
+      if self.pad_r < 0 then c_input = c_input:narrow(4, 1, c_input:size(4) + self.pad_r) end
+      -- crop outout if necessary
+      local c_output = self.output
+      if self.pad_t > 0 then c_output = c_output:narrow(3, 1 + self.pad_t, c_output:size(3) - self.pad_t) end
+      if self.pad_b > 0 then c_output = c_output:narrow(3, 1, c_output:size(3) - self.pad_b) end
+      if self.pad_l > 0 then c_output = c_output:narrow(4, 1 + self.pad_l, c_output:size(4) - self.pad_l) end
+      if self.pad_r > 0 then c_output = c_output:narrow(4, 1, c_output:size(4) - self.pad_r) end
+      -- copy input to output
+      c_output:copy(c_input)
+   else
+      error('input must be 3 or 4-dimensional')
+   end
+   return self.output
+end
+
+function SpatialZeroPadding:updateGradInput(input, gradOutput)
+   if input:dim() == 3 then
+      self.gradInput:resizeAs(input):zero()
+      -- crop gradInput if necessary
+      local cg_input = self.gradInput
+      if self.pad_t < 0 then cg_input = cg_input:narrow(2, 1 - self.pad_t, cg_input:size(2) + self.pad_t) end
+      if self.pad_b < 0 then cg_input = cg_input:narrow(2, 1, cg_input:size(2) + self.pad_b) end
+      if self.pad_l < 0 then cg_input = cg_input:narrow(3, 1 - self.pad_l, cg_input:size(3) + self.pad_l) end
+      if self.pad_r < 0 then cg_input = cg_input:narrow(3, 1, cg_input:size(3) + self.pad_r) end
+      -- crop gradOutout if necessary
+      local cg_output = gradOutput
+      if self.pad_t > 0 then cg_output = cg_output:narrow(2, 1 + self.pad_t, cg_output:size(2) - self.pad_t) end
+      if self.pad_b > 0 then cg_output = cg_output:narrow(2, 1, cg_output:size(2) - self.pad_b) end
+      if self.pad_l > 0 then cg_output = cg_output:narrow(3, 1 + self.pad_l, cg_output:size(3) - self.pad_l) end
+      if self.pad_r > 0 then cg_output = cg_output:narrow(3, 1, cg_output:size(3) - self.pad_r) end
+      -- copy gradOuput to gradInput
+      cg_input:copy(cg_output)
+   elseif input:dim() == 4 then
+      self.gradInput:resizeAs(input):zero()
+      -- crop gradInput if necessary
+      local cg_input = self.gradInput
+      if self.pad_t < 0 then cg_input = cg_input:narrow(3, 1 - self.pad_t, cg_input:size(3) + self.pad_t) end
+      if self.pad_b < 0 then cg_input = cg_input:narrow(3, 1, cg_input:size(3) + self.pad_b) end
+      if self.pad_l < 0 then cg_input = cg_input:narrow(4, 1 - self.pad_l, cg_input:size(4) + self.pad_l) end
+      if self.pad_r < 0 then cg_input = cg_input:narrow(4, 1, cg_input:size(4) + self.pad_r) end
+      -- crop gradOutout if necessary
+      local cg_output = gradOutput
+      if self.pad_t > 0 then cg_output = cg_output:narrow(3, 1 + self.pad_t, cg_output:size(3) - self.pad_t) end
+      if self.pad_b > 0 then cg_output = cg_output:narrow(3, 1, cg_output:size(3) - self.pad_b) end
+      if self.pad_l > 0 then cg_output = cg_output:narrow(4, 1 + self.pad_l, cg_output:size(4) - self.pad_l) end
+      if self.pad_r > 0 then cg_output = cg_output:narrow(4, 1, cg_output:size(4) - self.pad_r) end
+      -- copy gradOuput to gradInput
+      cg_input:copy(cg_output)
+   else
+      error('input must be 3 or 4-dimensional')
+   end
+   return self.gradInput
+end
+
+
+function SpatialZeroPadding:__tostring__()
+   return torch.type(self) ..
+   string.format('(l=%d, r=%d, t=%d, b=%d)', self.pad_l, self.pad_r,
+   self.pad_t, self.pad_b)
+end
diff --git a/SplitTable.lua b/SplitTable.lua
new file mode 100644
index 0000000..43b66a4
--- /dev/null
+++ b/SplitTable.lua
@@ -0,0 +1,43 @@
+local SplitTable, parent = torch.class('nn.SplitTable', 'nn.Module')
+
+function SplitTable:__init(dimension, nInputDims)
+   parent.__init(self)
+   self.dimension = dimension
+   self.nInputDims = nInputDims
+end
+
+function SplitTable:_getPositiveDimension(input)
+   local dimension = self.dimension
+   if dimension < 0 then
+      dimension = input:dim() + dimension + 1
+   elseif self.nInputDims and input:dim()==(self.nInputDims+1) then
+      dimension = dimension + 1
+   end
+   return dimension
+end
+
+function SplitTable:updateOutput(input)
+   local dimension = self:_getPositiveDimension(input)
+   local slices = input:size(dimension)
+
+   local currentOutput= {}
+   for i=1,slices do
+      currentOutput[#currentOutput+1] = input:select(dimension,i)
+   end
+   self.output = currentOutput
+   return self.output
+end 
+
+function SplitTable:updateGradInput(input, gradOutput)
+   local dimension = self:_getPositiveDimension(input)
+   local slices = input:size(dimension)
+   if self.gradInput then
+      self.gradInput:resizeAs(input)
+
+      for i=1,slices do 
+         local currentGradInput = gradOutput[i];        
+         self.gradInput:select(dimension,i):copy(currentGradInput)
+      end
+   end
+   return self.gradInput
+end
diff --git a/Sqrt.lua b/Sqrt.lua
new file mode 100644
index 0000000..df354a1
--- /dev/null
+++ b/Sqrt.lua
@@ -0,0 +1,26 @@
+local Sqrt, parent = torch.class('nn.Sqrt','nn.Module')
+
+function Sqrt:__init(b)
+   parent.__init(self)
+   self.eps = b or 0
+end
+
+function Sqrt:updateOutput(input)
+   self.eps = self.eps or 0
+   input.THNN.Sqrt_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.eps
+   )
+   return self.output
+end
+
+function Sqrt:updateGradInput(input, gradOutput)
+   input.THNN.Sqrt_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+   return self.gradInput
+end
diff --git a/Square.lua b/Square.lua
new file mode 100644
index 0000000..a6292af
--- /dev/null
+++ b/Square.lua
@@ -0,0 +1,22 @@
+local Square, parent = torch.class('nn.Square', 'nn.Module')
+
+function Square:__init(args)
+   parent.__init(self)
+end
+
+function Square:updateOutput(input)
+   input.THNN.Square_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function Square:updateGradInput(input, gradOutput)
+   input.THNN.Square_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata()
+   )
+   return self.gradInput
+end
diff --git a/Squeeze.lua b/Squeeze.lua
new file mode 100644
index 0000000..7d204a1
--- /dev/null
+++ b/Squeeze.lua
@@ -0,0 +1,40 @@
+local Squeeze, parent = torch.class('nn.Squeeze', 'nn.Module')
+
+function Squeeze:__init(dim, numInputDims)
+    parent.__init(self)
+    self.dim = dim
+    self:setNumInputDims(numInputDims)
+end
+
+function Squeeze:setNumInputDims(numInputDims)
+   self.numInputDims = numInputDims
+   return self
+end
+
+function Squeeze:updateOutput(input)
+    assert(input and torch.isTensor(input), 'Squeeze only works on tensors')
+    local dim    = self.dim
+    local addone = false
+    if self.numInputDims and input:dim()==(self.numInputDims+1) then
+        if dim then
+            dim = dim + 1
+        elseif input:size(1) == 1 then
+            addone = true -- in case of minibatch of size 1.
+        end
+    end
+    self.output:set(dim and input:squeeze(dim) or input:squeeze())
+    if addone then
+        local s = self.output:size():totable{}
+        table.insert(s, 1, 1)
+        self.output:set(self.output:view(torch.LongStorage(s)))
+    end
+    return self.output
+end
+
+function Squeeze:updateGradInput(input, gradOutput)
+    assert(input and torch.isTensor(input), 'Squeeze only works on tensors')
+    assert(gradOutput and torch.isTensor(gradOutput), 'Squeeze only works on tensors')
+    assert(input:nElement() == gradOutput:nElement())
+    self.gradInput:set(gradOutput:view(input:size()))
+    return self.gradInput
+end
diff --git a/StochasticGradient.lua b/StochasticGradient.lua
new file mode 100644
index 0000000..a060371
--- /dev/null
+++ b/StochasticGradient.lua
@@ -0,0 +1,62 @@
+local StochasticGradient = torch.class('nn.StochasticGradient')
+
+function StochasticGradient:__init(module, criterion)
+   self.learningRate = 0.01
+   self.learningRateDecay = 0
+   self.maxIteration = 25
+   self.shuffleIndices = true
+   self.module = module
+   self.criterion = criterion
+   self.verbose = true
+end
+
+function StochasticGradient:train(dataset)
+   local iteration = 1
+   local currentLearningRate = self.learningRate
+   local module = self.module
+   local criterion = self.criterion
+
+   local shuffledIndices = torch.randperm(dataset:size(), 'torch.LongTensor')
+   if not self.shuffleIndices then
+      for t = 1,dataset:size() do
+         shuffledIndices[t] = t
+      end
+   end
+
+   print("# StochasticGradient: training")
+
+   while true do
+      local currentError = 0
+      for t = 1,dataset:size() do
+         local example = dataset[shuffledIndices[t]]
+         local input = example[1]
+         local target = example[2]
+
+         currentError = currentError + criterion:forward(module:forward(input), target)
+
+         module:updateGradInput(input, criterion:updateGradInput(module.output, target))
+         module:accUpdateGradParameters(input, criterion.gradInput, currentLearningRate)
+
+         if self.hookExample then
+            self.hookExample(self, example)
+         end
+      end
+
+      currentError = currentError / dataset:size()
+
+      if self.hookIteration then
+         self.hookIteration(self, iteration, currentError)
+      end
+
+      if self.verbose then
+         print("# current error = " .. currentError)
+      end
+      iteration = iteration + 1
+      currentLearningRate = self.learningRate/(1+iteration*self.learningRateDecay)
+      if self.maxIteration > 0 and iteration > self.maxIteration then
+         print("# StochasticGradient: you have reached the maximum number of iterations")
+         print("# training error = " .. currentError)
+         break
+      end
+   end
+end
diff --git a/Sum.lua b/Sum.lua
new file mode 100644
index 0000000..5d61c28
--- /dev/null
+++ b/Sum.lua
@@ -0,0 +1,61 @@
+local Sum, parent = torch.class('nn.Sum', 'nn.Module')
+
+function Sum:__init(dimension, nInputDims, sizeAverage)
+   parent.__init(self)
+   self.dimension   = dimension or 1
+   -- do not assign default value to nInputDims or it will break backward compatibility
+   self.nInputDims  = nInputDims
+   self.sizeAverage = sizeAverage or false
+end
+
+function Sum:_getPositiveDimension(input)
+    local dimension = self.dimension
+    if dimension < 0 then
+        dimension = input:dim() + dimension + 1
+    elseif self.nInputDims and input:dim()==(self.nInputDims+1) then
+        dimension = dimension + 1
+    end
+    assert(input:dim() >= dimension, "dimension exceeds input dimensions")
+    return dimension
+end
+
+function Sum:updateOutput(input)
+    local dimension = self:_getPositiveDimension(input)
+    if type(self.output) == 'number' then
+        self.output = input.new()
+    end
+    self.output:sum(input, dimension)
+    if self.sizeAverage then
+        self.output:div(input:size(dimension))
+    end
+    if self.output:nDimension() > 1 then
+        self.output:set(self.output:select(dimension, 1))
+    end
+    return self.output
+end
+
+function Sum:updateGradInput(input, gradOutput)
+    local dimension = self:_getPositiveDimension(input)
+    -- zero-strides dont work with MKL/BLAS, so
+    -- dont set self.gradInput to zero-stride tensor.
+    -- Instead, do a deepcopy
+    local size      = input:size()
+    size[dimension] = 1
+    if not gradOutput:isContiguous() then
+        self._gradOutput = self._gradOutput or gradOutput.new()
+        self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+        gradOutput = self._gradOutput
+    end
+    gradOutput      = gradOutput:view(size)
+    self.gradInput:resizeAs(input)
+    self.gradInput:copy(gradOutput:expandAs(input))
+    if self.sizeAverage then
+        self.gradInput:div(input:size(dimension))
+    end
+    return self.gradInput
+end
+
+function Sum:clearState()
+    nn.utils.clear(self, '_gradOutput')
+    return parent.clearState(self)
+end
diff --git a/THNN.lua b/THNN.lua
new file mode 100644
index 0000000..e18dbaa
--- /dev/null
+++ b/THNN.lua
@@ -0,0 +1,139 @@
+local ffi = require 'ffi'
+
+local THNN = {}
+
+local generic_THNN_h = require 'nn.THNN_h'
+-- strip all lines starting with #
+-- to remove preprocessor directives originally present
+-- in THNN.h
+generic_THNN_h = generic_THNN_h:gsub("\n#[^\n]*", "")
+generic_THNN_h = generic_THNN_h:gsub("^#[^\n]*\n", "")
+
+-- THGenerator struct declaration copied from torch7/lib/TH/THRandom.h
+local base_declarations = [[
+typedef void THNNState;
+
+typedef struct {
+  unsigned long the_initial_seed;
+  int left;
+  int seeded;
+  unsigned long next;
+  unsigned long state[624]; /* the array for the state vector 624 = _MERSENNE_STATE_N  */
+  double normal_x;
+  double normal_y;
+  double normal_rho;
+  int normal_is_valid;
+} THGenerator;
+]]
+
+-- polyfill for LUA 5.1
+if not package.searchpath then
+   local sep = package.config:sub(1,1)
+   function package.searchpath(mod, path)
+      mod = mod:gsub('%.', sep)
+      for m in path:gmatch('[^;]+') do
+         local nm = m:gsub('?', mod)
+         local f = io.open(nm, 'r')
+         if f then
+            f:close()
+            return nm
+         end
+     end
+   end
+end
+
+-- load libTHNN
+THNN.C = ffi.load(package.searchpath('libTHNN', package.cpath))
+
+ffi.cdef(base_declarations)
+
+-- expand macros, allow to use original lines from lib/THNN/generic/THNN.h
+local preprocessed = string.gsub(generic_THNN_h, 'TH_API void THNN_%(([%a%d_]+)%)', 'void THNN_TYPE%1')
+
+local replacements =
+{
+   {
+      ['TYPE'] = 'Double',
+      ['real'] = 'double',
+      ['THTensor'] = 'THDoubleTensor',
+      ['THIndexTensor'] = 'THLongTensor',
+      ['THIntegerTensor'] = 'THIntTensor',
+      ['THIndex_t'] = 'long',
+      ['THInteger_t'] = 'int'
+   },
+   {
+      ['TYPE'] = 'Float',
+      ['real'] = 'float',
+      ['THTensor'] = 'THFloatTensor',
+      ['THIndexTensor'] = 'THLongTensor',
+      ['THIntegerTensor'] = 'THIntTensor',
+      ['THIndex_t'] = 'long',
+      ['THInteger_t'] = 'int'
+    }
+}
+
+for i=1,#replacements do
+   local r = replacements[i]
+   local s = preprocessed
+   for k,v in pairs(r) do
+      s = string.gsub(s, k, v)
+   end
+   ffi.cdef(s)
+end
+
+THNN.NULL = ffi.NULL or nil
+
+function THNN.getState()
+   return ffi.NULL or nil
+end
+
+function THNN.optionalTensor(t)
+   return t and t:cdata() or THNN.NULL
+end
+
+local function extract_function_names(s)
+   local t = {}
+   for n in string.gmatch(s, 'TH_API void THNN_%(([%a%d_]+)%)') do
+      t[#t+1] = n
+   end
+   return t
+end
+
+function THNN.bind(lib, base_names, type_name, state_getter)
+   local ftable = {}
+   local prefix = 'THNN_' .. type_name
+   for i,n in ipairs(base_names) do
+      -- use pcall since some libs might not support all functions (e.g. cunn)
+      local ok,v = pcall(function() return lib[prefix .. n] end)
+      if ok then
+         ftable[n] = function(...) v(state_getter(), ...) end   -- implicitely add state
+      else
+         print('not found: ' .. prefix .. n .. v)
+      end
+   end
+   return ftable
+end
+
+-- build function table
+local function_names = extract_function_names(generic_THNN_h)
+
+THNN.kernels = {}
+THNN.kernels['torch.FloatTensor'] = THNN.bind(THNN.C, function_names, 'Float', THNN.getState)
+THNN.kernels['torch.DoubleTensor'] = THNN.bind(THNN.C, function_names, 'Double', THNN.getState)
+
+torch.getmetatable('torch.FloatTensor').THNN = THNN.kernels['torch.FloatTensor']
+torch.getmetatable('torch.DoubleTensor').THNN = THNN.kernels['torch.DoubleTensor']
+
+function THNN.runKernel(f, type, ...)
+   local ftable = THNN.kernels[type]
+   if not ftable then
+      error('Unsupported tensor type: '..type)
+   end
+   local f = ftable[f]
+   if not f then
+      error(string.format("Function '%s' not found for tensor type '%s'.", f, type))
+   end
+   f(...)
+end
+
+return THNN
diff --git a/Tanh.lua b/Tanh.lua
new file mode 100644
index 0000000..fc42cbb
--- /dev/null
+++ b/Tanh.lua
@@ -0,0 +1,19 @@
+local Tanh = torch.class('nn.Tanh', 'nn.Module')
+
+function Tanh:updateOutput(input)
+   input.THNN.Tanh_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function Tanh:updateGradInput(input, gradOutput)
+   input.THNN.Tanh_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+   return self.gradInput
+end
diff --git a/TanhShrink.lua b/TanhShrink.lua
new file mode 100644
index 0000000..96df6c5
--- /dev/null
+++ b/TanhShrink.lua
@@ -0,0 +1,20 @@
+local TanhShrink, parent = torch.class('nn.TanhShrink','nn.Module')
+
+function TanhShrink:__init()
+   parent.__init(self)
+   self.tanh = nn.Tanh()
+end
+
+function TanhShrink:updateOutput(input)
+   local th = self.tanh:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   self.output:add(-1,th)
+   return self.output
+end
+
+function TanhShrink:updateGradInput(input, gradOutput)
+   local dth = self.tanh:updateGradInput(input,gradOutput)
+   self.gradInput:resizeAs(input):copy(gradOutput)
+   self.gradInput:add(-1,dth)
+   return self.gradInput
+end
diff --git a/TemporalConvolution.lua b/TemporalConvolution.lua
new file mode 100644
index 0000000..cdf0217
--- /dev/null
+++ b/TemporalConvolution.lua
@@ -0,0 +1,71 @@
+local TemporalConvolution, parent = torch.class('nn.TemporalConvolution', 'nn.Module')
+
+function TemporalConvolution:__init(inputFrameSize, outputFrameSize, kW, dW)
+   parent.__init(self)
+
+   dW = dW or 1
+
+   self.inputFrameSize = inputFrameSize
+   self.outputFrameSize = outputFrameSize
+   self.kW = kW
+   self.dW = dW
+
+   self.weight = torch.Tensor(outputFrameSize, inputFrameSize*kW)
+   self.bias = torch.Tensor(outputFrameSize)
+   self.gradWeight = torch.Tensor(outputFrameSize, inputFrameSize*kW)
+   self.gradBias = torch.Tensor(outputFrameSize)
+   
+   self:reset()
+end
+
+function TemporalConvolution:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW*self.inputFrameSize)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)   
+   else
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+function TemporalConvolution:updateOutput(input)
+    input.THNN.TemporalConvolution_updateOutput(
+	input:cdata(), self.output:cdata(),
+	self.weight:cdata(), self.bias:cdata(),
+	self.kW, self.dW,
+	self.inputFrameSize, self.outputFrameSize
+    )
+   return self.output
+end
+
+function TemporalConvolution:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      input.THNN.TemporalConvolution_updateGradInput(
+	  input:cdata(), gradOutput:cdata(),
+	  self.gradInput:cdata(), self.weight:cdata(),
+	  self.kW, self.dW
+       )
+      return self.gradInput
+   end
+end
+
+function TemporalConvolution:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   input.THNN.TemporalConvolution_accGradParameters(
+       input:cdata(), gradOutput:cdata(),
+       self.gradWeight:cdata(), self.gradBias:cdata(),
+       self.kW, self.dW, scale
+   )
+end
+
+-- we do not need to accumulate parameters when sharing
+TemporalConvolution.sharedAccUpdateGradParameters = TemporalConvolution.accUpdateGradParameters
diff --git a/TemporalMaxPooling.lua b/TemporalMaxPooling.lua
new file mode 100644
index 0000000..91723e6
--- /dev/null
+++ b/TemporalMaxPooling.lua
@@ -0,0 +1,39 @@
+local TemporalMaxPooling, parent = torch.class('nn.TemporalMaxPooling', 'nn.Module')
+
+function TemporalMaxPooling:__init(kW, dW)
+   parent.__init(self)
+
+   dW = dW or kW
+
+   self.kW = kW
+   self.dW = dW
+end
+
+function TemporalMaxPooling:updateOutput(input)
+   self.indices = self.indices or input.new()
+   input.THNN.TemporalMaxPooling_updateOutput(
+       input:cdata(), self.output:cdata(),
+       self.indices:cdata(), self.kW, self.dW
+   )
+   return self.output
+end
+
+function TemporalMaxPooling:updateGradInput(input, gradOutput)
+    if self.gradInput then
+	input.THNN.TemporalMaxPooling_updateGradInput(
+	    input:cdata(), gradOutput:cdata(),
+	    self.gradInput:cdata(), self.indices:cdata(),
+	    self.kW, self.dW
+	)
+	return self.gradInput
+    end
+end
+
+function TemporalMaxPooling:empty()
+   self:clearState()
+end
+
+function TemporalMaxPooling:clearState()
+   if self.indices then self.indices:set() end
+   return parent.clearState(self)
+end
diff --git a/TemporalSubSampling.lua b/TemporalSubSampling.lua
new file mode 100644
index 0000000..f7d6b10
--- /dev/null
+++ b/TemporalSubSampling.lua
@@ -0,0 +1,64 @@
+local TemporalSubSampling, parent = torch.class('nn.TemporalSubSampling', 'nn.Module')
+
+function TemporalSubSampling:__init(inputFrameSize, kW, dW)
+   parent.__init(self)
+
+   dW = dW or 1
+
+   self.inputFrameSize = inputFrameSize
+   self.kW = kW
+   self.dW = dW
+
+   self.weight = torch.Tensor(inputFrameSize)
+   self.bias = torch.Tensor(inputFrameSize)
+   self.gradWeight = torch.Tensor(inputFrameSize)
+   self.gradBias = torch.Tensor(inputFrameSize)
+   
+   self:reset()
+end
+
+function TemporalSubSampling:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+   else
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+function TemporalSubSampling:updateOutput(input)
+    input.THNN.TemporalSubSampling_updateOutput(
+	input:cdata(), self.output:cdata(),
+	self.weight:cdata(), self.bias:cdata(),
+	self.kW, self.dW, self.inputFrameSize
+    )
+   return self.output
+end
+
+function TemporalSubSampling:updateGradInput(input, gradOutput)
+    if self.gradInput then
+	input.THNN.TemporalSubSampling_updateGradInput(
+	    input:cdata(), gradOutput:cdata(), self.gradInput:cdata(),
+	    self.weight:cdata(), self.kW, self.dW
+	)
+	return self.gradInput
+   end
+end
+
+function TemporalSubSampling:accGradParameters(input, gradOutput, scale)
+    scale = scale or 1
+    input.THNN.TemporalSubSampling_accGradParameters(
+	input:cdata(), gradOutput:cdata(), self.gradWeight:cdata(),
+	self.gradBias:cdata(), self.kW, self.dW, scale
+    )
+end
diff --git a/Threshold.lua b/Threshold.lua
new file mode 100644
index 0000000..0c22bae
--- /dev/null
+++ b/Threshold.lua
@@ -0,0 +1,50 @@
+local Threshold, parent = torch.class('nn.Threshold','nn.Module')
+
+function Threshold:__init(th,v,ip)
+   parent.__init(self)
+   self.threshold = th or 1e-6
+   self.val = v or 0
+   if (th and type(th) ~= 'number') or (v and type(v) ~= 'number') then
+      error('nn.Threshold(threshold, value)')
+   end
+   -- default for inplace is false
+   self.inplace = ip or false
+   if (ip and type(ip) ~= 'boolean') then
+      error('in-place flag must be boolean')
+   end
+   self:validateParameters()
+end
+
+function Threshold:updateOutput(input)
+   self:validateParameters()
+   input.THNN.Threshold_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.threshold,
+      self.val,
+      self.inplace
+   )
+   return self.output
+end
+
+function Threshold:updateGradInput(input, gradOutput)
+   self:validateParameters()
+   input.THNN.Threshold_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.threshold,
+      self.inplace
+   )
+   return self.gradInput
+end
+
+function Threshold:validateParameters()
+   self.inplace = self.inplace or false -- backwards compatibility pre inplace
+   if self.inplace then
+      if self.val > self.threshold then
+         error('in-place processing requires value (' .. self.val ..
+                  ') not exceed threshold (' .. self.threshold .. ')')
+      end
+   end
+end
diff --git a/Transpose.lua b/Transpose.lua
new file mode 100644
index 0000000..263db60
--- /dev/null
+++ b/Transpose.lua
@@ -0,0 +1,28 @@
+local Transpose, parent = torch.class('nn.Transpose', 'nn.Module')
+
+-- transpose dimensions:
+-- n = nn.Transpose({1,4},{1,3})
+-- will transpose dims 1 and 4, then 1 and 3...
+
+function Transpose:__init(...)
+   parent.__init(self)
+   self.permutations = {...}
+end
+
+function Transpose:updateOutput(input)
+   for _,perm in ipairs(self.permutations) do
+      input = input:transpose(perm[1],perm[2])
+   end
+   self.output:resizeAs(input):copy(input)
+   return self.output
+end
+
+function Transpose:updateGradInput(input, gradOutput)
+   for i = #self.permutations,1,-1 do
+      local perm = self.permutations[i]
+      gradOutput = gradOutput:transpose(perm[1],perm[2])
+   end
+   self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+   return self.gradInput
+end
+
diff --git a/Unsqueeze.lua b/Unsqueeze.lua
new file mode 100644
index 0000000..2e82a25
--- /dev/null
+++ b/Unsqueeze.lua
@@ -0,0 +1,52 @@
+local Unsqueeze, parent = torch.class('nn.Unsqueeze', 'nn.Module')
+
+local function _assertTensor(t)
+   assert(torch.isTensor(t), "This module only works on tensor")
+end
+
+function Unsqueeze:__init(pos, numInputDims)
+   parent.__init(self)
+   self.pos = pos or error('the position to insert singleton dim not specified')
+   self:setNumInputDims(numInputDims)
+end
+
+function Unsqueeze:setNumInputDims(numInputDims)
+   self.numInputDims = numInputDims
+   return self
+end
+
+function Unsqueeze:updateOutput(input)
+   _assertTensor(input)
+   local actualPos = self:_getActualPosition(input)
+   nn.utils.addSingletonDimension(self.output, input, actualPos)
+   return self.output
+end
+
+function Unsqueeze:updateGradInput(input, gradOutput)
+   _assertTensor(input)
+   _assertTensor(gradOutput)
+   assert(input:nElement() == gradOutput:nElement())
+
+   self.gradInput:view(gradOutput, input:size())
+   return self.gradInput
+end
+
+function Unsqueeze:__tostring__()
+   return torch.type(self)..'(dim ' .. self.pos .. ')'
+end
+
+function Unsqueeze:_getActualPosition(input)
+   -- get valid dimesion offset for batchMode (if any)
+   local inputDim = input:dim() -- data batch dim
+   self.numInputDims = self.numInputDims or inputDim -- feature map dim
+   local offsetDim = inputDim - self.numInputDims
+   assert(offsetDim >= 0, "input feature map dim (numInputDims) must be <= input:dim()")
+
+   -- the actual position; clearer error message for batchMode (if any)
+   local actualPos = self.pos + offsetDim
+   assert(actualPos >= 1 and actualPos <= (inputDim + 1),
+      ("Invalid position: %d. input:dim() is %d, input feature map dim (numInputDims) is %d.")
+      :format(self.pos, inputDim, self.numInputDims)
+   )
+   return actualPos
+end
diff --git a/View.lua b/View.lua
new file mode 100644
index 0000000..542e57e
--- /dev/null
+++ b/View.lua
@@ -0,0 +1,96 @@
+local View, parent = torch.class('nn.View', 'nn.Module')
+
+function View:resetSize(...)
+   if select('#', ...) == 1 and torch.typename(select(1, ...)) == 'torch.LongStorage' then
+      self.size = select(1, ...)
+   else
+      self.size = torch.LongStorage({...})
+   end
+
+   self.numElements = 1
+   local inferdim = false
+   for i = 1,#self.size do
+      local szi = self.size[i]
+      if szi >= 0 then
+         self.numElements = self.numElements * self.size[i]
+      else
+         assert(szi == -1, 'size should be positive or -1')
+         assert(not inferdim, 'only one dimension can be at -1')
+         inferdim = true
+      end
+   end
+
+   return self
+end
+
+function View:__init(...)
+   parent.__init(self)
+   self:resetSize(...)
+   self.numInputDims = nil
+end
+
+function View:setNumInputDims(numInputDims)
+   self.numInputDims = numInputDims
+   return self
+end
+
+local function batchsize(input, size, numInputDims, numElements)
+   local ind = input:nDimension()
+   local isz = input:size()
+   local maxdim = numInputDims and numInputDims or ind
+   local ine = 1
+   for i=ind,ind-maxdim+1,-1 do
+      ine = ine * isz[i]
+   end
+
+   if ine % numElements ~= 0 then
+      error(string.format(
+               'input view (%s) and desired view (%s) do not match',
+               table.concat(input:size():totable(), 'x'),
+               table.concat(size:totable(), 'x')))
+   end
+
+   -- the remainder is either the batch...
+   local bsz = ine / numElements
+
+   -- ... or the missing size dim
+   for i=1,size:size() do
+      if size[i] == -1 then
+         bsz = 1
+         break
+      end
+   end
+
+   -- for dim over maxdim, it is definitively the batch
+   for i=ind-maxdim,1,-1 do
+      bsz = bsz * isz[i]
+   end
+
+   -- special card
+   if bsz == 1 and (not numInputDims or input:nDimension() <= numInputDims) then
+      return
+   end
+
+   return bsz
+end
+
+function View:updateOutput(input)
+   self.output = self.output or input.new()
+   local bsz = batchsize(input, self.size, self.numInputDims, self.numElements)
+   if bsz then
+      self.output:view(input, bsz, table.unpack(self.size:totable()))
+   else
+      self.output:view(input, self.size)
+   end
+   return self.output
+end
+
+function View:updateGradInput(input, gradOutput)
+   self.gradInput = self.gradInput or gradOutput.new()
+   self.gradInput:view(gradOutput, input:size())
+   return self.gradInput
+end
+
+function View:__tostring__()
+   return torch.type(self)..'('..table.concat(self.size:totable(), ', ')..')'
+end
diff --git a/VolumetricAveragePooling.lua b/VolumetricAveragePooling.lua
new file mode 100644
index 0000000..df6d2c4
--- /dev/null
+++ b/VolumetricAveragePooling.lua
@@ -0,0 +1,54 @@
+local VolumetricAveragePooling, parent = torch.class(
+   'nn.VolumetricAveragePooling', 'nn.Module')
+
+function VolumetricAveragePooling:__init(kT, kW, kH, dT, dW, dH)
+   parent.__init(self)
+
+   dT = dT or kT
+   dW = dW or kW
+   dH = dH or kH
+
+   self.kT = kT
+   self.kH = kH
+   self.kW = kW
+   self.dT = dT
+   self.dW = dW
+   self.dH = dH
+end
+
+function VolumetricAveragePooling:updateOutput(input)
+   input.THNN.VolumetricAveragePooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH
+   )
+   return self.output
+end
+
+function VolumetricAveragePooling:updateGradInput(input, gradOutput)
+   input.THNN.VolumetricAveragePooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH
+   )
+   return self.gradInput
+end
+
+function VolumetricAveragePooling:empty()
+   return parent.clearState(self)
+end
+
+function VolumetricAveragePooling:__tostring__()
+   local s =  string.format('%s(%dx%dx%d, %d,%d,%d', torch.type(self),
+                            self.kT, self.kW, self.kH, self.dT, self.dW, self.dH)
+   if (self.padT or self.padW or self.padH) and
+      (self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0) then
+      s = s .. ', ' .. self.padT.. ',' .. self.padW .. ','.. self.padH
+   end
+   s = s .. ')'
+
+   return s
+end
diff --git a/VolumetricBatchNormalization.lua b/VolumetricBatchNormalization.lua
new file mode 100644
index 0000000..6168a92
--- /dev/null
+++ b/VolumetricBatchNormalization.lua
@@ -0,0 +1,4 @@
+local BN, parent = torch.class('nn.VolumetricBatchNormalization', 'nn.BatchNormalization')
+
+-- expected dimension of input
+BN.nDim = 5
diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua
new file mode 100644
index 0000000..e40c90a
--- /dev/null
+++ b/VolumetricConvolution.lua
@@ -0,0 +1,195 @@
+local VolumetricConvolution, parent = torch.class('nn.VolumetricConvolution', 'nn.Module')
+
+function VolumetricConvolution:__init(nInputPlane, nOutputPlane, kT, kW, kH, dT, dW, dH, padT, padW, padH)
+   parent.__init(self)
+
+   dT = dT or 1
+   dW = dW or 1
+   dH = dH or 1
+
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kT = kT
+   self.kW = kW
+   self.kH = kH
+   self.dT = dT
+   self.dW = dW
+   self.dH = dH
+   self.padT = padT or 0
+   self.padW = padW or self.padT
+   self.padH = padH or self.padW
+
+   self.weight = torch.Tensor(nOutputPlane, nInputPlane, kT, kH, kW)
+   self.bias = torch.Tensor(nOutputPlane)
+   self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane, kT, kH, kW)
+   self.gradBias = torch.Tensor(nOutputPlane)
+   self:reset()
+end
+
+function VolumetricConvolution:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kT*self.kW*self.kH*self.nInputPlane)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+   else
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+local function makeContiguous(self, input, gradOutput)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resizeAs(input):copy(input)
+      input = self._input
+   end
+   if gradOutput then
+      if not gradOutput:isContiguous() then
+         self._gradOutput = self._gradOutput or gradOutput.new()
+         self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+         gradOutput = self._gradOutput
+      end
+   end
+   return input, gradOutput
+end
+
+-- function to re-view the weight layout in a way that would make the MM ops happy
+local function viewWeight(self)
+   self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane * self.kT * self.kH * self.kW)
+   if self.gradWeight and self.gradWeight:dim() > 0 then
+      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane * self.kT * self.kH * self.kW)
+   end
+end
+
+local function unviewWeight(self)
+   self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane, self.kT, self.kH, self.kW)
+   if self.gradWeight and self.gradWeight:dim() > 0 then
+      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane, self.kT, self.kH, self.kW)
+   end
+end
+
+function VolumetricConvolution:updateOutput(input)
+   self.finput = self.finput or input.new()
+   self.fgradInput = self.fgradInput or input.new()
+   if input:type() == 'torch.CudaTensor' then
+      input.THNN.VolumetricConvolution_updateOutput(
+        input:cdata(),
+        self.output:cdata(),
+        self.weight:cdata(),
+        self.bias:cdata(),
+        self.finput:cdata(),
+        self.fgradInput:cdata(),
+        self.dT, self.dW, self.dH,
+        self.padT, self.padW, self.padH
+      )
+   else
+      viewWeight(self)
+      input = makeContiguous(self, input)
+      input.THNN.VolumetricConvolutionMM_updateOutput(
+         input:cdata(),
+         self.output:cdata(),
+         self.weight:cdata(),
+         self.bias:cdata(),
+         self.finput:cdata(),
+         self.kT, self.kW, self.kH,
+         self.dT, self.dW, self.dH,
+         self.padT, self.padW, self.padH
+      )
+      unviewWeight(self)
+   end
+   return self.output
+end
+
+function VolumetricConvolution:updateGradInput(input, gradOutput)
+   if input:type() == 'torch.CudaTensor' then
+      input.THNN.VolumetricConvolution_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.finput:cdata(),
+         self.dT, self.dW, self.dH,
+         self.padT, self.padW, self.padH
+      )
+      return self.gradInput
+   else
+      if self.gradInput then
+         viewWeight(self)
+         input, gradOutput = makeContiguous(self, input, gradOutput)
+         input.THNN.VolumetricConvolutionMM_updateGradInput(
+            input:cdata(),
+            gradOutput:cdata(),
+            self.gradInput:cdata(),
+            self.weight:cdata(),
+            self.finput:cdata(),
+            self.fgradInput:cdata(),
+            self.kT, self.kW, self.kH,
+            self.dT, self.dW, self.dH,
+            self.padT, self.padW, self.padH
+         )
+         unviewWeight(self)
+         return self.gradInput
+      end
+   end
+end
+
+function VolumetricConvolution:accGradParameters(input, gradOutput, scale)
+   if input:type() == 'torch.CudaTensor' then
+      input.THNN.VolumetricConvolution_accGradParameters(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradWeight:cdata(),
+         self.gradBias:cdata(),
+         self.finput:cdata(),
+         self.fgradInput:cdata(),
+         self.dT, self.dW, self.dH,
+         self.padT, self.padW, self.padH,
+         scale or 1
+      )
+   else
+      input, gradOutput = makeContiguous(self, input, gradOutput)
+      viewWeight(self)
+      input.THNN.VolumetricConvolutionMM_accGradParameters(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradWeight:cdata(),
+         self.gradBias:cdata(),
+         self.finput:cdata(),
+         scale or 1
+      )
+      unviewWeight(self)
+   end
+end
+
+function VolumetricConvolution:type(type, tensorCache)
+   if self.finput then self.finput:set() end
+   if self.fgradInput then self.fgradInput:set() end
+   return parent.type(self, type, tensorCache)
+end
+
+function VolumetricConvolution:clearState()
+   nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+   return parent.clearState(self)
+end
+
+function VolumetricConvolution:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%dx%d', torch.type(self),
+         self.nInputPlane, self.nOutputPlane, self.kT, self.kW, self.kH)
+   if self.dT ~= 1 or self.dW ~= 1 or self.dH ~= 1 or
+      self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0 then
+     s = s .. string.format(', %d,%d,%d', self.dT, self.dW, self.dH)
+   end
+   if (self.padT or self.padW or self.padH) and
+      (self.padT ~=0 or self.padW ~= 0 or self.padH ~= 0) then
+     s = s .. ', ' .. self.padT .. ',' .. self.padW .. ',' .. self.padH
+   end
+   return s .. ')'
+end
diff --git a/VolumetricDropout.lua b/VolumetricDropout.lua
new file mode 100644
index 0000000..5f495af
--- /dev/null
+++ b/VolumetricDropout.lua
@@ -0,0 +1,54 @@
+local VolumetricDropout, Parent = torch.class('nn.VolumetricDropout', 'nn.Module')
+
+function VolumetricDropout:__init(p)
+   Parent.__init(self)
+   self.p = p or 0.5
+   self.train = true
+   self.noise = torch.Tensor()
+end
+
+function VolumetricDropout:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   if self.train then
+      if input:dim() == 5 then
+        self.noise:resize(input:size(1), input:size(2), 1, 1, 1)
+      elseif input:dim() == 4 then
+        self.noise:resize(input:size(1), 1, 1, 1)
+      else
+        error('Input must be 5D (nbatch, nfeat, t, h, w) or 4D (nfeat, t, h, w)')
+      end
+      self.noise:bernoulli(1-self.p)
+      -- We expand the random dropouts to the entire feature map because the
+      -- features are likely correlated accross the map and so the dropout
+      -- should also be correlated.
+      self.output:cmul(torch.expandAs(self.noise, input))
+   else
+      self.output:mul(1-self.p)
+   end
+   return self.output
+end
+
+function VolumetricDropout:updateGradInput(input, gradOutput)
+   if self.train then
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+      self.gradInput:cmul(torch.expandAs(self.noise, input)) -- simply mask the gradients with the noise vector
+   else
+      error('backprop only defined while training')
+   end
+   return self.gradInput
+end
+
+function VolumetricDropout:setp(p)
+   self.p = p
+end
+
+function VolumetricDropout:__tostring__()
+  return string.format('%s(%f)', torch.type(self), self.p)
+end
+
+function VolumetricDropout:clearState()
+  if self.noise then
+    self.noise:set()
+  end
+  return Parent.clearState(self)
+end
diff --git a/VolumetricFullConvolution.lua b/VolumetricFullConvolution.lua
new file mode 100644
index 0000000..3c86a14
--- /dev/null
+++ b/VolumetricFullConvolution.lua
@@ -0,0 +1,236 @@
+local VolumetricFullConvolution, parent = torch.class('nn.VolumetricFullConvolution','nn.Module')
+
+function VolumetricFullConvolution:__init(nInputPlane, nOutputPlane,
+                                          kT, kW, kH,         -- kernel size
+                                          dT, dW, dH,         -- stride
+                                          padT, padW, padH,   -- padding
+                                          adjT, adjW, adjH)   -- extra output adjustment
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+   dT = dT or 1
+
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kW = kW
+   self.kH = kH
+   self.kT = kT
+   self.dW = dW
+   self.dH = dH
+   self.dT = dT
+   self.padW = padW or 0
+   self.padH = padH or 0
+   self.padT = padT or 0
+   self.adjW = adjW or 0
+   self.adjH = adjH or 0
+   self.adjT = adjT or 0
+
+   if self.adjW > self.dW - 1 or self.adjH > self.dH - 1 or self.adjT > self.dT - 1 then
+      error('adjW, adjH and adjT must be smaller than self.dW - 1,' ..
+            ' self.dH - 1 and self.dT - 1 respectively')
+   end
+
+   self.weight = torch.Tensor(nInputPlane, nOutputPlane, kT, kH, kW)
+   self.gradWeight = torch.Tensor(nInputPlane, nOutputPlane, kT, kH, kW)
+   self.bias = torch.Tensor(self.nOutputPlane)
+   self.gradBias = torch.Tensor(self.nOutputPlane)
+
+   self.ones = torch.Tensor()
+   self.finput = torch.Tensor()
+   self.fgradInput = torch.Tensor()
+
+   self:reset()
+end
+
+function VolumetricFullConvolution:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      local nInputPlane = self.nInputPlane
+      local kT = self.kT
+      local kH = self.kH
+      local kW = self.kW
+      stdv = 1/math.sqrt(kW*kH*kT*nInputPlane)
+   end
+   self.weight:uniform(-stdv, stdv)
+   self.bias:uniform(-stdv, stdv)
+end
+
+local function makeContiguous(self, input, gradOutput)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resizeAs(input):copy(input)
+      input = self._input
+   end
+   if gradOutput then
+      if not gradOutput:isContiguous() then
+         self._gradOutput = self._gradOutput or gradOutput.new()
+         self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+         gradOutput = self._gradOutput
+     end
+   end
+   return input, gradOutput
+end
+
+local function calculateAdj(targetSize, ker, pad, stride)
+  return (targetSize + 2 * pad - ker) % stride
+end
+
+function VolumetricFullConvolution:backCompatibility()
+   -- Transpose the weight when loading from an old version
+   if not self.adjW then
+      self.weight = self.weight:transpose(1, 2):contiguous()
+   end
+
+   -- Rename the padding when loading from an old version
+   self.padW = self.padW or self.pW
+   self.padH = self.padH or self.pH
+   self.padT = self.padT or self.pT
+
+   self.adjW = self.adjW or 0
+   self.adjH = self.adjH or 0
+   self.adjT = self.adjT or 0
+end
+
+function VolumetricFullConvolution:updateOutput(input)
+   self:backCompatibility()
+
+  local inputTensor = input
+  local adjT, adjW, adjH = self.adjT, self.adjW, self.adjH
+
+  -- The input can be a table where the second element indicates the target
+  -- output size, in which case the adj factors are computed automatically
+  if type(inputTensor) == 'table' then
+    inputTensor = input[1]
+    local targetTensor = input[2]
+    local tDims = targetTensor:dim()
+    local tT = targetTensor:size(tDims-2)
+    local tH = targetTensor:size(tDims-1)
+    local tW = targetTensor:size(tDims)
+    adjT = calculateAdj(tT, self.kT, self.padT, self.dT)
+    adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
+    adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
+  end
+
+   inputTensor = makeContiguous(self, inputTensor)
+   inputTensor.THNN.VolumetricFullConvolution_updateOutput(
+      inputTensor:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      adjT, adjW, adjH
+   )
+
+   return self.output
+end
+
+function VolumetricFullConvolution:updateGradInput(input, gradOutput)
+   self:backCompatibility()
+
+    local inputTensor = input
+    local adjT, adjW, adjH = self.adjT, self.adjW, self.adjH
+
+    -- The input can be a table where the second element indicates the target
+    -- output size, in which case the adj factors are computed automatically
+    if type(inputTensor) == 'table' then
+      inputTensor = input[1]
+      local targetTensor = input[2]
+      local tDims = targetTensor:dim()
+      local tT = targetTensor:size(tDims-2)
+      local tH = targetTensor:size(tDims-1)
+      local tW = targetTensor:size(tDims)
+      adjT = calculateAdj(tT, self.kT, self.padT, self.dT)
+      adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
+      adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
+      -- Momentarily extract the gradInput tensor
+      if type(self.gradInput) == 'table' then
+        self.gradInput = self.gradInput[1]
+      end
+    end
+
+   inputTensor, gradOutput = makeContiguous(self, inputTensor, gradOutput)
+   inputTensor.THNN.VolumetricFullConvolution_updateGradInput(
+      inputTensor:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata(),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      adjT, adjW, adjH
+   )
+
+    if type(input) == 'table' then
+     -- Create a zero tensor to be expanded and used as gradInput[2].
+      self.zeroScalar = self.zeroScalar or input[2].new(1):zero()
+      self.ones:resize(input[2]:dim()):fill(1)
+      local zeroTensor =  self.zeroScalar
+          :view(table.unpack(self.ones:totable()))
+          :expandAs(input[2])
+      self.gradInput = {self.gradInput, zeroTensor}
+    end
+
+   return self.gradInput
+end
+
+function VolumetricFullConvolution:accGradParameters(input, gradOutput, scale)
+   self:backCompatibility()
+
+  local inputTensor = input
+  local adjT, adjW, adjH = self.adjT, self.adjW, self.adjH
+
+  -- The input can be a table where the second element indicates the target
+  -- output size, in which case the adj factors are computed automatically
+  if type(inputTensor) == 'table' then
+    inputTensor = input[1]
+    local targetTensor = input[2]
+    local tDims = targetTensor:dim()
+    local tT = targetTensor:size(tDims-2)
+    local tH = targetTensor:size(tDims-1)
+    local tW = targetTensor:size(tDims)
+    adjT = calculateAdj(tT, self.kT, self.padT, self.dT)
+    adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
+    adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
+  end
+
+   inputTensor, gradOutput = makeContiguous(self, inputTensor, gradOutput)
+   inputTensor.THNN.VolumetricFullConvolution_accGradParameters(
+      inputTensor:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      self.gradBias:cdata(),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      adjT, adjW, adjH,
+      scale or 1
+   )
+end
+
+function VolumetricFullConvolution:type(type, tensorCache)
+   self.finput = torch.Tensor()
+   self.fgradInput = torch.Tensor()
+   return parent.type(self, type, tensorCache)
+end
+
+function VolumetricFullConvolution:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%dx%d', torch.type(self),
+   self.nInputPlane, self.nOutputPlane, self.kT, self.kW, self.kH)
+   if self.dT ~= 1 or self.dW ~= 1 or self.dH ~= 1 or self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0 then
+      s = s .. string.format(', %d,%d,%d', self.dT, self.dW, self.dH)
+   end
+   if (self.padT or self.padW or self.padH) and (self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0) then
+      s = s .. ', ' .. self.padT .. ',' .. self.padW .. ',' .. self.padH
+   end
+   if (self.adjT or self.adjW or self.adjH) and (self.adjT ~= 0 or self.adjW ~= 0 or self.adjH ~= 0) then
+      s = s .. ', ' .. self.adjT .. ',' .. self.adjW .. ',' .. self.adjH
+   end
+   return s .. ')'
+end
diff --git a/VolumetricMaxPooling.lua b/VolumetricMaxPooling.lua
new file mode 100644
index 0000000..fd65231
--- /dev/null
+++ b/VolumetricMaxPooling.lua
@@ -0,0 +1,95 @@
+local VolumetricMaxPooling, parent = torch.class('nn.VolumetricMaxPooling', 'nn.Module')
+
+VolumetricMaxPooling.__version = 2
+
+function VolumetricMaxPooling:__init(kT, kW, kH, dT, dW, dH, padT, padW, padH)
+   parent.__init(self)
+
+   dT = dT or kT
+   dW = dW or kW
+   dH = dH or kH
+
+   self.kT = kT
+   self.kH = kH
+   self.kW = kW
+   self.dT = dT
+   self.dW = dW
+   self.dH = dH
+
+   self.padT = padT or 0
+   self.padW = padW or 0
+   self.padH = padH or 0
+
+
+   self.ceil_mode = false
+   self.indices = torch.Tensor()
+end
+
+function VolumetricMaxPooling:ceil()
+    self.ceil_mode = true
+    return self
+end
+
+function VolumetricMaxPooling:floor()
+    self.ceil_mode = false
+    return self
+end
+
+function VolumetricMaxPooling:updateOutput(input)
+   local dims = input:dim()
+   self.itime = input:size(dims-2)
+   self.iheight = input:size(dims-1)
+   self.iwidth = input:size(dims)
+
+   self.indices = self.indices or input.new()
+   input.THNN.VolumetricMaxPooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.indices:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      self.ceil_mode
+   )
+   return self.output
+end
+
+function VolumetricMaxPooling:updateGradInput(input, gradOutput)
+   input.THNN.VolumetricMaxPooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.indices:cdata(),
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH
+   )
+   return self.gradInput
+end
+
+function VolumetricMaxPooling:empty()
+   self:clearState()
+end
+
+function VolumetricMaxPooling:clearState()
+   if self.indices then self.indices:set() end
+   return parent.clearState(self)
+end
+
+function VolumetricMaxPooling:read(file, version)
+   parent.read(self, file)
+   if version < 2 then
+      self.ceil_mode = false
+   end
+end
+
+function VolumetricMaxPooling:__tostring__()
+   local s =  string.format('%s(%dx%dx%d, %d,%d,%d', torch.type(self),
+                            self.kT, self.kW, self.kH, self.dT, self.dW, self.dH)
+   if (self.padT or self.padW or self.padH) and
+      (self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0) then
+      s = s .. ', ' .. self.padT.. ',' .. self.padW .. ','.. self.padH
+   end
+   s = s .. ')'
+
+   return s
+end
diff --git a/VolumetricMaxUnpooling.lua b/VolumetricMaxUnpooling.lua
new file mode 100644
index 0000000..6291f5b
--- /dev/null
+++ b/VolumetricMaxUnpooling.lua
@@ -0,0 +1,56 @@
+local VolumetricMaxUnpooling, parent = torch.class('nn.VolumetricMaxUnpooling', 'nn.Module')
+
+function VolumetricMaxUnpooling:__init(poolingModule)
+  parent.__init(self)
+  assert(torch.type(poolingModule)=='nn.VolumetricMaxPooling', 'Argument must be a nn.VolumetricMaxPooling module')
+  assert(poolingModule.kT==poolingModule.dT and poolingModule.kH==poolingModule.dH and poolingModule.kW==poolingModule.dW, "The size of pooling module's kernel must be equal to its stride")
+  self.pooling = poolingModule
+end
+
+function VolumetricMaxUnpooling:setParams()
+  self.indices = self.pooling.indices
+  self.otime = self.pooling.itime
+  self.oheight = self.pooling.iheight
+  self.owidth = self.pooling.iwidth
+  self.dT = self.pooling.dT
+  self.dH = self.pooling.dH
+  self.dW = self.pooling.dW
+  self.padT = self.pooling.padT
+  self.padH = self.pooling.padH
+  self.padW = self.pooling.padW
+end
+
+function VolumetricMaxUnpooling:updateOutput(input)
+  self:setParams()
+  input.THNN.VolumetricMaxUnpooling_updateOutput(
+     input:cdata(),
+     self.output:cdata(),
+     self.indices:cdata(),
+     self.otime, self.owidth, self.oheight,
+     self.dT, self.dW, self.dH,
+     self.padT, self.padW, self.padH
+  )
+  return self.output
+end
+
+function VolumetricMaxUnpooling:updateGradInput(input, gradOutput)
+  self:setParams()
+  input.THNN.VolumetricMaxUnpooling_updateGradInput(
+     input:cdata(),
+     gradOutput:cdata(),
+     self.gradInput:cdata(),
+     self.indices:cdata(),
+     self.otime, self.owidth, self.oheight,
+     self.dT, self.dW, self.dH,
+     self.padT, self.padW, self.padH
+  )
+  return self.gradInput
+end
+
+function VolumetricMaxUnpooling:empty()
+   self:clearState()
+end
+
+function VolumetricMaxUnpooling:__tostring__()
+   return 'nn.VolumetricMaxUnpooling associated to '..tostring(self.pooling)
+end
diff --git a/WeightedEuclidean.lua b/WeightedEuclidean.lua
new file mode 100644
index 0000000..606510c
--- /dev/null
+++ b/WeightedEuclidean.lua
@@ -0,0 +1,244 @@
+local WeightedEuclidean, parent = torch.class('nn.WeightedEuclidean', 'nn.Module')
+
+function WeightedEuclidean:__init(inputSize,outputSize)
+   parent.__init(self)
+
+   self.weight = torch.Tensor(inputSize,outputSize)
+   self.gradWeight = torch.Tensor(inputSize,outputSize)
+
+   -- each template (output dim) has its own diagonal covariance matrix
+   self.diagCov = torch.Tensor(inputSize,outputSize)
+   self.gradDiagCov = torch.Tensor(inputSize,outputSize)
+
+   self:reset()
+end
+
+function WeightedEuclidean:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(1))
+   end
+   self.weight:uniform(-stdv, stdv)
+   self.diagCov:fill(1)
+end
+
+local function view(res, src, ...)
+   local args = {...}
+   if src:isContiguous() then
+      res:view(src, table.unpack(args))
+   else
+      res:reshape(src, table.unpack(args))
+   end
+end
+
+function WeightedEuclidean:updateOutput(input)
+   -- lazy-initialize 
+   self._diagCov = self._diagCov or self.output.new()
+   
+   self._input = self._input or input.new()
+   self._weight = self._weight or self.weight.new()
+   self._expand = self._expand or self.output.new()
+   self._expand2 = self._expand or self.output.new()
+   self._expand3 = self._expand3 or self.output.new()
+   self._repeat = self._repeat or self.output.new()
+   self._repeat2 = self._repeat2 or self.output.new()
+   self._repeat3 = self._repeat3 or self.output.new()
+   
+   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)
+   
+   -- y_j = || c_j * (w_j - x) ||
+   if input:dim() == 1 then
+      view(self._input, input, inputSize, 1)
+      self._expand:expandAs(self._input, self.weight)
+      self._repeat:resizeAs(self._expand):copy(self._expand)
+      self._repeat:add(-1, self.weight)
+      self._repeat:cmul(self.diagCov)
+      self.output:norm(self._repeat, 2, 1)
+      self.output:resize(outputSize)
+   elseif input:dim() == 2 then
+      local batchSize = input:size(1)
+      
+      view(self._input, input, batchSize, inputSize, 1)
+      self._expand:expand(self._input, batchSize, inputSize, outputSize)
+      -- make the expanded tensor contiguous (requires lots of memory)
+      self._repeat:resizeAs(self._expand):copy(self._expand)
+      
+      self._weight:view(self.weight, 1, inputSize, outputSize)
+      self._expand2:expandAs(self._weight, self._repeat)
+      
+      self._diagCov:view(self.diagCov, 1, inputSize, outputSize)
+      self._expand3:expandAs(self._diagCov, self._repeat)
+      if torch.type(input) == 'torch.CudaTensor' then
+         -- requires lots of memory, but minimizes cudaMallocs and loops
+         self._repeat2:resizeAs(self._expand2):copy(self._expand2)
+         self._repeat:add(-1, self._repeat2)
+         self._repeat3:resizeAs(self._expand3):copy(self._expand3)
+         self._repeat:cmul(self._repeat3)
+      else
+         self._repeat:add(-1, self._expand2)
+         self._repeat:cmul(self._expand3)
+      end
+      
+      self.output:norm(self._repeat, 2, 2)
+      self.output:resize(batchSize, outputSize)
+   else
+      error"1D or 2D input expected"
+   end
+   return self.output
+end
+
+function WeightedEuclidean:updateGradInput(input, gradOutput)
+   if not self.gradInput then
+      return
+   end
+   
+   self._div = self._div or input.new()
+   self._output = self._output or self.output.new()
+   self._expand4 = self._expand4 or input.new()
+   self._gradOutput = self._gradOutput or input.new()
+   
+   if not self.fastBackward then
+      self:updateOutput(input)
+   end
+   
+   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)
+
+   --[[ 
+   dy_j   -2 * c_j * c_j * (w_j - x)   c_j * c_j * (x - w_j) 
+   ---- = -------------------------- = ---------------------
+    dx     2 || c_j * (w_j - x) ||              y_j
+   --]]
+   
+   -- to prevent div by zero (NaN) bugs
+   self._output:resizeAs(self.output):copy(self.output):add(0.0000001)
+   view(self._gradOutput, gradOutput, gradOutput:size())
+   self._div:cdiv(gradOutput, self._output) 
+   if input:dim() == 1 then
+      self._div:resize(1, outputSize)
+      self._expand4:expandAs(self._div, self.weight)
+      
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat2:resizeAs(self._expand4):copy(self._expand4)
+         self._repeat2:cmul(self._repeat)
+      else
+         self._repeat2:cmul(self._repeat, self._expand4)
+      end
+      
+      self._repeat2:cmul(self.diagCov)
+      self.gradInput:sum(self._repeat2, 2)
+      self.gradInput:resizeAs(input)
+   elseif input:dim() == 2 then
+      local batchSize = input:size(1)
+      
+      self._div:resize(batchSize, 1, outputSize)
+      self._expand4:expand(self._div, batchSize, inputSize, outputSize)
+      
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat2:resizeAs(self._expand4):copy(self._expand4)
+         self._repeat2:cmul(self._repeat)
+         self._repeat2:cmul(self._repeat3)
+      else
+         self._repeat2:cmul(self._repeat, self._expand4)
+         self._repeat2:cmul(self._expand3)
+      end
+      
+      self.gradInput:sum(self._repeat2, 3)
+      self.gradInput:resizeAs(input)
+   else
+      error"1D or 2D input expected"
+   end
+   
+   return self.gradInput
+end
+
+function WeightedEuclidean:accGradParameters(input, gradOutput, scale)
+   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)
+   scale = scale or 1
+   
+   --[[ 
+   dy_j   2 * c_j * c_j * (w_j - x)    c_j * c_j * (w_j - x)
+   ---- = ------------------------- = ---------------------
+   dw_j    2 || c_j * (w_j - x) ||             y_j
+   
+   dy_j    2 * c_j * (w_j - x)^2    c_j * (w_j - x)^2 
+   ---- = ----------------------- = ----------------- 
+   dc_j   2 || c_j * (w_j - x) ||         y_j          
+   --]]
+   -- assumes a preceding call to updateGradInput 
+   if input:dim() == 1 then
+      self.gradWeight:add(-scale, self._repeat2)
+      
+      self._repeat:cdiv(self.diagCov)
+      self._repeat:cmul(self._repeat)
+      self._repeat:cmul(self.diagCov)
+      
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat2:resizeAs(self._expand4):copy(self._expand4)
+         self._repeat2:cmul(self._repeat)
+      else
+         self._repeat2:cmul(self._repeat, self._expand4)
+      end
+      
+      self.gradDiagCov:add(self._repeat2)
+   elseif input:dim() == 2 then
+      self._sum = self._sum or input.new()
+      self._sum:sum(self._repeat2, 1)
+      self._sum:resize(inputSize, outputSize)
+      self.gradWeight:add(-scale, self._sum)
+      
+      if torch.type(input) == 'torch.CudaTensor' then
+         -- requires lots of memory, but minimizes cudaMallocs and loops
+         self._repeat:cdiv(self._repeat3)
+         self._repeat:cmul(self._repeat)
+         self._repeat:cmul(self._repeat3)
+         self._repeat2:resizeAs(self._expand4):copy(self._expand4)
+         self._repeat:cmul(self._repeat2)
+      else
+         self._repeat:cdiv(self._expand3)
+         self._repeat:cmul(self._repeat)
+         self._repeat:cmul(self._expand3)
+         self._repeat:cmul(self._expand4)
+      end
+      
+      self._sum:sum(self._repeat, 1)
+      self._sum:resize(inputSize, outputSize)
+      self.gradDiagCov:add(scale, self._sum)
+   else
+      error"1D or 2D input expected"
+   end
+end
+
+function WeightedEuclidean:type(type, tensorCache)
+   if type then
+      -- prevent premature memory allocations
+      self._input = nil
+      self._output = nil
+      self._gradOutput = nil
+      self._weight = nil
+      self._div = nil
+      self._sum = nil
+      self._expand = nil
+      self._expand2 = nil
+      self._expand3 = nil
+      self._expand4 = nil
+      self._repeat = nil
+      self._repeat2 = nil
+      self._repeat3 = nil
+   end
+   return parent.type(self, type, tensorCache)
+end
+
+function WeightedEuclidean:parameters()
+   return {self.weight, self.diagCov}, {self.gradWeight, self.gradDiagCov}
+end
+
+function WeightedEuclidean:accUpdateGradParameters(input, gradOutput, lr)
+   local gradWeight = self.gradWeight
+   local gradDiagCov = self.gradDiagCov
+   self.gradWeight = self.weight
+   self.gradDiagCov = self.diagCov
+   self:accGradParameters(input, gradOutput, -lr)
+   self.gradWeight = gradWeight
+   self.gradDiagCov = gradDiagCov
+end
diff --git a/WeightedMSECriterion.lua b/WeightedMSECriterion.lua
new file mode 100644
index 0000000..9334729
--- /dev/null
+++ b/WeightedMSECriterion.lua
@@ -0,0 +1,45 @@
+local WeightedMSECriterion, parent = torch.class('nn.WeightedMSECriterion','nn.MSECriterion')
+
+function WeightedMSECriterion:__init(w)
+   parent.__init(self)
+   self.weight = w:clone()
+end
+
+function WeightedMSECriterion:updateOutput(input,target)
+   self.buffer = self.buffer or input.new()
+   self.buffer:resizeAs(input):copy(target)
+   if input:dim() - 1 == self.weight:dim() then
+      for i=1,input:size(1) do
+         self.buffer[i]:cmul(self.weight)
+      end
+   else
+      self.buffer:cmul(self.weight)
+   end
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.MSECriterion_updateOutput(
+      input:cdata(),
+      self.buffer:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function WeightedMSECriterion:updateGradInput(input, target)
+   self.buffer:resizeAs(input):copy(target)
+   if input:dim() - 1 == self.weight:dim() then
+      for i=1,input:size(1) do
+         self.buffer[i]:cmul(self.weight)
+      end
+   else
+      self.buffer:cmul(self.weight)
+   end
+   input.THNN.MSECriterion_updateGradInput(
+      input:cdata(),
+      self.buffer:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage
+   )
+   return self.gradInput
+end
diff --git a/doc/containers.md b/doc/containers.md
new file mode 100644
index 0000000..9a83607
--- /dev/null
+++ b/doc/containers.md
@@ -0,0 +1,283 @@
+<a name="nn.Containers"></a>
+# Containers #
+Complex neural networks are easily built using container classes:
+
+  * [Container](#nn.Container) : abstract class inherited by containers ;
+    * [Sequential](#nn.Sequential) : plugs layers in a feed-forward fully connected manner ;
+    * [Parallel](#nn.Parallel) : applies its `ith` child module to the  `ith` slice of the input Tensor ;
+    * [Concat](#nn.Concat) : concatenates in one layer several modules along dimension `dim` ;
+      * [DepthConcat](#nn.DepthConcat) : like Concat, but adds zero-padding when non-`dim` sizes don't match;
+ 
+See also the [Table Containers](#nn.TableContainers) for manipulating tables of [Tensors](https://github.com/torch/torch7/blob/master/doc/tensor.md).
+
+<a name="nn.Container"></a>
+## Container ##
+
+This is an abstract [Module](module.md#nn.Module) class which declares methods defined in all containers.
+It reimplements many of the Module methods such that calls are propagated to the 
+contained modules. For example, a call to [zeroGradParameters](module.md#nn.Module.zeroGradParameters)
+will be propagated to all contained modules.
+
+<a name="nn.Container.add"></a>
+### add(module) ###
+Adds the given `module` to the container. The order is important
+
+<a name="nn.Container.get"></a>
+### get(index) ###
+Returns the contained modules at index `index`.
+
+<a name="nn.Container.size"></a>
+### size() ###
+Returns the number of contained modules.
+
+<a name="nn.Sequential"></a>
+## Sequential ##
+
+Sequential provides a means to plug layers together
+in a feed-forward fully connected manner.
+
+E.g. 
+creating a one hidden-layer multi-layer perceptron is thus just as easy as:
+```lua
+mlp = nn.Sequential()
+mlp:add( nn.Linear(10, 25) ) -- 10 input, 25 hidden units
+mlp:add( nn.Tanh() ) -- some hyperbolic tangent transfer function
+mlp:add( nn.Linear(25, 1) ) -- 1 output
+
+print(mlp:forward(torch.randn(10)))
+```
+which gives the output:
+```lua
+-0.1815
+[torch.Tensor of dimension 1]
+```
+
+<a name="nn.Sequential.remove"></a>
+### remove([index]) ###
+
+Remove the module at the given `index`. If `index` is not specified, remove the last layer.
+
+```lua
+model = nn.Sequential()
+model:add(nn.Linear(10, 20))
+model:add(nn.Linear(20, 20))
+model:add(nn.Linear(20, 30))
+model:remove(2)
+> model
+nn.Sequential {
+  [input -> (1) -> (2) -> output]
+  (1): nn.Linear(10 -> 20)
+  (2): nn.Linear(20 -> 30)
+}
+```
+
+
+<a name="nn.Sequential.insert"></a>
+### insert(module, [index]) ###
+
+Inserts the given `module` at the given `index`. If `index` is not specified, the incremented length of the sequence is used and so this is equivalent to use `add(module)`.
+
+```lua
+model = nn.Sequential()
+model:add(nn.Linear(10, 20))
+model:add(nn.Linear(20, 30))
+model:insert(nn.Linear(20, 20), 2)
+> model
+nn.Sequential {
+  [input -> (1) -> (2) -> (3) -> output]
+  (1): nn.Linear(10 -> 20)
+  (2): nn.Linear(20 -> 20)      -- The inserted layer
+  (3): nn.Linear(20 -> 30)
+}
+```
+
+
+
+<a name="nn.Parallel"></a>
+## Parallel ##
+
+`module` = `Parallel(inputDimension,outputDimension)`
+
+Creates a container module that applies its `ith` child module to the  `ith` slice of the input Tensor by using [select](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-selectdim-index) 
+on dimension `inputDimension`. It concatenates the results of its contained modules together along dimension `outputDimension`.
+
+Example:
+```lua
+ mlp=nn.Parallel(2,1);     -- iterate over dimension 2 of input
+ mlp:add(nn.Linear(10,3)); -- apply to first slice
+ mlp:add(nn.Linear(10,2))  -- apply to first second slice
+ print(mlp:forward(torch.randn(10,2)))
+```
+gives the output:
+```lua
+-0.5300
+-1.1015
+ 0.7764
+ 0.2819
+-0.6026
+[torch.Tensor of dimension 5]
+```
+
+A more complicated example:
+```lua
+
+mlp=nn.Sequential();
+c=nn.Parallel(1,2)
+for i=1,10 do
+ local t=nn.Sequential()
+ t:add(nn.Linear(3,2))
+ t:add(nn.Reshape(2,1))
+ c:add(t)
+end
+mlp:add(c)
+
+pred=mlp:forward(torch.randn(10,3))
+print(pred)
+
+for i=1,10000 do     -- Train for a few iterations
+ x=torch.randn(10,3);
+ y=torch.ones(2,10);
+ pred=mlp:forward(x)
+
+ criterion= nn.MSECriterion()
+ local err=criterion:forward(pred,y)
+ local gradCriterion = criterion:backward(pred,y);
+ mlp:zeroGradParameters();
+ mlp:backward(x, gradCriterion); 
+ mlp:updateParameters(0.01);
+ print(err)
+end
+```
+
+
+<a name="nn.Concat"></a>
+## Concat ##
+
+```lua
+module = nn.Concat(dim)
+```
+Concat concatenates the output of one layer of "parallel" modules along the
+provided dimension `dim`: they take the same inputs, and their output is
+concatenated.
+```lua
+mlp=nn.Concat(1);
+mlp:add(nn.Linear(5,3))
+mlp:add(nn.Linear(5,7))
+print(mlp:forward(torch.randn(5)))
+```
+which gives the output:
+```lua
+ 0.7486
+ 0.1349
+ 0.7924
+-0.0371
+-0.4794
+ 0.3044
+-0.0835
+-0.7928
+ 0.7856
+-0.1815
+[torch.Tensor of dimension 10]
+```
+
+<a name="nn.DepthConcat"></a>
+## DepthConcat ##
+
+```lua
+module = nn.DepthConcat(dim)
+```
+DepthConcat concatenates the output of one layer of "parallel" modules along the
+provided dimension `dim`: they take the same inputs, and their output is
+concatenated. For dimensions other than `dim` having different sizes,
+the smaller tensors are copied in the center of the output tensor, 
+effectively padding the borders with zeros.
+
+The module is particularly useful for concatenating the output of [Convolutions](convolution.md) 
+along the depth dimension (i.e. `nOutputFrame`). 
+This is used to implement the *DepthConcat* layer 
+of the [Going deeper with convolutions](http://arxiv.org/pdf/1409.4842v1.pdf) article.
+The normal [Concat](#nn.Concat) Module can't be used since the spatial 
+dimensions (height and width) of the output Tensors requiring concatenation 
+may have different values. To deal with this, the output uses the largest 
+spatial dimensions and adds zero-padding around the smaller Tensors.
+```lua
+inputSize = 3
+outputSize = 2
+input = torch.randn(inputSize,7,7)
+mlp=nn.DepthConcat(1);
+mlp:add(nn.SpatialConvolutionMM(inputSize, outputSize, 1, 1))
+mlp:add(nn.SpatialConvolutionMM(inputSize, outputSize, 3, 3))
+mlp:add(nn.SpatialConvolutionMM(inputSize, outputSize, 4, 4))
+print(mlp:forward(input))
+```
+which gives the output:
+```lua
+(1,.,.) = 
+ -0.2874  0.6255  1.1122  0.4768  0.9863 -0.2201 -0.1516
+  0.2779  0.9295  1.1944  0.4457  1.1470  0.9693  0.1654
+ -0.5769 -0.4730  0.3283  0.6729  1.3574 -0.6610  0.0265
+  0.3767  1.0300  1.6927  0.4422  0.5837  1.5277  1.1686
+  0.8843 -0.7698  0.0539 -0.3547  0.6904 -0.6842  0.2653
+  0.4147  0.5062  0.6251  0.4374  0.3252  0.3478  0.0046
+  0.7845 -0.0902  0.3499  0.0342  1.0706 -0.0605  0.5525
+
+(2,.,.) = 
+ -0.7351 -0.9327 -0.3092 -1.3395 -0.4596 -0.6377 -0.5097
+ -0.2406 -0.2617 -0.3400 -0.4339 -0.3648  0.1539 -0.2961
+ -0.7124 -1.2228 -0.2632  0.1690  0.4836 -0.9469 -0.7003
+ -0.0221  0.1067  0.6975 -0.4221 -0.3121  0.4822  0.6617
+  0.2043 -0.9928 -0.9500 -1.6107  0.1409 -1.3548 -0.5212
+ -0.3086 -0.0298 -0.2031  0.1026 -0.5785 -0.3275 -0.1630
+  0.0596 -0.6097  0.1443 -0.8603 -0.2774 -0.4506 -0.5367
+
+(3,.,.) = 
+  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
+  0.0000 -0.7326  0.3544  0.1821  0.4796  1.0164  0.0000
+  0.0000 -0.9195 -0.0567 -0.1947  0.0169  0.1924  0.0000
+  0.0000  0.2596  0.6766  0.0939  0.5677  0.6359  0.0000
+  0.0000 -0.2981 -1.2165 -0.0224 -1.1001  0.0008  0.0000
+  0.0000 -0.1911  0.2912  0.5092  0.2955  0.7171  0.0000
+  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
+
+(4,.,.) = 
+  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
+  0.0000 -0.8263  0.3646  0.6750  0.2062  0.2785  0.0000
+  0.0000 -0.7572  0.0432 -0.0821  0.4871  1.9506  0.0000
+  0.0000 -0.4609  0.4362  0.5091  0.8901 -0.6954  0.0000
+  0.0000  0.6049 -0.1501 -0.4602 -0.6514  0.5439  0.0000
+  0.0000  0.2570  0.4694 -0.1262  0.5602  0.0821  0.0000
+  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
+
+(5,.,.) = 
+  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
+  0.0000  0.3158  0.4389 -0.0485 -0.2179  0.0000  0.0000
+  0.0000  0.1966  0.6185 -0.9563 -0.3365  0.0000  0.0000
+  0.0000 -0.2892 -0.9266 -0.0172 -0.3122  0.0000  0.0000
+  0.0000 -0.6269  0.5349 -0.2520 -0.2187  0.0000  0.0000
+  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
+  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
+
+(6,.,.) = 
+  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
+  0.0000  1.1148  0.2324 -0.1093  0.5024  0.0000  0.0000
+  0.0000 -0.2624 -0.5863  0.3444  0.3506  0.0000  0.0000
+  0.0000  0.1486  0.8413  0.6229 -0.0130  0.0000  0.0000
+  0.0000  0.8446  0.3801 -0.2611  0.8140  0.0000  0.0000
+  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
+  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
+[torch.DoubleTensor of dimension 6x7x7]
+```
+Note how the last 2 of 6 filter maps have 1 column of zero-padding 
+on the left and top, as well as 2 on the right and bottom. 
+This is inevitable when the component
+module output tensors non-`dim` sizes aren't all odd or even. 
+Such that in order to keep the mappings aligned, one need 
+only ensure that these be all odd (or even).
+
+<a name="nn.TableContainers"></a>
+## Table Containers ##
+While the above containers are used for manipulating input [Tensors](https://github.com/torch/torch7/blob/master/doc/tensor.md), table containers are used for manipulating tables :
+ * [ConcatTable](table.md#nn.ConcatTable)
+ * [ParallelTable](table.md#nn.ParallelTable)
+
+These, along with all other modules for manipulating tables can be found [here](table.md).
diff --git a/doc/convolution.md b/doc/convolution.md
new file mode 100644
index 0000000..4e2bb6f
--- /dev/null
+++ b/doc/convolution.md
@@ -0,0 +1,964 @@
+<a name="nn.convlayers.dok"></a>
+# Convolutional layers #
+
+A convolution is an integral that expresses the amount of overlap of one function `g` as it is shifted over another function `f`. It therefore "blends" one function with another. The neural network package supports convolution, pooling, subsampling and other relevant facilities. These are divided based on the dimensionality of the input and output [Tensors](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor):
+
+  * [Temporal Modules](#nn.TemporalModules) apply to sequences with a one-dimensional relationship
+(e.g. sequences of words, phonemes and letters. Strings of some kind).
+    * [TemporalConvolution](#nn.TemporalConvolution) : a 1D convolution over an input sequence ;
+    * [TemporalSubSampling](#nn.TemporalSubSampling) : a 1D sub-sampling over an input sequence ;
+    * [TemporalMaxPooling](#nn.TemporalMaxPooling) : a 1D max-pooling operation over an input sequence ;
+    * [LookupTable](#nn.LookupTable) : a convolution of width `1`, commonly used for word embeddings ;
+  * [Spatial Modules](#nn.SpatialModules) apply to inputs with two-dimensional relationships (e.g. images):
+    * [SpatialConvolution](#nn.SpatialConvolution) : a 2D convolution over an input image ;
+    * [SpatialFullConvolution](#nn.SpatialFullConvolution) : a 2D full convolution over an input image ;
+    * [SpatialDilatedConvolution](#nn.SpatialDilatedConvolution) : a 2D dilated convolution over an input image ;
+    * [SpatialConvolutionLocal](#nn.SpatialConvolutionLocal) : a 2D locally-connected layer over an input image ;
+    * [SpatialSubSampling](#nn.SpatialSubSampling) : a 2D sub-sampling over an input image ;
+    * [SpatialMaxPooling](#nn.SpatialMaxPooling) : a 2D max-pooling operation over an input image ;
+    * [SpatialFractionalMaxPooling](#nn.SpatialFractionalMaxPooling) : a 2D fractional max-pooling operation over an input image ;
+    * [SpatialAveragePooling](#nn.SpatialAveragePooling) : a 2D average-pooling operation over an input image ;
+    * [SpatialAdaptiveMaxPooling](#nn.SpatialAdaptiveMaxPooling) : a 2D max-pooling operation which adapts its parameters dynamically such that the output is of fixed size ;
+    * [SpatialMaxUnpooling](#nn.SpatialMaxUnpooling) : a 2D max-unpooling operation ;
+    * [SpatialLPPooling](#nn.SpatialLPPooling) : computes the `p` norm in a convolutional manner on a set of input images ;
+    * [SpatialConvolutionMap](#nn.SpatialConvolutionMap) : a 2D convolution that uses a generic connection table ;
+    * [SpatialZeroPadding](#nn.SpatialZeroPadding) : padds a feature map with specified number of zeros ;
+    * [SpatialReflectionPadding](#nn.SpatialReflectionPadding) : padds a feature map with the reflection of the input ;
+    * [SpatialReplicationPadding](#nn.SpatialReplicationPadding) : padds a feature map with the value at the edge of the input borders ;
+    * [SpatialSubtractiveNormalization](#nn.SpatialSubtractiveNormalization) : a spatial subtraction operation on a series of 2D inputs using
+    * [SpatialCrossMapLRN](#nn.SpatialCrossMapLRN) : a spatial local response normalization between feature maps ;
+    * [SpatialBatchNormalization](#nn.SpatialBatchNormalization): mean/std normalization over the mini-batch inputs and pixels, with an optional affine transform that follows
+a kernel for computing the weighted average in a neighborhood ;
+    * [SpatialUpsamplingNearest](#nn.SpatialUpSamplingNearest): A simple upsampler applied to every channel of the feature map.
+  * [Volumetric Modules](#nn.VolumetricModules) apply to inputs with three-dimensional relationships (e.g. videos) :
+    * [VolumetricConvolution](#nn.VolumetricConvolution) : a 3D convolution over an input video (a sequence of images) ;
+    * [VolumetricFullConvolution](#nn.VolumetricFullConvolution) : a 3D full convolution over an input video (a sequence of images) ;
+    * [VolumetricMaxPooling](#nn.VolumetricMaxPooling) : a 3D max-pooling operation over an input video.
+    * [VolumetricAveragePooling](#nn.VolumetricAveragePooling) : a 3D average-pooling operation over an input video.
+    * [VolumetricMaxUnpooling](#nn.VolumetricMaxUnpooling) : a 3D max-unpooling operation ;
+
+
+<a name="nn.TemporalModules"></a>
+## Temporal Modules ##
+Excluding an optional first batch dimension, temporal layers expect a 2D Tensor as input. The
+first dimension is the number of frames in the sequence (e.g. `nInputFrame`), the last dimension
+is the number of features per frame (e.g. `inputFrameSize`). The output will normally have the same number
+of dimensions, although the size of each dimension may change. These are commonly used for processing acoustic signals or sequences of words, i.e. in Natural Language Processing.
+
+Note: The [LookupTable](#nn.LookupTable) is special in that while it does output a temporal Tensor of size `nOutputFrame x outputFrameSize`,
+its input is a 1D Tensor of indices of size `nIndices`. Again, this is excluding the option first batch dimension.
+
+<a name="nn.TemporalConvolution"></a>
+## TemporalConvolution ##
+
+```lua
+module = nn.TemporalConvolution(inputFrameSize, outputFrameSize, kW, [dW])
+```
+
+Applies a 1D convolution over an input sequence composed of `nInputFrame` frames. The `input` tensor in
+`forward(input)` is expected to be a 2D tensor (`nInputFrame x inputFrameSize`) or a 3D tensor (`nBatchFrame x nInputFrame x inputFrameSize`).
+
+The parameters are the following:
+  * `inputFrameSize`: The input frame size expected in sequences given into `forward()`.
+  * `outputFrameSize`: The output frame size the convolution layer will produce.
+  * `kW`: The kernel width of the convolution
+  * `dW`: The step of the convolution. Default is `1`.
+
+Note that depending of the size of your kernel, several (of the last)
+frames of the sequence might be lost. It is up to the user to add proper padding frames in the input
+sequences.
+
+If the input sequence is a 2D tensor of dimension `nInputFrame x inputFrameSize`, the output sequence will be
+`nOutputFrame x outputFrameSize` where
+```lua
+nOutputFrame = (nInputFrame - kW) / dW + 1
+```
+
+If the input sequence is a 3D tensor of dimension `nBatchFrame x nInputFrame x inputFrameSize`, the output sequence will be
+`nBatchFrame x nOutputFrame x outputFrameSize`.
+
+The parameters of the convolution can be found in `self.weight` (Tensor of
+size `outputFrameSize x (inputFrameSize x kW) `) and `self.bias` (Tensor of
+size `outputFrameSize`). The corresponding gradients can be found in
+`self.gradWeight` and `self.gradBias`.
+
+For a 2D input, the output value of the layer can be precisely described as:
+```lua
+output[t][i] = bias[i]
+  + sum_j sum_{k=1}^kW weight[i][j][k]
+                                * input[dW*(t-1)+k)][j]
+```
+
+Here is a simple example:
+
+```lua
+inp=5;  -- dimensionality of one sequence element
+outp=1; -- number of derived features for one sequence element
+kw=1;   -- kernel only operates on one sequence element per step
+dw=1;   -- we step once and go on to the next sequence element
+
+mlp=nn.TemporalConvolution(inp,outp,kw,dw)
+
+x=torch.rand(7,inp) -- a sequence of 7 elements
+print(mlp:forward(x))
+```
+which gives:
+```lua
+-0.9109
+-0.9872
+-0.6808
+-0.9403
+-0.9680
+-0.6901
+-0.6387
+[torch.Tensor of dimension 7x1]
+```
+
+This is equivalent to:
+```lua
+weights=torch.reshape(mlp.weight,inp) -- weights applied to all
+bias= mlp.bias[1];
+for i=1,x:size(1) do -- for each sequence element
+   element= x[i]; -- features of ith sequence element
+   print(element:dot(weights) + bias)
+end
+```
+which gives:
+```lua
+-0.91094998687717
+-0.98721705771773
+-0.68075004276185
+-0.94030132495887
+-0.96798754116609
+-0.69008470895581
+-0.63871422284166
+```
+
+<a name="nn.TemporalMaxPooling"></a>
+## TemporalMaxPooling ##
+
+```lua
+module = nn.TemporalMaxPooling(kW, [dW])
+```
+
+Applies 1D max-pooling operation in `kW` regions by step size
+`dW` steps. Input sequence composed of `nInputFrame` frames. The `input` tensor in
+`forward(input)` is expected to be a 2D tensor (`nInputFrame x inputFrameSize`)
+or a 3D tensor (`nBatchFrame x nInputFrame x inputFrameSize`).
+
+If the input sequence is a 2D tensor of dimension `nInputFrame x inputFrameSize`, the output sequence will be
+`nOutputFrame x inputFrameSize` where
+```lua
+nOutputFrame = (nInputFrame - kW) / dW + 1
+```
+
+<a name="nn.TemporalSubSampling"></a>
+## TemporalSubSampling ##
+
+```lua
+module = nn.TemporalSubSampling(inputFrameSize, kW, [dW])
+```
+
+Applies a 1D sub-sampling over an input sequence composed of `nInputFrame` frames. The `input` tensor in
+`forward(input)` is expected to be a 2D tensor (`nInputFrame x inputFrameSize`). The output frame size
+will be the same as the input one (`inputFrameSize`).
+
+The parameters are the following:
+  * `inputFrameSize`: The input frame size expected in sequences given into `forward()`.
+  * `kW`: The kernel width of the sub-sampling
+  * `dW`: The step of the sub-sampling. Default is `1`.
+
+Note that depending of the size of your kernel, several (of the last)
+frames of the sequence might be lost. It is up to the user to add proper padding frames in the input
+sequences.
+
+If the input sequence is a 2D tensor `nInputFrame x inputFrameSize`, the output sequence will be
+`inputFrameSize x nOutputFrame` where
+```lua
+nOutputFrame = (nInputFrame - kW) / dW + 1
+```
+
+The parameters of the sub-sampling can be found in `self.weight` (Tensor of
+size `inputFrameSize`) and `self.bias` (Tensor of
+size `inputFrameSize`). The corresponding gradients can be found in
+`self.gradWeight` and `self.gradBias`.
+
+The output value of the layer can be precisely described as:
+```lua
+output[i][t] = bias[i] + weight[i] * sum_{k=1}^kW input[i][dW*(t-1)+k)]
+```
+
+<a name="nn.LookupTable"></a>
+## LookupTable ##
+
+```lua
+module = nn.LookupTable(nIndex, size, [paddingValue], [maxNorm], [normType])
+```
+
+This layer is a particular case of a convolution, where the width of the convolution would be `1`.
+When calling `forward(input)`, it assumes `input` is a 1D or 2D tensor filled with indices.
+If the input is a matrix, then each row is assumed to be an input sample of given batch. Indices start
+at `1` and can go up to `nIndex`. For each index, it outputs a corresponding `Tensor` of size
+specified by `size`.
+
+LookupTable can be very slow if a certain input occurs frequently compared to other inputs;
+this is often the case for input padding. During the backward step, there is a separate thread
+for each input symbol which results in a bottleneck for frequent inputs.
+generating a `n x size1 x size2 x ... x sizeN` tensor, where `n`
+is the size of a 1D `input` tensor.
+
+Again with a 1D input, when only `size1` is provided, the `forward(input)` is equivalent to
+performing the following matrix-matrix multiplication in an efficient manner:
+```lua
+M P
+```
+where `M` is a 2D matrix `size x nIndex` containing the parameters of the lookup-table and
+`P` is a 2D matrix, where each column vector `i` is a zero vector except at index `input[i]` where it is `1`.
+
+1D example:
+```lua
+ -- a lookup table containing 10 tensors of size 3
+ module = nn.LookupTable(10, 3)
+
+ input = torch.Tensor{1,2,1,10}
+ print(module:forward(input))
+```
+
+Outputs something like:
+```lua
+-1.4415 -0.1001 -0.1708
+-0.6945 -0.4350  0.7977
+-1.4415 -0.1001 -0.1708
+-0.0745  1.9275  1.0915
+[torch.DoubleTensor of dimension 4x3]
+```
+Note that the first row vector is the same as the 3rd one!
+
+Given a 2D input tensor of size `m x n`, the output is a `m x n x size`
+tensor, where `m` is the number of samples in
+the batch and `n` is the number of indices per sample.
+
+2D example:
+```lua
+ -- a lookup table containing 10 tensors of size 3
+ module = nn.LookupTable(10, 3)
+
+ -- a batch of 2 samples of 4 indices each
+ input = torch.Tensor({{1,2,4,5},{4,3,2,10}})
+ print(module:forward(input))
+```
+
+Outputs something like:
+```lua
+(1,.,.) =
+ -0.0570 -1.5354  1.8555
+ -0.9067  1.3392  0.6275
+  1.9662  0.4645 -0.8111
+  0.1103  1.7811  1.5969
+
+(2,.,.) =
+  1.9662  0.4645 -0.8111
+  0.0026 -1.4547 -0.5154
+ -0.9067  1.3392  0.6275
+ -0.0193 -0.8641  0.7396
+[torch.DoubleTensor of dimension 2x4x3]
+```
+
+LookupTable supports max-norm regularization. One can activate the max-norm constraints
+by setting non-nil maxNorm in constructor or using setMaxNorm function. In the implementation,
+the max-norm constraint is enforced in the forward pass. That is the output of the LookupTable
+always obeys the max-norm constraint, even though the module weights may temporarily exceed
+the max-norm constraint.
+
+max-norm regularization example:
+```lua
+ -- a lookup table with max-norm constraint: 2-norm <= 1
+ module = nn.LookupTable(10, 3, 0, 1, 2)
+ input = torch.Tensor{1,2,1,10}
+ print(module.weight)
+ -- output of the module always obey max-norm constraint
+ print(module:forward(input))
+ -- the rows accessed should be re-normalized
+ print(module.weight)
+```
+
+Outputs something like:
+```lua
+ 0.2194  1.4759 -1.1829
+ 0.7069  0.2436  0.9876
+-0.2955  0.3267  1.1844
+-0.0575 -0.2957  1.5079
+-0.2541  0.5331 -0.0083
+ 0.8005 -1.5994 -0.4732
+-0.0065  2.3441 -0.6354
+ 0.2910  0.4230  0.0975
+ 1.2662  1.1846  1.0114
+-0.4095 -1.0676 -0.9056
+[torch.DoubleTensor of size 10x3]
+
+ 0.1152  0.7751 -0.6212
+ 0.5707  0.1967  0.7973
+ 0.1152  0.7751 -0.6212
+-0.2808 -0.7319 -0.6209
+[torch.DoubleTensor of size 4x3]
+
+ 0.1152  0.7751 -0.6212
+ 0.5707  0.1967  0.7973
+-0.2955  0.3267  1.1844
+-0.0575 -0.2957  1.5079
+-0.2541  0.5331 -0.0083
+ 0.8005 -1.5994 -0.4732
+-0.0065  2.3441 -0.6354
+ 0.2910  0.4230  0.0975
+ 1.2662  1.1846  1.0114
+-0.2808 -0.7319 -0.6209
+[torch.DoubleTensor of size 10x3]
+```
+Note that the 1st, 2nd and 10th rows of the module.weight are updated to
+obey the max-norm constraint, since their indices appear in the "input".
+
+<a name="nn.SpatialModules"></a>
+## Spatial Modules ##
+Excluding an optional batch dimension, spatial layers expect a 3D Tensor as input. The
+first dimension is the number of features (e.g. `frameSize`), the last two dimensions
+are spatial (e.g. `height x width`). These are commonly used for processing images.
+
+<a name="nn.SpatialConvolution"></a>
+### SpatialConvolution ###
+
+```lua
+module = nn.SpatialConvolution(nInputPlane, nOutputPlane, kW, kH, [dW], [dH], [padW], [padH])
+```
+
+Applies a 2D convolution over an input image composed of several input planes. The `input` tensor in
+`forward(input)` is expected to be a 3D tensor (`nInputPlane x height x width`).
+
+The parameters are the following:
+  * `nInputPlane`: The number of expected input planes in the image given into `forward()`.
+  * `nOutputPlane`: The number of output planes the convolution layer will produce.
+  * `kW`: The kernel width of the convolution
+  * `kH`: The kernel height of the convolution
+  * `dW`: The step of the convolution in the width dimension. Default is `1`.
+  * `dH`: The step of the convolution in the height dimension. Default is `1`.
+  * `padW`: The additional zeros added per width to the input planes. Default is `0`, a good number is `(kW-1)/2`.
+  * `padH`: The additional zeros added per height to the input planes. Default is `padW`, a good number is `(kH-1)/2`.
+
+Note that depending of the size of your kernel, several (of the last)
+columns or rows of the input image might be lost. It is up to the user to
+add proper padding in images.
+
+If the input image is a 3D tensor `nInputPlane x height x width`, the output image size
+will be `nOutputPlane x oheight x owidth` where
+```lua
+owidth  = floor((width  + 2*padW - kW) / dW + 1)
+oheight = floor((height + 2*padH - kH) / dH + 1)
+```
+
+The parameters of the convolution can be found in `self.weight` (Tensor of
+size `nOutputPlane x nInputPlane x kH x kW`) and `self.bias` (Tensor of
+size `nOutputPlane`). The corresponding gradients can be found in
+`self.gradWeight` and `self.gradBias`.
+
+The output value of the layer can be precisely described as:
+```lua
+output[i][j][k] = bias[k]
+  + sum_l sum_{s=1}^kW sum_{t=1}^kH weight[s][t][l][k]
+                                    * input[dW*(i-1)+s)][dH*(j-1)+t][l]
+```
+
+
+<a name="nn.SpatialConvolutionMap"></a>
+### SpatialConvolutionMap ###
+
+```lua
+module = nn.SpatialConvolutionMap(connectionMatrix, kW, kH, [dW], [dH])
+```
+
+This class is a generalization of
+[nn.SpatialConvolution](#nn.SpatialConvolution). It uses a generic
+connection table between input and output features. The
+[nn.SpatialConvolution](#nn.SpatialConvolution) is equivalent to
+using a [full connection table](#nn.tables.full). One can specify
+different types of connection tables.
+
+<a name="nn.tables.full"></a>
+#### Full Connection Table ####
+
+```lua
+table = nn.tables.full(nin,nout)
+```
+
+This is a precomputed table that specifies connections between every
+input and output node.
+
+<a name="nn.tables.onetoone"></a>
+#### One to One Connection Table ####
+
+```lua
+table = nn.tables.oneToOne(n)
+```
+
+This is a precomputed table that specifies a single connection to each
+output node from corresponding input node.
+
+<a name="nn.tables.random"></a>
+#### Random Connection Table ####
+
+```lua
+table = nn.tables.random(nin,nout, nto)
+```
+
+This table is randomly populated such that each output unit has
+`nto` incoming connections. The algorithm tries to assign uniform
+number of outgoing connections to each input node if possible.
+
+<a name="nn.SpatialFullConvolution"></a>
+### SpatialFullConvolution ###
+
+```lua
+module = nn.SpatialFullConvolution(nInputPlane, nOutputPlane, kW, kH, [dW], [dH], [padW], [padH], [adjW], [adjH])
+```
+
+Applies a 2D full convolution over an input image composed of several input planes. The `input` tensor in
+`forward(input)` is expected to be a 3D or 4D tensor. Note that instead of setting `adjW` and `adjH`, SpatialFullConvolution also accepts a table input with two tensors: `{convInput, sizeTensor}` where `convInput` is the standard input on which the full convolution
+is applied, and the size of `sizeTensor` is used to set the size of the output. Using the two-input version of forward
+will ignore the `adjW` and `adjH` values used to construct the module.
+
+Other frameworks call this operation "In-network Upsampling", "Fractionally-strided convolution", "Backwards Convolution," "Deconvolution", or "Upconvolution."
+
+The parameters are the following:
+  * `nInputPlane`: The number of expected input planes in the image given into `forward()`.
+  * `nOutputPlane`: The number of output planes the convolution layer will produce.
+  * `kW`: The kernel width of the convolution
+  * `kH`: The kernel height of the convolution
+  * `dW`: The step of the convolution in the width dimension. Default is `1`.
+  * `dH`: The step of the convolution in the height dimension. Default is `1`.
+  * `padW`: The additional zeros added per width to the input planes. Default is `0`, a good number is `(kW-1)/2`.
+  * `padH`: The additional zeros added per height to the input planes. Default is `0`, a good number is `(kH-1)/2`.
+  * `adjW`: Extra width to add to the output image. Default is `0`. Cannot be greater than dW-1.
+  * `adjH`: Extra height to add to the output image. Default is `0`. Cannot be greater than dH-1.
+
+If the input image is a 3D tensor `nInputPlane x height x width`, the output image size
+will be `nOutputPlane x oheight x owidth` where
+```lua
+owidth  = (width  - 1) * dW - 2*padW + kW + adjW
+oheight = (height - 1) * dH - 2*padH + kH + adjH
+```
+
+Further information about the full convolution can be found in the following paper: [Fully Convolutional Networks for Semantic Segmentation](http://www.cs.berkeley.edu/~jonlong/long_shelhamer_fcn.pdf).
+
+<a name="nn.SpatialDilatedConvolution"></a>
+### SpatialDilatedConvolution ###
+
+```lua
+module = nn.SpatialDilatedConvolution(nInputPlane, nOutputPlane, kW, kH, [dW], [dH], [padW], [padH], [dilationW], [dilationH])
+```
+
+Applies a 2D dilated convolution over an input image composed of several input planes. The `input` tensor in
+`forward(input)` is expected to be a 3D or 4D tensor.
+
+The parameters are the following:
+  * `nInputPlane`: The number of expected input planes in the image given into `forward()`.
+  * `nOutputPlane`: The number of output planes the convolution layer will produce.
+  * `kW`: The kernel width of the convolution
+  * `kH`: The kernel height of the convolution
+  * `dW`: The step of the convolution in the width dimension. Default is `1`.
+  * `dH`: The step of the convolution in the height dimension. Default is `1`.
+  * `padW`: The additional zeros added per width to the input planes. Default is `0`, a good number is `(kW-1)/2`.
+  * `padH`: The additional zeros added per height to the input planes. Default is `0`, a good number is `(kH-1)/2`.
+  * `dilationW`: The number of pixels to skip. Default is `1`. `1` makes it a SpatialConvolution
+  * `dilationH`: The number of pixels to skip. Default is `1`. `1` makes it a SpatialConvolution
+
+If the input image is a 3D tensor `nInputPlane x height x width`, the output image size
+will be `nOutputPlane x oheight x owidth` where
+```lua
+owidth  = width + 2 * padW - dilationW * (kW-1) + 1 / dW + 1
+oheight = height + 2 * padH - dilationH * (kH-1) + 1 / dH + 1
+```
+
+Further information about the dilated convolution can be found in the following paper: [Multi-Scale Context Aggregation by Dilated Convolutions](http://arxiv.org/abs/1511.07122).
+
+<a name="nn.SpatialConvolutionLocal"></a>
+### SpatialConvolutionLocal ###
+
+```lua
+module = nn.SpatialConvolutionLocal(nInputPlane, nOutputPlane, iW, iH, kW, kH, [dW], [dH], [padW], [padH])
+```
+
+Applies a 2D locally-connected layer over an input image composed of several input planes. The `input` tensor in
+`forward(input)` is expected to be a 3D or 4D tensor.
+
+A locally-connected layer is similar to a convolution layer but without weight-sharing.
+
+The parameters are the following:
+  * `nInputPlane`: The number of expected input planes in the image given into `forward()`.
+  * `nOutputPlane`: The number of output planes the locally-connected layer will produce.
+  * `iW`: The input width.
+  * `iH`: The input height.
+  * `kW`: The kernel width.
+  * `kH`: The kernel height.
+  * `dW`: The step in the width dimension. Default is `1`.
+  * `dH`: The step in the height dimension. Default is `1`.
+  * `padW`: The additional zeros added per width to the input planes. Default is `0`, a good number is `(kW-1)/2`.
+  * `padH`: The additional zeros added per height to the input planes. Default is `0`, a good number is `(kH-1)/2`.
+
+If the input image is a 3D tensor `nInputPlane x iH x iW`, the output image size
+will be `nOutputPlane x oH x oW` where
+```lua
+oW  = floor((iW  + 2*padW - kW) / dW + 1)
+oH = floor((iH + 2*padH - kH) / dH + 1)
+```
+
+<a name="nn.SpatialLPPooling"></a>
+### SpatialLPPooling ###
+
+```lua
+module = nn.SpatialLPPooling(nInputPlane, pnorm, kW, kH, [dW], [dH])
+```
+
+Computes the `p` norm in a convolutional manner on a set of 2D input planes.
+
+<a name="nn.SpatialMaxPooling"></a>
+### SpatialMaxPooling ###
+
+```lua
+module = nn.SpatialMaxPooling(kW, kH [, dW, dH, padW, padH])
+```
+
+Applies 2D max-pooling operation in `kWxkH` regions by step size
+`dWxdH` steps. The number of output features is equal to the number of
+input planes.
+
+If the input image is a 3D tensor `nInputPlane x height x width`, the output
+image size will be `nOutputPlane x oheight x owidth` where
+
+```lua
+owidth  = op((width  + 2*padW - kW) / dW + 1)
+oheight = op((height + 2*padH - kH) / dH + 1)
+```
+
+`op` is a rounding operator. By default, it is `floor`. It can be changed
+by calling `:ceil()` or `:floor()` methods.
+
+<a name="nn.SpatialFractionalMaxPooling"></a>
+### SpatialFractionalMaxPooling ###
+
+```lua
+module = nn.SpatialFractionalMaxPooling(kW, kH, outW, outH)
+--   the output should be the exact size (outH x outW)
+OR
+module = nn.SpatialFractionalMaxPooling(kW, kH, ratioW, ratioH)
+--   the output should be the size (floor(inH x ratioH) x floor(inW x ratioW))
+--   ratios are numbers between (0, 1) exclusive
+```
+
+Applies 2D Fractional max-pooling operation as described in the
+paper ["Fractional Max Pooling" by Ben Graham](http://arxiv.org/abs/1412.6071) in the "pseudorandom" mode.
+
+The max-pooling operation is applied in `kWxkH` regions by a stochastic step size determined by the target output size.
+The number of output features is equal to the number of input planes.
+
+There are two constructors available.
+
+Constructor 1:
+```lua
+module = nn.SpatialFractionalMaxPooling(kW, kH, outW, outH)
+```
+
+Constructor 2:
+```lua
+module = nn.SpatialFractionalMaxPooling(kW, kH, ratioW, ratioH)
+```
+If the input image is a 3D tensor `nInputPlane x height x width`, the output
+image size will be `nOutputPlane x oheight x owidth`
+
+ where
+
+```lua
+owidth  = floor(width * ratioW)
+oheight = floor(height * ratioH)
+```
+ratios are numbers between (0, 1) exclusive
+
+
+<a name="nn.SpatialAveragePooling"></a>
+### SpatialAveragePooling ###
+
+```lua
+module = nn.SpatialAveragePooling(kW, kH [, dW, dH, padW, padH])
+```
+
+Applies 2D average-pooling operation in `kWxkH` regions by step size
+`dWxdH` steps. The number of output features is equal to the number of
+input planes.
+
+If the input image is a 3D tensor `nInputPlane x height x width`, the output
+image size will be `nOutputPlane x oheight x owidth` where
+
+```lua
+owidth  = op((width  + 2*padW - kW) / dW + 1)
+oheight = op((height + 2*padH - kH) / dH + 1)
+```
+
+`op` is a rounding operator. By default, it is `floor`. It can be changed
+by calling `:ceil()` or `:floor()` methods.
+
+By default, the output of each pooling region is divided by the number of
+elements inside the padded image (which is usually `kW*kH`, except in some
+corner cases in which it can be smaller). You can also divide by the number
+of elements inside the original non-padded image. To switch between different
+division factors, call `:setCountIncludePad()` or `:setCountExcludePad()`. If
+`padW=padH=0`, both options give the same results.
+
+<a name="nn.SpatialAdaptiveMaxPooling"></a>
+### SpatialAdaptiveMaxPooling ###
+
+```lua
+module = nn.SpatialAdaptiveMaxPooling(W, H)
+```
+
+Applies 2D max-pooling operation in an image such that the output is of
+size `WxH`, for any input size. The number of output features is equal
+to the number of input planes.
+
+For an output of dimensions `(owidth,oheight)`, the indexes of the pooling
+region `(j,i)` in the input image of dimensions `(iwidth,iheight)` are
+given by:
+
+```lua
+x_j_start = floor((j   /owidth)  * iwidth)
+x_j_end   = ceil(((j+1)/owidth)  * iwidth)
+
+y_i_start = floor((i   /oheight) * iheight)
+y_i_end   = ceil(((i+1)/oheight) * iheight)
+```
+
+<a name="nn.SpatialMaxUnpooling"></a>
+### SpatialMaxUnpooling ###
+
+```lua
+module = nn.SpatialMaxUnpooling(poolingModule)
+```
+
+Applies 2D "max-unpooling" operation using the indices previously computed
+by the SpatialMaxPooling module `poolingModule`.
+
+When `B = poolingModule:forward(A)` is called, the indices of the maximal
+values (corresponding to their position within each map) are stored:
+`B[{n,k,i,j}] = A[{n,k,indices[{n,k,i}],indices[{n,k,j}]}]`.
+If `C` is a tensor of same size as `B`, `module:updateOutput(C)` outputs a
+tensor `D` of same size as `A` such that:
+`D[{n,k,indices[{n,k,i}],indices[{n,k,j}]}] = C[{n,k,i,j}]`.
+
+Module inspired by:
+   "Visualizing and understanding convolutional networks" (2014)
+                   by Matthew Zeiler, Rob Fergus
+
+<a name="nn.SpatialSubSampling"></a>
+### SpatialSubSampling ###
+
+```lua
+module = nn.SpatialSubSampling(nInputPlane, kW, kH, [dW], [dH])
+```
+
+Applies a 2D sub-sampling over an input image composed of several input planes. The `input` tensor in
+`forward(input)` is expected to be a 3D tensor (`nInputPlane x height x width`). The number of output
+planes will be the same as `nInputPlane`.
+
+The parameters are the following:
+  * `nInputPlane`: The number of expected input planes in the image given into `forward()`.
+  * `kW`: The kernel width of the sub-sampling
+  * `kH`: The kernel height of the sub-sampling
+  * `dW`: The step of the sub-sampling in the width dimension. Default is `1`.
+  * `dH`: The step of the sub-sampling in the height dimension. Default is `1`.
+
+Note that depending of the size of your kernel, several (of the last)
+columns or rows of the input image might be lost. It is up to the user to
+add proper padding in images.
+
+If the input image is a 3D tensor `nInputPlane x height x width`, the output image size
+will be `nInputPlane x oheight x owidth` where
+
+```lua
+owidth  = (width  - kW) / dW + 1
+oheight = (height - kH) / dH + 1 .
+```
+
+The parameters of the sub-sampling can be found in `self.weight` (Tensor of
+size `nInputPlane`) and `self.bias` (Tensor of size `nInputPlane`). The
+corresponding gradients can be found in `self.gradWeight` and
+`self.gradBias`.
+
+The output value of the layer can be precisely described as:
+```lua
+output[i][j][k] = bias[k]
+  + weight[k] sum_{s=1}^kW sum_{t=1}^kH input[dW*(i-1)+s)][dH*(j-1)+t][k]
+```
+
+<a name="nn.SpatialUpSamplingNearest"></a>
+### SpatialUpSamplingNearest ###
+
+```lua
+module = nn.SpatialUpSamplingNearest(scale)
+```
+
+Applies a 2D up-sampling over an input image composed of several input planes. The `input` tensor in
+`forward(input)` is expected to be a 3D or 4D tensor (i.e. for 4D: `nBatchPlane x nInputPlane x height x width`). The number of output planes will be the same.  The v dimension is assumed to be the second last dimension (i.e. for 4D it will be the 3rd dim), and the u dimension is assumed to be the last dimension.
+
+The parameters are the following:
+  * `scale`: The upscale ratio.  Must be a positive integer
+
+The up-scaling method is simple nearest neighbor, ie:
+
+```lua
+output(u,v) = input(floor((u-1)/scale)+1, floor((v-1)/scale)+1)
+```
+
+Where `u` and `v` are index from 1 (as per lua convention).  There are no learnable parameters.
+
+<a name="nn.SpatialZeroPadding"></a>
+### SpatialZeroPadding ###
+
+```lua
+module = nn.SpatialZeroPadding(padLeft, padRight, padTop, padBottom)
+```
+
+Each feature map of a given input is padded with specified number of
+zeros. If padding values are negative, then input is cropped.
+
+<a name="nn.SpatialReflectionPadding"></a>
+### SpatialReflectionPadding ###
+
+```lua
+module = nn.SpatialReflectionPadding(padLeft, padRight, padTop, padBottom)
+```
+
+Each feature map of a given input is padded with the reflection of the input boundary
+
+<a name="nn.SpatialReplicationPadding"></a>
+### SpatialReplicationPadding ###
+
+```lua
+module = nn.SpatialReplicationPadding(padLeft, padRight, padTop, padBottom)
+```
+
+Each feature map of a given input is padded with the replication of the input boundary
+
+<a name="nn.SpatialSubtractiveNormalization"></a>
+### SpatialSubtractiveNormalization ###
+
+```lua
+module = nn.SpatialSubtractiveNormalization(ninputplane, kernel)
+```
+
+Applies a spatial subtraction operation on a series of 2D inputs using
+`kernel` for computing the weighted average in a neighborhood. The
+neighborhood is defined for a local spatial region that is the size as
+kernel and across all features. For a an input image, since there is
+only one feature, the region is only spatial. For an RGB image, the
+weighted average is taken over RGB channels and a spatial region.
+
+If the `kernel` is 1D, then it will be used for constructing and seperable
+2D kernel. The operations will be much more efficient in this case.
+
+The kernel is generally chosen as a gaussian when it is believed that
+the correlation of two pixel locations decrease with increasing
+distance. On the feature dimension, a uniform average is used since
+the weighting across features is not known.
+
+For this example we use an external package
+[image](http://www.github.com/clementfarabet/lua---image/)
+
+```lua
+require 'image'
+require 'nn'
+lena = image.rgb2y(image.lena())
+ker = torch.ones(11)
+m=nn.SpatialSubtractiveNormalization(1,ker)
+processed = m:forward(lena)
+w1=image.display(lena)
+w2=image.display(processed)
+```
+![](image/lena.jpg)![](image/lenap.jpg)
+
+<a name="nn.SpatialCrossMapLRN"></a>
+### SpatialCrossMapLRN ###
+
+```lua
+module = nn.SpatialCrossMapLRN(size [,alpha] [,beta] [,k])
+```
+
+Applies Spatial Local Response Normalization between different feature maps.
+By default, `alpha = 0.0001`, `beta = 0.75` and `k = 1`
+
+The operation implemented is:
+```
+                          x_f
+y_f =  -------------------------------------------------
+        (k+(alpha/size)* sum_{l=l1 to l2} (x_l^2))^beta
+```
+where `x_f` is the input at spatial locations `h,w` (not shown for simplicity) and feature map `f`,
+`l1` corresponds to `max(0,f-ceil(size/2))` and `l2` to `min(F, f-ceil(size/2) + size)`. Here, `F`
+is the number of feature maps.
+More information can be found [here](https://code.google.com/p/cuda-convnet2/wiki/LayerParams#Local_response_normalization_layer_%28across_maps%29).
+
+<a name="nn.SpatialBatchNormalization"></a>
+## SpatialBatchNormalization ##
+
+`module` = `nn.SpatialBatchNormalization(N [,eps] [, momentum] [,affine])`
+ where N = number of input feature maps
+eps is a small value added to the standard-deviation to avoid divide-by-zero. Defaults to 1e-5
+`affine` is a boolean. When set to false, the learnable affine transform is disabled.  Defaults to true
+
+Implements Batch Normalization as described in the paper:
+   "Batch Normalization: Accelerating Deep Network Training
+                         by Reducing Internal Covariate Shift"
+                   by Sergey Ioffe, Christian Szegedy
+
+The operation implemented is:
+```
+   y =     ( x - mean(x) )
+        -------------------- * gamma + beta
+       standard-deviation(x)
+```
+where the mean and standard-deviation are calculated per feature-map over the mini-batches and pixels
+and where gamma and beta are learnable parameter vectors of size N (where N = number of feature maps).
+The learning of gamma and beta is optional.
+
+   In training time, this layer keeps a running estimate of it's computed mean and std.
+   The running sum is kept with a default momentup of 0.1 (unless over-ridden)
+   In test time, this running mean/std is used to normalize.
+
+
+
+The module only accepts 4D inputs.
+
+```lua
+-- with learnable parameters
+model = nn.SpatialBatchNormalization(m)
+A = torch.randn(b, m, h, w)
+C = model:forward(A)  -- C will be of size `b x m x h x w`
+
+-- without learnable parameters
+model = nn.SpatialBatchNormalization(m, nil, nil, false)
+A = torch.randn(b, m, h, w)
+C = model:forward(A)  -- C will be of size `b x m x h x w`
+```
+
+<a name="nn.VolumetricModules"></a>
+## Volumetric Modules ##
+Excluding an optional batch dimension, volumetric layers expect a 4D Tensor as input. The
+first dimension is the number of features (e.g. `frameSize`), the second is sequential (e.g. `time`) and the
+last two dimensions are spatial (e.g. `height x width`). These are commonly used for processing videos (sequences of images).
+
+<a name="nn.VolumetricConvolution"></a>
+### VolumetricConvolution ###
+
+```lua
+module = nn.VolumetricConvolution(nInputPlane, nOutputPlane, kT, kW, kH [, dT, dW, dH, padT, padW, padH])
+```
+
+Applies a 3D convolution over an input image composed of several input planes. The `input` tensor in
+`forward(input)` is expected to be a 4D tensor (`nInputPlane x time x height x width`).
+
+The parameters are the following:
+  * `nInputPlane`: The number of expected input planes in the image given into `forward()`.
+  * `nOutputPlane`: The number of output planes the convolution layer will produce.
+  * `kT`: The kernel size of the convolution in time
+  * `kW`: The kernel width of the convolution
+  * `kH`: The kernel height of the convolution
+  * `dT`: The step of the convolution in the time dimension. Default is `1`.
+  * `dW`: The step of the convolution in the width dimension. Default is `1`.
+  * `dH`: The step of the convolution in the height dimension. Default is `1`.
+  * `padT`: The additional zeros added per time to the input planes. Default is `0`, a good number is `(kT-1)/2`.
+  * `padW`: The additional zeros added per width to the input planes. Default is `0`, a good number is `(kW-1)/2`.
+  * `padH`: The additional zeros added per height to the input planes. Default is `0`, a good number is `(kH-1)/2`.
+
+
+Note that depending of the size of your kernel, several (of the last)
+columns or rows of the input image might be lost. It is up to the user to
+add proper padding in images.
+
+If the input image is a 4D tensor `nInputPlane x time x height x width`, the output image size
+will be `nOutputPlane x otime x owidth x oheight` where
+```lua
+otime  = floor((time  + 2*padT - kT) / dT + 1)
+owidth  = floor((width  + 2*padW - kW) / dW + 1)
+oheight  = floor((height  + 2*padH - kH) / dH + 1)
+```
+
+The parameters of the convolution can be found in `self.weight` (Tensor of
+size `nOutputPlane x nInputPlane x kT x kH x kW`) and `self.bias` (Tensor of
+size `nOutputPlane`). The corresponding gradients can be found in
+`self.gradWeight` and `self.gradBias`.
+
+<a name="nn.VolumetricFullConvolution"></a>
+### VolumetricFullConvolution ###
+
+```lua
+module = nn.VolumetricFullConvolution(nInputPlane, nOutputPlane, kT, kW, kH, [dT], [dW], [dH], [padT], [padW], [padH])
+```
+
+Applies a 3D full convolution over an input image composed of several input planes. The `input` tensor in
+`forward(input)` is expected to be a 4D or 5D tensor. Note that instead of setting `adjT`, `adjW` and `adjH`, VolumetricFullConvolution also accepts a table input with two tensors: `{convInput, sizeTensor}` where `convInput` is the standard input on which the full convolution is applied, and the size of `sizeTensor` is used to set the size of the output. Using the two-input version of forward
+will ignore the `adjT`, `adjW` and `adjH` values used to construct the module.
+
+The parameters are the following:
+* `nInputPlane`: The number of expected input planes in the image given into `forward()`.
+* `nOutputPlane`: The number of output planes the convolution layer will produce.
+* `kT`: The kernel depth of the convolution
+* `kW`: The kernel width of the convolution
+* `kH`: The kernel height of the convolution
+* `dT`: The step of the convolution in the depth dimension. Default is `1`.
+* `dW`: The step of the convolution in the width dimension. Default is `1`.
+* `dH`: The step of the convolution in the height dimension. Default is `1`.
+* `padT`: The additional zeros added per depth to the input planes. Default is `0`, a good number is `(kT-1)/2`.
+* `padW`: The additional zeros added per width to the input planes. Default is `0`, a good number is `(kW-1)/2`.
+* `padH`: The additional zeros added per height to the input planes. Default is `0`, a good number is `(kH-1)/2`.
+
+If the input image is a 3D tensor `nInputPlane x depth x height x width`, the output image size
+will be `nOutputPlane x odepth x oheight x owidth` where
+```lua
+odepth  = (depth  - 1) * dT - 2*padT + kT
+owidth  = (width  - 1) * dW - 2*padW + kW
+oheight = (height - 1) * dH - 2*padH + kH
+```
+
+<a name="nn.VolumetricMaxPooling"></a>
+### VolumetricMaxPooling ###
+
+```lua
+module = nn.VolumetricMaxPooling(kT, kW, kH [, dT, dW, dH, padT, padW, padH])
+```
+
+Applies 3D max-pooling operation in `kTxkWxkH` regions by step size
+`dTxdWxdH` steps. The number of output features is equal to the number of
+input planes / dT. The input can optionally be padded with zeros. Padding should be smaller than half of kernel size.  That is, `padT < kT/2`, `padW < kW/2` and `padH < kH/2`.
+
+<a name="nn.VolumetricAveragePooling"></a>
+### VolumetricAveragePooling ###
+
+```lua
+module = nn.VolumetricAveragePooling(kT, kW, kH [, dT, dW, dH])
+```
+
+Applies 3D average-pooling operation in `kTxkWxkH` regions by step size
+`dTxdWxdH` steps. The number of output features is equal to the number of
+input planes / dT.
+
+<a name="nn.VolumetricMaxUnpooling"></a>
+### VolumetricMaxUnpooling ###
+
+```lua
+module = nn.VolumetricMaxUnpooling(poolingModule)
+```
+
+Applies 3D "max-unpooling" operation using the indices previously computed
+by the VolumetricMaxPooling module `poolingModule`.
+
+When `B = poolingModule:forward(A)` is called, the indices of the maximal
+values (corresponding to their position within each map) are stored:
+`B[{n,k,t,i,j}] = A[{n,k,indices[{n,k,t}],indices[{n,k,i}],indices[{n,k,j}]}]`.
+If `C` is a tensor of same size as `B`, `module:updateOutput(C)` outputs a
+tensor `D` of same size as `A` such that:
+`D[{n,k,indices[{n,k,t}],indices[{n,k,i}],indices[{n,k,j}]}] = C[{n,k,t,i,j}]`.
diff --git a/doc/criterion.md b/doc/criterion.md
new file mode 100644
index 0000000..6e25f72
--- /dev/null
+++ b/doc/criterion.md
@@ -0,0 +1,789 @@
+<a name="nn.Criterions"></a>
+# Criterions #
+
+[`Criterions`](#nn.Criterion) are helpful to train a neural network. Given an input and a
+target, they compute a gradient according to a given loss function.
+
+  * Classification criterions:
+    * [`BCECriterion`](#nn.BCECriterion): binary cross-entropy for [`Sigmoid`](transfer.md#nn.Sigmoid) (two-class version of [`ClassNLLCriterion`](#nn.ClassNLLCriterion));
+    * [`ClassNLLCriterion`](#nn.ClassNLLCriterion): negative log-likelihood for [`LogSoftMax`](transfer.md#nn.LogSoftMax) (multi-class);
+    * [`CrossEntropyCriterion`](#nn.CrossEntropyCriterion): combines [`LogSoftMax`](transfer.md#nn.LogSoftMax) and [`ClassNLLCriterion`](#nn.ClassNLLCriterion);
+    * [`ClassSimplexCriterion`](#nn.ClassSimplexCriterion): A simplex embedding criterion for classification.
+    * [`MarginCriterion`](#nn.MarginCriterion): two class margin-based loss;
+    * [`SoftMarginCriterion`](#nn.SoftMarginCriterion): two class softmargin-based loss;
+    * [`MultiMarginCriterion`](#nn.MultiMarginCriterion): multi-class margin-based loss;
+    * [`MultiLabelMarginCriterion`](#nn.MultiLabelMarginCriterion): multi-class multi-classification margin-based loss;
+    * [`MultiLabelSoftMarginCriterion`](#nn.MultiLabelSoftMarginCriterion): multi-class multi-classification loss based on binary cross-entropy;
+  * Regression criterions:
+    * [`AbsCriterion`](#nn.AbsCriterion): measures the mean absolute value of the element-wise difference between input;
+    * [`SmoothL1Criterion`](#nn.SmoothL1Criterion): a smooth version of the AbsCriterion;
+    * [`MSECriterion`](#nn.MSECriterion): mean square error (a classic);
+    * [`DistKLDivCriterion`](#nn.DistKLDivCriterion): Kullback–Leibler divergence (for fitting continuous probability distributions);
+  * Embedding criterions (measuring whether two inputs are similar or dissimilar):
+    * [`HingeEmbeddingCriterion`](#nn.HingeEmbeddingCriterion): takes a distance as input;
+    * [`L1HingeEmbeddingCriterion`](#nn.L1HingeEmbeddingCriterion): L1 distance between two inputs;
+    * [`CosineEmbeddingCriterion`](#nn.CosineEmbeddingCriterion): cosine distance between two inputs;
+  * Miscelaneus criterions:
+    * [`MultiCriterion`](#nn.MultiCriterion) : a weighted sum of other criterions each applied to the same input and target;
+    * [`ParallelCriterion`](#nn.ParallelCriterion) : a weighted sum of other criterions each applied to a different input and target;
+    * [`MarginRankingCriterion`](#nn.MarginRankingCriterion): ranks two inputs;
+
+<a name="nn.Criterion"></a>
+## Criterion ##
+
+This is an abstract class which declares methods defined in all criterions.
+This class is [serializable](https://github.com/torch/torch7/blob/master/doc/file.md#serialization-methods).
+
+<a name="nn.Criterion.forward"></a>
+### [output] forward(input, target) ###
+
+Given an `input` and a `target`, compute the loss function associated to the criterion and return the result.
+In general `input` and `target` are [`Tensor`s](https://github.com/torch/torch7/blob/master/doc/tensor.md), but some specific criterions might require some other type of object.
+
+The `output` returned should be a scalar in general.
+
+The state variable [`self.output`](#nn.Criterion.output) should be updated after a call to `forward()`.
+
+
+<a name="nn.Criterion.backward"></a>
+### [gradInput] backward(input, target) ###
+
+Given an `input` and a `target`, compute the gradients of the loss function associated to the criterion and return the result.
+In general `input`, `target` and `gradInput` are [`Tensor`s](..:torch:tensor), but some specific criterions might require some other type of object.
+
+The state variable [`self.gradInput`](#nn.Criterion.gradInput) should be updated after a call to `backward()`.
+
+
+<a name="nn.Criterion.output"></a>
+### State variable: output ###
+
+State variable which contains the result of the last [`forward(input, target)`](#nn.Criterion.forward) call.
+
+
+<a name="nn.Criterion.gradInput"></a>
+### State variable: gradInput ###
+
+State variable which contains the result of the last [`backward(input, target)`](#nn.Criterion.backward) call.
+
+
+<a name="nn.AbsCriterion"></a>
+## AbsCriterion ##
+
+```lua
+criterion = nn.AbsCriterion()
+```
+
+Creates a criterion that measures the mean absolute value of the element-wise difference between input `x` and target `y`:
+
+```lua
+loss(x, y)  = 1/n \sum |x_i - y_i|
+```
+
+If `x` and `y` are `d`-dimensional `Tensor`s with a total of `n` elements, the sum operation still operates over all the elements, and divides by `n`.
+
+The division by `n` can be avoided if one sets the internal variable `sizeAverage` to `false`:
+
+```lua
+criterion = nn.AbsCriterion()
+criterion.sizeAverage = false
+```
+
+
+<a name="nn.ClassNLLCriterion"></a>
+## ClassNLLCriterion ##
+
+```lua
+criterion = nn.ClassNLLCriterion([weights])
+```
+
+The negative log likelihood criterion. It is useful to train a classication problem with `n` classes.
+If provided, the optional argument `weights` should be a 1D `Tensor` assigning weight to each of the classes.
+This is particularly useful when you have an unbalanced training set.
+
+The `input` given through a `forward()` is expected to contain _log-probabilities_ of each class: `input` has to be a 1D `Tensor` of size `n`.
+Obtaining log-probabilities in a neural network is easily achieved by adding a [`LogSoftMax`](transfer.md#nn.LogSoftMax) layer in the last layer of your neural network.
+You may use [`CrossEntropyCriterion`](#nn.CrossEntropyCriterion) instead, if you prefer not to add an extra layer to your network.
+This criterion expects a class index (1 to the number of class) as `target` when calling [`forward(input, target`)](#nn.CriterionForward) and [`backward(input, target)`](#nn.CriterionBackward).
+
+The loss can be described as:
+
+```lua
+loss(x, class) = -x[class]
+```
+
+or in the case of the `weights` argument it is specified as follows:
+
+```lua
+loss(x, class) = -weights[class] * x[class]
+```
+
+The following is a code fragment showing how to make a gradient step given an input `x`, a desired output `y` (an integer `1` to `n`, in this case `n = 2` classes), a network `mlp` and a learning rate `learningRate`:
+
+```lua
+function gradUpdate(mlp, x, y, learningRate)
+   local criterion = nn.ClassNLLCriterion()
+   local pred = mlp:forward(x)
+   local err = criterion:forward(pred, y)
+   mlp:zeroGradParameters()
+   local t = criterion:backward(pred, y)
+   mlp:backward(x, t)
+   mlp:updateParameters(learningRate)
+end
+```
+
+By default, the losses are averaged over observations for each minibatch. However, if the field `sizeAverage` is set to `false`, the losses are instead summed for each minibatch.
+
+
+<a name="nn.CrossEntropyCriterion"></a>
+## CrossEntropyCriterion ##
+
+```lua
+criterion = nn.CrossEntropyCriterion([weights])
+```
+
+This criterion combines [`LogSoftMax`](#nn.LogSoftMax) and [`ClassNLLCriterion`](#nn.ClassNLLCriterion) in one single class.
+
+It is useful to train a classication problem with `n` classes.
+If provided, the optional argument `weights` should be a 1D `Tensor` assigning weight to each of the classes. This is particularly useful when you have an unbalanced training set.
+
+The `input` given through a `forward()` is expected to contain scores for each class: `input` has to be a 1D `Tensor` of size `n`.
+This criterion expect a class index (1 to the number of class) as `target` when calling [`forward(input, target)`](#nn.CriterionForward) and [`backward(input, target)`](#nn.CriterionBackward).
+
+The loss can be described as:
+
+```lua
+loss(x, class) = -log(exp(x[class]) / (\sum_j exp(x[j])))
+               = -x[class] + log(\sum_j exp(x[j]))
+```
+
+or in the case of the `weights` argument being specified:
+
+```lua
+loss(x, class) = weights[class] * (-x[class] + log(\sum_j exp(x[j])))
+```
+
+The losses are averaged across observations for each minibatch.
+
+<a name="nn.ClassSimplexCriterion"/>
+## ClassSimplexCriterion ##
+
+```lua
+criterion = nn.ClassSimplexCriterion(nClasses)
+```
+
+ClassSimplexCriterion implements a criterion for classification.
+It learns an embedding per class, where each class' embedding is a point on an (N-1)-dimensional simplex,
+where N is the number of classes.
+
+The `input` given through a `forward()` is expected to be the output of a Normalized Linear layer with no bias:
+- `input` has to be a 1D `Tensor` of size `n` for a single sample
+- a 2D `Tensor` of size `batchSize x n` for a mini-batch of samples
+
+This Criterion is best used in combination with a neural network where the last layers are:
+- a weight-normalized bias-less Linear layer. [Example source code](https://gist.github.com/soumith/4d0273f592956199739b)
+- followed by an output normalization layer (nn.Normalize).
+
+The loss is described in detail in the paper [Scale-invariant learning and convolutional networks](http://arxiv.org/abs/1506.08230).
+
+
+The following is a code fragment showing how to make a gradient step given an input `x`, a desired output `y` (an integer `1` to `n`, in this case `n = 30` classes), a network `mlp` and a learning rate `learningRate`:
+
+```lua
+nInput = 10
+nClasses = 30
+nHidden = 100
+mlp = nn.Sequential()
+mlp:add(nn.Linear(nInput, nHidden)):add(nn.ReLU())
+mlp:add(nn.NormalizedLinearNoBias(nHidden, nClasses))
+mlp:add(nn.Normalize(2))
+
+criterion = nn.ClassSimplexCriterion(nClasses)
+
+function gradUpdate(mlp, x, y, learningRate)
+   pred = mlp:forward(x)
+   local err = criterion:forward(pred, y)
+   mlp:zeroGradParameters()
+   local t = criterion:backward(pred, y)
+   mlp:backward(x, t)
+   mlp:updateParameters(learningRate)
+end
+```
+
+This criterion also provides two helper functions `getPredictions(input)` and `getTopPrediction(input)` that return the raw predictions and the top prediction index respectively, given an input sample.
+
+<a name="nn.DistKLDivCriterion"></a>
+## DistKLDivCriterion ##
+
+```lua
+criterion = nn.DistKLDivCriterion()
+```
+
+The [Kullback–Leibler divergence](http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence) criterion.
+KL divergence is a useful distance measure for continuous distributions and is often useful when performing direct regression over the space of (discretely sampled) continuous output distributions.
+As with ClassNLLCriterion, the `input` given through a `forward()` is expected to contain _log-probabilities_, however unlike ClassNLLCriterion, `input` is not restricted to a 1D or 2D vector (as the criterion is applied element-wise).
+
+This criterion expect a `target` `Tensor` of the same size as the `input` `Tensor` when calling [`forward(input, target)`](#nn.CriterionForward) and [`backward(input, target)`](#nn.CriterionBackward).
+
+The loss can be described as:
+
+```lua
+loss(x, target) = 1/n \sum(target_i * (log(target_i) - x_i))
+```
+
+By default, the losses are averaged for each minibatch over observations *as well as* over dimensions. However, if the field `sizeAverage` is set to `false`, the losses are instead summed.
+
+
+<a name="nn.BCECriterion"></a>
+## BCECriterion
+
+```lua
+criterion = nn.BCECriterion([weights])
+```
+
+Creates a criterion that measures the Binary Cross Entropy between the target and the output:
+
+```lua
+loss(o, t) = - 1/n sum_i (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i]))
+```
+
+or in the case of the weights argument being specified:
+
+```lua
+loss(o, t) = - 1/n sum_i weights[i] * (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i]))
+```
+
+This is used for measuring the error of a reconstruction in for example an auto-encoder. Note that the targets `t[i]` should be numbers between 0 and 1, for instance, the output of an [`nn.Sigmoid`](transfer.md#nn.Sigmoid) layer.
+
+By default, the losses are averaged for each minibatch over observations *as well as* over dimensions. However, if the field `sizeAverage` is set to `false`, the losses are instead summed.
+
+
+<a name="nn.MarginCriterion"></a>
+## MarginCriterion ##
+
+```lua
+criterion = nn.MarginCriterion([margin])
+```
+
+Creates a criterion that optimizes a two-class classification hinge loss (margin-based loss) between input `x` (a `Tensor` of dimension `1`) and output `y` (which is a tensor containing either `1`s or `-1`s).
+`margin`, if unspecified, is by default `1`.
+
+```lua
+loss(x, y) = sum_i (max(0, margin - y[i]*x[i])) / x:nElement()
+```
+
+The normalization by the number of elements in the input can be disabled by
+setting `self.sizeAverage` to `false`.
+
+### Example
+
+```lua
+function gradUpdate(mlp, x, y, criterion, learningRate)
+   local pred = mlp:forward(x)
+   local err = criterion:forward(pred, y)
+   local gradCriterion = criterion:backward(pred, y)
+   mlp:zeroGradParameters()
+   mlp:backward(x, gradCriterion)
+   mlp:updateParameters(learningRate)
+end
+
+mlp = nn.Sequential()
+mlp:add(nn.Linear(5, 1))
+
+x1 = torch.rand(5)
+x1_target = torch.Tensor{1}
+x2 = torch.rand(5)
+x2_target = torch.Tensor{-1}
+criterion=nn.MarginCriterion(1)
+
+for i = 1, 1000 do
+   gradUpdate(mlp, x1, x1_target, criterion, 0.01)
+   gradUpdate(mlp, x2, x2_target, criterion, 0.01)
+end
+
+print(mlp:forward(x1))
+print(mlp:forward(x2))
+
+print(criterion:forward(mlp:forward(x1), x1_target))
+print(criterion:forward(mlp:forward(x2), x2_target))
+```
+
+gives the output:
+
+```lua
+ 1.0043
+[torch.Tensor of dimension 1]
+
+
+-1.0061
+[torch.Tensor of dimension 1]
+
+0
+0
+```
+
+i.e. the mlp successfully separates the two data points such that they both have a `margin` of `1`, and hence a loss of `0`.
+
+By default, the losses are averaged over observations for each minibatch. However, if the field `sizeAverage` is set to `false`, the losses are instead summed.
+
+<a name="nn.SoftMarginCriterion"></a>
+## SoftMarginCriterion ##
+
+```lua
+criterion = nn.SoftMarginCriterion()
+```
+
+Creates a criterion that optimizes a two-class classification logisitic loss between input `x` (a `Tensor` of dimension `1`) and output `y` (which is a tensor containing either `1`s or `-1`s).
+
+```lua
+loss(x, y) = sum_i (log(1 + exp(-y[i]*x[i]))) / x:nElement()
+```
+
+The normalization by the number of elements in the input can be disabled by
+setting `self.sizeAverage` to `false`.
+
+### Example
+
+```lua
+function gradUpdate(mlp, x, y, criterion, learningRate)
+   local pred = mlp:forward(x)
+   local err = criterion:forward(pred, y)
+   local gradCriterion = criterion:backward(pred, y)
+   mlp:zeroGradParameters()
+   mlp:backward(x, gradCriterion)
+   mlp:updateParameters(learningRate)
+end
+
+mlp = nn.Sequential()
+mlp:add(nn.Linear(5, 1))
+
+x1 = torch.rand(5)
+x1_target = torch.Tensor{1}
+x2 = torch.rand(5)
+x2_target = torch.Tensor{-1}
+criterion=nn.SoftMarginCriterion(1)
+
+for i = 1, 1000 do
+   gradUpdate(mlp, x1, x1_target, criterion, 0.01)
+   gradUpdate(mlp, x2, x2_target, criterion, 0.01)
+end
+
+print(mlp:forward(x1))
+print(mlp:forward(x2))
+
+print(criterion:forward(mlp:forward(x1), x1_target))
+print(criterion:forward(mlp:forward(x2), x2_target))
+```
+
+gives the output:
+
+```lua
+
+ 0.7471
+[torch.DoubleTensor of size 1]
+
+-0.9607
+[torch.DoubleTensor of size 1]
+
+0.38781049558836
+0.32399356957564
+
+```
+
+i.e. the mlp successfully separates the two data points.
+
+By default, the losses are averaged over observations for each minibatch. However, if the field `sizeAverage` is set to `false`, the losses are instead summed.
+
+<a name="nn.MultiMarginCriterion"></a>
+## MultiMarginCriterion ##
+
+```lua
+criterion = nn.MultiMarginCriterion(p, [weights], [margin])
+```
+
+Creates a criterion that optimizes a multi-class classification hinge loss (margin-based loss) between input `x`  (a `Tensor` of dimension 1) and output `y` (which is a target class index, `1` <= `y` <= `x:size(1)`):
+
+```lua
+loss(x, y) = sum_i(max(0, (margin - x[y] + x[i]))^p) / x:size(1)
+```
+
+where `i == 1` to `x:size(1)` and `i ~= y`.
+Note that this criterion also works with 2D inputs and 1D targets.
+
+Optionally, you can give non-equal weighting on the classes by passing a 1D `weights` tensor into the constructor.
+The loss function then becomes:
+
+```lua
+loss(x, y) = sum_i(max(0, w[y] * (margin - x[y] - x[i]))^p) / x:size(1)
+```
+
+This criterion is especially useful for classification when used in conjunction with a module ending in the following output layer:
+
+```lua
+mlp = nn.Sequential()
+mlp:add(nn.Euclidean(n, m)) -- outputs a vector of distances
+mlp:add(nn.MulConstant(-1)) -- distance to similarity
+```
+
+By default, the losses are averaged over observations for each minibatch. However, if the field `sizeAverage` is set to `false`, the losses are instead summed.
+
+
+<a name="nn.MultiLabelMarginCriterion"></a>
+## MultiLabelMarginCriterion ##
+
+```lua
+criterion = nn.MultiLabelMarginCriterion()
+```
+
+Creates a criterion that optimizes a multi-class multi-classification hinge loss (margin-based loss) between input `x`  (a 1D `Tensor`) and output `y` (which is a 1D `Tensor` of target class indices):
+
+```lua
+loss(x, y) = sum_ij(max(0, 1 - (x[y[j]] - x[i]))) / x:size(1)
+```
+
+where `i == 1` to `x:size(1)`, `j == 1` to `y:size(1)`, `y[j] ~= 0`, and `i ~= y[j]` for all `i` and `j`.
+Note that this criterion also works with 2D inputs and targets.
+
+`y` and `x` must have the same size.
+The criterion only considers the first non zero `y[j]` targets.
+This allows for different samples to have variable amounts of target classes:
+
+```lua
+criterion = nn.MultiLabelMarginCriterion()
+input = torch.randn(2, 4)
+target = torch.Tensor{{1, 3, 0, 0}, {4, 0, 0, 0}} -- zero-values are ignored
+criterion:forward(input, target)
+```
+
+<a name="nn.MultiLabelSoftMarginCriterion"/>
+## MultiLabelSoftMarginCriterion ##
+
+```lua
+criterion = nn.MultiLabelSoftMarginCriterion()
+```
+
+Creates a criterion that optimizes a multi-label one-versus-all loss based on max-entropy, between input `x`  (a 1D `Tensor`) and target `y` (a binary 1D `Tensor`):
+
+```lua
+loss(x, y) = - sum_i (y[i] log( exp(x[i]) / (1 + exp(x[i]))) + (1-y[i]) log(1/(1+exp(x[i])))) / x:nElement()
+```
+
+where `i == 1` to `x:nElement()`, `y[i]  in {0,1}`.
+Note that this criterion also works with 2D inputs and targets.
+
+`y` and `x` must have the same size.
+
+<a name="nn.MSECriterion"></a>
+## MSECriterion ##
+
+```lua
+criterion = nn.MSECriterion()
+```
+
+Creates a criterion that measures the mean squared error between `n` elements in the input `x` and output `y`:
+
+```lua
+loss(x, y) = 1/n \sum |x_i - y_i|^2 .
+```
+
+If `x` and `y` are `d`-dimensional `Tensor`s with a total of `n` elements, the sum operation still operates over all the elements, and divides by `n`.
+The two `Tensor`s must have the same number of elements (but their sizes might be different).
+
+The division by `n` can be avoided if one sets the internal variable `sizeAverage` to `false`:
+
+```lua
+criterion = nn.MSECriterion()
+criterion.sizeAverage = false
+```
+
+By default, the losses are averaged over observations for each minibatch. However, if the field `sizeAverage` is set to `false`, the losses are instead summed.
+
+
+<a name="nn.MultiCriterion"></a>
+## MultiCriterion ##
+
+```lua
+criterion = nn.MultiCriterion()
+```
+
+This returns a Criterion which is a weighted sum of other Criterion.
+Criterions are added using the method:
+
+```lua
+criterion:add(singleCriterion [, weight])
+```
+
+where `weight` is a scalar (default 1). Each criterion is applied to the same `input` and `target`.
+
+Example :
+
+```lua
+input = torch.rand(2,10)
+target = torch.IntTensor{1,8}
+nll = nn.ClassNLLCriterion()
+nll2 = nn.CrossEntropyCriterion()
+mc = nn.MultiCriterion():add(nll, 0.5):add(nll2)
+output = mc:forward(input, target)
+```
+
+<a name="nn.ParallelCriterion"></a>
+## ParallelCriterion ##
+
+```lua
+criterion = nn.ParallelCriterion([repeatTarget])
+```
+
+This returns a Criterion which is a weighted sum of other Criterion.
+Criterions are added using the method:
+
+```lua
+criterion:add(singleCriterion [, weight])
+```
+
+where `weight` is a scalar (default 1). The criterion expects an `input` and `target` table.
+Each criterion is applied to the commensurate `input` and `target` element in the tables.
+However, if `repeatTarget=true`, the `target` is repeatedly presented to each criterion (with a different `input`).
+
+Example :
+
+```lua
+input = {torch.rand(2,10), torch.randn(2,10)}
+target = {torch.IntTensor{1,8}, torch.randn(2,10)}
+nll = nn.ClassNLLCriterion()
+mse = nn.MSECriterion()
+pc = nn.ParallelCriterion():add(nll, 0.5):add(mse)
+output = pc:forward(input, target)
+```
+
+
+<a name="nn.SmoothL1Criterion"></a>
+## SmoothL1Criterion ##
+
+```lua
+criterion = nn.SmoothL1Criterion()
+```
+
+Creates a criterion that can be thought of as a smooth version of the [`AbsCriterion`](#nn.AbsCriterion). It uses a squared term if the absolute element-wise error falls below 1. It is less sensitive to outliers than the [`MSECriterion`](#nn.MSECriterion) and in some cases prevents exploding gradients (e.g. see "Fast R-CNN" paper by Ross Girshick).
+
+```lua
+                      ⎧ 0.5 * (x_i - y_i)^2, if |x_i - y_i| < 1
+loss(x, y) = 1/n \sum ⎨
+                      ⎩ |x_i - y_i| - 0.5,   otherwise
+```
+
+If `x` and `y` are `d`-dimensional `Tensor`s with a total of `n` elements, the sum operation still operates over all the elements, and divides by `n`.
+
+The division by `n` can be avoided if one sets the internal variable `sizeAverage` to `false`:
+
+```lua
+criterion = nn.SmoothL1Criterion()
+criterion.sizeAverage = false
+```
+
+By default, the losses are averaged over observations for each minibatch. However, if the field `sizeAverage` is set to `false`, the losses are instead summed.
+
+
+<a name="nn.HingeEmbeddingCriterion"></a>
+## HingeEmbeddingCriterion ##
+
+```lua
+criterion = nn.HingeEmbeddingCriterion([margin])
+```
+
+Creates a criterion that measures the loss given an input `x` which is a 1-dimensional vector and a label `y` (`1` or `-1`).
+This is usually used for measuring whether two inputs are similar or dissimilar, e.g. using the L1 pairwise distance, and is typically used for learning nonlinear embeddings or semi-supervised learning.
+
+```lua
+                 ⎧ x_i,                  if y_i ==  1
+loss(x, y) = 1/n ⎨
+                 ⎩ max(0, margin - x_i), if y_i == -1
+```
+
+If `x` and `y` are `n`-dimensional `Tensor`s, the sum operation still operates over all the elements, and divides by `n` (this can be avoided if one sets the internal variable `sizeAverage` to `false`). The `margin` has a default value of `1`, or can be set in the constructor.
+
+### Example
+
+```lua
+-- imagine we have one network we are interested in, it is called "p1_mlp"
+p1_mlp = nn.Sequential(); p1_mlp:add(nn.Linear(5, 2))
+
+-- But we want to push examples towards or away from each other so we make another copy
+-- of it called p2_mlp; this *shares* the same weights via the set command, but has its
+-- own set of temporary gradient storage that's why we create it again (so that the gradients
+-- of the pair don't wipe each other)
+p2_mlp = nn.Sequential(); p2_mlp:add(nn.Linear(5, 2))
+p2_mlp:get(1).weight:set(p1_mlp:get(1).weight)
+p2_mlp:get(1).bias:set(p1_mlp:get(1).bias)
+
+-- we make a parallel table that takes a pair of examples as input.
+-- They both go through the same (cloned) mlp
+prl = nn.ParallelTable()
+prl:add(p1_mlp)
+prl:add(p2_mlp)
+
+-- now we define our top level network that takes this parallel table
+-- and computes the pairwise distance betweem the pair of outputs
+mlp = nn.Sequential()
+mlp:add(prl)
+mlp:add(nn.PairwiseDistance(1))
+
+-- and a criterion for pushing together or pulling apart pairs
+crit = nn.HingeEmbeddingCriterion(1)
+
+-- lets make two example vectors
+x = torch.rand(5)
+y = torch.rand(5)
+
+
+-- Use a typical generic gradient update function
+function gradUpdate(mlp, x, y, criterion, learningRate)
+local pred = mlp:forward(x)
+local err = criterion:forward(pred, y)
+local gradCriterion = criterion:backward(pred, y)
+mlp:zeroGradParameters()
+mlp:backward(x, gradCriterion)
+mlp:updateParameters(learningRate)
+end
+
+-- push the pair x and y together, notice how then the distance between them given
+-- by print(mlp:forward({x, y})[1]) gets smaller
+for i = 1, 10 do
+   gradUpdate(mlp, {x, y}, 1, crit, 0.01)
+   print(mlp:forward({x, y})[1])
+end
+
+-- pull apart the pair x and y, notice how then the distance between them given
+-- by print(mlp:forward({x, y})[1]) gets larger
+
+for i = 1, 10 do
+   gradUpdate(mlp, {x, y}, -1, crit, 0.01)
+   print(mlp:forward({x, y})[1])
+end
+```
+
+By default, the losses are averaged over observations for each minibatch. However, if the field `sizeAverage` is set to `false`, the losses are instead summed.
+
+
+<a name="nn.L1HingeEmbeddingCriterion"></a>
+## L1HingeEmbeddingCriterion ##
+
+```lua
+criterion = nn.L1HingeEmbeddingCriterion([margin])
+```
+
+Creates a criterion that measures the loss given  an input `x` = `{x1, x2}`, a table of two `Tensor`s, and a label `y` (`1` or `-1`): this is used for measuring whether two inputs are similar or dissimilar, using the L1 distance, and is typically used for learning nonlinear embeddings or semi-supervised learning.
+
+```lua
+             ⎧ ||x1 - x2||_1,                  if y ==  1
+loss(x, y) = ⎨
+             ⎩ max(0, margin - ||x1 - x2||_1), if y == -1
+```
+
+The `margin` has a default value of `1`, or can be set in the constructor.
+
+<a name="nn.CosineEmbeddingCriterion"></a>
+## CosineEmbeddingCriterion ##
+
+```lua
+criterion = nn.CosineEmbeddingCriterion([margin])
+```
+
+Creates a criterion that measures the loss given  an input `x` = `{x1, x2}`, a table of two `Tensor`s, and a `Tensor` label `y`  with values 1 or -1.
+This is used for measuring whether two inputs are similar or dissimilar, using the cosine distance, and is typically used for learning nonlinear embeddings or semi-supervised learning.
+
+`margin` should be a number from `-1` to `1`, `0` to `0.5` is suggested.
+`Forward` and `Backward` have to be used alternately. If `margin` is missing, the default value is `0`.
+
+The loss function for each sample is:
+
+```lua
+             ⎧ 1 - cos(x1, x2),              if y ==  1
+loss(x, y) = ⎨
+             ⎩ max(0, cos(x1, x2) - margin), if y == -1
+```
+
+For batched inputs, if the internal variable `sizeAverage` is equal to `true`, the loss function averages the loss over the batch samples; if `sizeAverage` is `false`, then the loss function sums over the batch samples. By default, `sizeAverage` equals to `true`.
+
+By default, the losses are averaged over observations for each minibatch. However, if the field `sizeAverage` is set to `false`, the losses are instead summed.
+
+
+<a name="nn.MarginRankingCriterion"></a>
+## MarginRankingCriterion ##
+
+```lua
+criterion = nn.MarginRankingCriterion(margin)
+```
+
+Creates a criterion that measures the loss given  an input `x` = `{x1, x2}`, a table of two `Tensor`s of size 1 (they contain only scalars), and a label `y` (`1` or `-1`).
+In batch mode, `x` is a table of two `Tensor`s of size `batchsize`, and `y` is a `Tensor` of size `batchsize` containing `1` or `-1` for each corresponding pair of elements in the input `Tensor`.
+
+If `y == 1` then it assumed the first input should be ranked higher (have a larger value) than the second input, and vice-versa for `y == -1`.
+
+The loss function is:
+
+```lua
+loss(x, y) = max(0, -y * (x[1] - x[2]) + margin)
+```
+
+For batched inputs, if the internal variable `sizeAverage` is equal to `true`, the loss function averages the loss over the batch samples; if `sizeAverage` is `false`, then the loss function sums over the batch samples. By default, `sizeAverage` equals to `true`.
+By default, the losses are averaged over observations for each minibatch. However, if the field `sizeAverage` is set to `false`, the losses are instead summed.
+
+### Example
+
+```lua
+p1_mlp = nn.Linear(5, 2)
+p2_mlp = p1_mlp:clone('weight', 'bias')
+
+prl = nn.ParallelTable()
+prl:add(p1_mlp)
+prl:add(p2_mlp)
+
+mlp1 = nn.Sequential()
+mlp1:add(prl)
+mlp1:add(nn.DotProduct())
+
+mlp2 = mlp1:clone('weight', 'bias')
+
+mlpa = nn.Sequential()
+prla = nn.ParallelTable()
+prla:add(mlp1)
+prla:add(mlp2)
+mlpa:add(prla)
+
+crit = nn.MarginRankingCriterion(0.1)
+
+x=torch.randn(5)
+y=torch.randn(5)
+z=torch.randn(5)
+
+-- Use a typical generic gradient update function
+function gradUpdate(mlp, x, y, criterion, learningRate)
+   local pred = mlp:forward(x)
+   local err = criterion:forward(pred, y)
+   local gradCriterion = criterion:backward(pred, y)
+   mlp:zeroGradParameters()
+   mlp:backward(x, gradCriterion)
+   mlp:updateParameters(learningRate)
+end
+
+for i = 1, 100 do
+   gradUpdate(mlpa, {{x, y}, {x, z}}, 1, crit, 0.01)
+   if true then
+      o1 = mlp1:forward{x, y}[1]
+      o2 = mlp2:forward{x, z}[1]
+      o = crit:forward(mlpa:forward{{x, y}, {x, z}}, 1)
+      print(o1, o2, o)
+   end
+end
+
+print "--"
+
+for i = 1, 100 do
+   gradUpdate(mlpa, {{x, y}, {x, z}}, -1, crit, 0.01)
+   if true then
+      o1 = mlp1:forward{x, y}[1]
+      o2 = mlp2:forward{x, z}[1]
+      o = crit:forward(mlpa:forward{{x, y}, {x, z}}, -1)
+      print(o1, o2, o)
+   end
+end
+```
diff --git a/doc/image/abs.png b/doc/image/abs.png
new file mode 100644
index 0000000..fa7f470
Binary files /dev/null and b/doc/image/abs.png differ
diff --git a/doc/image/elu.png b/doc/image/elu.png
new file mode 100644
index 0000000..a12873f
Binary files /dev/null and b/doc/image/elu.png differ
diff --git a/doc/image/exp.png b/doc/image/exp.png
new file mode 100644
index 0000000..07d28d4
Binary files /dev/null and b/doc/image/exp.png differ
diff --git a/doc/image/hshrink.png b/doc/image/hshrink.png
new file mode 100644
index 0000000..7f96292
Binary files /dev/null and b/doc/image/hshrink.png differ
diff --git a/doc/image/htanh.png b/doc/image/htanh.png
new file mode 100644
index 0000000..c8e6084
Binary files /dev/null and b/doc/image/htanh.png differ
diff --git a/doc/image/lena.jpg b/doc/image/lena.jpg
new file mode 100644
index 0000000..d4a8c36
Binary files /dev/null and b/doc/image/lena.jpg differ
diff --git a/doc/image/lenap.jpg b/doc/image/lenap.jpg
new file mode 100644
index 0000000..0e6916d
Binary files /dev/null and b/doc/image/lenap.jpg differ
diff --git a/doc/image/logsigmoid.png b/doc/image/logsigmoid.png
new file mode 100644
index 0000000..f632ed8
Binary files /dev/null and b/doc/image/logsigmoid.png differ
diff --git a/doc/image/logsoftmax.png b/doc/image/logsoftmax.png
new file mode 100644
index 0000000..dec5be5
Binary files /dev/null and b/doc/image/logsoftmax.png differ
diff --git a/doc/image/parameterflattening.png b/doc/image/parameterflattening.png
new file mode 100644
index 0000000..efab4de
Binary files /dev/null and b/doc/image/parameterflattening.png differ
diff --git a/doc/image/parameterflattening.svg b/doc/image/parameterflattening.svg
new file mode 100644
index 0000000..d58d62f
--- /dev/null
+++ b/doc/image/parameterflattening.svg
@@ -0,0 +1,338 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="275.54715mm"
+   height="214.99242mm"
+   viewBox="0 0 976.34814 761.78413"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.91 r13725"
+   sodipodi:docname="parameterflattening.svg"
+   inkscape:export-filename="/home/ubuntu/git/nn/doc/image/parameterflattening.svg.png"
+   inkscape:export-xdpi="90"
+   inkscape:export-ydpi="90">
+  <defs
+     id="defs4" />
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="0.7"
+     inkscape:cx="165.78568"
+     inkscape:cy="360.0347"
+     inkscape:document-units="px"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     inkscape:window-width="1920"
+     inkscape:window-height="1024"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0" />
+  <metadata
+     id="metadata7">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-145.10191,-140.95261)">
+    <rect
+       id="rect3336"
+       width="264.20071"
+       height="127.05788"
+       x="498.61389"
+       y="212.40469"
+       style="fill:none;stroke:#000000;stroke-width:1.08497822;stroke-opacity:1" />
+    <rect
+       id="rect3336-7"
+       width="264.20071"
+       height="127.05788"
+       x="499.32819"
+       y="384.54752"
+       style="fill:none;stroke:#000000;stroke-width:1.08497822;stroke-opacity:1" />
+    <rect
+       id="rect3336-7-1"
+       width="264.20071"
+       height="127.05788"
+       x="502.18533"
+       y="554.54755"
+       style="fill:none;stroke:#000000;stroke-width:1.08497822;stroke-opacity:1" />
+    <rect
+       id="rect3336-7-1-4"
+       width="264.20071"
+       height="127.05788"
+       x="499.32816"
+       y="705.97614"
+       style="fill:none;stroke:#000000;stroke-width:1.08497822;stroke-opacity:1" />
+    <rect
+       style="fill:#aafff8;fill-opacity:1;stroke:#000000;stroke-opacity:1"
+       id="rect4183"
+       width="18.571428"
+       height="631.42859"
+       x="170.00005"
+       y="206.64792" />
+    <rect
+       style="fill:#fcf2cd;fill-opacity:1;stroke:#000000;stroke-opacity:1"
+       id="rect4185"
+       width="18.571428"
+       height="631.42859"
+       x="207.14287"
+       y="207.50507" />
+    <rect
+       style="fill:#aafff8;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
+       id="rect4187"
+       width="84.285713"
+       height="41.42857"
+       x="518.57141"
+       y="229.50507" />
+    <rect
+       style="fill:#fcf2cd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
+       id="rect4187-3"
+       width="84.285713"
+       height="41.42857"
+       x="518.42853"
+       y="283.07651" />
+    <rect
+       style="fill:#aafff8;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
+       id="rect4187-8"
+       width="84.285713"
+       height="41.42857"
+       x="519.35712"
+       y="400.57651" />
+    <rect
+       style="fill:#fcf2cd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
+       id="rect4187-3-3"
+       width="84.285713"
+       height="41.42857"
+       x="519.21423"
+       y="454.14792" />
+    <rect
+       style="fill:#aafff8;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
+       id="rect4187-8-7"
+       width="84.285713"
+       height="41.42857"
+       x="526.5"
+       y="572.00507" />
+    <rect
+       style="fill:#fcf2cd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
+       id="rect4187-3-3-8"
+       width="84.285713"
+       height="41.42857"
+       x="526.35712"
+       y="625.57648" />
+    <rect
+       style="fill:#aafff8;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
+       id="rect4187-8-7-8"
+       width="84.285713"
+       height="41.42857"
+       x="529.35718"
+       y="722.00513" />
+    <rect
+       style="fill:#fcf2cd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
+       id="rect4187-3-3-8-3"
+       width="84.285713"
+       height="41.42857"
+       x="529.21429"
+       y="775.57648" />
+    <text
+       xml:space="preserve"
+       style="font-size:20px;fill:none;stroke:#000000;stroke-opacity:1"
+       x="1515.7142"
+       y="190.93362"
+       id="text4278"><tspan
+         sodipodi:role="line"
+         id="tspan4280"
+         x="1515.7142"
+         y="190.93362"></tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:20px;fill:#000000;stroke:#000000;stroke-opacity:1;fill-opacity:1;"
+       x="635.71429"
+       y="768.07654"
+       id="text4290"><tspan
+         sodipodi:role="line"
+         id="tspan4292"
+         x="635.71429"
+         y="768.07654">conv1</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:20px;fill:#000000;stroke:#000000;stroke-opacity:1;fill-opacity:1;"
+       x="627.14288"
+       y="613.79077"
+       id="text4294"><tspan
+         sodipodi:role="line"
+         id="tspan4296"
+         x="627.14288"
+         y="613.79077">conv2</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:20px;fill:#000000;stroke:#000000;stroke-opacity:1;fill-opacity:1;"
+       x="632.85718"
+       y="443.79074"
+       id="text4298"><tspan
+         sodipodi:role="line"
+         id="tspan4300"
+         x="632.85718"
+         y="443.79074">conv3</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:20px;fill:#000000;stroke:#000000;stroke-opacity:1;fill-opacity:1;"
+       x="631.42865"
+       y="259.50507"
+       id="text4302"><tspan
+         sodipodi:role="line"
+         id="tspan4304"
+         x="631.42865"
+         y="259.50507">conv4</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:20px;fill:#000000;stroke:#000000;stroke-opacity:1;fill-opacity:1;"
+       x="528.57141"
+       y="156.64792"
+       id="text4306"><tspan
+         sodipodi:role="line"
+         id="tspan4308"
+         x="528.57141"
+         y="156.64792">Network layers:</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:20px;fill:#000000;stroke:#000000;stroke-opacity:1;fill-opacity:1;"
+       x="145.14287"
+       y="159.79077"
+       id="text4310"><tspan
+         sodipodi:role="line"
+         x="145.14287"
+         y="159.79077"
+         id="tspan4314">flattened tensors:</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:20px;fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;"
+       x="175.71434"
+       y="898.0766"
+       id="text4337"><tspan
+         sodipodi:role="line"
+         id="tspan4339"
+         x="175.71434"
+         y="898.0766">params tensor</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:20px;fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;"
+       x="288.57147"
+       y="815.21936"
+       id="text4341"><tspan
+         sodipodi:role="line"
+         id="tspan4343"
+         x="288.57147"
+         y="815.21936">gradParams</tspan><tspan
+         sodipodi:role="line"
+         x="288.57147"
+         y="840.21936"
+         id="tspan4345">tensor</tspan></text>
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 284.28571,810.93366 228.57143,793.79078"
+       id="path4347"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 191.42857,872.36216 180,843.79076"
+       id="path4349"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 522.85714,230.93364 185.71429,205.21935"
+       id="path4351"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 517.14285,269.50506 187.14286,342.36221"
+       id="path4353"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 521.42857,396.64792 187.14286,340.93364"
+       id="path4355"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 521.42857,440.93364 185.71429,483.79078"
+       id="path4357"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 527.14285,625.21935 225.71428,506.64792"
+       id="path4359"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 522.85714,666.64792 225.71428,659.50506"
+       id="path4361"
+       inkscape:connector-curvature="0" />
+    <text
+       xml:space="preserve"
+       style="font-size:20px;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;"
+       x="801.42853"
+       y="649.50513"
+       id="text4363"><tspan
+         sodipodi:role="line"
+         id="tspan4365"
+         x="801.42853"
+         y="649.50513">conv2 grad weight:</tspan><tspan
+         sodipodi:role="line"
+         x="801.42853"
+         y="674.50513"
+         id="tspan4367">view onto flattened gradParams</tspan></text>
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 612.85708,640.9336 180,14.2857"
+       id="path4375"
+       inkscape:connector-curvature="0" />
+    <text
+       xml:space="preserve"
+       style="font-size:20px;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;"
+       x="791.42853"
+       y="400.93353"
+       id="text4377"><tspan
+         sodipodi:role="line"
+         id="tspan4379"
+         x="791.42853"
+         y="400.93353">conv3 weight:</tspan><tspan
+         sodipodi:role="line"
+         x="791.42853"
+         y="425.93353"
+         id="tspan4381">view onto flattened params</tspan><tspan
+         sodipodi:role="line"
+         x="791.42853"
+         y="450.93353"
+         id="tspan4383">tensor</tspan></text>
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 782.85708,403.7907 -180,11.4286"
+       id="path4387"
+       inkscape:connector-curvature="0" />
+  </g>
+</svg>
diff --git a/doc/image/power.png b/doc/image/power.png
new file mode 100644
index 0000000..958eeb4
Binary files /dev/null and b/doc/image/power.png differ
diff --git a/doc/image/prelu.png b/doc/image/prelu.png
new file mode 100644
index 0000000..ac751cd
Binary files /dev/null and b/doc/image/prelu.png differ
diff --git a/doc/image/relu.png b/doc/image/relu.png
new file mode 100644
index 0000000..d60d2ab
Binary files /dev/null and b/doc/image/relu.png differ
diff --git a/doc/image/rrelu.png b/doc/image/rrelu.png
new file mode 100644
index 0000000..50e3483
Binary files /dev/null and b/doc/image/rrelu.png differ
diff --git a/doc/image/sigmmoid.png b/doc/image/sigmmoid.png
new file mode 100644
index 0000000..48aad7e
Binary files /dev/null and b/doc/image/sigmmoid.png differ
diff --git a/doc/image/sigmoid.png b/doc/image/sigmoid.png
new file mode 100644
index 0000000..48aad7e
Binary files /dev/null and b/doc/image/sigmoid.png differ
diff --git a/doc/image/softmax.png b/doc/image/softmax.png
new file mode 100644
index 0000000..29c5534
Binary files /dev/null and b/doc/image/softmax.png differ
diff --git a/doc/image/softmin.png b/doc/image/softmin.png
new file mode 100644
index 0000000..d1807a4
Binary files /dev/null and b/doc/image/softmin.png differ
diff --git a/doc/image/softplus.png b/doc/image/softplus.png
new file mode 100644
index 0000000..9132093
Binary files /dev/null and b/doc/image/softplus.png differ
diff --git a/doc/image/softsign.png b/doc/image/softsign.png
new file mode 100644
index 0000000..0805433
Binary files /dev/null and b/doc/image/softsign.png differ
diff --git a/doc/image/sqrt.png b/doc/image/sqrt.png
new file mode 100644
index 0000000..29b1d42
Binary files /dev/null and b/doc/image/sqrt.png differ
diff --git a/doc/image/square.png b/doc/image/square.png
new file mode 100644
index 0000000..c191eaf
Binary files /dev/null and b/doc/image/square.png differ
diff --git a/doc/image/sshrink.png b/doc/image/sshrink.png
new file mode 100644
index 0000000..99c5d11
Binary files /dev/null and b/doc/image/sshrink.png differ
diff --git a/doc/image/tanh.png b/doc/image/tanh.png
new file mode 100644
index 0000000..d2f77aa
Binary files /dev/null and b/doc/image/tanh.png differ
diff --git a/doc/index.md b/doc/index.md
new file mode 100644
index 0000000..5c36166
--- /dev/null
+++ b/doc/index.md
@@ -0,0 +1,23 @@
+[![Build Status](https://travis-ci.org/torch/nn.svg?branch=master)](https://travis-ci.org/torch/nn)
+<a name="nn.dok"></a>
+# Neural Network Package #
+
+This package provides an easy and modular way to build and train simple or complex neural networks using [Torch](https://github.com/torch/torch7/blob/master/README.md):
+  
+  * Modules are the bricks used to build neural networks. Each are themselves neural networks, but can be combined with other networks using containers to create complex neural networks:
+    * [Module](module.md#nn.Module) : abstract class inherited by all modules;
+    * [Containers](containers.md#nn.Containers) : container classes like [Sequential](containers.md#nn.Sequential), [Parallel](containers.md#nn.Parallel) and [Concat](containers.md#nn.Concat);
+    * [Transfer functions](transfer.md#nn.transfer.dok) : non-linear functions like [Tanh](transfer.md#nn.Tanh) and [Sigmoid](transfer.md#nn.Sigmoid);
+    * [Simple layers](simple.md#nn.simplelayers.dok) : like [Linear](simple.md#nn.Linear), [Mean](simple.md#nn.Mean), [Max](simple.md#nn.Max) and [Reshape](simple.md#nn.Reshape); 
+    * [Table layers](table.md#nn.TableLayers) : layers for manipulating tables like [SplitTable](table.md#nn.SplitTable), [ConcatTable](table.md#nn.ConcatTable) and [JoinTable](table.md#nn.JoinTable);
+    * [Convolution layers](convolution.md#nn.convlayers.dok) : [Temporal](convolution.md#nn.TemporalModules),  [Spatial](convolution.md#nn.SpatialModules) and [Volumetric](convolution.md#nn.VolumetricModules) convolutions ; 
+  * Criterions compute a gradient according to a given loss function given an input and a target:
+    * [Criterions](criterion.md#nn.Criterions) : a list of all criterions, including [Criterion](criterion.md#nn.Criterion), the abstract class;
+    * [MSECriterion](criterion.md#nn.MSECriterion) : the Mean Squared Error criterion used for regression; 
+    * [ClassNLLCriterion](criterion.md#nn.ClassNLLCriterion) : the Negative Log Likelihood criterion used for classification;
+  * Additional documentation :
+    * [Overview](overview.md#nn.overview.dok) of the package essentials including modules, containers and training;
+    * [Training](training.md#nn.traningneuralnet.dok) : how to train a neural network using [StochasticGradient](training.md#nn.StochasticGradient);
+    * [Testing](testing.md) : how to test your modules.
+    * [Experimental Modules](https://github.com/clementfarabet/lua---nnx/blob/master/README.md) : a package containing experimental modules and criteria.
+
diff --git a/doc/module.md b/doc/module.md
new file mode 100644
index 0000000..ce8c7b4
--- /dev/null
+++ b/doc/module.md
@@ -0,0 +1,437 @@
+<a name="nn.Module"></a>
+## Module ##
+
+`Module` is an abstract class which defines fundamental methods necessary
+for a training a neural network. Modules are [serializable](https://github.com/torch/torch7/blob/master/doc/serialization.md#serialization).
+
+Modules contain two states variables: [output](#output) and
+[gradInput](#gradinput).
+
+<a name="nn.Module.forward"></a>
+### [output] forward(input) ###
+
+Takes an `input` object, and computes the corresponding `output` of the
+module. In general `input` and `output` are
+[Tensors](https://github.com/torch/torch7/blob/master/doc/tensor.md). However, some special sub-classes
+like [table layers](table.md#nn.TableLayers) might expect something else. Please,
+refer to each module specification for further information.
+
+After a `forward()`, the [output](#output) state variable should
+have been updated to the new value.
+
+It is not advised to override this function. Instead, one should
+implement [updateOutput(input)](#nn.Module.updateOutput)
+function. The forward module in the abstract parent class
+[Module](#nn.Module) will call `updateOutput(input)`.
+
+<a name="nn.Module.backward"></a>
+### [gradInput] backward(input, gradOutput) ###
+
+Performs a _backpropagation step_ through the module, with respect to the
+given `input`.  In general this method makes the assumption
+[forward(input)](#nn.Module.forward) has been called before, _with the same input_.
+This is necessary for optimization reasons. If you do not respect
+this rule, `backward()` will compute incorrect gradients.
+
+In general `input` and `gradOutput`  and `gradInput` are
+[Tensors](https://github.com/torch/torch7/blob/master/doc/tensor.md). However, some special sub-classes
+like [table layers](table.md#nn.TableLayers) might expect something else. Please,
+refer to each module specification for further information.
+
+A _backpropagation step_ consist in computing two kind of gradients
+at `input` given `gradOutput` (gradients with respect to the
+output of the module).  This function simply performs this task using
+two function calls:
+
+  - A function call to [updateGradInput(input, gradOutput)](#nn.Module.updateGradInput).
+  - A function call to [accGradParameters(input,gradOutput,scale)](#nn.Module.accGradParameters).
+
+It is not advised to override this function call in custom classes. It
+is better to override
+[updateGradInput(input, gradOutput)](#nn.Module.updateGradInput) and
+[accGradParameters(input, gradOutput,scale)](#nn.Module.accGradParameters)
+functions.
+
+<a name="nn.Module.updateOutput"></a>
+### updateOutput(input) ###
+
+Computes the output using the current parameter set of the class and
+input. This function returns the result which is stored in the
+[output](#output) field.
+
+<a name="nn.Module.updateGradInput"></a>
+### updateGradInput(input, gradOutput) ###
+
+Computing the gradient of the module with respect to its own
+input. This is returned in `gradInput`. Also, the
+[gradInput](#gradinput) state variable is updated
+accordingly.
+
+<a name="nn.Module.accGradParameters"></a>
+### accGradParameters(input, gradOutput, scale) ###
+
+Computing the gradient of the module with respect to its
+own parameters. Many modules do not perform this step as they do not
+have any parameters. The state variable name for the parameters is
+module dependent. The module is expected to _accumulate_ the
+gradients with respect to the parameters in some variable.
+
+`scale` is a scale factor that is multiplied with the gradParameters before being accumulated.
+
+Zeroing this accumulation is achieved with
+[zeroGradParameters()](#nn.Module.zeroGradParameters) and updating
+the parameters according to this accumulation is done with
+[updateParameters()](#nn.Module.updateParameters).
+
+<a name="nn.Module.zeroGradParameters"></a>
+### zeroGradParameters() ###
+
+If the module has parameters, this will zero the accumulation of the
+gradients with respect to these parameters, accumulated through
+[accGradParameters(input, gradOutput,scale)](#nn.Module.accGradParameters)
+calls. Otherwise, it does nothing.
+
+<a name="nn.Module.updateParameters"></a>
+### updateParameters(learningRate) ###
+
+If the module has parameters, this will update these parameters, according
+to the accumulation of the gradients with respect to these parameters,
+accumulated through [backward()](#nn.Module.backward) calls.
+
+The update is basically:
+```lua
+parameters = parameters - learningRate * gradients_wrt_parameters
+```
+If the module does not have parameters, it does nothing.
+
+<a name="nn.Module.accUpdateGradParameters"></a>
+### accUpdateGradParameters(input, gradOutput, learningRate) ###
+
+This is a convenience module that performs two functions at
+once. Calculates and accumulates the gradients with respect to the
+weights after multiplying with negative of the learning rate
+`learningRate`. Performing these two operations at once is more
+performance efficient and it might be advantageous in certain
+situations.
+
+Keep in mind that, this function uses a simple trick to achieve its
+goal and it might not be valid for a custom module.
+
+Also note that compared to accGradParameters(), the gradients are not retained
+for future use.
+
+```lua
+function Module:accUpdateGradParameters(input, gradOutput, lr)
+   local gradWeight = self.gradWeight
+   local gradBias = self.gradBias
+   self.gradWeight = self.weight
+   self.gradBias = self.bias
+   self:accGradParameters(input, gradOutput, -lr)
+   self.gradWeight = gradWeight
+   self.gradBias = gradBias
+end
+```
+
+As it can be seen, the gradients are accumulated directly into
+weights. This assumption may not be true for a module that computes a
+nonlinear operation.
+
+<a name="nn.Module.share"></a>
+### share(mlp,s1,s2,...,sn) ###
+
+This function modifies the parameters of the module named
+`s1`,..`sn` (if they exist) so that they are shared with (pointers
+to) the parameters with the same names in the given module `mlp`.
+
+The parameters have to be Tensors. This function is typically used if
+you want to have modules that share the same weights or biases.
+
+Note that this function if called on a [Container](containers.md#nn.Containers)
+module will share the same parameters for all the contained modules as
+well.
+
+Example:
+```lua
+
+-- make an mlp
+mlp1=nn.Sequential();
+mlp1:add(nn.Linear(100,10));
+
+-- make a second mlp
+mlp2=nn.Sequential();
+mlp2:add(nn.Linear(100,10));
+
+-- the second mlp shares the bias of the first
+mlp2:share(mlp1,'bias');
+
+-- we change the bias of the first
+mlp1:get(1).bias[1]=99;
+
+-- and see that the second one's bias has also changed..
+print(mlp2:get(1).bias[1])
+
+```
+
+<a name="nn.Module.clone"></a>
+### clone(mlp,...) ###
+
+Creates a deep copy of (i.e. not just a pointer to) the module,
+including the current state of its parameters (e.g. weight, biases
+etc., if any).
+
+If arguments are provided to the `clone(...)` function it also calls
+[share(...)](#nn.Module.share) with those arguments on the cloned
+module after creating it, hence making a deep copy of this module with
+some shared parameters.
+
+Example:
+```lua
+-- make an mlp
+mlp1=nn.Sequential();
+mlp1:add(nn.Linear(100,10));
+
+-- make a copy that shares the weights and biases
+mlp2=mlp1:clone('weight','bias');
+
+-- we change the bias of the first mlp
+mlp1:get(1).bias[1]=99;
+
+-- and see that the second one's bias has also changed..
+print(mlp2:get(1).bias[1])
+
+```
+
+<a name="nn.Module.type"></a>
+### type(type[, tensorCache]) ###
+
+This function converts all the parameters of a module to the given
+`type`. The `type` can be one of the types defined for
+[torch.Tensor](https://github.com/torch/torch7/blob/master/doc/tensor.md).
+
+If tensors (or their storages) are shared between multiple modules in a
+network, this sharing will be preserved after type is called.
+
+To preserve sharing between multiple modules and/or tensors, use
+`nn.utils.recursiveType`:
+
+```lua
+-- make an mlp
+mlp1=nn.Sequential();
+mlp1:add(nn.Linear(100,10));
+
+-- make a second mlp
+mlp2=nn.Sequential();
+mlp2:add(nn.Linear(100,10));
+
+-- the second mlp shares the bias of the first
+mlp2:share(mlp1,'bias');
+
+-- mlp1 and mlp2 will be converted to float, and will share bias
+-- note: tensors can be provided as inputs as well as modules
+nn.utils.recursiveType({mlp1, mlp2}, 'torch.FloatTensor')
+```
+
+<a name="nn.Module.float"></a>
+### float([tensorCache]) ###
+
+Convenience method for calling [module:type('torch.FloatTensor'[, tensorCache])](#nn.Module.type)
+
+<a name="nn.Module.double"></a>
+### double([tensorCache]) ###
+
+Convenience method for calling [module:type('torch.DoubleTensor'[, tensorCache])](#nn.Module.type)
+
+<a name="nn.Module.cuda"></a>
+### cuda([tensorCache]) ###
+
+Convenience method for calling [module:type('torch.CudaTensor'[, tensorCache])](#nn.Module.type)
+
+<a name="nn.statevars.dok"></a>
+### State Variables ###
+
+These state variables are useful objects if one wants to check the guts of
+a `Module`. The object pointer is _never_ supposed to change. However, its
+contents (including its size if it is a Tensor) are supposed to change.
+
+In general state variables are
+[Tensors](https://github.com/torch/torch7/blob/master/doc/tensor.md).
+However, some special sub-classes
+like [table layers](table.md#nn.TableLayers) contain something else. Please,
+refer to each module specification for further information.
+
+<a name="nn.Module.output"></a>
+#### output ####
+
+This contains the output of the module, computed with the last call of
+[forward(input)](#nn.Module.forward).
+
+<a name="nn.Module.gradInput"></a>
+#### gradInput ####
+
+This contains the gradients with respect to the inputs of the module, computed with the last call of
+[updateGradInput(input, gradOutput)](#nn.Module.updateGradInput).
+
+### Parameters and gradients w.r.t parameters ###
+
+Some modules contain parameters (the ones that we actually want to
+train!). The name of these parameters, and gradients w.r.t these parameters
+are module dependent.
+
+<a name="nn.Module.parameters"></a>
+### [{weights}, {gradWeights}] parameters() ###
+
+This function should returns two tables. One for the learnable
+parameters `{weights}` and another for the gradients of the energy
+wrt to the learnable parameters `{gradWeights}`.
+
+Custom modules should override this function if they use learnable
+parameters that are stored in tensors.
+
+<a name="nn.Module.getParameters"></a>
+### [flatParameters, flatGradParameters] getParameters() ###
+
+This function returns two tensors. One for the flattened learnable
+parameters `flatParameters` and another for the gradients of the energy
+wrt to the learnable parameters `flatGradParameters`.
+
+Custom modules should not override this function. They should instead override [parameters(...)](#nn.Module.parameters) which is, in turn, called by the present function.
+
+This function will go over all the weights and gradWeights and make them view into a single tensor (one for weights and one for gradWeights). Since the storage of every weight and gradWeight is changed, this function should be called only once on a given network.
+
+<a name="nn.Module.training"></a>
+### training() ###
+This sets the mode of the Module (or sub-modules) to `train=true`. This is useful for modules like [Dropout](simple.md#nn.Dropout) that have a different behaviour during training vs evaluation.
+
+<a name="nn.Module.evaluate"></a>
+### evaluate() ###
+This sets the mode of the Module (or sub-modules) to `train=false`. This is useful for modules like [Dropout](simple.md#nn.Dropout) that have a different behaviour during training vs evaluation.
+
+<a name="nn.Module.findModules"></a>
+### findModules(typename) ###
+Find all instances of modules in the network of a certain `typename`.  It returns a flattened list of the matching nodes, as well as a flattened list of the container modules for each matching node.
+
+Modules that do not have a parent container (ie, a top level nn.Sequential for instance) will return their `self` as the container.
+
+This function is very helpful for navigating complicated nested networks.  For example, a didactic example might be; if you wanted to print the output size of all `nn.SpatialConvolution` instances:
+
+```lua
+-- Construct a multi-resolution convolution network (with 2 resolutions):
+model = nn.ParallelTable()
+conv_bank1 = nn.Sequential()
+conv_bank1:add(nn.SpatialConvolution(3,16,5,5))
+conv_bank1:add(nn.Threshold())
+model:add(conv_bank1)
+conv_bank2 = nn.Sequential()
+conv_bank2:add(nn.SpatialConvolution(3,16,5,5))
+conv_bank2:add(nn.Threshold())
+model:add(conv_bank2)
+-- FPROP a multi-resolution sample
+input = {torch.rand(3,128,128), torch.rand(3,64,64)}
+model:forward(input)
+-- Print the size of the Threshold outputs
+conv_nodes = model:findModules('nn.SpatialConvolution')
+for i = 1, #conv_nodes do
+  print(conv_nodes[i].output:size())
+end
+```
+
+Another use might be to replace all nodes of a certain `typename` with another.  For instance, if we wanted to replace all `nn.Threshold` with `nn.Tanh` in the model above:
+
+```lua
+threshold_nodes, container_nodes = model:findModules('nn.Threshold')
+for i = 1, #threshold_nodes do
+  -- Search the container for the current threshold node
+  for j = 1, #(container_nodes[i].modules) do
+    if container_nodes[i].modules[j] == threshold_nodes[i] then
+      -- Replace with a new instance
+      container_nodes[i].modules[j] = nn.Tanh()
+    end
+  end
+end
+```
+
+<a name="nn.Module.listModules"></a>
+### listModules() ###
+
+List all Modules instances in a network. Returns a flattened list of modules,
+including container modules (which will be listed first), self, and any other
+component modules.
+
+For example :
+```lua
+mlp = nn.Sequential()
+mlp:add(nn.Linear(10,20))
+mlp:add(nn.Tanh())
+mlp2 = nn.Parallel()
+mlp2:add(mlp)
+mlp2:add(nn.ReLU())
+for i,module in ipairs(mlp2:listModules()) do
+   print(module)
+end
+```
+
+Which will result in the following output :
+
+```lua
+nn.Parallel {
+  input
+    |`-> (1): nn.Sequential {
+    |      [input -> (1) -> (2) -> output]
+    |      (1): nn.Linear(10 -> 20)
+    |      (2): nn.Tanh
+    |    }
+    |`-> (2): nn.ReLU
+     ... -> output
+}
+nn.Sequential {
+  [input -> (1) -> (2) -> output]
+  (1): nn.Linear(10 -> 20)
+  (2): nn.Tanh
+}
+nn.Linear(10 -> 20)
+nn.Tanh
+nn.ReLU
+```
+
+### clearState() ###
+
+Clears intermediate module states as `output`, `gradInput` and others.
+Useful when serializing networks and running low on memory. Internally calls `set()`
+on tensors so it does not break buffer sharing.
+
+
+<a name="nn.Module.apply"></a>
+### apply(function)
+
+Calls provided function on itself and all child modules. This function takes
+module to operate on as a first argument:
+
+```lua
+model:apply(function(module)
+   module.train = true
+end)
+```
+
+In the example above `train` will be set to to `true` in all modules of `model`.
+This is how `training()` and `evaluate()` functions implemented.
+
+
+<a name="nn.Module.replace"></a>
+### replace(function)
+
+Similar to apply takes a function which applied to all modules of a model,
+but uses return value to replace the module. Can be used to replace all
+modules of one type to another or remove certain modules.
+
+For example, can be used to remove `nn.Dropout` layers by replacing them with
+`nn.Identity`:
+
+```lua
+model:replace(function(module)
+   if torch.typename(module) == 'nn.Dropout' then
+      return nn.Identity()
+   else
+      return module
+   end
+end)
+```
diff --git a/doc/overview.md b/doc/overview.md
new file mode 100644
index 0000000..25eb092
--- /dev/null
+++ b/doc/overview.md
@@ -0,0 +1,200 @@
+<a name="nn.overview.dok"></a>
+# Overview #
+
+Each module of a network is composed of [Modules](module.md#nn.Modules) and there
+are several sub-classes of `Module` available: container classes like
+[Sequential](containers.md#nn.Sequential), [Parallel](containers.md#nn.Parallel) and
+[Concat](containers.md#nn.Concat) , which can contain simple layers like
+[Linear](simple.md#nn.Linear), [Mean](simple.md#nn.Mean), [Max](simple.md#nn.Max) and
+[Reshape](simple.md#nn.Reshape), as well as [convolutional layers](convolution.md), and [transfer
+functions](transfer.md) like [Tanh](transfer.md#nn.Tanh).
+
+Loss functions are implemented as sub-classes of
+[Criterion](criterion.md#nn.Criterions). They are helpful to train neural network on
+classical tasks.  Common criterions are the Mean Squared Error
+criterion implemented in [MSECriterion](criterion.md#nn.MSECriterion) and the
+cross-entropy criterion implemented in
+[ClassNLLCriterion](criterion.md#nn.ClassNLLCriterion).
+
+Finally, the [StochasticGradient](training.md#nn.StochasticGradient) class provides a
+high level way to train the neural network of choice, even though it is
+easy with a simple for loop to [train a neural network yourself](training.md#nn.DoItYourself).
+
+## Detailed Overview ##
+This section provides a detailed overview of the neural network package. First the omnipresent [Module](#nn.overview.module) is examined, followed by some examples for [combining modules](#nn.overview.plugandplay) together. The last part explores facilities for [training a neural network](#nn.overview.training), and finally some caveats while training networks with [shared parameters](#nn.overview.sharedparams).
+
+<a name="nn.overview.module"></a>
+### Module ###
+
+A neural network is called a [Module](module.md#nn.Module) (or simply
+_module_ in this documentation) in Torch. `Module` is an abstract
+class which defines four main methods:
+
+  * [forward(input)](module.md#nn.Module.forward) which computes the output of the module given the `input` [Tensor](https://github.com/torch/torch7/blob/master/doc/tensor.md).
+  * [backward(input, gradOutput)](module.md#nn.Module.backward) which computes the gradients of the module with respect to its own parameters, and its own inputs.
+  * [zeroGradParameters()](module.md#nn.Module.zeroGradParameters) which zeroes the gradient with respect to the parameters of the module.
+  * [updateParameters(learningRate)](module.md#nn.Module.updateParameters) which updates the parameters after one has computed the gradients with `backward()`
+
+It also declares two members:
+
+  * [output](module.md#nn.Module.output) which is the output returned by `forward()`.
+  * [gradInput](module.md#nn.Module.gradInput) which contains the gradients with respect to the input of the module, computed in a `backward()`.
+
+Two other perhaps less used but handy methods are also defined:
+
+  * [share(mlp,s1,s2,...,sn)](module.md#nn.Module.share) which makes this module share the parameters s1,..sn of the module `mlp`. This is useful if you want to have modules that share the same weights.
+  * [clone(...)](module.md#nn.Module.clone) which produces a deep copy of (i.e. not just a pointer to) this Module, including the current state of its parameters (if any).
+
+Some important remarks:
+
+  * `output` contains only valid values after a [forward(input)](module.md#nn.Module.forward).
+  * `gradInput` contains only valid values after a [backward(input, gradOutput)](module.md#nn.Module.backward).
+  * [backward(input, gradOutput)](module.md#nn.Module.backward) uses certain computations obtained during [forward(input)](module.md#nn.Module.forward). You _must_ call `forward()` before calling a `backward()`, on the _same_ `input`, or your gradients are going to be incorrect!
+
+<a name="nn.overview.plugandplay"></a>
+### Plug and play ###
+
+Building a simple neural network can be achieved by constructing an available layer.
+A linear neural network (perceptron!) is built only in one line:
+```lua
+mlp = nn.Linear(10,1) -- perceptron with 10 inputs
+```
+
+More complex neural networks are easily built using container classes
+[Sequential](containers.md#nn.Sequential) and [Concat](containers.md#nn.Concat). `Sequential` plugs
+layer in a feed-forward fully connected manner. `Concat` concatenates in
+one layer several modules: they take the same inputs, and their output is
+concatenated.
+
+Creating a one hidden-layer multi-layer perceptron is thus just as easy as:
+```lua
+mlp = nn.Sequential()
+mlp:add( nn.Linear(10, 25) ) -- 10 input, 25 hidden units
+mlp:add( nn.Tanh() ) -- some hyperbolic tangent transfer function
+mlp:add( nn.Linear(25, 1) ) -- 1 output
+```
+
+Of course, `Sequential` and `Concat` can contains other
+`Sequential` or `Concat`, allowing you to try the craziest neural
+networks you ever dreamt of!
+
+<a name="nn.overview.training"></a>
+### Training a neural network ###
+
+Once you built your neural network, you have to choose a particular
+[Criterion](criterion.md#nn.Criterions) to train it. A criterion is a class which
+describes the cost to be minimized during training.
+
+You can then train the neural network by using the
+[StochasticGradient](training.md#nn.StochasticGradient) class.
+
+```lua
+ criterion = nn.MSECriterion() -- Mean Squared Error criterion
+ trainer = nn.StochasticGradient(mlp, criterion)
+ trainer:train(dataset) -- train using some examples
+```
+
+StochasticGradient expect as a `dataset` an object which implements
+the operator `dataset[index]` and implements the method
+`dataset:size()`. The `size()` methods returns the number of
+examples and `dataset[i]` has to return the i-th example.
+
+An `example` has to be an object which implements the operator
+`example[field]`, where `field` might take the value `1` (input
+features) or `2` (corresponding label which will be given to the
+criterion).  The input is usually a Tensor (except if you use special
+kind of gradient modules, like [table layers](table.md#nn.TableLayers)). The
+label type depends on the criterion.  For example, the
+[MSECriterion](criterion.md#nn.MSECriterion) expect a Tensor, but the
+[ClassNLLCriterion](criterion.md#nn.ClassNLLCriterion) except a integer number (the
+class).
+
+Such a dataset is easily constructed by using Lua tables, but it could
+any `C` object for example, as long as required operators/methods
+are implemented.  [See an example](containers.md#nn.DoItStochasticGradient).
+
+`StochasticGradient` being written in `Lua`, it is extremely easy
+to cut-and-paste it and create a variant to it adapted to your needs
+(if the constraints of `StochasticGradient` do not satisfy you).
+
+<a name="nn.overview.lowlevel"></a>
+#### Low Level Training ####
+
+If you want to program the `StochasticGradient` by hand, you
+essentially need to control the use of forwards and backwards through
+the network yourself.  For example, here is the code fragment one
+would need to make a gradient step given an input `x`, a desired
+output `y`, a network `mlp` and a given criterion `criterion`
+and learning rate `learningRate`:
+
+```lua
+function gradUpdate(mlp, x, y, criterion, learningRate) 
+  local pred = mlp:forward(x)
+  local err = criterion:forward(pred, y)
+  local gradCriterion = criterion:backward(pred, y)
+  mlp:zeroGradParameters()
+  mlp:backward(x, gradCriterion)
+  mlp:updateParameters(learningRate)
+end
+```
+For example, if you wish to use your own criterion you can simple replace 
+`gradCriterion` with the gradient vector of your criterion of choice.
+
+<a name="nn.overview.sharedparams"></a>
+### A Note on Sharing Parameters ###
+
+By using `:share(...)` and the Container Modules, one can easily create very
+complex architectures. In order to make sure that the network is going to
+train properly, one need to pay attention to the way the sharing is applied,
+because it might depend on the optimization procedure.
+
+* If you are using an optimization algorithm that iterates over the modules
+of your network (by calling `:updateParameters` for example), only the
+parameters of the network should be shared.
+* If you use the flattened parameter tensor to optimize the network, 
+obtained by calling `:getParameters`, for example for the package `optim`, 
+then you need to share both the parameters and the gradParameters.
+
+Here is an example for the first case:
+
+```lua
+-- our optimization procedure will iterate over the modules, so only share
+-- the parameters
+mlp = nn.Sequential()
+linear = nn.Linear(2,2)
+linear_clone = linear:clone('weight','bias') -- clone sharing the parameters
+mlp:add(linear)
+mlp:add(linear_clone)
+function gradUpdate(mlp, x, y, criterion, learningRate) 
+  local pred = mlp:forward(x)
+  local err = criterion:forward(pred, y)
+  local gradCriterion = criterion:backward(pred, y)
+  mlp:zeroGradParameters()
+  mlp:backward(x, gradCriterion)
+  mlp:updateParameters(learningRate)
+end
+```
+
+And for the second case:
+
+```lua
+-- our optimization procedure will use all the parameters at once, because
+-- it requires the flattened parameters and gradParameters Tensors. Thus,
+-- we need to share both the parameters and the gradParameters
+mlp = nn.Sequential()
+linear = nn.Linear(2,2)
+-- need to share the parameters and the gradParameters as well
+linear_clone = linear:clone('weight','bias','gradWeight','gradBias')
+mlp:add(linear)
+mlp:add(linear_clone)
+params, gradParams = mlp:getParameters()
+function gradUpdate(mlp, x, y, criterion, learningRate, params, gradParams)
+  local pred = mlp:forward(x)
+  local err = criterion:forward(pred, y)
+  local gradCriterion = criterion:backward(pred, y)
+  mlp:zeroGradParameters()
+  mlp:backward(x, gradCriterion)
+  -- adds the gradients to all the parameters at once
+  params:add(-learningRate, gradParams)
+end
+```
diff --git a/doc/simple.md b/doc/simple.md
new file mode 100644
index 0000000..50e5c9f
--- /dev/null
+++ b/doc/simple.md
@@ -0,0 +1,1406 @@
+<a name="nn.simplelayers.dok"></a>
+# Simple layers #
+Simple Modules are used for various tasks like adapting Tensor methods and providing affine transformations :
+
+  * Parameterized Modules :
+    * [Linear](#nn.Linear) : a linear transformation ;
+    * [SparseLinear](#nn.SparseLinear) : a linear transformation with sparse inputs ;
+    * [Bilinear](#nn.Bilinear) : a bilinear transformation with sparse inputs ;
+    * [PartialLinear](#nn.PartialLinear) : a linear transformation with sparse inputs with the option of only computing a subset ;
+    * [Add](#nn.Add) : adds a bias term to the incoming data ;
+    * [Mul](#nn.Mul) : multiply a single scalar factor to the incoming data ;
+    * [CMul](#nn.CMul) : a component-wise multiplication to the incoming data ;
+    * [Euclidean](#nn.Euclidean) : the euclidean distance of the input to `k` mean centers ;
+    * [WeightedEuclidean](#nn.WeightedEuclidean) : similar to [Euclidean](#nn.Euclidean), but additionally learns a diagonal covariance matrix ;
+    * [Cosine](#nn.Cosine) : the cosine similarity of the input to `k` mean centers ;
+  * Modules that adapt basic Tensor methods :
+    * [Copy](#nn.Copy) : a [copy](https://github.com/torch/torch7/blob/master/doc/tensor.md#torch.Tensor.copy) of the input with [type](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-or-string-typetype) casting ;
+    * [Narrow](#nn.Narrow) : a [narrow](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-narrowdim-index-size) operation over a given dimension ;
+    * [Replicate](#nn.Replicate) : [repeats](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-repeattensorresult-sizes) input `n` times along its first dimension ;
+    * [Reshape](#nn.Reshape) : a [reshape](https://github.com/torch/torch7/blob/master/doc/maths.md#res-torchreshaperes-x-m-n) of the inputs ;
+    * [View](#nn.View) : a [view](https://github.com/torch/torch7/blob/master/doc/tensor.md#result-viewresult-tensor-sizes) of the inputs ;
+    * [Contiguous](#nn.Contiguous) : [contiguous](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-contiguous) of the inputs ;
+    * [Select](#nn.Select) : a [select](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-selectdim-index) over a given dimension ;
+    * [MaskedSelect](#nn.MaskedSelect) : a [masked select](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-maskedselect-index) module performs the torch.maskedSelect operation ;
+    * [Index](#nn.Index) : a [index](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-indexdim-index) over a given dimension ;
+    * [Squeeze](#nn.Squeeze) : [squeezes](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-squeezedim) the input;
+    * [Unsqueeze](#nn.Unsqueeze) : unsqueeze the input, i.e., insert singleton dimension;  
+    * [Transpose](#nn.Transpose) : [transposes](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-transposedim1-dim2) the input ;
+  * Modules that adapt mathematical Tensor methods :
+    * [AddConstant](https://github.com/torch/nn/blob/master/doc/transfer.md#nn.AddConstant) : adding a constant ;
+    * [MulConstant](https://github.com/torch/nn/blob/master/doc/transfer.md#nn.MulConstant) : multiplying a constant ;
+    * [Max](#nn.Max) : a [max](https://github.com/torch/torch7/blob/master/doc/maths.md#torch.max) operation over a given dimension ;
+    * [Min](#nn.Min) : a [min](https://github.com/torch/torch7/blob/master/doc/maths.md#torchminresval-resind-x) operation over a given dimension ;
+    * [Mean](#nn.Mean) : a [mean](https://github.com/torch/torch7/blob/master/doc/maths.md#res-torchmeanres-x-dim) operation over a given dimension ;
+    * [Sum](#nn.Sum) : a [sum](https://github.com/torch/torch7/blob/master/doc/maths.md#res-torchsumres-x) operation over a given dimension ;
+    * [Exp](#nn.Exp) : an element-wise [exp](https://github.com/torch/torch7/blob/master/doc/maths.md#res-torchexpres-x) operation ;
+    * [Log](#nn.Log) : an element-wise [log](https://github.com/torch/torch7/blob/master/doc/maths.md#res-torchlogres-x) operation ;
+    * [Abs](#nn.Abs) : an element-wise [abs](https://github.com/torch/torch7/blob/master/doc/maths.md#res-torchabsres-x) operation ;
+    * [Power](#nn.Power) : an element-wise [pow](https://github.com/torch/torch7/blob/master/doc/maths.md#res-torchpowres-x) operation ;
+    * [Square](#nn.Square) : an element-wise square operation ;
+    * [Sqrt](#nn.Sqrt) : an element-wise [sqrt](https://github.com/torch/torch7/blob/master/doc/maths.md#res-torchsqrtres-x) operation ;
+    * [Clamp](#nn.Clamp) : an element-wise [clamp](https://github.com/torch/torch7/blob/master/doc/maths.md#res-torchclampres-tensor1-min_value-max_value) operation ;
+    * [Normalize](#nn.Normalize) : normalizes the input to have unit `L_p` norm ;
+    * [MM](#nn.MM) : matrix-matrix multiplication (also supports batches of matrices) ;
+  * Miscellaneous Modules :
+    * [BatchNormalization](#nn.BatchNormalization) : mean/std normalization over the mini-batch inputs (with an optional affine transform) ;
+    * [Identity](#nn.Identity) : forward input as-is to output (useful with [ParallelTable](table.md#nn.ParallelTable)) ;
+    * [Dropout](#nn.Dropout) : masks parts of the `input` using binary samples from a [bernoulli](http://en.wikipedia.org/wiki/Bernoulli_distribution) distribution ;
+    * [SpatialDropout](#nn.SpatialDropout) : same as Dropout but for spatial inputs where adjacent pixels are strongly correlated ;
+    * [VolumetricDropout](#nn.VolumetricDropout) : same as Dropout but for volumetric inputs where adjacent voxels are strongly correlated ;
+    * [Padding](#nn.Padding) : adds padding to a dimension ;
+    * [L1Penalty](#nn.L1Penalty) : adds an L1 penalty to an input (for sparsity) ;
+    * [GradientReversal](#nn.GradientReversal) : reverses the gradient (to maximize an objective function) ;
+
+<a name="nn.Linear"></a>
+## Linear ##
+
+```lua
+module = nn.Linear(inputDimension, outputDimension, [bias = true])
+```
+
+Applies a linear transformation to the incoming data, i.e. `y = Ax + b`. The `input` tensor given in `forward(input)` must be either a vector (1D tensor) or matrix (2D tensor). If the input is a matrix, then each row is assumed to be an input sample of given batch. The layer can be used without bias by setting `bias = false`.
+
+You can create a layer in the following way:
+
+```lua
+ module = nn.Linear(10, 5)  -- 10 inputs, 5 outputs
+```
+
+Usually this would be added to a network of some kind, e.g.:
+
+```lua
+ mlp = nn.Sequential()
+ mlp:add(module)
+```
+
+The weights and biases (_A_ and _b_) can be viewed with:
+
+```lua
+ print(module.weight)
+ print(module.bias)
+```
+
+The gradients for these weights can be seen with:
+
+```lua
+ print(module.gradWeight)
+ print(module.gradBias)
+```
+
+As usual with `nn` modules, applying the linear transformation is performed with:
+
+```lua
+x = torch.Tensor(10) -- 10 inputs
+y = module:forward(x)
+```
+
+<a name="nn.SparseLinear"></a>
+## SparseLinear ##
+
+```lua
+module = nn.SparseLinear(inputDimension, outputDimension)
+```
+
+Applies a linear transformation to the incoming sparse data, i.e. `y = Ax + b`. The `input` tensor given in `forward(input)` must be a sparse vector represented as 2D tensor of the form torch.Tensor(N, 2) where the pairs represent indices and values.
+The SparseLinear layer is useful when the number of input dimensions is very large and the input data is sparse.
+
+You can create a sparse linear layer in the following way:
+
+```lua
+module = nn.SparseLinear(10000, 2)  -- 10000 inputs, 2 outputs
+```
+
+The sparse linear module may be used as part of a larger network, and apart from the form of the input, [SparseLinear](#nn.SparseLinear) operates in exactly the same way as the [Linear](#nn.Linear) layer.
+
+A sparse input vector may be created as so...
+
+```lua
+x = torch.Tensor({ {1, 0.1}, {2, 0.3}, {10, 0.3}, {31, 0.2} })
+
+ print(x)
+
+  1.0000   0.1000
+  2.0000   0.3000
+ 10.0000   0.3000
+ 31.0000   0.2000
+[torch.Tensor of dimension 4x2]
+```
+
+The first column contains indices, the second column contains values in a a vector where all other elements are zeros. The indices should not exceed the stated dimensions of the input to the layer (10000 in the example).
+
+<a name="nn.Bilinear"></a>
+## Bilinear ##
+
+```lua
+module = nn.Bilinear(inputDimension1, inputDimension2, outputDimension, [bias = true])
+```
+
+Applies a bilinear transformation to the incoming data, i.e. `\forall k: y_k = x_1 A_k x_2 + b`. The `input` tensor given in `forward(input)` is a table containing both inputs `x_1` and `x_2`, which are tensors of size `N x inputDimension1`
+and `N x inputDimension1`, respectively. The layer can be trained without biases by setting `bias = false`.
+
+You can create a layer in the following way:
+
+```lua
+ module = nn.Bilinear(10, 5, 3)  -- 10 and 5 inputs, 3 outputs
+```
+
+Input data for this layer would look as follows:
+```lua
+ input = {torch.randn(128, 10), torch.randn(128, 5)}  -- 128 input examples
+ module:forward(input)
+```
+
+<a name="nn.PartialLinear"></a>
+## PartialLinear ##
+
+```lua
+module = nn.PartialLinear(inputSize, outputSize, [bias = true])
+```
+
+PartialLinear is a Linear layer that allows the user to a set a collection of
+column indices. When the column indices are set, the layer will behave like a
+Linear layer that only has those columns. Meanwhile, all parameters are
+preserved, so resetting the PartialLinear layer will result in a module that
+behaves just like a regular Linear layer.
+
+This module is useful, for instance, when you want to do forward-backward on
+only a subset of a Linear layer during training but use the full Linear layer
+at test time.
+
+You can create a layer in the following way:
+
+```lua
+ module = nn.PartialLinear(5, 3)  -- 5 inputs, 3 outputs
+```
+
+Input data for this layer would look as follows:
+```lua
+ input = torch.randn(128, 5)  -- 128 input examples
+ module:forward(input)
+```
+
+One can set the partition of indices to compute using the function `setPartition(indices)` where `indices` is a tensor containing the indices to compute.
+```lua
+module = nn.PartialLinear(5, 3)  -- 5 inputs, 3 outputs
+module:setPartition(torch.Tensor({2,4})) -- only compute the 2nd and 4th indices out of a total of 5 indices
+```
+
+One can reset the partition via the `resetPartition()` function that resets the partition to compute all indices, making it's behaviour equivalent to `nn.Linear`
+
+<a name="nn.Dropout"></a>
+## Dropout ##
+
+```lua
+module = nn.Dropout(p)
+```
+
+During training, `Dropout` masks parts of the `input` using binary samples from a [bernoulli](http://en.wikipedia.org/wiki/Bernoulli_distribution) distribution.
+Each `input` element has a probability of `p` of being dropped, i.e having its commensurate output element be zero. This has proven an effective technique for regularization and preventing the co-adaptation of neurons (see [Hinton et al. 2012](http://arxiv.org/abs/1207.0580)).
+
+Furthermore, the outputs are scaled by a factor of `1/(1-p)` during training. This allows the `input` to be simply forwarded as-is during evaluation.
+
+In this example, we demonstrate how the call to [forward](module.md#output-forwardinput) samples different `outputs` to dropout (the zeros) given the same `input`:
+
+```lua
+module = nn.Dropout()
+
+> x = torch.Tensor{{1, 2, 3, 4}, {5, 6, 7, 8}}
+
+> module:forward(x)
+  2   0   0   8
+ 10   0  14   0
+[torch.DoubleTensor of dimension 2x4]
+
+> module:forward(x)
+  0   0   6   0
+ 10   0   0   0
+[torch.DoubleTensor of dimension 2x4]
+```
+
+[Backward](module.md#gradinput-backwardinput-gradoutput) drops out the gradients at the same location:
+
+```lua
+> module:forward(x)
+  0   4   0   0
+ 10  12   0  16
+[torch.DoubleTensor of dimension 2x4]
+
+> module:backward(x, x:clone():fill(1))
+ 0  2  0  0
+ 2  2  0  2
+[torch.DoubleTensor of dimension 2x4]
+```
+
+In both cases the `gradOutput` and `input` are scaled by `1/(1-p)`, which in this case is `2`.
+
+During [evaluation](module.md#evaluate), `Dropout` does nothing more than forward the input such that all elements of the input are considered.
+
+```lua
+> module:evaluate()
+
+> module:forward(x)
+ 1  2  3  4
+ 5  6  7  8
+[torch.DoubleTensor of dimension 2x4]
+```
+
+We can return to training our model by first calling [Module:training()](module.md#training):
+
+```lua
+> module:training()
+
+> return module:forward(x)
+  2   4   6   0
+  0   0   0  16
+[torch.DoubleTensor of dimension 2x4]
+```
+
+When used, `Dropout` should normally be applied to the input of parameterized [Modules](module.md#nn.Module) like [Linear](#nn.Linear) or [SpatialConvolution](convolution.md#nn.SpatialConvolution). A `p` of `0.5` (the default) is usually okay for hidden layers. `Dropout` can sometimes be used successfully on the dataset inputs with a `p` around `0.2`. It sometimes works best following [Transfer](transfer.md) Modules like [ReLU](transfer.md#nn.ReLU). All this depends a great deal on the d [...]
+
+<a name="nn.SpatialDropout"></a>
+## SpatialDropout ##
+
+`module` = `nn.SpatialDropout(p)`
+
+This version performs the same function as ```nn.Dropout```, however it assumes the 2 right-most dimensions of the input are spatial, performs one Bernoulli trial per output feature when training, and extends this dropout value across the entire feature map.
+
+As described in the paper "Efficient Object Localization Using Convolutional Networks" (http://arxiv.org/abs/1411.4280), if adjacent pixels within feature maps are strongly correlated (as is normally the case in early convolution layers) then iid dropout will not regularize the activations and will otherwise just result in an effective learning rate decrease.  In this case, ```nn.SpatialDropout``` will help promote independence between feature maps and should be used instead.
+
+```nn.SpatialDropout``` accepts 3D or 4D inputs.  If the input is 3D than a layout of (features x height x width) is assumed and for 4D (batch x features x height x width) is assumed.
+
+<a name="nn.VolumetricDropout"></a>
+## VolumetricDropout ##
+
+`module` = `nn.VolumetricDropout(p)`
+
+This version performs the same function as ```nn.Dropout```, however it assumes the 3 right-most dimensions of the input are spatial, performs one Bernoulli trial per output feature when training, and extends this dropout value across the entire feature map.
+
+As described in the paper "Efficient Object Localization Using Convolutional Networks" (http://arxiv.org/abs/1411.4280), if adjacent voxels within feature maps are strongly correlated (as is normally the case in early convolution layers) then iid dropout will not regularize the activations and will otherwise just result in an effective learning rate decrease.  In this case, ```nn.VolumetricDropout``` will help promote independence between feature maps and should be used instead.
+
+```nn.VolumetricDropout``` accepts 4D or 5D inputs.  If the input is 4D than a layout of (features x time x height x width) is assumed and for 5D (batch x features x time x height x width) is assumed.
+
+<a name="nn.Abs"></a>
+## Abs ##
+
+```lua
+module = Abs()
+```
+
+```lua
+m = nn.Abs()
+ii = torch.linspace(-5, 5)
+oo = m:forward(ii)
+go = torch.ones(100)
+gi = m:backward(ii, go)
+gnuplot.plot({'f(x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
+gnuplot.grid(true)
+```
+
+![](image/abs.png)
+
+
+<a name='nn.Add'></a>
+## Add ##
+
+```lua
+module = nn.Add(inputDimension, scalar)
+```
+
+Applies a bias term to the incoming data, i.e. `yi = x_i + b_i`,  or if `scalar = true` then uses a single bias term, `yi = x_i + b`.
+
+Example:
+
+```lua
+y = torch.Tensor(5)
+mlp = nn.Sequential()
+mlp:add(nn.Add(5))
+
+function gradUpdate(mlp, x, y, criterion, learningRate)
+   local pred = mlp:forward(x)
+   local err = criterion:forward(pred, y)
+   local gradCriterion = criterion:backward(pred, y)
+   mlp:zeroGradParameters()
+   mlp:backward(x, gradCriterion)
+   mlp:updateParameters(learningRate)
+   return err
+end
+
+for i = 1, 10000 do
+   x = torch.rand(5)
+   y:copy(x);
+   for i = 1, 5 do y[i] = y[i] + i; end
+   err = gradUpdate(mlp, x, y, nn.MSECriterion(), 0.01)
+end
+
+print(mlp:get(1).bias)
+```
+
+gives the output:
+
+```lua
+ 1.0000
+ 2.0000
+ 3.0000
+ 4.0000
+ 5.0000
+[torch.Tensor of dimension 5]
+```
+
+i.e. the network successfully learns the input `x` has been shifted to produce the output `y`.
+
+
+<a name="nn.Mul"></a>
+## Mul ##
+
+```lua
+module = nn.Mul()
+```
+
+Applies a _single_ scaling factor to the incoming data, i.e. `y = w x`, where `w` is a scalar.
+
+Example:
+
+```lua
+y = torch.Tensor(5)
+mlp = nn.Sequential()
+mlp:add(nn.Mul())
+
+function gradUpdate(mlp, x, y, criterion, learningRate)
+   local pred = mlp:forward(x)
+   local err = criterion:forward(pred, y)
+   local gradCriterion = criterion:backward(pred, y)
+   mlp:zeroGradParameters()
+   mlp:backward(x, gradCriterion)
+   mlp:updateParameters(learningRate)
+   return err
+end
+
+for i = 1, 10000 do
+   x = torch.rand(5)
+   y:copy(x)
+   y:mul(math.pi)
+   err = gradUpdate(mlp, x, y, nn.MSECriterion(), 0.01)
+end
+
+print(mlp:get(1).weight)
+```
+
+gives the output:
+
+```lua
+ 3.1416
+[torch.Tensor of dimension 1]
+```
+
+i.e. the network successfully learns the input `x` has been scaled by pi.
+
+<a name='nn.CMul'></a>
+## CMul ##
+
+```lua
+module = nn.CMul(size)
+```
+
+Applies a component-wise multiplication to the incoming data, i.e. `y_i = w_i * x_i`. Argument `size` can be one or many numbers (sizes) or a `torch.LongStorage`. For example, `nn.CMul(3,4,5)` is equivalent to `nn.CMul(torch.LongStorage{3,4,5})`.
+
+Example:
+
+```lua
+mlp = nn.Sequential()
+mlp:add(nn.CMul(5))
+
+y = torch.Tensor(5)
+sc = torch.Tensor(5)
+for i = 1, 5 do sc[i] = i; end -- scale input with this
+
+function gradUpdate(mlp, x, y, criterion, learningRate)
+   local pred = mlp:forward(x)
+   local err = criterion:forward(pred, y)
+   local gradCriterion = criterion:backward(pred, y)
+   mlp:zeroGradParameters()
+   mlp:backward(x, gradCriterion)
+   mlp:updateParameters(learningRate)
+   return err
+end
+
+for i = 1, 10000 do
+   x = torch.rand(5)
+   y:copy(x)
+   y:cmul(sc)
+   err = gradUpdate(mlp, x, y, nn.MSECriterion(), 0.01)
+end
+
+print(mlp:get(1).weight)
+```
+
+gives the output:
+
+```lua
+ 1.0000
+ 2.0000
+ 3.0000
+ 4.0000
+ 5.0000
+[torch.Tensor of dimension 5]
+```
+
+i.e. the network successfully learns the input `x` has been scaled by those scaling factors to produce the output `y`.
+
+
+<a name="nn.Max"></a>
+## Max ##
+
+```lua
+module = nn.Max(dimension, nInputDim)
+```
+
+Applies a max operation over dimension `dimension`.
+Hence, if an `nxpxq` Tensor was given as input, and `dimension` = `2` then an `nxq` matrix would be output.
+When `nInputDim` is provided, inputs larger than that value will be considered batches where the actual `dimension` to apply the max operation will be dimension `dimension + 1`.
+
+<a name="nn.Min"></a>
+## Min ##
+
+```lua
+module = nn.Min(dimension, nInputDim)
+```
+
+Applies a min operation over dimension `dimension`.
+Hence, if an `nxpxq` Tensor was given as input, and `dimension` = `2` then an `nxq` matrix would be output.
+When `nInputDim` is provided, inputs larger than that value will be considered batches where the actual `dimension` to apply the min operation will be dimension `dimension + 1`.
+
+<a name="nn.Mean"></a>
+## Mean ##
+
+```lua
+module = nn.Mean(dimension, nInputDim)
+```
+
+Applies a mean operation over dimension `dimension`.
+Hence, if an `nxpxq` Tensor was given as input, and `dimension` = `2` then an `nxq` matrix would be output.
+When `nInputDim` is provided , inputs larger than that value will be considered batches where the actual `dimension` to apply the sum operation will be dimension `dimension + 1`.
+This module is based on [nn.Sum](#nn.Sum).
+
+<a name="nn.Sum"></a>
+## Sum ##
+
+```lua
+module = nn.Sum(dimension, nInputDim, sizeAverage)
+```
+
+Applies a sum operation over dimension `dimension`.
+Hence, if an `nxpxq` Tensor was given as input, and `dimension` = `2` then an `nxq` matrix would be output.
+When `nInputDim` is provided , inputs larger than that value will be considered batches where the actual `dimension` to apply the sum operation will be dimension `dimension + 1`.
+Negative indexing is allowed by providing a negative value to `nInputDim`.
+When `sizeAverage` is provided, the sum is divided by the size of the input in this `dimension`. This is equivalent to the mean operation performed by the [nn.Mean](#nn.Mean) module.
+
+<a name="nn.Euclidean"></a>
+## Euclidean ##
+
+```lua
+module = nn.Euclidean(inputSize,outputSize)
+```
+
+Outputs the Euclidean distance of the input to `outputSize` centers, i.e. this layer has the weights `w_j`,  for `j` = `1`,..,`outputSize`, where `w_j` are vectors of dimension `inputSize`.
+
+The distance `y_j` between center `j` and input `x` is formulated as `y_j = || w_j - x ||`.
+
+<a name="nn.WeightedEuclidean"></a>
+## WeightedEuclidean ##
+
+```lua
+module = nn.WeightedEuclidean(inputSize,outputSize)
+```
+
+This module is similar to [Euclidean](#nn.Euclidean), but additionally learns a separate diagonal covariance matrix across the features of the input space _for each center_.
+
+In other words, for each of the `outputSize` centers `w_j`, there is a diagonal covariance matrices `c_j`, for `j` = `1`,..,`outputSize`, where `c_j` are stored as vectors of size `inputSize`.
+
+The distance `y_j` between center `j` and input `x` is formulated as `y_j = || c_j * (w_j - x) ||`.
+
+<a name="nn.Cosine"></a>
+## Cosine ##
+
+```lua
+module = nn.Cosine(inputSize,outputSize)
+```
+
+Outputs the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) of the input to `outputSize` centers, i.e. this layer has the weights `w_j`,  for `j` = `1`,..,`outputSize`, where `w_j` are vectors of dimension `inputSize`.
+
+The distance `y_j` between center `j` and input `x` is formulated as `y_j = (x · w_j) / ( || w_j || * || x || )`.
+
+
+<a name="nn.Identity"></a>
+## Identity ##
+
+```lua
+module = nn.Identity()
+```
+
+
+Creates a module that returns whatever is input to it as output.
+This is useful when combined with the module [ParallelTable](table.md#nn.ParallelTable) in case you do not wish to do anything to one of the input Tensors.
+
+Example:
+
+```lua
+mlp = nn.Identity()
+print(mlp:forward(torch.ones(5, 2)))
+```
+
+gives the output:
+
+```lua
+ 1  1
+ 1  1
+ 1  1
+ 1  1
+ 1  1
+[torch.Tensor of dimension 5x2]
+```
+
+Here is a more useful example, where one can implement a network which also computes a Criterion using this module:
+
+```lua
+pred_mlp = nn.Sequential()  -- A network that makes predictions given x.
+pred_mlp:add(nn.Linear(5, 4))
+pred_mlp:add(nn.Linear(4, 3))
+
+xy_mlp = nn.ParallelTable() -- A network for predictions and for keeping the
+xy_mlp:add(pred_mlp)        -- true label for comparison with a criterion
+xy_mlp:add(nn.Identity())   -- by forwarding both x and y through the network.
+
+mlp = nn.Sequential()       -- The main network that takes both x and y.
+mlp:add(xy_mlp)             -- It feeds x and y to parallel networks;
+cr = nn.MSECriterion()
+cr_wrap = nn.CriterionTable(cr)
+mlp:add(cr_wrap)            -- and then applies the criterion.
+
+for i = 1, 100 do           -- Do a few training iterations
+   x = torch.ones(5)        -- Make input features.
+   y = torch.Tensor(3)
+   y:copy(x:narrow(1,1,3))  -- Make output label.
+   err = mlp:forward{x,y}   -- Forward both input and output.
+   print(err)               -- Print error from criterion.
+
+   mlp:zeroGradParameters() -- Do backprop...
+   mlp:backward({x, y})
+   mlp:updateParameters(0.05)
+end
+```
+
+<a name="nn.Copy"></a>
+## Copy ##
+
+```lua
+module = nn.Copy(inputType, outputType, [forceCopy, dontCast])
+```
+
+This layer copies the input to output with type casting from `inputType` to `outputType`. Unless `forceCopy` is true, when the first two arguments are the same, the input isn't copied, only transfered as the output. The default `forceCopy` is false.
+When `dontCast` is true, a call to `nn.Copy:type(type)` will not cast the module's `output` and `gradInput` Tensors to the new type. The default is false.
+
+<a name="nn.Narrow"></a>
+## Narrow ##
+
+```lua
+module = nn.Narrow(dimension, offset, length)
+```
+
+Narrow is application of [narrow](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-narrowdim-index-size) operation in a module. The module further supports a negative `length` in order to handle inputs with an unknown size.
+
+```lua
+> x = torch.rand(4, 5)
+
+> x
+ 0.3695  0.2017  0.4485  0.4638  0.0513
+ 0.9222  0.1877  0.3388  0.6265  0.5659
+ 0.8785  0.7394  0.8265  0.9212  0.0129
+ 0.2290  0.7971  0.2113  0.1097  0.3166
+[torch.DoubleTensor of size 4x5]
+
+> nn.Narrow(1, 2, 3):forward(x)
+ 0.9222  0.1877  0.3388  0.6265  0.5659
+ 0.8785  0.7394  0.8265  0.9212  0.0129
+ 0.2290  0.7971  0.2113  0.1097  0.3166
+[torch.DoubleTensor of size 3x5]
+
+> nn.Narrow(1, 2, -1):forward(x)
+ 0.9222  0.1877  0.3388  0.6265  0.5659
+ 0.8785  0.7394  0.8265  0.9212  0.0129
+ 0.2290  0.7971  0.2113  0.1097  0.3166
+[torch.DoubleTensor of size 3x5]
+
+> nn.Narrow(1, 2, 2):forward(x)
+ 0.9222  0.1877  0.3388  0.6265  0.5659
+ 0.8785  0.7394  0.8265  0.9212  0.0129
+[torch.DoubleTensor of size 2x5]
+
+> nn.Narrow(1, 2, -2):forward(x)
+ 0.9222  0.1877  0.3388  0.6265  0.5659
+ 0.8785  0.7394  0.8265  0.9212  0.0129
+[torch.DoubleTensor of size 2x5]
+
+> nn.Narrow(2, 2, 3):forward(x)
+ 0.2017  0.4485  0.4638
+ 0.1877  0.3388  0.6265
+ 0.7394  0.8265  0.9212
+ 0.7971  0.2113  0.1097
+[torch.DoubleTensor of size 4x3]
+
+> nn.Narrow(2, 2, -2):forward(x)
+ 0.2017  0.4485  0.4638
+ 0.1877  0.3388  0.6265
+ 0.7394  0.8265  0.9212
+ 0.7971  0.2113  0.1097
+[torch.DoubleTensor of size 4x3]
+```
+
+<a name="nn.Replicate"></a>
+## Replicate ##
+
+```lua
+module = nn.Replicate(nFeature [, dim, ndim])
+```
+
+This class creates an output where the input is replicated `nFeature` times along dimension `dim` (default 1).
+There is no memory allocation or memory copy in this module.
+It sets the [stride](https://github.com/torch/torch7/blob/master/doc/tensor.md#torch.Tensor.stride) along the `dim`th dimension to zero.
+When provided, `ndim` should specify the number of non-batch dimensions.
+This allows the module to replicate the same non-batch dimension `dim` for both batch and non-batch `inputs`.
+
+```lua
+> x = torch.linspace(1, 5, 5)
+ 1
+ 2
+ 3
+ 4
+ 5
+[torch.DoubleTensor of dimension 5]
+
+> m = nn.Replicate(3)
+> o = m:forward(x)
+ 1  2  3  4  5
+ 1  2  3  4  5
+ 1  2  3  4  5
+[torch.DoubleTensor of dimension 3x5]
+
+> x:fill(13)
+ 13
+ 13
+ 13
+ 13
+ 13
+[torch.DoubleTensor of dimension 5]
+
+> print(o)
+ 13  13  13  13  13
+ 13  13  13  13  13
+ 13  13  13  13  13
+[torch.DoubleTensor of dimension 3x5]
+```
+
+
+<a name="nn.Reshape"></a>
+## Reshape ##
+
+```lua
+module = nn.Reshape(dimension1, dimension2, ... [, batchMode])
+```
+
+
+Reshapes an `nxpxqx..`  Tensor into a `dimension1xdimension2x...` Tensor, taking the elements row-wise.
+
+The optional last argument `batchMode`, when `true` forces the first dimension of the input to be considered the batch dimension, and thus keep its size fixed. This is necessary when dealing with batch sizes of one. When `false`, it forces the entire input (including the first dimension) to be reshaped to the input size. Default `batchMode=nil`, which means that the module considers inputs with more elements than the produce of provided sizes, i.e. `dimension1xdimension2x...`, to be batches.
+
+Example:
+
+```lua
+> x = torch.Tensor(4,4)
+> for i = 1, 4 do
+>    for j = 1, 4 do
+>       x[i][j] = (i-1)*4+j
+>    end
+> end
+> print(x)
+
+  1   2   3   4
+  5   6   7   8
+  9  10  11  12
+ 13  14  15  16
+[torch.Tensor of dimension 4x4]
+
+> print(nn.Reshape(2,8):forward(x))
+
+  1   2   3   4   5   6   7   8
+  9  10  11  12  13  14  15  16
+[torch.Tensor of dimension 2x8]
+
+> print(nn.Reshape(8,2):forward(x))
+
+  1   2
+  3   4
+  5   6
+  7   8
+  9  10
+ 11  12
+ 13  14
+ 15  16
+[torch.Tensor of dimension 8x2]
+
+> print(nn.Reshape(16):forward(x))
+
+  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+[torch.Tensor of dimension 16]
+
+> y = torch.Tensor(1, 4):fill(0)
+> print(y)
+
+ 0  0  0  0
+ [torch.DoubleTensor of dimension 1x4]
+
+> print(nn.Reshape(4):forward(y))
+
+ 0  0  0  0
+ [torch.DoubleTensor of dimension 1x4]
+
+> print(nn.Reshape(4, false):forward(y))
+
+ 0
+ 0
+ 0
+ 0
+ [torch.DoubleTensor of dimension 4]
+
+```
+
+<a name="nn.View"></a>
+## View ##
+
+```lua
+module = nn.View(sizes)
+```
+
+This module creates a new view of the input tensor using the `sizes` passed to the constructor. The parameter `sizes` can either be a `LongStorage` or numbers.
+The method `setNumInputDims()` allows to specify the expected number of dimensions of the inputs of the modules. This makes it possible to use minibatch inputs when using a size `-1` for one of the dimensions.
+The method `resetSize(sizes)` allows to reset the view size of the module after initialization.
+
+Example 1:
+
+```lua
+> x = torch.Tensor(4, 4)
+> for i = 1, 4 do
+>    for j = 1, 4 do
+>       x[i][j] = (i-1)*4+j
+>    end
+> end
+> print(x)
+
+  1   2   3   4
+  5   6   7   8
+  9  10  11  12
+ 13  14  15  16
+[torch.Tensor of dimension 4x4]
+
+> print(nn.View(2, 8):forward(x))
+
+  1   2   3   4   5   6   7   8
+  9  10  11  12  13  14  15  16
+[torch.DoubleTensor of dimension 2x8]
+
+> print(nn.View(torch.LongStorage{8,2}):forward(x))
+
+  1   2
+  3   4
+  5   6
+  7   8
+  9  10
+ 11  12
+ 13  14
+ 15  16
+[torch.DoubleTensor of dimension 8x2]
+
+> print(nn.View(16):forward(x))
+
+  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+[torch.DoubleTensor of dimension 16]
+```
+
+Example 2:
+```lua
+> input = torch.Tensor(2, 3)
+> minibatch = torch.Tensor(5, 2, 3)
+> m = nn.View(-1):setNumInputDims(2)
+> print(#m:forward(input))
+
+ 6
+[torch.LongStorage of size 1]
+
+> print(#m:forward(minibatch))
+
+ 5
+ 6
+[torch.LongStorage of size 2]
+```
+
+<a name="nn.Contiguous"></a>
+## Contiguous ##
+
+Is used to make `input`, `gradOutput` or both contiguous, corresponds to
+`torch.contiguous` function. Only does copy and allocation if `input` or
+`gradOutput` is not contiguous, otherwise passes the same tensor.
+
+<a name="nn.Select"></a>
+## Select ##
+
+```lua
+module = nn.Select(dim, index)
+```
+
+Selects a dimension and index of a  `nxpxqx..`  Tensor.
+
+Example:
+
+```lua
+mlp = nn.Sequential()
+mlp:add(nn.Select(1, 3))
+
+x = torch.randn(10, 5)
+print(x)
+print(mlp:forward(x))
+```
+
+gives the output:
+
+```lua
+ 0.9720 -0.0836  0.0831 -0.2059 -0.0871
+ 0.8750 -2.0432 -0.1295 -2.3932  0.8168
+ 0.0369  1.1633  0.6483  1.2862  0.6596
+ 0.1667 -0.5704 -0.7303  0.3697 -2.2941
+ 0.4794  2.0636  0.3502  0.3560 -0.5500
+-0.1898 -1.1547  0.1145 -1.1399  0.1711
+-1.5130  1.4445  0.2356 -0.5393 -0.6222
+-0.6587  0.4314  1.1916 -1.4509  1.9400
+ 0.2733  1.0911  0.7667  0.4002  0.1646
+ 0.5804 -0.5333  1.1621  1.5683 -0.1978
+[torch.Tensor of dimension 10x5]
+
+ 0.0369
+ 1.1633
+ 0.6483
+ 1.2862
+ 0.6596
+[torch.Tensor of dimension 5]
+```
+
+This can be used in conjunction with [Concat](containers.md#nn.Concat) to emulate the behavior of [Parallel](containers.md#nn.Parallel), or to select various parts of an input Tensor to perform operations on. Here is a fairly complicated example:
+
+```lua
+mlp = nn.Sequential()
+c = nn.Concat(2)
+for i = 1, 10 do
+   local t = nn.Sequential()
+   t:add(nn.Select(1, i))
+   t:add(nn.Linear(3, 2))
+   t:add(nn.Reshape(2, 1))
+   c:add(t)
+end
+mlp:add(c)
+
+pred = mlp:forward(torch.randn(10, 3))
+print(pred)
+
+for i = 1, 10000 do     -- Train for a few iterations
+   x = torch.randn(10, 3)
+   y = torch.ones(2, 10)
+   pred = mlp:forward(x)
+
+   criterion = nn.MSECriterion()
+   err = criterion:forward(pred, y)
+   gradCriterion = criterion:backward(pred, y)
+   mlp:zeroGradParameters()
+   mlp:backward(x, gradCriterion)
+   mlp:updateParameters(0.01)
+   print(err)
+end
+```
+
+<a name="nn.MaskedSelect"></a>
+## MaskedSelect ##
+
+```lua
+module = nn.MaskedSelect()
+```
+
+Performs a [torch.MaskedSelect](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-maskedselectmask) on a Tensor.  The mask is supplied as a tabular argument with the input on the forward and backward passes.
+
+Example:
+
+```lua
+ms = nn.MaskedSelect()
+mask = torch.ByteTensor({{1, 0}, {0, 1}})
+input = torch.DoubleTensor({{10, 20}, {30, 40}})
+print(input)
+print(mask)
+out = ms:forward({input, mask})
+print(out)
+gradIn = ms:backward({input, mask}, out)
+print(gradIn[1])
+```
+
+Gives the output:
+
+```lua
+10  20
+30  40
+[torch.DoubleTensor of size 2x2]
+
+1  0
+0  1
+[torch.ByteTensor of size 2x2]
+
+10
+40
+[torch.DoubleTensor of size 2]
+
+10  0
+0  40
+[torch.DoubleTensor of size 2x2]
+```
+
+<a name="nn.Index"></a>
+## Index ##
+
+```lua
+module = nn.Index(dim)
+```
+
+Applies the Tensor [index](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-indexdim-index) operation along the given dimension. So
+
+```lua
+nn.Index(dim):forward{t,i}
+```
+gives the same output as
+```lua
+t:index(dim, i)
+```
+
+<a name="nn.Squeeze"></a>
+## Squeeze ##
+
+```lua
+module = nn.Squeeze([dim, numInputDims])
+```
+Applies the Tensor [squeeze](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-squeezedim) operation. So
+
+```lua
+nn.Squeeze():forward(t)
+```
+gives the same output as
+```lua
+t:squeeze()
+```
+Setting `numInputDims` allows to use this module on batches.
+
+<a name="nn.Unsqueeze"></a>
+## Unsqueeze ##
+
+```lua
+module = nn.Unsqueeze(pos [, numInputDims])
+```
+Insert singleton dim (i.e., dimension 1) at position `pos`. 
+For an `input` with `dim = input:dim()`, there are `dim + 1` possible positions to insert the singleton dimension.
+For example, if `input` is `3` dimensional tensor in size `p x q x r`, then the singleton dim can be inserted at the following `4` positions
+```
+pos = 1: 1 x p x q x r
+pos = 2: p x 1 x q x r
+pos = 3: p x q x 1 x r
+pos = 4: p x q x r x 1
+```
+
+Example:
+```lua
+input = torch.Tensor(2, 4, 3) -- input: 2 x 4 x 3
+
+-- insert at head
+m = nn.Unsqueeze(1)
+m:forward(input) -- output: 1 x 2 x 4 x 3
+
+-- insert at tail
+m = nn.Unsqueeze(4)
+m:forward(input) -- output: 2 x 4 x 3 x 1
+
+-- insert in between
+m = nn.Unsqueeze(2)
+m:forward(input) -- output: 2 x 1 x 4 x 3
+
+-- the input size can vary across calls
+input2 = torch.Tensor(3, 5, 7) -- input2: 3 x 5 x 7
+m:forward(input2) -- output: 3 x 1 x 5 x 7
+```
+
+Indicate the expected input feature map dimension by specifying `numInputDims`. 
+This allows the module to work with mini-batch. Example:
+```lua
+b = 5 -- batch size 5
+input = torch.Tensor(b, 2, 4, 3) -- input: b x 2 x 4 x 3
+numInputDims = 3 -- input feature map should be the last 3 dims
+
+m = nn.Unsqueeze(4, numInputDims)
+m:forward(input) -- output: b x 2 x 4 x 3 x 1
+
+m = nn.Unsqueeze(2):setNumInputDims(numInputDims)
+m:forward(input) -- output: b x 2 x 1 x 4 x 3
+```
+
+<a name="nn.Transpose"></a>
+## Transpose ##
+
+```lua
+module = nn.Transpose({dim1, dim2} [, {dim3, dim4}, ...])
+```
+
+Swaps dimension `dim1` with `dim2`, then `dim3` with `dim4`, and so on. So
+
+```lua
+nn.Transpose({dim1, dim2}, {dim3, dim4}):forward(t)
+```
+
+gives the same output as
+
+```lua
+t:transpose(dim1, dim2)
+t:transpose(dim3, dim4)
+```
+
+<a name="nn.Exp"></a>
+## Exp ##
+
+```lua
+module = nn.Exp()
+```
+
+Applies the `exp` function element-wise to the input Tensor, thus outputting a Tensor of the same dimension.
+
+```lua
+ii = torch.linspace(-2, 2)
+m = nn.Exp()
+oo = m:forward(ii)
+go = torch.ones(100)
+gi = m:backward(ii,go)
+gnuplot.plot({'f(x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
+gnuplot.grid(true)
+```
+
+![](image/exp.png)
+
+
+<a name="nn.Log"></a>
+## Log ##
+
+```lua
+module = nn.Log()
+```
+
+Applies the `log` function element-wise to the input Tensor, thus outputting a Tensor of the same dimension.
+
+
+<a name="nn.Square"></a>
+## Square ##
+
+```lua
+module = nn.Square()
+```
+
+Takes the square of each element.
+
+```lua
+ii = torch.linspace(-5, 5)
+m = nn.Square()
+oo = m:forward(ii)
+go = torch.ones(100)
+gi = m:backward(ii, go)
+gnuplot.plot({'f(x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
+gnuplot.grid(true)
+```
+
+![](image/square.png)
+
+
+<a name="nn.Sqrt"></a>
+## Sqrt ##
+
+```lua
+module = nn.Sqrt()
+```
+
+Takes the square root of each element.
+
+```lua
+ii = torch.linspace(0, 5)
+m = nn.Sqrt()
+oo = m:forward(ii)
+go = torch.ones(100)
+gi = m:backward(ii, go)
+gnuplot.plot({'f(x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
+gnuplot.grid(true)
+```
+
+![](image/sqrt.png)
+
+
+<a name="nn.Power"></a>
+## Power ##
+
+```lua
+module = nn.Power(p)
+```
+
+Raises each element to its `p`-th power.
+
+```lua
+ii = torch.linspace(0, 2)
+m = nn.Power(1.25)
+oo = m:forward(ii)
+go = torch.ones(100)
+gi = m:backward(ii, go)
+gnuplot.plot({'f(x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
+gnuplot.grid(true)
+```
+
+![](image/power.png)
+
+<a name="nn.Clamp"></a>
+## Clamp ##
+
+```lua
+module = nn.Clamp(min_value, max_value)
+```
+
+Clamps all elements into the range `[min_value, max_value]`.
+Output is identical to input in the range, otherwise elements less than `min_value` (or greater than `max_value`) are saturated to `min_value` (or `max_value`).
+
+```lua
+A = torch.randn(2, 5)
+m = nn.Clamp(-0.1, 0.5)
+B = m:forward(A)
+
+print(A)  -- input
+-1.1321  0.0227 -0.4672  0.6519 -0.5380
+ 0.9061 -1.0858  0.3697 -0.8120 -1.6759
+[torch.DoubleTensor of size 3x5]
+
+print(B)  -- output
+-0.1000  0.0227 -0.1000  0.5000 -0.1000
+ 0.5000 -0.1000  0.3697 -0.1000 -0.1000
+[torch.DoubleTensor of size 3x5]
+```
+
+<a name="nn.Normalize"></a>
+## Normalize ##
+
+```lua
+module = nn.Normalize(p, [eps])
+```
+Normalizes the input Tensor to have unit `L_p` norm. The smoothing parameter `eps` prevents division by zero when the input contains all zero elements (default = `1e-10`).
+
+Input can be 1D or 2D (in which case it's considered as in batch mode)
+
+```lua
+A = torch.randn(3, 5)
+m = nn.Normalize(2)
+B = m:forward(A) -- B is also 3 x 5
+-- take the L2 norm over the second axis:
+print(torch.norm(B, 2, 2)) -- norms is [1, 1, 1]
+```
+
+`Normalize` has a specialized implementation for the `inf` norm, which corresponds to the maximum norm.
+```lua
+A = torch.randn(3,5)
+m = nn.Normalize(math.huge) -- uses maximum/inf norm
+B = m:forward(A)
+maxA = torch.abs(A):max(2)
+print(A,B,maxA)
+```
+
+<a name="nn.MM"></a>
+## MM ##
+
+```lua
+module = nn.MM(transA, transB)
+```
+
+Performs multiplications on one or more pairs of matrices. If `transA` is set to true, the first matrix is transposed before multiplication. If `transB` is set to true, the second matrix is transposed before multiplication. By default, the matrices do not get transposed.
+
+The module also accepts 3D inputs which are interpreted as batches of matrices. When using batches, the first input matrix should be of size `b x m x n` and the second input matrix should be of size `b x n x p` (assuming `transA` and `transB` are not set). If `transA` or `transB` is set, transpose takes place between the second and the third dimensions for the corresponding matrix.
+
+```lua
+model = nn.MM()
+A = torch.randn(b, m, n)
+B = torch.randn(b, n, p)
+C = model:forward({A, B})  -- C will be of size `b x m x p`
+
+model = nn.MM(true, false)
+A = torch.randn(b, n, m)
+B = torch.randn(b, n, p)
+C = model:forward({A, B})  -- C will be of size `b x m x p`
+```
+
+
+<a name="nn.BatchNormalization"></a>
+## BatchNormalization ##
+
+```lua
+module = nn.BatchNormalization(N [, eps] [, momentum] [,affine])
+```
+where `N` is the dimensionality of input
+`eps` is a small value added to the standard-deviation to avoid divide-by-zero. Defaults to `1e-5`.
+`affine` is a boolean. When set to false, the learnable affine transform is disabled. Defaults to true
+
+During training, this layer keeps a running estimate of its computed mean and std.
+The running sum is kept with a default momentum of 0.1 (unless over-ridden)
+During evaluation, this running mean/std is used for normalization.
+
+Implements Batch Normalization as described in [the paper](http://arxiv.org/pdf/1502.03167v3.pdf): "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" by Sergey Ioffe, Christian Szegedy.
+
+The operation implemented is:
+
+```lua
+              x - mean(x)
+y =  ----------------------------- * gamma + beta
+      standard-deviation(x) + eps
+```
+
+where the mean and standard-deviation are calculated per-dimension over the mini-batches and where gamma and beta are learnable parameter vectors of size `N` (where `N` is the input size).
+The learning of gamma and beta is optional.
+The module only accepts 2D inputs.
+
+```lua
+-- with learnable parameters
+model = nn.BatchNormalization(m)
+A = torch.randn(b, m)
+C = model:forward(A)  -- C will be of size `b x m`
+
+-- without learnable parameters
+model = nn.BatchNormalization(m, nil, nil, false)
+A = torch.randn(b, m)
+C = model:forward(A)  -- C will be of size `b x m`
+```
+
+<a name="nn.Padding"></a>
+## Padding ##
+
+```lua
+module = nn.Padding(dim, pad [, nInputDim, value, index])
+```
+
+This module adds `pad` units of padding to dimension `dim` of the input.
+If `pad` is negative, padding is added to the left, otherwise, it is added to the right of the dimension. When `nInputDim` is provided, inputs larger than that value will be considered batches where the actual `dim` to be padded will
+be dimension `dim + 1`. When `value` is provide, the padding will be filled with that `value`. The default `value` is zero.
+When `index` is provided, padding will be added at that offset from the left or right, depending on the sign of `pad`.
+
+Example 1:
+
+```lua
+module = nn.Padding(1, 2, 1, -1) --pad right x2
+module:forward(torch.randn(3)) --non-batch input
+ 0.2008
+ 0.4848
+-1.0783
+-1.0000
+-1.0000
+[torch.DoubleTensor of dimension 5]
+```
+
+Example 2:
+
+```lua
+module = nn.Padding(1, -2, 1, -1) --pad left x2
+module:forward(torch.randn(2, 3)) --batch input
+-1.0000 -1.0000  1.0203  0.2704 -1.6164
+-1.0000 -1.0000 -0.2219 -0.6529 -1.9218
+[torch.DoubleTensor of dimension 2x5]
+```
+
+Example 3:
+
+```lua
+module = nn.Padding(1, -2, 1, -1, 2) --pad left x2, offset to index 2
+module:forward(torch.randn(2, 3)) --batch input
+ 1.0203 -1.0000 -1.0000  0.2704 -1.6164
+-0.6529 -1.0000 -1.0000 -0.2219 -1.9218
+[torch.DoubleTensor of dimension 2x5]
+```
+
+<a name="nn.L1Penalty"></a>
+## L1Penalty ##
+
+```lua
+penalty = nn.L1Penalty(L1weight, sizeAverage)
+```
+
+L1Penalty is an inline module that in its forward propagation copies the input Tensor directly to the output, and computes an L1 loss of the latent state (input) and stores it in the module's `loss` field.
+During backward propagation: `gradInput = gradOutput + gradLoss`.
+
+This module can be used in autoencoder architectures to apply L1 losses to internal latent state without having to use Identity and parallel containers to carry the internal code to an output criterion.
+
+Example (sparse autoencoder, note: decoder should be normalized):
+
+```lua
+encoder = nn.Sequential()
+encoder:add(nn.Linear(3, 128))
+encoder:add(nn.Threshold())
+decoder = nn.Linear(128, 3)
+
+autoencoder = nn.Sequential()
+autoencoder:add(encoder)
+autoencoder:add(nn.L1Penalty(l1weight))
+autoencoder:add(decoder)
+
+criterion = nn.MSECriterion()  -- To measure reconstruction error
+-- ...
+```
+
+<a name="nn.GradientReversal"></a>
+## GradientReversal ##
+
+```lua
+module = nn.GradientReversal([lambda = 1])
+```
+This module preserves the input, but takes the gradient from the subsequent layer, multiplies it by `-lambda` and passes it to the preceding layer. This can be used to maximise an objective function whilst using gradient descent, as described in "Domain-Adversarial Training of Neural Networks" (http://arxiv.org/abs/1505.07818).
+
+One can also call:
+```lua
+module:setLambda(lambda)
+```
+to set the hyper-parameter `lambda` dynamically during training.
diff --git a/doc/table.md b/doc/table.md
new file mode 100644
index 0000000..a2e23f8
--- /dev/null
+++ b/doc/table.md
@@ -0,0 +1,1214 @@
+<a name="nn.TableLayers"></a>
+# Table Layers #
+
+This set of modules allows the manipulation of `table`s through the layers of a neural network.
+This allows one to build very rich architectures:
+
+  * `table` Container Modules encapsulate sub-Modules:
+    * [`ConcatTable`](#nn.ConcatTable): applies each member module to the same input     [`Tensor`](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor) and outputs a `table`;
+    * [`ParallelTable`](#nn.ParallelTable): applies the `i`-th member module to the `i`-th input and outputs a `table`;
+  * Table Conversion Modules convert between `table`s and `Tensor`s or `table`s:
+    * [`SplitTable`](#nn.SplitTable): splits a `Tensor` into a `table` of `Tensor`s;
+    * [`JoinTable`](#nn.JoinTable): joins a `table` of `Tensor`s into a `Tensor`;
+    * [`MixtureTable`](#nn.MixtureTable): mixture of experts weighted by a gater;
+    * [`SelectTable`](#nn.SelectTable): select one element from a `table`;
+    * [`NarrowTable`](#nn.NarrowTable): select a slice of elements from a `table`;
+    * [`FlattenTable`](#nn.FlattenTable): flattens a nested `table` hierarchy;
+  * Pair Modules compute a measure like distance or similarity from a pair (`table`) of input `Tensor`s:
+    * [`PairwiseDistance`](#nn.PairwiseDistance): outputs the `p`-norm. distance between inputs;
+    * [`DotProduct`](#nn.DotProduct): outputs the dot product (similarity) between inputs;
+    * [`CosineDistance`](#nn.CosineDistance): outputs the cosine distance between inputs;
+  * CMath Modules perform element-wise operations on a `table` of `Tensor`s:
+    * [`CAddTable`](#nn.CAddTable): addition of input `Tensor`s;
+    * [`CSubTable`](#nn.CSubTable): substraction of input `Tensor`s;
+    * [`CMulTable`](#nn.CMulTable): multiplication of input `Tensor`s;
+    * [`CDivTable`](#nn.CDivTable): division of input `Tensor`s;
+  * `Table` of Criteria:
+    * [`CriterionTable`](#nn.CriterionTable): wraps a [Criterion](criterion.md#nn.Criterion) so that it can accept a `table` of inputs.
+
+`table`-based modules work by supporting `forward()` and `backward()` methods that can accept `table`s as inputs.
+It turns out that the usual [`Sequential`](containers.md#nn.Sequential) module can do this, so all that is needed is other child modules that take advantage of such `table`s.
+
+```lua
+mlp = nn.Sequential()
+t = {x, y, z}
+pred = mlp:forward(t)
+pred = mlp:forward{x, y, z}      -- This is equivalent to the line before
+```
+
+<a name="nn.ConcatTable"></a>
+## ConcatTable ##
+
+```lua
+module = nn.ConcatTable()
+```
+
+`ConcatTable` is a container module that applies each member module to the same input [`Tensor`](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor) or `table`.
+
+```
+                  +-----------+
+             +----> {member1, |
++-------+    |    |           |
+| input +----+---->  member2, |
++-------+    |    |           |
+   or        +---->  member3} |
+ {input}          +-----------+
+```
+
+### Example 1
+
+```lua
+mlp = nn.ConcatTable()
+mlp:add(nn.Linear(5, 2))
+mlp:add(nn.Linear(5, 3))
+
+pred = mlp:forward(torch.randn(5))
+for i, k in ipairs(pred) do print(i, k) end
+```
+
+which gives the output:
+
+```lua
+1
+-0.4073
+ 0.0110
+[torch.Tensor of dimension 2]
+
+2
+ 0.0027
+-0.0598
+-0.1189
+[torch.Tensor of dimension 3]
+```
+
+### Example 2
+
+```lua
+mlp = nn.ConcatTable()
+mlp:add(nn.Identity())
+mlp:add(nn.Identity())
+
+pred = mlp:forward{torch.randn(2), {torch.randn(3)}}
+print(pred)
+```
+
+which gives the output (using [th](https://github.com/torch/trepl)):
+
+```lua
+{
+  1 :
+    {
+      1 : DoubleTensor - size: 2
+      2 :
+        {
+          1 : DoubleTensor - size: 3
+        }
+    }
+  2 :
+    {
+      1 : DoubleTensor - size: 2
+      2 :
+        {
+          1 : DoubleTensor - size: 3
+        }
+    }
+}
+```
+
+
+<a name="nn.ParallelTable"></a>
+## ParallelTable ##
+
+```lua
+module = nn.ParallelTable()
+```
+
+`ParallelTable` is a container module that, in its `forward()` method, applies the `i`-th member module to the `i`-th input, and outputs a `table` of the set of outputs.
+
+```
++----------+         +-----------+
+| {input1, +---------> {member1, |
+|          |         |           |
+|  input2, +--------->  member2, |
+|          |         |           |
+|  input3} +--------->  member3} |
++----------+         +-----------+
+```
+
+### Example
+
+```lua
+mlp = nn.ParallelTable()
+mlp:add(nn.Linear(10, 2))
+mlp:add(nn.Linear(5, 3))
+
+x = torch.randn(10)
+y = torch.rand(5)
+
+pred = mlp:forward{x, y}
+for i, k in pairs(pred) do print(i, k) end
+```
+
+which gives the output:
+
+```lua
+1
+ 0.0331
+ 0.7003
+[torch.Tensor of dimension 2]
+
+2
+ 0.0677
+-0.1657
+-0.7383
+[torch.Tensor of dimension 3]
+```
+
+
+<a name="nn.SplitTable"></a>
+## SplitTable ##
+
+```lua
+module = SplitTable(dimension, nInputDims)
+```
+
+Creates a module that takes a [`Tensor`](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor) as input and outputs several `table`s, splitting the `Tensor` along the specified `dimension`.
+In the diagram below, `dimension` is equal to `1`.
+
+```
+    +----------+         +-----------+
+    | input[1] +---------> {member1, |
+  +----------+-+         |           |
+  | input[2] +----------->  member2, |
++----------+-+           |           |
+| input[3] +------------->  member3} |
++----------+             +-----------+
+```
+
+The optional parameter `nInputDims` allows to specify the number of dimensions that this module will receive.
+This makes it possible to forward both minibatch and non-minibatch `Tensor`s through the same module.
+
+### Example 1
+
+```lua
+mlp = nn.SplitTable(2)
+x = torch.randn(4, 3)
+pred = mlp:forward(x)
+for i, k in ipairs(pred) do print(i, k) end
+```
+
+gives the output:
+
+```lua
+1
+ 1.3885
+ 1.3295
+ 0.4281
+-1.0171
+[torch.Tensor of dimension 4]
+
+2
+-1.1565
+-0.8556
+-1.0717
+-0.8316
+[torch.Tensor of dimension 4]
+
+3
+-1.3678
+-0.1709
+-0.0191
+-2.5871
+[torch.Tensor of dimension 4]
+```
+
+### Example 2
+
+```lua
+mlp = nn.SplitTable(1)
+pred = mlp:forward(torch.randn(4, 3))
+for i, k in ipairs(pred) do print(i, k) end
+```
+
+gives the output:
+
+```lua
+1
+ 1.6114
+ 0.9038
+ 0.8419
+[torch.Tensor of dimension 3]
+
+2
+ 2.4742
+ 0.2208
+ 1.6043
+[torch.Tensor of dimension 3]
+
+3
+ 1.3415
+ 0.2984
+ 0.2260
+[torch.Tensor of dimension 3]
+
+4
+ 2.0889
+ 1.2309
+ 0.0983
+[torch.Tensor of dimension 3]
+```
+
+### Example 3
+
+```lua
+mlp = nn.SplitTable(1, 2)
+pred = mlp:forward(torch.randn(2, 4, 3))
+for i, k in ipairs(pred) do print(i, k) end
+pred = mlp:forward(torch.randn(4, 3))
+for i, k in ipairs(pred) do print(i, k) end
+```
+
+gives the output:
+
+```lua
+1
+-1.3533  0.7448 -0.8818
+-0.4521 -1.2463  0.0316
+[torch.DoubleTensor of dimension 2x3]
+
+2
+ 0.1130 -1.3904  1.4620
+ 0.6722  2.0910 -0.2466
+[torch.DoubleTensor of dimension 2x3]
+
+3
+ 0.4672 -1.2738  1.1559
+ 0.4664  0.0768  0.6243
+[torch.DoubleTensor of dimension 2x3]
+
+4
+ 0.4194  1.2991  0.2241
+ 2.9786 -0.6715  0.0393
+[torch.DoubleTensor of dimension 2x3]
+
+
+1
+-1.8932
+ 0.0516
+-0.6316
+[torch.DoubleTensor of dimension 3]
+
+2
+-0.3397
+-1.8881
+-0.0977
+[torch.DoubleTensor of dimension 3]
+
+3
+ 0.0135
+ 1.2089
+ 0.5785
+[torch.DoubleTensor of dimension 3]
+
+4
+-0.1758
+-0.0776
+-1.1013
+[torch.DoubleTensor of dimension 3]
+```
+
+The module also supports indexing from the end using negative dimensions. This allows to use this module when the number of dimensions of the input is unknown.
+
+### Example
+
+```lua
+m = nn.SplitTable(-2)
+out = m:forward(torch.randn(3, 2))
+for i, k in ipairs(out) do print(i, k) end
+out = m:forward(torch.randn(1, 3, 2))
+for i, k in ipairs(out) do print(i, k) end
+```
+
+gives the output:
+
+```
+1
+ 0.1420
+-0.5698
+[torch.DoubleTensor of size 2]
+
+2
+ 0.1663
+ 0.1197
+[torch.DoubleTensor of size 2]
+
+3
+ 0.4198
+-1.1394
+[torch.DoubleTensor of size 2]
+
+
+1
+-2.4941
+-1.4541
+[torch.DoubleTensor of size 1x2]
+
+2
+ 0.4594
+ 1.1946
+[torch.DoubleTensor of size 1x2]
+
+3
+-2.3322
+-0.7383
+[torch.DoubleTensor of size 1x2]
+```
+
+### A more complicated example
+
+```lua
+mlp = nn.Sequential()       -- Create a network that takes a Tensor as input
+mlp:add(nn.SplitTable(2))
+c = nn.ParallelTable()      -- The two Tensor slices go through two different Linear
+c:add(nn.Linear(10, 3))     -- Layers in Parallel
+c:add(nn.Linear(10, 7))
+mlp:add(c)                  -- Outputing a table with 2 elements
+p = nn.ParallelTable()      -- These tables go through two more linear layers separately
+p:add(nn.Linear(3, 2))
+p:add(nn.Linear(7, 1))
+mlp:add(p)
+mlp:add(nn.JoinTable(1))    -- Finally, the tables are joined together and output.
+
+pred = mlp:forward(torch.randn(10, 2))
+print(pred)
+
+for i = 1, 100 do           -- A few steps of training such a network..
+   x = torch.ones(10, 2)
+   y = torch.Tensor(3)
+   y:copy(x:select(2, 1):narrow(1, 1, 3))
+   pred = mlp:forward(x)
+
+   criterion = nn.MSECriterion()
+   local err = criterion:forward(pred, y)
+   local gradCriterion = criterion:backward(pred, y)
+   mlp:zeroGradParameters()
+   mlp:backward(x, gradCriterion)
+   mlp:updateParameters(0.05)
+
+   print(err)
+end
+```
+
+
+<a name="nn.JoinTable"></a>
+## JoinTable ##
+
+```lua
+module = JoinTable(dimension, nInputDims)
+```
+
+Creates a module that takes a `table` of `Tensor`s as input and outputs a [`Tensor`](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor) by joining them together along dimension `dimension`.
+In the diagram below `dimension` is set to `1`.
+
+```
++----------+             +-----------+
+| {input1, +-------------> output[1] |
+|          |           +-----------+-+
+|  input2, +-----------> output[2] |
+|          |         +-----------+-+
+|  input3} +---------> output[3] |
++----------+         +-----------+
+```
+
+The optional parameter `nInputDims` allows to specify the number of dimensions that this module will receive. This makes it possible to forward both minibatch and non-minibatch `Tensor`s through the same module.
+
+### Example 1
+
+```lua
+x = torch.randn(5, 1)
+y = torch.randn(5, 1)
+z = torch.randn(2, 1)
+
+print(nn.JoinTable(1):forward{x, y})
+print(nn.JoinTable(2):forward{x, y})
+print(nn.JoinTable(1):forward{x, z})
+```
+
+gives the output:
+
+```lua
+ 1.3965
+ 0.5146
+-1.5244
+-0.9540
+ 0.4256
+ 0.1575
+ 0.4491
+ 0.6580
+ 0.1784
+-1.7362
+[torch.DoubleTensor of dimension 10x1]
+
+ 1.3965  0.1575
+ 0.5146  0.4491
+-1.5244  0.6580
+-0.9540  0.1784
+ 0.4256 -1.7362
+[torch.DoubleTensor of dimension 5x2]
+
+ 1.3965
+ 0.5146
+-1.5244
+-0.9540
+ 0.4256
+-1.2660
+ 1.0869
+[torch.Tensor of dimension 7x1]
+```
+
+### Example 2
+
+```lua
+module = nn.JoinTable(2, 2)
+
+x = torch.randn(3, 1)
+y = torch.randn(3, 1)
+
+mx = torch.randn(2, 3, 1)
+my = torch.randn(2, 3, 1)
+
+print(module:forward{x, y})
+print(module:forward{mx, my})
+```
+
+gives the output:
+
+```lua
+ 0.4288  1.2002
+-1.4084 -0.7960
+-0.2091  0.1852
+[torch.DoubleTensor of dimension 3x2]
+
+(1,.,.) =
+  0.5561  0.1228
+ -0.6792  0.1153
+  0.0687  0.2955
+
+(2,.,.) =
+  2.5787  1.8185
+ -0.9860  0.6756
+  0.1989 -0.4327
+[torch.DoubleTensor of dimension 2x3x2]
+```
+
+### A more complicated example
+
+```lua
+mlp = nn.Sequential()         -- Create a network that takes a Tensor as input
+c = nn.ConcatTable()          -- The same Tensor goes through two different Linear
+c:add(nn.Linear(10, 3))       -- Layers in Parallel
+c:add(nn.Linear(10, 7))
+mlp:add(c)                    -- Outputing a table with 2 elements
+p = nn.ParallelTable()        -- These tables go through two more linear layers
+p:add(nn.Linear(3, 2))        -- separately.
+p:add(nn.Linear(7, 1))
+mlp:add(p)
+mlp:add(nn.JoinTable(1))      -- Finally, the tables are joined together and output.
+
+pred = mlp:forward(torch.randn(10))
+print(pred)
+
+for i = 1, 100 do             -- A few steps of training such a network..
+   x = torch.ones(10)
+   y = torch.Tensor(3); y:copy(x:narrow(1, 1, 3))
+   pred = mlp:forward(x)
+
+   criterion= nn.MSECriterion()
+   local err = criterion:forward(pred, y)
+   local gradCriterion = criterion:backward(pred, y)
+   mlp:zeroGradParameters()
+   mlp:backward(x, gradCriterion)
+   mlp:updateParameters(0.05)
+
+   print(err)
+end
+```
+
+
+<a name='nn.MixtureTable'></a>
+## MixtureTable ##
+
+`module` = `MixtureTable([dim])`
+
+Creates a module that takes a `table` `{gater, experts}` as input and outputs
+the mixture of `experts` (a `Tensor` or `table` of `Tensor`s) using a
+`gater` `Tensor`. When `dim` is provided, it specifies the dimension of
+the `experts` `Tensor` that will be interpolated (or mixed). Otherwise,
+the `experts` should take the form of a `table` of `Tensor`s. This
+Module works for `experts` of dimension 1D or more, and for a
+1D or 2D `gater`, i.e. for single examples or mini-batches.
+
+Considering an `input = {G, E}` with a single example, then
+the mixture of experts `Tensor` `E` with
+gater `Tensor` `G` has the following form:
+```lua
+output = G[1]*E[1] + G[2]*E[2] + ... + G[n]*E[n]
+```
+where `dim = 1`, `n = E:size(dim) = G:size(dim)` and `G:dim() == 1`.
+Note that `E:dim() >= 2`, such that `output:dim() = E:dim() - 1`.
+
+Example 1:
+Using this Module, an arbitrary mixture of `n` 2-layer experts
+by a 2-layer gater could be constructed as follows:
+```lua
+experts = nn.ConcatTable()
+for i = 1, n do
+   local expert = nn.Sequential()
+   expert:add(nn.Linear(3, 4))
+   expert:add(nn.Tanh())
+   expert:add(nn.Linear(4, 5))
+   expert:add(nn.Tanh())
+   experts:add(expert)
+end
+
+gater = nn.Sequential()
+gater:add(nn.Linear(3, 7))
+gater:add(nn.Tanh())
+gater:add(nn.Linear(7, n))
+gater:add(nn.SoftMax())
+
+trunk = nn.ConcatTable()
+trunk:add(gater)
+trunk:add(experts)
+
+moe = nn.Sequential()
+moe:add(trunk)
+moe:add(nn.MixtureTable())
+```
+Forwarding a batch of 2 examples gives us something like this:
+```lua
+> =moe:forward(torch.randn(2, 3))
+-0.2152  0.3141  0.3280 -0.3772  0.2284
+ 0.2568  0.3511  0.0973 -0.0912 -0.0599
+[torch.DoubleTensor of dimension 2x5]
+```
+
+Example 2:
+In the following, the `MixtureTable` expects `experts` to be a `Tensor` of
+`size = {1, 4, 2, 5, n}`:
+```lua
+experts = nn.Concat(5)
+for i = 1, n do
+   local expert = nn.Sequential()
+   expert:add(nn.Linear(3, 4))
+   expert:add(nn.Tanh())
+   expert:add(nn.Linear(4, 4*2*5))
+   expert:add(nn.Tanh())
+   expert:add(nn.Reshape(4, 2, 5, 1))
+   experts:add(expert)
+end
+
+gater = nn.Sequential()
+gater:add(nn.Linear(3, 7))
+gater:add(nn.Tanh())
+gater:add(nn.Linear(7, n))
+gater:add(nn.SoftMax())
+
+trunk = nn.ConcatTable()
+trunk:add(gater)
+trunk:add(experts)
+
+moe = nn.Sequential()
+moe:add(trunk)
+moe:add(nn.MixtureTable(5))
+```
+Forwarding a batch of 2 examples gives us something like this:
+```lua
+> =moe:forward(torch.randn(2, 3)):size()
+ 2
+ 4
+ 2
+ 5
+[torch.LongStorage of size 4]
+
+```
+
+<a name="nn.SelectTable"></a>
+## SelectTable ##
+
+`module` = `SelectTable(index)`
+
+Creates a module that takes a `table` as input and outputs the element at index `index` (positive or negative). 
+This can be either a `table` or a [`Tensor`](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor).
+
+The gradients of the non-`index` elements are zeroed `Tensor`s of the same size. This is true regardless of the
+depth of the encapsulated `Tensor` as the function used internally to do so is recursive.
+
+Example 1:
+```lua
+> input = {torch.randn(2, 3), torch.randn(2, 1)}
+> =nn.SelectTable(1):forward(input)
+-0.3060  0.1398  0.2707
+ 0.0576  1.5455  0.0610
+[torch.DoubleTensor of dimension 2x3]
+
+> =nn.SelectTable(-1):forward(input)
+ 2.3080
+-0.2955
+[torch.DoubleTensor of dimension 2x1]
+
+> =table.unpack(nn.SelectTable(1):backward(input, torch.randn(2, 3)))
+-0.4891 -0.3495 -0.3182
+-2.0999  0.7381 -0.5312
+[torch.DoubleTensor of dimension 2x3]
+
+0
+0
+[torch.DoubleTensor of dimension 2x1]
+
+```
+
+Example 2:
+```lua
+> input = {torch.randn(2, 3), {torch.randn(2, 1), {torch.randn(2, 2)}}}
+
+> =nn.SelectTable(2):forward(input)
+{
+  1 : DoubleTensor - size: 2x1
+  2 :
+    {
+      1 : DoubleTensor - size: 2x2
+    }
+}
+
+> =table.unpack(nn.SelectTable(2):backward(input, {torch.randn(2, 1), {torch.randn(2, 2)}}))
+0 0 0
+0 0 0
+[torch.DoubleTensor of dimension 2x3]
+
+{
+  1 : DoubleTensor - size: 2x1
+  2 :
+    {
+      1 : DoubleTensor - size: 2x2
+    }
+}
+
+> gradInput = nn.SelectTable(1):backward(input, torch.randn(2, 3))
+
+> =gradInput
+{
+  1 : DoubleTensor - size: 2x3
+  2 :
+    {
+      1 : DoubleTensor - size: 2x1
+      2 :
+        {
+          1 : DoubleTensor - size: 2x2
+        }
+    }
+}
+
+> =gradInput[1]
+-0.3400 -0.0404  1.1885
+ 1.2865  0.4107  0.6506
+[torch.DoubleTensor of dimension 2x3]
+
+> gradInput[2][1]
+0
+0
+[torch.DoubleTensor of dimension 2x1]
+
+> gradInput[2][2][1]
+0 0
+0 0
+[torch.DoubleTensor of dimension 2x2]
+
+```
+
+<a name="nn.NarrowTable"></a>
+## NarrowTable ##
+
+`module` = `NarrowTable(offset [, length])`
+
+Creates a module that takes a `table` as input and outputs the subtable 
+starting at index `offset` having `length` elements (defaults to 1 element).
+The elements can be either a `table` or a [`Tensor`](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor).
+
+The gradients of the elements not included in the subtable are zeroed `Tensor`s of the same size. 
+This is true regardless of the depth of the encapsulated `Tensor` as the function used internally to do so is recursive.
+
+Example:
+```lua
+> input = {torch.randn(2, 3), torch.randn(2, 1), torch.randn(1, 2)}
+> =nn.NarrowTable(2,2):forward(input)
+{
+  1 : DoubleTensor - size: 2x1
+  2 : DoubleTensor - size: 1x2
+}
+
+> =nn.NarrowTable(1):forward(input)
+{
+  1 : DoubleTensor - size: 2x3
+}
+
+> =table.unpack(nn.NarrowTable(1,2):backward(input, {torch.randn(2, 3), torch.randn(2, 1)}))
+ 1.9528 -0.1381  0.2023
+ 0.2297 -1.5169 -1.1871
+[torch.DoubleTensor of size 2x3]
+
+-1.2023
+-0.4165
+[torch.DoubleTensor of size 2x1]
+
+ 0  0
+[torch.DoubleTensor of size 1x2]
+
+```
+
+<a name="nn.FlattenTable"></a>
+## FlattenTable ##
+
+`module` = `FlattenTable()`
+
+Creates a module that takes an arbitrarily deep `table` of `Tensor`s (potentially nested) as input and outputs a `table` of `Tensor`s, where the output `Tensor` in index `i` is the `Tensor` with post-order DFS index `i` in the input `table`.
+
+This module is particularly useful in combination with nn.Identity() to create networks that can append to their input `table`.
+
+Example:
+```lua
+x = {torch.rand(1), {torch.rand(2), {torch.rand(3)}}, torch.rand(4)}
+print(x)
+print(nn.FlattenTable():forward(x))
+```
+gives the output:
+```lua
+{
+  1 : DoubleTensor - size: 1
+  2 :
+    {
+      1 : DoubleTensor - size: 2
+      2 :
+        {
+          1 : DoubleTensor - size: 3
+        }
+    }
+  3 : DoubleTensor - size: 4
+}
+{
+  1 : DoubleTensor - size: 1
+  2 : DoubleTensor - size: 2
+  3 : DoubleTensor - size: 3
+  4 : DoubleTensor - size: 4
+}
+```
+
+<a name="nn.PairwiseDistance"></a>
+## PairwiseDistance ##
+
+`module` = `PairwiseDistance(p)` creates a module that takes a `table` of two vectors as input and outputs the distance between them using the `p`-norm.
+
+Example:
+```lua
+mlp_l1 = nn.PairwiseDistance(1)
+mlp_l2 = nn.PairwiseDistance(2)
+x = torch.Tensor({1, 2, 3})
+y = torch.Tensor({4, 5, 6})
+print(mlp_l1:forward({x, y}))
+print(mlp_l2:forward({x, y}))
+```
+gives the output:
+```lua
+ 9
+[torch.Tensor of dimension 1]
+
+ 5.1962
+[torch.Tensor of dimension 1]
+```
+
+A more complicated example:
+```lua
+-- imagine we have one network we are interested in, it is called "p1_mlp"
+p1_mlp= nn.Sequential(); p1_mlp:add(nn.Linear(5, 2))
+
+-- But we want to push examples towards or away from each other
+-- so we make another copy of it called p2_mlp
+-- this *shares* the same weights via the set command, but has its own set of temporary gradient storage
+-- that's why we create it again (so that the gradients of the pair don't wipe each other)
+p2_mlp= nn.Sequential(); p2_mlp:add(nn.Linear(5, 2))
+p2_mlp:get(1).weight:set(p1_mlp:get(1).weight)
+p2_mlp:get(1).bias:set(p1_mlp:get(1).bias)
+
+-- we make a parallel table that takes a pair of examples as input. they both go through the same (cloned) mlp
+prl = nn.ParallelTable()
+prl:add(p1_mlp)
+prl:add(p2_mlp)
+
+-- now we define our top level network that takes this parallel table and computes the pairwise distance between
+-- the pair of outputs
+mlp= nn.Sequential()
+mlp:add(prl)
+mlp:add(nn.PairwiseDistance(1))
+
+-- and a criterion for pushing together or pulling apart pairs
+crit = nn.HingeEmbeddingCriterion(1)
+
+-- lets make two example vectors
+x = torch.rand(5)
+y = torch.rand(5)
+
+
+-- Use a typical generic gradient update function
+function gradUpdate(mlp, x, y, criterion, learningRate)
+local pred = mlp:forward(x)
+local err = criterion:forward(pred, y)
+local gradCriterion = criterion:backward(pred, y)
+mlp:zeroGradParameters()
+mlp:backward(x, gradCriterion)
+mlp:updateParameters(learningRate)
+end
+
+-- push the pair x and y together, notice how then the distance between them given
+-- by  print(mlp:forward({x, y})[1]) gets smaller
+for i = 1, 10 do
+gradUpdate(mlp, {x, y}, 1, crit, 0.01)
+print(mlp:forward({x, y})[1])
+end
+
+
+-- pull apart the pair x and y, notice how then the distance between them given
+-- by  print(mlp:forward({x, y})[1]) gets larger
+
+for i = 1, 10 do
+gradUpdate(mlp, {x, y}, -1, crit, 0.01)
+print(mlp:forward({x, y})[1])
+end
+
+```
+
+<a name="nn.DotProduct"></a>
+## DotProduct ##
+
+`module` = `DotProduct()` creates a module that takes a `table` of two vectors (or matrices if in batch mode) as input and outputs the dot product between them.
+
+Example:
+```lua
+mlp = nn.DotProduct()
+x = torch.Tensor({1, 2, 3})
+y = torch.Tensor({4, 5, 6})
+print(mlp:forward({x, y}))
+```
+gives the output:
+```lua
+ 32
+[torch.Tensor of dimension 1]
+```
+
+
+A more complicated example:
+```lua
+
+-- Train a ranking function so that mlp:forward({x, y}, {x, z}) returns a number
+-- which indicates whether x is better matched with y or z (larger score = better match), or vice versa.
+
+mlp1 = nn.Linear(5, 10)
+mlp2 = mlp1:clone('weight', 'bias')
+
+prl = nn.ParallelTable();
+prl:add(mlp1); prl:add(mlp2)
+
+mlp1 = nn.Sequential()
+mlp1:add(prl)
+mlp1:add(nn.DotProduct())
+
+mlp2 = mlp1:clone('weight', 'bias')
+
+mlp = nn.Sequential()
+prla = nn.ParallelTable()
+prla:add(mlp1)
+prla:add(mlp2)
+mlp:add(prla)
+
+x = torch.rand(5);
+y = torch.rand(5)
+z = torch.rand(5)
+
+
+print(mlp1:forward{x, x})
+print(mlp1:forward{x, y})
+print(mlp1:forward{y, y})
+
+
+crit = nn.MarginRankingCriterion(1);
+
+-- Use a typical generic gradient update function
+function gradUpdate(mlp, x, y, criterion, learningRate)
+   local pred = mlp:forward(x)
+   local err = criterion:forward(pred, y)
+   local gradCriterion = criterion:backward(pred, y)
+   mlp:zeroGradParameters()
+   mlp:backward(x, gradCriterion)
+   mlp:updateParameters(learningRate)
+end
+
+inp = {{x, y}, {x, z}}
+
+math.randomseed(1)
+
+-- make the pair x and y have a larger dot product than x and z
+
+for i = 1, 100 do
+   gradUpdate(mlp, inp, 1, crit, 0.05)
+   o1 = mlp1:forward{x, y}[1];
+   o2 = mlp2:forward{x, z}[1];
+   o = crit:forward(mlp:forward{{x, y}, {x, z}}, 1)
+   print(o1, o2, o)
+end
+
+print "________________**"
+
+-- make the pair x and z have a larger dot product than x and y
+
+for i = 1, 100 do
+   gradUpdate(mlp, inp, -1, crit, 0.05)
+   o1 = mlp1:forward{x, y}[1];
+   o2 = mlp2:forward{x, z}[1];
+   o = crit:forward(mlp:forward{{x, y}, {x, z}}, -1)
+   print(o1, o2, o)
+end
+```
+
+
+<a name="nn.CosineDistance"></a>
+## CosineDistance ##
+
+`module` = `CosineDistance()` creates a module that takes a `table` of two vectors (or matrices if in batch mode) as input and outputs the cosine distance between them.
+
+Examples:
+```lua
+mlp = nn.CosineDistance()
+x = torch.Tensor({1, 2, 3})
+y = torch.Tensor({4, 5, 6})
+print(mlp:forward({x, y}))
+```
+gives the output:
+```lua
+ 0.9746
+[torch.Tensor of dimension 1]
+```
+`CosineDistance` also accepts batches:
+```lua
+mlp = nn.CosineDistance()
+x = torch.Tensor({{1,2,3},{1,2,-3}})
+y = torch.Tensor({{4,5,6},{-4,5,6}})
+print(mlp:forward({x,y}))
+```
+gives the output:
+```lua
+ 0.9746
+-0.3655
+[torch.DoubleTensor of size 2]
+```
+
+A more complicated example:
+```lua
+
+-- imagine we have one network we are interested in, it is called "p1_mlp"
+p1_mlp= nn.Sequential(); p1_mlp:add(nn.Linear(5, 2))
+
+-- But we want to push examples towards or away from each other
+-- so we make another copy of it called p2_mlp
+-- this *shares* the same weights via the set command, but has its own set of temporary gradient storage
+-- that's why we create it again (so that the gradients of the pair don't wipe each other)
+p2_mlp= p1_mlp:clone('weight', 'bias')
+
+-- we make a parallel table that takes a pair of examples as input. they both go through the same (cloned) mlp
+prl = nn.ParallelTable()
+prl:add(p1_mlp)
+prl:add(p2_mlp)
+
+-- now we define our top level network that takes this parallel table and computes the cosine distance between
+-- the pair of outputs
+mlp= nn.Sequential()
+mlp:add(prl)
+mlp:add(nn.CosineDistance())
+
+
+-- lets make two example vectors
+x = torch.rand(5)
+y = torch.rand(5)
+
+-- Grad update function..
+function gradUpdate(mlp, x, y, learningRate)
+    local pred = mlp:forward(x)
+    if pred[1]*y < 1 then
+        gradCriterion = torch.Tensor({-y})
+        mlp:zeroGradParameters()
+        mlp:backward(x, gradCriterion)
+        mlp:updateParameters(learningRate)
+    end
+end
+
+-- push the pair x and y together, the distance should get larger..
+for i = 1, 1000 do
+ gradUpdate(mlp, {x, y}, 1, 0.1)
+ if ((i%100)==0) then print(mlp:forward({x, y})[1]);end
+end
+
+
+-- pull apart the pair x and y, the distance should get smaller..
+
+for i = 1, 1000 do
+ gradUpdate(mlp, {x, y}, -1, 0.1)
+ if ((i%100)==0) then print(mlp:forward({x, y})[1]);end
+end
+```
+
+
+
+<a name="nn.CriterionTable"></a>
+## CriterionTable ##
+
+`module` = `CriterionTable(criterion)`
+
+Creates a module that wraps a Criterion module so that it can accept a `table` of inputs. Typically the `table` would contain two elements: the input and output `x` and `y` that the Criterion compares.
+
+Example:
+```lua
+mlp = nn.CriterionTable(nn.MSECriterion())
+x = torch.randn(5)
+y = torch.randn(5)
+print(mlp:forward{x, x})
+print(mlp:forward{x, y})
+```
+gives the output:
+```lua
+0
+1.9028918413199
+```
+
+Here is a more complex example of embedding the criterion into a network:
+```lua
+
+function table.print(t)
+ for i, k in pairs(t) do print(i, k); end
+end
+
+mlp = nn.Sequential();                          -- Create an mlp that takes input
+  main_mlp = nn.Sequential();		      -- and output using ParallelTable
+  main_mlp:add(nn.Linear(5, 4))
+  main_mlp:add(nn.Linear(4, 3))
+ cmlp = nn.ParallelTable();
+ cmlp:add(main_mlp)
+ cmlp:add(nn.Identity())
+mlp:add(cmlp)
+mlp:add(nn.CriterionTable(nn.MSECriterion())) -- Apply the Criterion
+
+for i = 1, 20 do                                 -- Train for a few iterations
+ x = torch.ones(5);
+ y = torch.Tensor(3); y:copy(x:narrow(1, 1, 3))
+ err = mlp:forward{x, y}                         -- Pass in both input and output
+ print(err)
+
+ mlp:zeroGradParameters();
+ mlp:backward({x, y} );
+ mlp:updateParameters(0.05);
+end
+```
+
+<a name="nn.CAddTable"></a>
+## CAddTable ##
+
+`module` = `CAddTable([inplace])`
+
+Takes a `table` of `Tensor`s and outputs summation of all `Tensor`s. If `inplace` is `true`, the sum is written to the first `Tensor`.
+
+```lua
+ii = {torch.ones(5), torch.ones(5)*2, torch.ones(5)*3}
+=ii[1]
+ 1
+ 1
+ 1
+ 1
+ 1
+[torch.DoubleTensor of dimension 5]
+
+return ii[2]
+ 2
+ 2
+ 2
+ 2
+ 2
+[torch.DoubleTensor of dimension 5]
+
+return ii[3]
+ 3
+ 3
+ 3
+ 3
+ 3
+[torch.DoubleTensor of dimension 5]
+
+m = nn.CAddTable()
+=m:forward(ii)
+ 6
+ 6
+ 6
+ 6
+ 6
+[torch.DoubleTensor of dimension 5]
+```
+
+
+<a name="nn.CSubTable"></a>
+## CSubTable ##
+
+Takes a `table` with two `Tensor` and returns the component-wise
+subtraction between them.
+
+```lua
+m = nn.CSubTable()
+=m:forward({torch.ones(5)*2.2, torch.ones(5)})
+ 1.2000
+ 1.2000
+ 1.2000
+ 1.2000
+ 1.2000
+[torch.DoubleTensor of dimension 5]
+```
+
+<a name="nn.CMulTable"></a>
+## CMulTable ##
+
+Takes a `table` of `Tensor`s and outputs the multiplication of all of them.
+
+```lua
+ii = {torch.ones(5)*2, torch.ones(5)*3, torch.ones(5)*4}
+m = nn.CMulTable()
+=m:forward(ii)
+ 24
+ 24
+ 24
+ 24
+ 24
+[torch.DoubleTensor of dimension 5]
+
+```
+
+<a name="nn.CDivTable"></a>
+## CDivTable ##
+
+Takes a `table` with two `Tensor` and returns the component-wise
+division between them.
+
+```lua
+m = nn.CDivTable()
+=m:forward({torch.ones(5)*2.2, torch.ones(5)*4.4})
+ 0.5000
+ 0.5000
+ 0.5000
+ 0.5000
+ 0.5000
+[torch.DoubleTensor of dimension 5]
+```
+
diff --git a/doc/testing.md b/doc/testing.md
new file mode 100644
index 0000000..424c492
--- /dev/null
+++ b/doc/testing.md
@@ -0,0 +1,69 @@
+# Testing #
+For those who want to implement their own modules, we suggest using
+the `nn.Jacobian` class for testing the derivatives of their class,
+together with the [torch.Tester](https://github.com/torch/torch7/blob/master/doc/tester.md) class. The sources
+of `nn` package contains sufficiently many examples of such tests.
+
+
+## nn.Jacobian ##
+
+
+<a name="nn.Jacobian.testJacobian"></a>
+### testJacobian(module, input, minval, maxval, perturbation) ###
+
+Test the jacobian of a module w.r.t. to its input. 
+
+`module` takes as its input a random tensor shaped the same as `input`.  
+`minval` and `maxval` specify the range of the random tensor ([-2, 2] by default).  
+`perturbation` is used as finite difference (1e-6 by default).
+
+Returns the L-inf distance between the jacobian computed by backpropagation and by finite difference.
+
+
+<a name="nn.Jacobian.testJacobianParameters"></a>
+### testJacobianParameters(module, input, param, dparam, minval, maxval, perturbation) ###
+
+Test the jacobian of a module w.r.t. its parameters (instead of its input).
+
+The input and parameters of `module` are random tensors shaped the same as `input` and `param`.  
+`minval` and `maxval` specify the range of the random tensors ([-2, 2] by default).  
+`dparam` points to the gradient w.r.t. parameters.  
+`perturbation` is used as finite difference (1e-6 by default).
+
+Returns the L-inf distance between the jacobian computed by backpropagation and by finite difference.
+
+
+<a name="nn.Jacobian.testJacobianUpdateParameters"></a>
+### testJacobianUpdateParameters(module, input, param, minval, maxval, perturbation) ###
+
+Test the amount of update of a module to its parameters.
+
+The input and parameters of `module` are random tensors shaped the same as `input` and `param`.  
+`minval` and `maxval` specify the range of the random tensors ([-2, 2] by default).  
+`perturbation` is used as finite difference (1e-6 by default).
+
+Returns the L-inf distance between the update computed by backpropagation and by finite difference.
+
+
+<a name="nn.Jacobian.forward"></a>
+### forward(module, input, param, perturbation) ###
+
+Compute the jacobian by finite difference.
+
+`module` has parameters `param` and input `input`.  
+If provided, `param` is regarded as independent variables, otherwise `input` is the independent variables.  
+`perturbation` is used as finite difference (1e-6 by default).
+
+Returns the jacobian computed by finite difference.
+
+
+<a name="nn.Jacobian.backward"></a>
+### backward(module, input, param, dparam) ###
+
+Compute the jacobian by backpropagation.
+
+`module` has parameters `param` and input `input`.  
+If provided, `param` is regarded as independent variables, otherwise `input` is the independent variables.  
+`dparam` is the gradient w.r.t. parameters, it must present as long as `param` is present.  
+
+Returns the jacobian computed by backpropagation.
diff --git a/doc/training.md b/doc/training.md
new file mode 100644
index 0000000..d21bcc7
--- /dev/null
+++ b/doc/training.md
@@ -0,0 +1,294 @@
+<a name="nn.traningneuralnet.dok"></a>
+# Training a neural network #
+
+Training a neural network is easy with a [simple `for` loop](#nn.DoItYourself).  Typically however we would
+use the `optim` optimizer, which implements some cool functionalities, like Nesterov momentum,
+[adagrad](https://github.com/torch/optim/blob/master/doc/index.md#x-adagradopfunc-x-config-state) and
+[adam](https://github.com/torch/optim/blob/master/doc/index.md#x-adamopfunc-x-config-state).
+
+We will demonstrate using a for-loop first, to show the low-level view of what happens in training, and then
+we will show how to train using `optim`.
+
+<a name="nn.DoItYourself"></a>
+## Example of manual training of a neural network ##
+
+We show an example here on a classical XOR problem.
+
+__Neural Network__
+
+We create a simple neural network with one hidden layer.
+```lua
+require "nn"
+mlp = nn.Sequential();  -- make a multi-layer perceptron
+inputs = 2; outputs = 1; HUs = 20; -- parameters
+mlp:add(nn.Linear(inputs, HUs))
+mlp:add(nn.Tanh())
+mlp:add(nn.Linear(HUs, outputs))
+```
+
+__Loss function__
+
+We choose the Mean Squared Error criterion:
+```lua
+criterion = nn.MSECriterion()
+```
+
+__Training__
+
+We create data _on the fly_ and feed it to the neural network.
+
+```lua
+for i = 1,2500 do
+  -- random sample
+  local input= torch.randn(2);     -- normally distributed example in 2d
+  local output= torch.Tensor(1);
+  if input[1]*input[2] > 0 then  -- calculate label for XOR function
+    output[1] = -1
+  else
+    output[1] = 1
+  end
+
+  -- feed it to the neural network and the criterion
+  criterion:forward(mlp:forward(input), output)
+
+  -- train over this example in 3 steps
+  -- (1) zero the accumulation of the gradients
+  mlp:zeroGradParameters()
+  -- (2) accumulate gradients
+  mlp:backward(input, criterion:backward(mlp.output, output))
+  -- (3) update parameters with a 0.01 learning rate
+  mlp:updateParameters(0.01)
+end
+```
+
+__Test the network__
+
+```lua
+x = torch.Tensor(2)
+x[1] =  0.5; x[2] =  0.5; print(mlp:forward(x))
+x[1] =  0.5; x[2] = -0.5; print(mlp:forward(x))
+x[1] = -0.5; x[2] =  0.5; print(mlp:forward(x))
+x[1] = -0.5; x[2] = -0.5; print(mlp:forward(x))
+```
+
+You should see something like:
+```lua
+> x = torch.Tensor(2)
+> x[1] =  0.5; x[2] =  0.5; print(mlp:forward(x))
+
+-0.6140
+[torch.Tensor of dimension 1]
+
+> x[1] =  0.5; x[2] = -0.5; print(mlp:forward(x))
+
+ 0.8878
+[torch.Tensor of dimension 1]
+
+> x[1] = -0.5; x[2] =  0.5; print(mlp:forward(x))
+
+ 0.8548
+[torch.Tensor of dimension 1]
+
+> x[1] = -0.5; x[2] = -0.5; print(mlp:forward(x))
+
+-0.5498
+[torch.Tensor of dimension 1]
+```
+
+<a name="nn.DoItYourself"></a>
+## Training using optim ##
+
+[optim](https://github.com/torch/optim) is the standard way of training Torch7 neural networks.
+
+`optim` is a quite general optimizer, for minimizing any function with respect to a set
+of parameters.  In our case, our
+function will be the loss of our network, given an input, and a set of weights.  The goal of training 
+a neural net is to
+optimize the weights to give the lowest loss over our training set of input data.  So, we are going to use optim
+to minimize the loss with respect to the weights, over our training set.  We will feed the data to 
+`optim` in minibatches.  For this particular example, we will use just one minibatch, but in your own training
+you will almost certainly want to break your training set into minibatches, and feed each minibatch to `optim`,
+one by one.
+
+We need to give `optim` a function that will output the loss and the derivative of the loss with respect to the
+weights, given the current weights, as a function parameter.  The function will have access to our training minibatch, and use this
+to calculate the loss, for this minibatch.  Typically, the function would be defined inside our loop over
+batches, and therefore have access to the current minibatch data.
+
+Here's how this looks:
+
+__Neural Network__
+
+We create a simple neural network with one hidden layer.
+```lua
+require 'nn'
+
+local model = nn.Sequential();  -- make a multi-layer perceptron
+local inputs = 2; local outputs = 1; local HUs = 20; -- parameters
+model:add(nn.Linear(inputs, HUs))
+model:add(nn.Tanh())
+model:add(nn.Linear(HUs, outputs))
+```
+
+__Criterion__
+
+We choose the Mean Squared Error loss criterion:
+```lua
+local criterion = nn.MSECriterion()
+```
+
+We are using an `nn.MSECriterion` because we are training on a regression task, predicting float target values.
+For a classification task, we would add an `nn.LogSoftMax()` layer to the end of our
+network, and use a `nn.ClassNLLCriterion` loss criterion.
+
+__Dataset__
+
+We will just create one minibatch of 128 examples.  In your own networks, you'd want to break down your
+rather larger dataset into multiple minibatches, of around 32-512 examples each.
+
+```lua
+local batchSize = 128
+local batchInputs = torch.Tensor(batchSize, inputs)
+local batchLabels = torch.DoubleTensor(batchSize)
+
+for i=1,batchSize do
+  local input = torch.randn(2)     -- normally distributed example in 2d
+  local label = 1
+  if input[1]*input[2]>0 then     -- calculate label for XOR function
+    label = -1;
+  end
+  batchInputs[i]:copy(input)
+  batchLabels[i] = label
+end
+```
+
+__Flatten Parameters__
+
+`optim` expects the parameters that are to be optimized, and their gradients, to be one-dimensional tensors.
+But, our network model contains probably multiple modules, typically multiple convolutional layers, and each
+of these layers has their own weight and bias tensors.  How to handle this?
+
+It is simple: we can call a standard method `:getParameters()`, that is defined for any network module.  When
+we call this method, the following magic will happen:
+- a new tensor will be created, large enough to hold all the weights and biases of the entire network model
+- the model weight and bias tensors are replaced with views onto the new contiguous parameter tensor
+- and the exact same thing will happen for all the gradient tensors: replaced with views onto one single
+contiguous gradient tensor
+
+We can call this method as follows:
+```lua
+local params, gradParams = model:getParameters()
+```
+
+These flattened tensors have the following characteristics:
+- to `optim`, the parameters it needs to optimize are all contained in one single one-dimensional tensor
+- when `optim` optimizes the parameters in this large one-dimensional tensor, it is implicitly optimizing
+the weights and biases in our network model, since those are now simply views onto this large one-dimensional
+parameter tensor.
+
+It will look something like this:
+
+![Parameter Flattening](image/parameterflattening.png?raw=true "Parameter Flattening")
+
+Note that flattening the parameters redefines the weight and bias tensors for all the network modules
+in our network model.  Therefore, any pre-existing references to the original model layer weight and bias tensors
+will no longer point to the model weight and bias tensors, after flattening.
+
+__Training__
+
+Now that we have created our model, our training set, and prepared the flattened network parameters,
+we can run training, using `optim`.  `optim` provides [various training algorithms](https://github.com/torch/optim/blob/master/doc/index.md).  We
+will use the stochastic gradient descent algorithm [sgd](https://github.com/torch/optim/blob/master/doc/index.md#x-sgdopfunc-x-state).  We
+need to provide the learning rate, via an optimization state table:
+
+```lua
+local optimState = {learningRate=0.01}
+```
+
+We define an evaluation function, inside our training loop, and use `optim.sgd` to run training:
+```lua
+require 'optim'
+
+for epoch=1,50 do
+  -- local function we give to optim
+  -- it takes current weights as input, and outputs the loss
+  -- and the gradient of the loss with respect to the weights
+  -- gradParams is calculated implicitly by calling 'backward',
+  -- because the model's weight and bias gradient tensors
+  -- are simply views onto gradParams
+  local function feval(params)
+    gradParams:zero()
+
+    local outputs = model:forward(batchInputs)
+    local loss = criterion:forward(outputs, batchLabels)
+    local dloss_doutput = criterion:backward(outputs, batchLabels)
+    model:backward(batchInputs, dloss_doutput)
+
+    return loss,gradParams
+  end
+  optim.sgd(feval, params, optimState)
+end
+```
+__Test the network__
+
+For the prediction task, we will also typically use minibatches, although we can run prediction sample by
+sample too.  In this example, we will predict sample by sample.  To run prediction on a minibatch, simply
+pass in a tensor with one additional dimension, which represents the sample index.
+
+```lua
+x = torch.Tensor(2)
+x[1] =  0.5; x[2] =  0.5; print(model:forward(x))
+x[1] =  0.5; x[2] = -0.5; print(model:forward(x))
+x[1] = -0.5; x[2] =  0.5; print(model:forward(x))
+x[1] = -0.5; x[2] = -0.5; print(model:forward(x))
+```
+
+You should see something like:
+```lua
+> x = torch.Tensor(2)
+> x[1] =  0.5; x[2] =  0.5; print(model:forward(x))
+
+-0.3490
+[torch.Tensor of dimension 1]
+
+> x[1] =  0.5; x[2] = -0.5; print(model:forward(x))
+
+ 1.0561
+[torch.Tensor of dimension 1]
+
+> x[1] = -0.5; x[2] =  0.5; print(model:forward(x))
+
+ 0.8640
+[torch.Tensor of dimension 1]
+
+> x[1] = -0.5; x[2] = -0.5; print(model:forward(x))
+
+-0.2941
+[torch.Tensor of dimension 1]
+```
+
+If we were running on a GPU, we would probably want to predict using minibatches, because this will
+hide the latencies involved in transferring data from main memory to the GPU.  To predict
+on a minbatch, we could do something like:
+
+```lua
+local x = torch.Tensor({
+  {0.5, 0.5},
+  {0.5, -0.5},
+  {-0.5, 0.5},
+  {-0.5, -0.5}
+})
+print(model:forward(x))
+```
+You should see something like:
+```lua
+> print(model:forward(x))
+ -0.3490
+ 1.0561
+ 0.8640
+ -0.2941
+[torch.Tensor of size 4]
+```
+
+That's it! For minibatched prediction, the output tensor contains one value for each of our input data samples.
+
diff --git a/doc/transfer.md b/doc/transfer.md
new file mode 100644
index 0000000..c1dfc80
--- /dev/null
+++ b/doc/transfer.md
@@ -0,0 +1,382 @@
+<a name="nn.transfer.dok"></a>
+# Transfer Function Layers #
+Transfer functions are normally used to introduce a non-linearity after a parameterized layer like [Linear](simple.md#nn.Linear) and  [SpatialConvolution](convolution.md#nn.SpatialConvolution). Non-linearities allows for dividing the problem space into more complex regions than what a simple logistic regressor would permit.
+
+<a name="nn.HardTanh"></a>
+## HardTanh ##
+
+Applies the `HardTanh` function element-wise to the input Tensor,
+thus outputting a Tensor of the same dimension.
+
+`HardTanh` is defined as:
+
+  * `f(x)` = `1, if x >`  `1,`
+  * `f(x)` = `-1, if x <`  `-1,`
+  * `f(x)` = `x,` `otherwise.`
+
+The range of the linear region `[-1 1]` can be adjusted by specifying arguments in declaration, for example `nn.HardTanh(min_value, max_value)`.
+Otherwise, `[min_value max_value]` is set to `[-1 1]` by default.
+
+
+```lua
+ii=torch.linspace(-2,2)
+m=nn.HardTanh()
+oo=m:forward(ii)
+go=torch.ones(100)
+gi=m:backward(ii,go)
+gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+gnuplot.grid(true)
+```
+![](image/htanh.png)
+
+
+<a name="nn.HardShrink"></a>
+## HardShrink ##
+
+`module = nn.HardShrink(lambda)`
+
+Applies the hard shrinkage function element-wise to the input
+[Tensor](https://github.com/torch/torch7/blob/master/doc/tensor.md). The output is the same size as the input.
+
+`HardShrinkage` operator is defined as:
+
+  * `f(x) = x, if x > lambda`
+  * `f(x) = x, if x < -lambda`
+  * `f(x) = 0, otherwise`
+
+```lua
+ii=torch.linspace(-2,2)
+m=nn.HardShrink(0.85)
+oo=m:forward(ii)
+go=torch.ones(100)
+gi=m:backward(ii,go)
+gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+gnuplot.grid(true)
+```
+![](image/hshrink.png)
+
+<a name="nn.SoftShrink"></a>
+## SoftShrink ##
+
+`module = nn.SoftShrink(lambda)`
+
+Applies the soft shrinkage function element-wise to the input
+[Tensor](https://github.com/torch/torch7/blob/master/doc/tensor.md). The output is the same size as the input.
+
+`SoftShrinkage` operator is defined as:
+
+  * `f(x) = x-lambda, if x > lambda`
+  * `f(x) = x+lambda, if x < -lambda`
+  * `f(x) = 0, otherwise`
+
+```lua
+ii=torch.linspace(-2,2)
+m=nn.SoftShrink(0.85)
+oo=m:forward(ii)
+go=torch.ones(100)
+gi=m:backward(ii,go)
+gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+gnuplot.grid(true)
+```
+![](image/sshrink.png)
+
+
+<a name="nn.SoftMax"></a>
+## SoftMax ##
+
+Applies the `Softmax` function to an n-dimensional input Tensor,
+rescaling them so that the elements of the n-dimensional output Tensor
+lie in the range (0,1) and sum to 1.
+
+`Softmax` is defined as `f_i(x)` = `exp(x_i-shift) / sum_j exp(x_j-shift)`,
+where `shift` = `max_i x_i`.
+
+
+```lua
+ii=torch.exp(torch.abs(torch.randn(10)))
+m=nn.SoftMax()
+oo=m:forward(ii)
+gnuplot.plot({'Input',ii,'+-'},{'Output',oo,'+-'})
+gnuplot.grid(true)
+```
+![](image/softmax.png)
+
+Note that this module doesn't work directly with [ClassNLLCriterion](criterion.md#nn.ClassNLLCriterion), which expects the `nn.Log` to be computed between the `SoftMax` and itself. Use [LogSoftMax](#nn.LogSoftMax) instead (it's faster).
+
+<a name="nn.SoftMin"></a>
+## SoftMin ##
+
+Applies the `Softmin` function to an n-dimensional input Tensor,
+rescaling them so that the elements of the n-dimensional output Tensor
+lie in the range (0,1) and sum to 1.
+
+`Softmin` is defined as `f_i(x)` = `exp(-x_i-shift) / sum_j exp(-x_j-shift)`,
+where `shift` = `max_i -x_i`.
+
+
+```lua
+ii=torch.exp(torch.abs(torch.randn(10)))
+m=nn.SoftMin()
+oo=m:forward(ii)
+gnuplot.plot({'Input',ii,'+-'},{'Output',oo,'+-'})
+gnuplot.grid(true)
+```
+![](image/softmin.png)
+
+<a name="nn.SoftPlus"></a>
+### SoftPlus ###
+
+Applies the `SoftPlus` function to an n-dimensioanl input Tensor.
+`SoftPlus` is a smooth approximation to the [ReLU](#nn.ReLU) function and can be used to constrain the output of a machine to always be positive. For numerical stability the implementation reverts to the linear function for inputs above a certain value (20 by default).
+
+`SoftPlus` is defined as `f_i(x)` = `1/beta * log(1 + exp(beta * x_i))`.
+
+```lua
+ii=torch.linspace(-3,3)
+m=nn.SoftPlus()
+oo=m:forward(ii)
+go=torch.ones(100)
+gi=m:backward(ii,go)
+gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+gnuplot.grid(true)
+```
+![](image/softplus.png)
+
+<a name="nn.SoftSign"></a>
+## SoftSign ##
+
+Applies the `SoftSign` function to an n-dimensioanl input Tensor.
+
+`SoftSign` is defined as `f_i(x) = x_i / (1+|x_i|)`
+
+```lua
+ii=torch.linspace(-5,5)
+m=nn.SoftSign()
+oo=m:forward(ii)
+go=torch.ones(100)
+gi=m:backward(ii,go)
+gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+gnuplot.grid(true)
+```
+![](image/softsign.png)
+
+<a name="nn.LogSigmoid"></a>
+## LogSigmoid ##
+
+Applies the `LogSigmoid` function to an n-dimensional input Tensor.
+
+`LogSigmoid` is defined as `f_i(x)` = `log(1/(1+ exp(-x_i)))`.
+
+
+```lua
+ii=torch.randn(10)
+m=nn.LogSigmoid()
+oo=m:forward(ii)
+go=torch.ones(10)
+gi=m:backward(ii,go)
+gnuplot.plot({'Input',ii,'+-'},{'Output',oo,'+-'},{'gradInput',gi,'+-'})
+gnuplot.grid(true)
+```
+![](image/logsigmoid.png)
+
+
+<a name="nn.LogSoftMax"></a>
+## LogSoftMax ##
+
+Applies the `LogSoftmax` function to an n-dimensional input Tensor.
+
+`LogSoftmax` is defined as `f_i(x)` = `log(1/a exp(x_i))`,
+where  `a` = `sum_j exp(x_j)`.
+
+```lua
+ii=torch.randn(10)
+m=nn.LogSoftMax()
+oo=m:forward(ii)
+go=torch.ones(10)
+gi=m:backward(ii,go)
+gnuplot.plot({'Input',ii,'+-'},{'Output',oo,'+-'},{'gradInput',gi,'+-'})
+gnuplot.grid(true)
+```
+![](image/logsoftmax.png)
+
+<a name="nn.Sigmoid"></a>
+## Sigmoid ##
+
+Applies the `Sigmoid` function element-wise to the input Tensor,
+thus outputting a Tensor of the same dimension.
+
+`Sigmoid` is defined as `f(x)` = `1/(1+exp(-x))`.
+
+```lua
+ii=torch.linspace(-5,5)
+m=nn.Sigmoid()
+oo=m:forward(ii)
+go=torch.ones(100)
+gi=m:backward(ii,go)
+gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+gnuplot.grid(true)
+```
+![](image/sigmoid.png)
+
+<a name="nn.Tanh"></a>
+## Tanh ##
+
+Applies the `Tanh` function element-wise to the input Tensor,
+thus outputting a Tensor of the same dimension.
+
+`Tanh` is defined as `f(x)` = `(exp(x)-exp(-x))/(exp(x)+exp(-x))`.
+
+```lua
+ii=torch.linspace(-3,3)
+m=nn.Tanh()
+oo=m:forward(ii)
+go=torch.ones(100)
+gi=m:backward(ii,go)
+gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+gnuplot.grid(true)
+```
+![](image/tanh.png)
+
+<a name="nn.ReLU"></a>
+## ReLU ##
+
+Applies the rectified linear unit (`ReLU`) function element-wise to the input Tensor,
+thus outputting a Tensor of the same dimension.
+
+`ReLU` is defined as `f(x)` = `max(0,x)`
+
+Can optionally do its operation in-place without using extra state memory:
+```lua
+m=nn.ReLU(true) -- true = in-place, false = keeping separate state.
+```
+
+```lua
+ii=torch.linspace(-3,3)
+m=nn.ReLU()
+oo=m:forward(ii)
+go=torch.ones(100)
+gi=m:backward(ii,go)
+gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+gnuplot.grid(true)
+```
+![](image/relu.png)
+
+<a name="nn.PReLU"></a>
+## PReLU ##
+
+Applies parametric ReLU, which parameter varies the slope of the negative part:
+
+`PReLU` is defined as `f(x)` = `max(0,x) + a * min(0,x)`
+
+When called without a number on input as ```nn.PReLU()``` uses shared version, meaning
+has only one parameter. Otherwise if called ```nn.PReLU(nOutputPlane)``` has ```nOutputPlane```
+parameters, one for each input map. The output dimension is always equal to input dimension.
+Note that weight decay should not be used on it. For reference see [Delving Deep into Rectifiers](http://arxiv.org/abs/1502.01852).
+
+![](image/prelu.png)
+
+<a name="nn.RReLU"></a>
+## RReLU ##
+
+Applies the randomized leaky rectified linear unit (RReLU) element-wise to the input tensor, thus outputting a tensor of the same dimension. Informally the RReLU is also known as 'insanity' layer.
+
+`RReLU` is defined as `f(x)` = `max(0,x) + a * min(0,x)`, where `a` ~ `U(l,u)`.
+
+In training mode negative inputs are multiplied by a factor `a` drawn from a uniform random distribution `U(l, u)`. In evaluation mode a RReLU behaves like a LeakyReLU with a constant mean factor `a` = `(l+u)/2`.
+
+Syntax:
+```lua
+m=nn.ReLU(
+   l,       -- minimum factor for negative inputs, default: 1/8;
+   u,       -- maximum factor for negative inputs, default: 1/3;
+   inplace  -- if true the result will be written to the input tensor, default: false;
+)
+```
+If `l == u` a RReLU effectively becomes a LeakyReLU. Regardless of operating in in-place mode a RReLU will internally allocate an input-sized `noise` tensor to store random factors for negative inputs. The backward() operation assumes that forward() has been called before.
+
+For reference see [Empirical Evaluation of Rectified Activations in Convolutional Network](http://arxiv.org/abs/1505.00853).
+```lua
+ii=torch.linspace(-3, 3)
+m=nn.RReLU()
+oo=m:forward(ii):clone()
+gi=m:backward(ii,torch.ones(100))
+gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+gnuplot.grid(true)
+```
+![](image/rrelu.png)
+
+<a name="nn.ELU"></a>
+## ELU ##
+
+Applies exponential linear unit (ELU), which parameter a varies the convergence value of the exponential function below zero:
+
+`ELU` is defined as `f(x)` = `max(0,x) + min(0,a*(exp(x)-1))`
+
+It is called with the parameter a as ```nn.ELU(a)``` with the default value `a=1`. The output dimension is always equal to input dimension.
+
+For reference see [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)](http://arxiv.org/abs/1511.07289).
+```lua
+require 'nn'
+require 'gnuplot'
+
+xs = torch.linspace(-3,3,200)
+go = torch.ones(xs:size(1))
+function f(a) return nn.ELU(a):forward(xs) end
+function df(a) local m = nn.ELU(a) m:forward(xs) return m:backward(xs, go) end
+
+gnuplot.plot({'fw ELU, alpha=0.1', xs,  f(0.1), '-'},
+             {'fw ELU, alpha=1.0', xs,  f(1.0), '-'},
+             {'bw ELU, alpha=0.1', xs, df(0.1), '-'},
+             {'bw ELU, alpha=1.0', xs, df(1.0), '-'})
+gnuplot.grid(true)
+```
+![](image/elu.png)
+
+<a name="nn.LeakyReLU"></a>
+## LeakyReLU ##
+
+Applies Leaky ReLU, which parameter `a` sets the slope of the negative part:
+
+`LeakyReLU` is defined as `f(x)` = `max(0,x) + a * min(0,x)`
+
+Can optionally do its operation in-place without using extra state memory:
+
+```lua
+m=nn.LeakyReLU(a,true) -- true = in-place, false = keeping separate state.
+```
+
+<a name="nn.SpatialSoftMax"></a>
+## SpatialSoftMax ##
+
+Applies [SoftMax](#nn.SoftMax) over features to each spatial location (height x width of planes).
+The module accepts 1D (vector), 2D (batch of vectors), 3D (vectors in space) or 4D (batch of vectors in space) tensor as input.
+Functionally it is equivalent to [SoftMax](#nn.SoftMax) when 1D or 2D input is used.
+The output dimension is always the same as input dimension.
+
+```lua
+ii=torch.randn(4,8,16,16)  -- batchSize x features x height x width
+m=nn.SpatialSoftMax()
+oo = m:forward(ii)
+```
+
+<a name="nn.AddConstant"></a>
+## AddConstant ##
+
+Adds a (non-learnable) scalar constant.  This module is sometimes useful for debugging purposes:  `f(x)` = `x + k`, where `k` is a scalar.
+
+Can optionally do its operation in-place without using extra state memory:
+```lua
+m=nn.AddConstant(k,true) -- true = in-place, false = keeping separate state.
+```
+In-place mode restores the original input value after the backward pass, allowing its use after other in-place modules, like [MulConstant](#nn.MulConstant).
+
+<a name="nn.MulConstant"></a>
+## MulConstant ##
+
+Multiplies input tensor by a (non-learnable) scalar constant.  This module is sometimes useful for debugging purposes:  `f(x)` = `k * x`, where `k` is a scalar.
+
+Can optionally do its operation in-place without using extra state memory:
+```lua
+m=nn.MulConstant(k,true) -- true = in-place, false = keeping separate state.
+```
+In-place mode restores the original input value after the backward pass, allowing its use after other in-place modules, like [AddConstant](#nn.AddConstant).
diff --git a/hessian.lua b/hessian.lua
new file mode 100644
index 0000000..4d3afa3
--- /dev/null
+++ b/hessian.lua
@@ -0,0 +1,391 @@
+----------------------------------------------------------------------
+-- hessian.lua: this file appends extra methods to modules in nn,
+-- to estimate diagonal elements of the Hessian. This is useful
+-- to condition learning rates individually.
+----------------------------------------------------------------------
+nn.hessian = {}
+
+----------------------------------------------------------------------
+-- Hessian code is still experimental,
+-- and deactivated by default
+----------------------------------------------------------------------
+function nn.hessian.enable()
+
+   local function accDiagHessianParameters(module, input, diagHessianOutput, gw, hw)
+      if #gw ~= #hw then
+         error('Number of gradients is nto equal to number of hessians')
+      end
+      module.inputSq = module.inputSq or input.new()
+      module.inputSq:resizeAs(input)
+      torch.cmul(module.inputSq, input, input)
+      -- replace gradients with hessian
+      for i=1,#gw do
+         local gwname = gw[i]
+         local hwname = hw[i]
+         local gwval = module[gwname]
+         local hwval = module[hwname]
+         if hwval == nil then
+            module[hwname] = gwval.new():resizeAs(gwval)
+            hwval = module[hwname]
+         end
+         module[gwname] = hwval
+         module[hwname] = gwval
+      end
+      local oldOutput = module.output
+      module.output = module.output.new():resizeAs(oldOutput)
+      module.forward(module, module.inputSq)
+      module.accGradParameters(module, module.inputSq, diagHessianOutput, 1)
+      -- put back gradients
+      for i=1,#gw do
+         local gwname = gw[i]
+         local hwname = hw[i]
+         local gwval = module[gwname]
+         local hwval = module[hwname]
+         module[gwname] = hwval
+         module[hwname] = gwval
+      end
+      module.output = oldOutput
+   end
+   nn.hessian.accDiagHessianParameters = accDiagHessianParameters
+
+   local function updateDiagHessianInput(module, input, diagHessianOutput, w, wsq)
+      if #w ~= #wsq then
+         error('Number of weights is not equal to number of weights squares')
+      end
+      module.diagHessianInput = module.diagHessianInput or input.new()
+      module.diagHessianInput:resizeAs(input)
+
+      local gi = module.gradInput
+      module.gradInput = module.diagHessianInput
+      for i=1,#w do
+         local wname = w[i]
+         local wsqname = wsq[i]
+         local wval = module[wname]
+         local wsqval = module[wsqname]
+         if wsqval == nil then
+            module[wsqname] = wval.new()
+            wsqval = module[wsqname]
+         end
+         wsqval:resizeAs(wval)
+         torch.cmul(wsqval, wval, wval)
+         module[wsqname] = wval
+         module[wname] = wsqval
+      end
+      module.updateGradInput(module,input,diagHessianOutput)
+      for i=1,#w do
+         local wname = w[i]
+         local wsqname = wsq[i]
+         local wval = module[wname]
+         local wsqval = module[wsqname]
+         module[wname] = wsqval
+         module[wsqname] = wval
+      end
+      module.gradInput = gi
+   end
+   nn.hessian.updateDiagHessianInput = updateDiagHessianInput
+
+   local function updateDiagHessianInputPointWise(module, input, diagHessianOutput)
+      local tdh = diagHessianOutput.new():resizeAs(diagHessianOutput):fill(1)
+      updateDiagHessianInput(module,input,tdh,{},{})
+      module.diagHessianInput:cmul(module.diagHessianInput)
+      module.diagHessianInput:cmul(diagHessianOutput)
+   end
+   nn.hessian.updateDiagHessianInputPointWise = updateDiagHessianInputPointWise
+
+   local function initDiagHessianParameters(module,gw,hw)
+      module.diagHessianInput = module.diagHessianInput or module.gradInput.new();
+      for i=1,#gw do
+         module[hw[i]] = module[hw[i]] or module[gw[i]].new():resizeAs(module[gw[i]])
+      end
+   end
+   nn.hessian.initDiagHessianParameters = initDiagHessianParameters
+
+   ----------------------------------------------------------------------
+   -- Module
+   ----------------------------------------------------------------------
+   function nn.Module.updateDiagHessianInput(self, input, diagHessianOutput)
+      error(torch.typename(self) .. ':updateDiagHessianInput() is undefined')
+   end
+
+   function nn.Module.accDiagHessianParameters(self, input, diagHessianOutput)
+   end
+
+   function nn.Module.initDiagHessianParameters()
+   end
+
+   ----------------------------------------------------------------------
+   -- Sequential
+   ----------------------------------------------------------------------
+   function nn.Sequential.initDiagHessianParameters(self)
+      for i=1,#self.modules do
+         self.modules[i]:initDiagHessianParameters()
+      end
+   end
+
+   function nn.Sequential.updateDiagHessianInput(self, input, diagHessianOutput)
+      local currentDiagHessianOutput = diagHessianOutput
+      local currentModule = self.modules[#self.modules]
+      for i=#self.modules-1,1,-1 do
+         local previousModule = self.modules[i]
+         currentDiagHessianOutput = currentModule:updateDiagHessianInput(previousModule.output, currentDiagHessianOutput)
+         currentModule = previousModule
+      end
+      currentDiagHessianOutput = currentModule:updateDiagHessianInput(input, currentDiagHessianOutput)
+      self.diagHessianInput = currentDiagHessianOutput
+      return currentDiagHessianOutput
+   end
+
+   function nn.Sequential.accDiagHessianParameters(self, input, diagHessianOutput)
+      local currentDiagHessianOutput = diagHessianOutput
+      local currentModule = self.modules[#self.modules]
+      for i=#self.modules-1,1,-1 do
+         local previousModule = self.modules[i]
+         currentModule:accDiagHessianParameters(previousModule.output, currentDiagHessianOutput)
+         currentDiagHessianOutput = currentModule.diagHessianInput
+         currentModule = previousModule
+      end
+      currentModule:accDiagHessianParameters(input, currentDiagHessianOutput)
+   end
+
+   ----------------------------------------------------------------------
+   -- Criterion
+   ----------------------------------------------------------------------
+   function nn.Criterion.updateDiagHessianInput(self, input, diagHessianOutput)
+      error(torch.typename(self) .. ':updateDiagHessianInput() is undefined')
+   end
+
+   ----------------------------------------------------------------------
+   -- MSECriterion
+   ----------------------------------------------------------------------
+   function nn.MSECriterion.updateDiagHessianInput(self, input, target)
+      self.diagHessianInput = self.diagHessianInput or input.new()
+      local val = 2
+      if self.sizeAverage then
+         val = val / input:nElement()
+      end
+      self.diagHessianInput:resizeAs(input):fill(val)
+      return self.diagHessianInput
+   end
+
+   ----------------------------------------------------------------------
+   -- WeightedMSECriterion
+   ----------------------------------------------------------------------
+   function nn.WeightedMSECriterion.updateDiagHessianInput(self,input,target)
+      return nn.MSECriterion.updateDiagHessianInput(self,input,target)
+   end
+
+   ----------------------------------------------------------------------
+   -- L1Cost
+   ----------------------------------------------------------------------
+   function nn.L1Cost.updateDiagHessianInput(self,input)
+      self.diagHessianInput = self.diagHessianInput or input.new()
+      self.diagHessianInput:resizeAs(input)
+      self.diagHessianInput:fill(1)
+      self.diagHessianInput[torch.eq(input,0)] = 0
+      return self.diagHessianInput
+   end
+
+   ----------------------------------------------------------------------
+   -- Linear
+   ----------------------------------------------------------------------
+   function nn.Linear.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
+      return self.diagHessianInput
+   end
+
+   function nn.Linear.accDiagHessianParameters(self, input, diagHessianOutput)
+      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
+   end
+
+   function nn.Linear.initDiagHessianParameters(self)
+      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
+   end
+
+   ----------------------------------------------------------------------
+   -- SpatialConvolution
+   ----------------------------------------------------------------------
+   function nn.SpatialConvolution.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
+      return self.diagHessianInput
+   end
+
+   function nn.SpatialConvolution.accDiagHessianParameters(self, input, diagHessianOutput)
+      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
+   end
+
+   function nn.SpatialConvolution.initDiagHessianParameters(self)
+      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
+   end
+   
+   ----------------------------------------------------------------------
+   -- SpatialConvolutionLocal
+   ----------------------------------------------------------------------
+   function nn.SpatialConvolutionLocal.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
+      return self.diagHessianInput
+   end
+
+   function nn.SpatialConvolutionLocal.accDiagHessianParameters(self, input, diagHessianOutput)
+      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
+   end
+
+   function nn.SpatialConvolutionLocal.initDiagHessianParameters(self)
+      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
+   end
+
+   ----------------------------------------------------------------------
+   -- SpatialFullConvolution
+   ----------------------------------------------------------------------
+   function nn.SpatialFullConvolution.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
+      return self.diagHessianInput
+   end
+
+   function nn.SpatialFullConvolution.accDiagHessianParameters(self, input, diagHessianOutput)
+      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
+   end
+
+   function nn.SpatialFullConvolution.initDiagHessianParameters(self)
+      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
+   end
+
+   ----------------------------------------------------------------------
+   -- SpatialConvolutionMap
+   ----------------------------------------------------------------------
+   function nn.SpatialConvolutionMap.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInput(self, input, diagHessianOutput, {'weight','bias'}, {'weightSq','biasSq'})
+      return self.diagHessianInput
+   end
+
+   function nn.SpatialConvolutionMap.accDiagHessianParameters(self, input, diagHessianOutput)
+      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
+   end
+
+   function nn.SpatialConvolutionMap.initDiagHessianParameters(self)
+      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
+   end
+
+   ----------------------------------------------------------------------
+   -- SpatialFullConvolutionMap
+   ----------------------------------------------------------------------
+   function nn.SpatialFullConvolutionMap.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
+      return self.diagHessianInput
+   end
+
+   function nn.SpatialFullConvolutionMap.accDiagHessianParameters(self, input, diagHessianOutput)
+      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
+   end
+
+   function nn.SpatialFullConvolutionMap.initDiagHessianParameters(self)
+      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
+   end
+
+----------------------------------------------------------------------
+   -- Tanh
+   ----------------------------------------------------------------------
+   function nn.Tanh.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInputPointWise(self, input, diagHessianOutput)
+      return self.diagHessianInput
+   end
+
+   ----------------------------------------------------------------------
+   -- TanhShrink
+   ----------------------------------------------------------------------
+   function nn.TanhShrink.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInputPointWise(self.tanh, input, diagHessianOutput)
+      self.diagHessianInput = self.diagHessianInput or input.new():resizeAs(input)
+      torch.add(self.diagHessianInput, self.tanh.diagHessianInput, diagHessianOutput)
+      return self.diagHessianInput
+   end
+
+   ----------------------------------------------------------------------
+   -- Square
+   ----------------------------------------------------------------------
+   function nn.Square.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInputPointWise(self, input, diagHessianOutput)
+      return self.diagHessianInput
+   end
+
+   ----------------------------------------------------------------------
+   -- Sqrt
+   ----------------------------------------------------------------------
+   function nn.Sqrt.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInputPointWise(self, input, diagHessianOutput)
+      return self.diagHessianInput
+   end
+
+   ----------------------------------------------------------------------
+   -- Reshape
+   ----------------------------------------------------------------------
+   function nn.Reshape.updateDiagHessianInput(self, input, diagHessianOutput)
+      self.diagHessianInput = self.diagHessianInput or input.new()
+      diagHessianOutput = diagHessianOutput:contiguous()
+      self.diagHessianInput:set(diagHessianOutput):resizeAs(input)
+      return self.diagHessianInput
+   end
+
+   ----------------------------------------------------------------------
+   -- Parameters manipulation:
+   -- we modify these functions such that they return hessian coefficients
+   ----------------------------------------------------------------------
+   function nn.Module.parameters(self)
+      if self.weight and self.bias then
+         return {self.weight, self.bias}, {self.gradWeight, self.gradBias}, {self.diagHessianWeight, self.diagHessianBias}
+      elseif self.weight then
+         return {self.weight}, {self.gradWeight}, {self.diagHessianWeight}
+      elseif self.bias then
+         return {self.bias}, {self.gradBias}, {self.diagHessianBias}
+      else
+         return
+      end
+   end
+
+   function nn.Module.getParameters(self)
+      -- get parameters
+      local parameters,gradParameters,hessianParameters = self:parameters()
+      -- flatten parameters and gradients
+      local flatParameters = nn.Module.flatten(parameters)
+      collectgarbage()
+      local flatGradParameters = nn.Module.flatten(gradParameters)
+      collectgarbage()
+      local flatHessianParameters
+      if hessianParameters and hessianParameters[1] then
+         flatHessianParameters = nn.Module.flatten(hessianParameters)
+         collectgarbage()
+      end
+
+      -- return new flat vector that contains all discrete parameters
+      return flatParameters, flatGradParameters, flatHessianParameters
+   end
+
+   function nn.Sequential.parameters(self)
+      local function tinsert(to, from)
+         if type(from) == 'table' then
+            for i=1,#from do
+               tinsert(to,from[i])
+            end
+         else
+            table.insert(to,from)
+         end
+      end
+      local w = {}
+      local gw = {}
+      local ggw = {}
+      for i=1,#self.modules do
+         local mw,mgw,mggw = self.modules[i]:parameters()
+         if mw then
+            tinsert(w,mw)
+            tinsert(gw,mgw)
+            tinsert(ggw,mggw)
+         end
+      end
+      return w,gw,ggw
+   end
+
+   ----------------------------------------------------------------------
+   -- Avoid multiple calls to enable()
+   ----------------------------------------------------------------------
+   function nn.hessian.enable()
+   end
+end
diff --git a/init.lua b/init.lua
new file mode 100644
index 0000000..516f29b
--- /dev/null
+++ b/init.lua
@@ -0,0 +1,173 @@
+require('torch')
+
+nn = {} -- define the global nn table
+
+require('nn.THNN')
+
+require('nn.utils')
+
+
+require('nn.ErrorMessages')
+require('nn.Module')
+
+require('nn.Container')
+require('nn.Concat')
+require('nn.Parallel')
+require('nn.Sequential')
+require('nn.DepthConcat')
+
+require('nn.Linear')
+require('nn.Bilinear')
+require('nn.PartialLinear')
+require('nn.SparseLinear')
+require('nn.Reshape')
+require('nn.View')
+require('nn.Contiguous')
+require('nn.Select')
+require('nn.Narrow')
+require('nn.Index')
+require('nn.Squeeze')
+require('nn.Unsqueeze')
+require('nn.Replicate')
+require('nn.Transpose')
+require('nn.BatchNormalization')
+require('nn.Padding')
+require('nn.GradientReversal')
+require('nn.MaskedSelect')
+
+require('nn.Copy')
+require('nn.Min')
+require('nn.Max')
+require('nn.Sum')
+require('nn.Mean')
+require('nn.CMul')
+require('nn.Mul')
+require('nn.MulConstant')
+require('nn.Add')
+require('nn.AddConstant')
+require('nn.Dropout')
+require('nn.SpatialDropout')
+require('nn.VolumetricDropout')
+
+require('nn.CAddTable')
+require('nn.CDivTable')
+require('nn.CMulTable')
+require('nn.CSubTable')
+
+require('nn.Euclidean')
+require('nn.WeightedEuclidean')
+require('nn.PairwiseDistance')
+require('nn.CosineDistance')
+require('nn.DotProduct')
+require('nn.Normalize')
+require('nn.Cosine')
+
+require('nn.Exp')
+require('nn.Log')
+require('nn.HardTanh')
+require('nn.Clamp')
+require('nn.LogSigmoid')
+require('nn.LogSoftMax')
+require('nn.Sigmoid')
+require('nn.SoftMax')
+require('nn.SoftMin')
+require('nn.SoftPlus')
+require('nn.SoftSign')
+require('nn.Tanh')
+require('nn.TanhShrink')
+require('nn.Abs')
+require('nn.Power')
+require('nn.Square')
+require('nn.Sqrt')
+require('nn.HardShrink')
+require('nn.SoftShrink')
+require('nn.Threshold')
+require('nn.ReLU')
+require('nn.PReLU')
+require('nn.LeakyReLU')
+require('nn.SpatialSoftMax')
+require('nn.RReLU')
+require('nn.ELU')
+
+require('nn.LookupTable')
+require('nn.SpatialConvolution')
+require('nn.SpatialConvolutionLocal')
+require('nn.SpatialFullConvolution')
+require('nn.SpatialFullConvolutionMap')
+require('nn.SpatialConvolutionMM')
+require('nn.SpatialConvolutionMap')
+require('nn.SpatialDilatedConvolution')
+require('nn.SpatialSubSampling')
+require('nn.SpatialMaxPooling')
+require('nn.SpatialMaxUnpooling')
+require('nn.SpatialFractionalMaxPooling')
+require('nn.SpatialLPPooling')
+require('nn.SpatialAveragePooling')
+require('nn.SpatialAdaptiveMaxPooling')
+require('nn.TemporalConvolution')
+require('nn.TemporalSubSampling')
+require('nn.TemporalMaxPooling')
+require('nn.SpatialSubtractiveNormalization')
+require('nn.SpatialDivisiveNormalization')
+require('nn.SpatialContrastiveNormalization')
+require('nn.SpatialCrossMapLRN')
+require('nn.SpatialZeroPadding')
+require('nn.SpatialReflectionPadding')
+require('nn.SpatialReplicationPadding')
+require('nn.SpatialUpSamplingNearest')
+require('nn.SpatialBatchNormalization')
+
+require('nn.VolumetricConvolution')
+require('nn.VolumetricFullConvolution')
+require('nn.VolumetricMaxPooling')
+require('nn.VolumetricMaxUnpooling')
+require('nn.VolumetricAveragePooling')
+require('nn.VolumetricBatchNormalization')
+
+require('nn.ParallelTable')
+require('nn.Identity')
+require('nn.ConcatTable')
+require('nn.SplitTable')
+require('nn.JoinTable')
+require('nn.SelectTable')
+require('nn.MixtureTable')
+require('nn.CriterionTable')
+require('nn.FlattenTable')
+require('nn.NarrowTable')
+
+require('nn.Criterion')
+require('nn.MSECriterion')
+require('nn.SmoothL1Criterion')
+require('nn.MarginCriterion')
+require('nn.SoftMarginCriterion')
+require('nn.AbsCriterion')
+require('nn.ClassNLLCriterion')
+require('nn.SpatialClassNLLCriterion')
+require('nn.ClassSimplexCriterion')
+require('nn.DistKLDivCriterion')
+require('nn.MultiCriterion')
+require('nn.L1HingeEmbeddingCriterion')
+require('nn.HingeEmbeddingCriterion')
+require('nn.CosineEmbeddingCriterion')
+require('nn.MarginRankingCriterion')
+require('nn.MultiMarginCriterion')
+require('nn.MultiLabelMarginCriterion')
+require('nn.MultiLabelSoftMarginCriterion')
+require('nn.L1Cost')
+require('nn.L1Penalty')
+require('nn.WeightedMSECriterion')
+require('nn.BCECriterion')
+require('nn.CrossEntropyCriterion')
+require('nn.ParallelCriterion')
+
+require('nn.StochasticGradient')
+
+require('nn.MM')
+require('nn.MV')
+
+require('nn.Jacobian')
+require('nn.SparseJacobian')
+require('nn.hessian')
+require('nn.test')
+
+return nn
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
new file mode 100644
index 0000000..ac30ce1
--- /dev/null
+++ b/lib/CMakeLists.txt
@@ -0,0 +1,5 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.6)
+SET(THNN_INSTALL_LIB_SUBDIR "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+SET(THNN_INSTALL_INCLUDE_SUBDIR "${Torch_INSTALL_INCLUDE_SUBDIR}")
+ADD_SUBDIRECTORY(THNN)
\ No newline at end of file
diff --git a/lib/THNN/CMakeLists.txt b/lib/THNN/CMakeLists.txt
new file mode 100644
index 0000000..b221d59
--- /dev/null
+++ b/lib/THNN/CMakeLists.txt
@@ -0,0 +1,65 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.6)
+
+IF(NOT Torch_FOUND)
+  FIND_PACKAGE(Torch REQUIRED)
+ENDIF()
+
+IF(NOT THNN_INSTALL_LIB_SUBDIR)
+  SET(THNN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THNN install library directory")
+ENDIF()
+
+# Flags
+# When using MSVC
+IF(MSVC)
+  # we want to respect the standard, and we are bored of those **** .
+  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
+ENDIF(MSVC)
+
+IF (CMAKE_VERSION VERSION_LESS "3.1")
+  SET(CMAKE_C_FLAGS "-std=c99 ${CMAKE_C_FLAGS}")
+ELSE ()
+  SET(CMAKE_C_STANDARD 99)
+ENDIF ()
+
+# OpenMP support?
+SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
+IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
+  EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
+  STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
+  MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
+  IF (DARWIN_VERSION GREATER 9)
+    SET(APPLE_OPENMP_SUCKS 1)
+  ENDIF (DARWIN_VERSION GREATER 9)
+  EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
+    OUTPUT_VARIABLE GCC_VERSION)
+  IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
+    MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
+    MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unknown-pragmas")
+    SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
+  ENDIF ()
+ENDIF ()
+
+IF (WITH_OPENMP)
+  FIND_PACKAGE(OpenMP)
+  IF(OPENMP_FOUND)
+    MESSAGE(STATUS "Compiling with OpenMP support")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+  ENDIF(OPENMP_FOUND)
+ENDIF (WITH_OPENMP)
+
+LINK_DIRECTORIES("${Torch_INSTALL_LIB}")
+
+SET(src init.c)
+ADD_LIBRARY(THNN MODULE init.c)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+### Torch packages supposes libraries prefix is "lib"
+SET_TARGET_PROPERTIES(THNN PROPERTIES
+  PREFIX "lib"
+  IMPORT_PREFIX "lib")
+TARGET_LINK_LIBRARIES(THNN TH)
+
+INSTALL(TARGETS THNN LIBRARY DESTINATION ${THNN_INSTALL_LIB_SUBDIR})
diff --git a/lib/THNN/README.md b/lib/THNN/README.md
new file mode 100644
index 0000000..e6c6160
--- /dev/null
+++ b/lib/THNN/README.md
@@ -0,0 +1,32 @@
+# THNN
+
+THNN is a library that gathers nn's C implementations of neural network modules. It's entirely free of Lua dependency and therefore can be used in any application that has a C FFI. Please note that it only contains quite low level functions, and an object oriented C/C++ wrapper will be created soon as another library.
+
+There is also a CUDA counterpart of THNN (THCUNN) in the [cunn repository](https://github.com/torch/cunn/tree/master/lib/THCUNN).
+
+## Links
+
+* [API reference](doc/api_reference.md)
+* [Style guidelines](doc/style_guidelines.md)
+
+## Motivation
+
+Torch's neural network package (nn) provided many optimized C implementations of modules, but the source files contained Lua specific code and headers so they couldn't be easily compiled and included anywhere else.
+
+THNN is based on the same code, but is written in pure C, so it can be easily included in other code. **Future C implementations should be committed to THNN.**
+
+## API
+
+THNN is a purely functional library. It provides 2-3 functions for each module, that perform the most important operations:
+
+* **updateOutput** - applies the module to an input
+* **updateGradInput** - accepts gradient w.r.t. output and previous module input, and computes a gradient w.r.t. that input
+* **accGradParameters** - *(optional, only modules with parameters)* accepts gradient w.r.t. output and previous module input, and computes gradient w.r.t. the parameters
+
+For information on argument types please check the [API reference](doc/api_reference.md).
+
+## Developer docs
+
+* [Style guidelines](doc/style_guidelines.md)
+
+This section will be expanded when FFI refactoring will be finished.
diff --git a/lib/THNN/THNN.h b/lib/THNN/THNN.h
new file mode 100644
index 0000000..9efcd46
--- /dev/null
+++ b/lib/THNN/THNN.h
@@ -0,0 +1,25 @@
+#ifndef THNN_H
+#define THNN_H
+
+#include <stdbool.h>
+#include <TH.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#define THNN_(NAME) TH_CONCAT_3(THNN_, Real, NAME)
+
+#define THIndexTensor THLongTensor
+#define THIndexTensor_(NAME) THLongTensor_ ## NAME
+
+#define THIntegerTensor THIntTensor
+#define THIntegerTensor_(NAME) THIntTensor_ ## NAME
+
+typedef long THIndex_t;
+typedef int THInteger_t;
+typedef void THNNState;
+
+#include "generic/THNN.h"
+#include <THGenerateFloatTypes.h>
+
+#endif
\ No newline at end of file
diff --git a/lib/THNN/doc/api_reference.md b/lib/THNN/doc/api_reference.md
new file mode 100644
index 0000000..830cc3d
--- /dev/null
+++ b/lib/THNN/doc/api_reference.md
@@ -0,0 +1,1509 @@
+# API docs
+
+This document only describes a THNN API. For a thorough review of all modules present here please refer to [nn's docs](http://github.com/torch/nn/tree/master/doc).
+
+### Note on function names
+
+Please remember, that because C doesn't support function overloading, functions taking different tensor types have different names. So e.g. for an Abs module, there are actually two updateOutput functions:
+
+* `void THNN_FloatAbs_updateOutput(...)`
+* `void THNN_DoubleAbs_updateOutput(...)`
+
+In these docs such function will be referred to as `void THNN_Abs_updateOutput(...)`, and it's up to developer to add a type prefix. `real` is an alias for that type.
+
+### Argument types
+
+Some arguments have additional tags placed in square brackets:
+* **[OUT]** - This is the output argument. It will be reshaped if needed.
+* **[OPTIONAL]** - This argument is optional and can be safely set to NULL
+* **[BUFFER]** - A buffer. `updateGradInput` and `accGradParameters` should get the same buffers that were used in `updateOutput` call.
+* **[MODIFIED]** - Some functions accept an `inplace` flag. If set to true, this argument might be modified (in addition to the output).
+
+## Module list
+
+These are all modules implemented in THNN:
+
+* [Abs](#abs)
+* [AbsCriterion](#abscriterion)
+* [ClassNLLCriterion](#classnllcriterion)
+* [DistKLDivCriterion](#distkldivcriterion)
+* [ELU](#elu)
+* [HardShrink](#hardshrink)
+* [HardTanh](#hardtanh)
+* [L1Cost](#l1cost)
+* [LeakyReLU](#leakyrelu)
+* [LogSigmoid](#logsigmoid)
+* [LogSoftMax](#logsoftmax)
+* [LookupTable](#lookuptable)
+* [MSECriterion](#msecriterion)
+* [MarginCriterion](#margincriterion)
+* [MultiLabelMarginCriterion](#multilabelmargincriterion)
+* [MultiMarginCriterion](#multimargincriterion)
+* [PReLU](#prelu)
+* [RReLU](#rrelu)
+* [Sigmoid](#sigmoid)
+* [SmoothL1Criterion](#smoothl1criterion)
+* [SoftMax](#softmax)
+* [SoftPlus](#softplus)
+* [SoftShrink](#softshrink)
+* [SparseLinear](#sparselinear)
+* [SpatialAdaptiveMaxPooling](#spatialadaptivemaxpooling)
+* [SpatialAveragePooling](#spatialaveragepooling)
+* [SpatialBatchNormalization](#spatialbatchnormalization)
+* [SpatialConvolutionLocal](#spatialconvolutionlocal)
+* [SpatialConvolutionMM](#spatialconvolutionmm)
+* [SpatialConvolutionMap](#spatialconvolutionmap)
+* [SpatialFractionalMaxPooling](#spatialfractionalmaxpooling)
+* [SpatialFullConvolution](#spatialfullconvolution)
+* [SpatialFullConvolutionMap](#spatialfullconvolutionmap)
+* [SpatialMaxPooling](#spatialmaxpooling)
+* [SpatialMaxUnpooling](#spatialmaxunpooling)
+* [SpatialSubSampling](#spatialsubsampling)
+* [SpatialUpSamplingNearest](#spatialupsamplingnearest)
+* [Sqrt](#sqrt)
+* [Square](#square)
+* [Tanh](#tanh)
+* [Threshold](#threshold)
+* [VolumetricAveragePooling](#volumetricaveragepooling)
+* [VolumetricConvolution](#volumetricconvolution)
+* [VolumetricConvolutionMM](#volumetricconvolutionmm)
+* [VolumetricFullConvolution](#volumetricfullconvolution)
+* [VolumetricMaxPooling](#volumetricmaxpooling)
+* [VolumetricMaxUnpooling](#volumetricmaxunpooling)
+
+## Abs
+```C
+void THNN_Abs_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** Abs output
+<br/>
+```C
+void THNN_Abs_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+## AbsCriterion
+```C
+void THNN_AbsCriterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *target` - tensor with target values
+<br/>
+`THTensor *output` - **[OUT]** a one-element tensor with loss
+<br/>
+`bool sizeAverage` - if true, the loss will be divided by batch size
+<br/>
+```C
+void THNN_AbsCriterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *target` - tensor with target values
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`bool sizeAverage` - if true, the gradient will be normalized by batch size
+<br/>
+## ClassNLLCriterion
+```C
+void THNN_ClassNLLCriterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor (1D/2D)
+<br/>
+`THIndexTensor *target` - tensor containing indexes of target classes
+<br/>
+`THTensor *output` - **[OUT]** a one-element tensor with loss
+<br/>
+`bool sizeAverage` - if true, the loss will be normalized by batch size and class weights
+<br/>
+`THTensor *weights` - **[OPTIONAL]** class weights
+<br/>
+`THTensor *total_weight` - **[BUFFER]**
+<br/>
+```C
+void THNN_ClassNLLCriterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor (1D/2D)
+<br/>
+`THIndexTensor *target` - tensor containing indexes of target classes
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`bool sizeAverage` - if true, the loss will be normalized by batch size and class weights
+<br/>
+`THTensor *weights` - **[OPTIONAL]** class weights
+<br/>
+`THTensor *total_weight` - **[BUFFER]**
+<br/>
+## DistKLDivCriterion
+```C
+void THNN_DistKLDivCriterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *target` - target tensor
+<br/>
+`THTensor *output` - **[OUT]** a one-element tensor containing the loss
+<br/>
+`bool sizeAverage` - if true, the loss will be normalized **by total number of elements**
+<br/>
+```C
+void THNN_DistKLDivCriterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *target` - target tensor
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`bool sizeAverage` - if true, the loss will be normalized **by total number of elements**
+<br/>
+## ELU
+```C
+void THNN_ELU_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real alpha);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** ELU output
+<br/>
+`real alpha` - an ELU parameter (as in paper)
+<br/>
+```C
+void THNN_ELU_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          real alpha);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`THTensor *output` - output from a forward pass
+<br/>
+`real alpha` - an ELU parameter (as in paper)
+<br/>
+## HardShrink
+```C
+void THNN_HardShrink_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real lambda);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** output tensor
+<br/>
+`real lambda` - HardShrink parameter
+<br/>
+```C
+void THNN_HardShrink_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. module's output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`real lambda` - HardShrink parameter
+<br/>
+## HardTanh
+```C
+void THNN_HardTanh_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real min_val,
+          real max_val);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** output tensor
+<br/>
+`real min_val` - lower threshold
+<br/>
+`real max_val` - upper threshold
+<br/>
+```C
+void THNN_HardTanh_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real min_val,
+          real max_val);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. module's output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. the input
+<br/>
+`real min_val` - lower threshold
+<br/>
+`real max_val` - upper threshold
+<br/>
+## L1Cost
+```C
+void THNN_L1Cost_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** output tensor
+<br/>
+```C
+void THNN_L1Cost_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t module's output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t the input
+<br/>
+## LeakyReLU
+```C
+void THNN_LeakyReLU_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real negval,
+          bool inplace);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - **[MODIFIED]** input tensor
+<br/>
+`THTensor *output` - **[OUT]** output tensor
+<br/>
+`real negval` - negative part slope
+<br/>
+`bool inplace` - if true, modifies the input tensor and sets the output tensor on it (no additional memory is allocated)
+<br/>
+```C
+void THNN_LeakyReLU_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real negval,
+          bool inplace);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - **[MODIFIED]** gradient w.r.t. module's output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. the input
+<br/>
+`real negval` - negative part slope
+<br/>
+`bool inplace` - if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+<br/>
+## LogSigmoid
+```C
+void THNN_LogSigmoid_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *buffer);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - output tensor
+<br/>
+`THTensor *buffer` - **[BUFFER]**
+<br/>
+```C
+void THNN_LogSigmoid_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *buffer);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. module's output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`THTensor *buffer` - **[BUFFER]**
+<br/>
+## LogSoftMax
+```C
+void THNN_LogSoftMax_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** output tensor
+<br/>
+```C
+void THNN_LogSoftMax_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. module's output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`THTensor *output` - module's output
+<br/>
+## LookupTable
+```C
+void THNN_LookupTable_accGradParameters(
+          THNNState *state,
+          THIndexTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THIntegerTensor *count,
+          THTensor *sorted,
+          THTensor *indices,
+          bool scaleGradByFreq,
+          int paddingValue,
+          real scale);
+```
+## MSECriterion
+```C
+void THNN_MSECriterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+```
+```C
+void THNN_MSECriterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+```
+## MarginCriterion
+```C
+void THNN_MarginCriterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          real margin);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *target` - target tensor (should contain only 1s and -1s)
+<br/>
+`THTensor *output` - **[OUT]** a one-element tensor containing the loss
+<br/>
+`bool sizeAverage` - if true, the loss is normalized by **total number of elements**
+<br/>
+`real margin` - a margin that is required for the loss to be 0
+<br/>
+```C
+void THNN_MarginCriterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          real margin);
+```
+`THNNState *state` - library's state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *target` - target tensor (should contin only 1s and -1s)
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. module's input
+<br/>
+`bool sizeAverage` - if true, the gradient is normalized by **total number of elements**
+<br/>
+`real margin` - a margin that is required for the loss to be 0
+<br/>
+## MultiLabelMarginCriterion
+```C
+void THNN_MultiLabelMarginCriterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+```
+```C
+void THNN_MultiLabelMarginCriterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+```
+## MultiMarginCriterion
+```C
+void THNN_MultiMarginCriterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          int p,
+          THTensor* weights);
+```
+```C
+void THNN_MultiMarginCriterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          int p,
+          THTensor *weights);
+```
+## PReLU
+```C
+void THNN_PReLU_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THIndex_t nOutputPlane);
+```
+```C
+void THNN_PReLU_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THIndex_t nOutputPlane);
+```
+```C
+void THNN_PReLU_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradWeight,
+          THTensor *gradWeightBuf,
+          THTensor *gradWeightBuf2,
+          THIndex_t nOutputPlane,
+          real scale);
+```
+## RReLU
+```C
+void THNN_RReLU_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *noise,
+          real lower,
+          real upper,
+          bool train,
+          bool inplace,
+          THGenerator *generator);
+```
+```C
+void THNN_RReLU_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *noise,
+          real lower,
+          real upper,
+          bool train,
+          bool inplace);
+```
+## Sigmoid
+```C
+void THNN_Sigmoid_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+```
+```C
+void THNN_Sigmoid_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+```
+## SmoothL1Criterion
+```C
+void THNN_SmoothL1Criterion_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+```
+```C
+void THNN_SmoothL1Criterion_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+```
+## SoftMax
+```C
+void THNN_SoftMax_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+```
+```C
+void THNN_SoftMax_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+```
+## SoftPlus
+```C
+void THNN_SoftPlus_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real beta,
+          real threshold);
+```
+```C
+void THNN_SoftPlus_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          real beta,
+          real threshold);
+```
+## SoftShrink
+```C
+void THNN_SoftShrink_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real lambda);
+```
+```C
+void THNN_SoftShrink_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda);
+```
+## SparseLinear
+```C
+void THNN_SparseLinear_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *shardBuffer);
+```
+```C
+void THNN_SparseLinear_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight);
+```
+```C
+void THNN_SparseLinear_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          real weightDecay,
+          real scale);
+```
+```C
+void THNN_SparseLinear_zeroGradParameters(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput);
+```
+```C
+void THNN_SparseLinear_updateParameters(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          real learningRate);
+```
+## SpatialAdaptiveMaxPooling
+```C
+void THNN_SpatialAdaptiveMaxPooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int owidth, int oheight);
+```
+```C
+void THNN_SpatialAdaptiveMaxPooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices);
+```
+## SpatialAveragePooling
+```C
+void THNN_SpatialAveragePooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode,
+          bool count_include_pad);
+```
+```C
+void THNN_SpatialAveragePooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode,
+          bool count_include_pad);
+```
+## SpatialBatchNormalization
+```C
+void THNN_SpatialBatchNormalization_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *running_mean,
+          THTensor *running_var,
+          THTensor *save_mean,
+          THTensor *save_std,
+          bool train,
+          double momentum,
+          double eps);
+```
+```C
+void THNN_SpatialBatchNormalization_backward(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *save_mean,
+          THTensor *save_std,
+          double scale);
+```
+## SpatialConvolutionLocal
+```C
+void THNN_SpatialConvolutionLocal_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight);
+```
+```C
+void THNN_SpatialConvolutionLocal_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight);
+```
+```C
+void THNN_SpatialConvolutionLocal_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight,
+          real scale);
+```
+## SpatialConvolutionMM
+```C
+void THNN_SpatialConvolutionMM_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+```
+```C
+void THNN_SpatialConvolutionMM_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+```
+```C
+void THNN_SpatialConvolutionMM_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          real scale);
+```
+## SpatialConvolutionMap
+```C
+void THNN_SpatialConvolutionMap_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH);
+```
+`THNNState *state` - library state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** convolution output
+<br/>
+`THTensor *weight` - 3D weight tensor (connTable:size(1) x kH x kW)
+<br/>
+`THTensor *bias` - 1D bias tensor (nOutputPlane)
+<br/>
+`THTensor *connTable` - connection table
+<br/>
+`int nInputPlane` - number of input planes
+<br/>
+`int nOutputPlane` - number of output planes
+<br/>
+`int dW, int dH` - stride
+<br/>
+```C
+void THNN_SpatialConvolutionMap_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH);
+```
+`THNNState *state` - library state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`THTensor *weight` - 3D weight tensor (connTable:size(1) x kH x kW)
+<br/>
+`THTensor *bias` - 1D bias tensor (nOutputPlane)
+<br/>
+`THTensor *connTable` - connection table
+<br/>
+`int nInputPlane` - number of input planes
+<br/>
+`int nOutputPlane` - number of output planes
+<br/>
+`int dW, int dH` - stride
+<br/>
+```C
+void THNN_SpatialConvolutionMap_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH,
+          real scale);
+```
+`THNNState *state` - library state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. output
+<br/>
+`THTensor *gradWeight` - 3D gradWeight tensor (connTable:size(1) x kH x kW)
+<br/>
+`THTensor *gradBias` - 1D gradBias tensor (nOutputPlane)
+<br/>
+`THTensor *connTable` - connection table
+<br/>
+`int nInputPlane` - number of input planes
+<br/>
+`int nOutputPlane` - number of output planes
+<br/>
+`int dW, int dH` - stride
+<br/>
+`real scale` - scaling factor
+<br/>
+## SpatialFractionalMaxPooling
+```C
+void THNN_SpatialFractionalMaxPooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int outputW, int outputH,
+          int poolSizeW, int poolSizeH,
+          THTensor *indices,
+          THTensor *randomSamples);
+```
+```C
+void THNN_SpatialFractionalMaxPooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int outputW, int outputH,
+          int poolSizeW, int poolSizeH,
+          THTensor *indices);
+```
+## SpatialFullConvolution
+```C
+void THNN_SpatialFullConvolution_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH);
+```
+```C
+void THNN_SpatialFullConvolution_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH);
+```
+```C
+void THNN_SpatialFullConvolution_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH,
+          real scale);
+```
+## SpatialFullConvolutionMap
+```C
+void THNN_SpatialFullConvolutionMap_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH);
+```
+`THNNState *state` - library state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *output` - **[OUT]** convolution output
+<br/>
+`THTensor *weight` - 3D weight tensor (connTable:size(1) x kH x kW)
+<br/>
+`THTensor *bias` - 1D bias tensor (nOutputPlane)
+<br/>
+`THTensor *connTable` - connection table
+<br/>
+`int nInputPlane` - number of input planes
+<br/>
+`int nOutputPlane` - number of output planes
+<br/>
+`int dW, int dH` - stride
+<br/>
+```C
+void THNN_SpatialFullConvolutionMap_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH);
+```
+`THNNState *state` - library state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. output
+<br/>
+`THTensor *gradInput` - **[OUT]** gradient w.r.t. input
+<br/>
+`THTensor *weight` - 3D weight tensor (connTable:size(1) x kH x kW)
+<br/>
+`THTensor *bias` - 1D bias tensor (nOutputPlane)
+<br/>
+`THTensor *connTable` - connection table
+<br/>
+`int nInputPlane` - number of input planes
+<br/>
+`int nOutputPlane` - number of output planes
+<br/>
+`int dW, int dH` - stride
+<br/>
+```C
+void THNN_SpatialFullConvolutionMap_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH,
+          real scale);
+```
+`THNNState *state` - library state
+<br/>
+`THTensor *input` - input tensor
+<br/>
+`THTensor *gradOutput` - gradient w.r.t. output
+<br/>
+`THTensor *gradWeight` - 3D gradWeight tensor (connTable:size(1) x kH x kW)
+<br/>
+`THTensor *gradBias` - 1D gradBias tensor (nOutputPlane)
+<br/>
+`THTensor *connTable` - connection table
+<br/>
+`int nInputPlane` - number of input planes
+<br/>
+`int nOutputPlane` - number of output planes
+<br/>
+`int dW, int dH` - stride
+<br/>
+`real scale` - scaling factor
+<br/>
+## SpatialMaxPooling
+```C
+void THNN_SpatialMaxPooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode);
+```
+```C
+void THNN_SpatialMaxPooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode);
+```
+## SpatialMaxUnpooling
+```C
+void THNN_SpatialMaxUnpooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int owidth, int oheight);
+```
+```C
+void THNN_SpatialMaxUnpooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int owidth, int oheight);
+```
+## SpatialSubSampling
+```C
+void THNN_SpatialSubSampling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int kH,
+          int dW, int dH);
+```
+```C
+void THNN_SpatialSubSampling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int kH,
+          int dW, int dH);
+```
+```C
+void THNN_SpatialSubSampling_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int kH,
+          int dW, int dH,
+          real scale);
+```
+## SpatialUpSamplingNearest
+```C
+void THNN_SpatialUpSamplingNearest_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int scale_factor);
+```
+```C
+void THNN_SpatialUpSamplingNearest_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int scale_factor);
+```
+## Sqrt
+```C
+void THNN_Sqrt_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real eps);
+```
+```C
+void THNN_Sqrt_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+```
+## Square
+```C
+void THNN_Square_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+```
+```C
+void THNN_Square_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+```
+## Tanh
+```C
+void THNN_Tanh_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+```
+```C
+void THNN_Tanh_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+```
+## Threshold
+```C
+void THNN_Threshold_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real threshold,
+          real val,
+          bool inplace);
+```
+```C
+void THNN_Threshold_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real threshold,
+          bool inplace);
+```
+## VolumetricAveragePooling
+```C
+void THNN_VolumetricAveragePooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH);
+```
+```C
+void THNN_VolumetricAveragePooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH);
+```
+## VolumetricConvolution
+```C
+void THNN_VolumetricConvolution_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+```C
+void THNN_VolumetricConvolution_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+```C
+void THNN_VolumetricConvolution_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          real scale);
+```
+## VolumetricConvolutionMM
+```C
+void THNN_VolumetricConvolutionMM_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+```C
+void THNN_VolumetricConvolutionMM_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+```C
+void THNN_VolumetricConvolutionMM_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale);
+```
+## VolumetricFullConvolution
+```C
+void THNN_VolumetricFullConvolution_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+```C
+void THNN_VolumetricFullConvolution_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+```C
+void THNN_VolumetricFullConvolution_accGradParameters(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          real scale);
+```
+## VolumetricMaxPooling
+```C
+void THNN_VolumetricMaxPooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          bool ceilMode);
+```
+```C
+void THNN_VolumetricMaxPooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+## VolumetricMaxUnpooling
+```C
+void THNN_VolumetricMaxUnpooling_updateOutput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int oT, int oW, int oH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
+```C
+void THNN_VolumetricMaxUnpooling_updateGradInput(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int oT, int oW, int oH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+```
diff --git a/lib/THNN/doc/generate_reference.lua b/lib/THNN/doc/generate_reference.lua
new file mode 100644
index 0000000..0f75474
--- /dev/null
+++ b/lib/THNN/doc/generate_reference.lua
@@ -0,0 +1,106 @@
+--[[
+  This script regenerates api_reference.md based on comments placed in THNN.h.
+]]--
+
+local header = [[
+# API docs
+
+This document only describes a THNN API. For a thorough review of all modules present here please refer to [nn's docs](http://github.com/torch/nn/tree/master/doc).
+
+### Note on function names
+
+Please remember, that because C doesn't support function overloading, functions taking different tensor types have different names. So e.g. for an Abs module, there are actually two updateOutput functions:
+
+* `void THNN_FloatAbs_updateOutput(...)`
+* `void THNN_DoubleAbs_updateOutput(...)`
+
+In these docs such function will be referred to as `void THNN_Abs_updateOutput(...)`, and it's up to developer to add a type prefix. `real` is an alias for that type.
+
+### Argument types
+
+Some arguments have additional tags placed in square brackets:
+* **[OUT]** - This is the output argument. It will be reshaped if needed.
+* **[OPTIONAL]** - This argument is optional and can be safely set to NULL
+* **[BUFFER]** - A buffer. `updateGradInput` and `accGradParameters` should get the same buffers that were used in `updateOutput` call.
+* **[MODIFIED]** - Some functions accept an `inplace` flag. If set to true, this argument might be modified (in addition to the output).
+
+## Module list
+
+These are all modules implemented in THNN:
+
+]]
+
+local hfile = io.open('../generic/THNN.h', 'r')
+local lines = hfile:read('*a'):split('\n')
+hfile:close()
+
+-- Parse input
+local declarations = {}
+local current_declaration
+local declaration_module
+for i,line in ipairs(lines) do
+   if line:sub(1, 6) == 'TH_API' then
+     current_declaration = ''
+     declaration_module = line:match('THNN_%((.+)_.+%)')
+   end
+
+   if current_declaration then
+      current_declaration = current_declaration .. line .. '\n'
+   end
+
+   if line:match('%);') then
+     current_declaration = current_declaration:sub(1, -2) -- remove a trailing newline
+     declarations[declaration_module] = declarations[declaration_module] or {}
+     table.insert(declarations[declaration_module], current_declaration)
+     current_declaration = nil
+     declaration_module = nil
+   end
+end
+declarations["unfolded"] = nil
+
+-- Sort modules
+modules = {}
+for k,_ in pairs(declarations) do table.insert(modules, k) end
+table.sort(modules)
+
+-- Create an index
+local outfile = io.open('api_reference.md', 'w')
+outfile:write(header)
+for i, name in ipairs(modules) do
+    outfile:write(string.format('* [%s](#%s)\n', name, name:lower()))
+end
+outfile:write('\n')
+
+-- Write proper docs
+for i,name in ipairs(modules) do
+    outfile:write('## ' .. name ..'\n')
+
+    for i,declaration in ipairs(declarations[name]) do
+
+        -- Write source code
+        outfile:write('```C' .. '\n')
+        local declaration_lines = declaration:split('\n')
+        for i, line in ipairs(declaration_lines) do
+            if i == 1 then
+                line = line:gsub('TH_API ', ''):gsub('%(', ''):gsub('%)', '') .. '(' -- remove macro junk
+            else
+                line = line:gsub('%s*//.*$', '') -- remove the comment
+            end
+            outfile:write(line .. '\n')
+        end
+        outfile:write('```' .. '\n')
+
+        -- Describe arguments
+        table.remove(declaration_lines, 1)
+        for i,line in ipairs(declaration_lines) do
+            local param, comment = line:match('^%s*(.*),%s*// (.*)$')
+            if param == nil then param, comment = line:match('^%s*(.*)%);%s*// (.*)$') end
+
+            if param ~= nil then
+                comment = comment:gsub('%[', '%*%*%['):gsub('%]', '%]%*%*') -- use bold font for tags
+                outfile:write(string.format('`%s` - %s\n<br/>\n', param, comment))
+            end
+        end
+    end
+end
+outfile:close()
diff --git a/lib/THNN/doc/style_guidelines.md b/lib/THNN/doc/style_guidelines.md
new file mode 100644
index 0000000..a725454
--- /dev/null
+++ b/lib/THNN/doc/style_guidelines.md
@@ -0,0 +1,59 @@
+## API design guidelines
+
+Functions should return `void`.
+
+All functions should accept arguments in the following order. `...` represent any module-specific parameters or buffers, disregarding whether they are used for writing or reading. Arguments in `...` below should be ordered like this:
+```
+[weight], [bias], [any buffers], [additional arguments], [optional arguments]
+```
+
+### Modules
+```
+updateOutput: state, input, output, ...
+updateGradInput: state, input, gradOutput, gradInput, ...
+accGradParameters: state, input, gradOutput, [gradWeight], [gradBias], ...
+```
+
+e.g.
+```C
+void THNN_(HardShrink_updateGradInput)(
+          THNNState* state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda)
+```
+
+### Criterions
+```
+updateOutput: state, input, target, output, ...
+updateGradInput: state, input, target, gradInput, ...
+```
+
+e.g.
+
+```C
+void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState* state,
+          THTensor *input,
+          THLongTensor *target,
+          THTensor *output,
+          THTensor *weights,
+          THTensor *total_weight,
+          bool sizeAverage)
+```
+
+## Code style guide
+
+```C
+void THNN_Linear_updateOutput(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+//<- 10 ->
+```
+
+All arguments should start on a new line after function name, and they should be indented using 10 spaces.
+
+Use 2 spaces for block indentation.
diff --git a/lib/THNN/generic/Abs.c b/lib/THNN/generic/Abs.c
new file mode 100644
index 0000000..c5e36ff
--- /dev/null
+++ b/lib/THNN/generic/Abs.c
@@ -0,0 +1,27 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Abs.c"
+#else
+
+void THNN_(Abs_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(abs)(output, input);
+}
+
+void THNN_(Abs_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    real z = *input_data;
+    *gradInput_data = *gradOutput_data * (z >= 0 ? 1 : -1);
+  );
+}
+
+#endif
diff --git a/lib/THNN/generic/AbsCriterion.c b/lib/THNN/generic/AbsCriterion.c
new file mode 100644
index 0000000..e87bb5b
--- /dev/null
+++ b/lib/THNN/generic/AbsCriterion.c
@@ -0,0 +1,39 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/AbsCriterion.c"
+#else
+
+void THNN_(AbsCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    sum += fabs(*input_data - *target_data);
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(AbsCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = (*input_data - *target_data) >= 0 ? norm : -norm;
+  );
+}
+
+#endif
diff --git a/lib/THNN/generic/BatchNormalization.c b/lib/THNN/generic/BatchNormalization.c
new file mode 100644
index 0000000..bf36d30
--- /dev/null
+++ b/lib/THNN/generic/BatchNormalization.c
@@ -0,0 +1,144 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/BatchNormalization.c"
+#else
+
+void THNN_(BatchNormalization_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output,
+  THTensor *weight, THTensor *bias,
+  THTensor *running_mean, THTensor *running_var,
+  THTensor *save_mean, THTensor *save_std,
+  bool train, double momentum, double eps)
+{
+  long nInput = THTensor_(size)(input, 1);
+  long f,n = THTensor_(nElement)(input) / nInput;
+
+  #pragma omp parallel for
+  for (f = 0; f < nInput; ++f) {
+    THTensor *in = THTensor_(newSelect)(input, 1, f);
+    THTensor *out = THTensor_(newSelect)(output, 1, f);
+
+    real mean, invstd;
+
+    if (train) {
+      // compute mean per input
+      accreal sum = 0;
+      TH_TENSOR_APPLY(real, in, sum += *in_data;);
+
+      mean = (real) sum / n;
+      THTensor_(set1d)(save_mean, f, (real) mean);
+
+      // compute variance per input
+      sum = 0;
+      TH_TENSOR_APPLY(real, in,
+        sum += (*in_data - mean) * (*in_data - mean););
+
+      if (sum == 0 && eps == 0.0) {
+        invstd = 0;
+      } else {
+        invstd = (real) (1 / sqrt(sum/n + eps));
+      }
+      THTensor_(set1d)(save_std, f, (real) invstd);
+
+      // update running averages
+      THTensor_(set1d)(running_mean, f,
+        (real) (momentum * mean + (1 - momentum) * THTensor_(get1d)(running_mean, f)));
+
+      accreal unbiased_var = sum / (n - 1);
+      THTensor_(set1d)(running_var, f,
+        (real) (momentum * unbiased_var + (1 - momentum) * THTensor_(get1d)(running_var, f)));
+    } else {
+      mean = THTensor_(get1d)(running_mean, f);
+      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+    }
+
+    // compute output
+    real w = weight ? THTensor_(get1d)(weight, f) : 1;
+    real b = bias ? THTensor_(get1d)(bias, f) : 0;
+
+    TH_TENSOR_APPLY2(real, in, real, out,
+      *out_data = (real) (((*in_data - mean) * invstd) * w + b););
+
+    THTensor_(free)(out);
+    THTensor_(free)(in);
+  }
+}
+
+void THNN_(BatchNormalization_backward)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput,
+  THTensor *gradWeight, THTensor *gradBias, THTensor *weight,
+  THTensor *running_mean, THTensor *running_var,
+  THTensor *save_mean, THTensor *save_std,
+  bool train, double scale, double eps)
+{
+  long nInput = THTensor_(size)(input, 1);
+  long f,n = THTensor_(nElement)(input) / nInput;
+
+  #pragma omp parallel for
+  for (f = 0; f < nInput; ++f) {
+    THTensor *in = THTensor_(newSelect)(input, 1, f);
+    THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f);
+    real w = weight ? THTensor_(get1d)(weight, f) : 1;
+    real mean, invstd;
+    if (train) {
+      mean = THTensor_(get1d)(save_mean, f);
+      invstd = THTensor_(get1d)(save_std, f);
+    } else {
+      mean = THTensor_(get1d)(running_mean, f);
+      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+    }
+
+    // sum over all gradOutput in feature plane
+    accreal sum = 0;
+    TH_TENSOR_APPLY(real, gradOut, sum += *gradOut_data;);
+
+    // dot product of the Q(X) and gradOuput
+    accreal dotp = 0;
+    TH_TENSOR_APPLY2(real, in, real, gradOut,
+      dotp += (*in_data - mean) * (*gradOut_data););
+
+    if (gradInput) {
+      THTensor *gradIn = THTensor_(newSelect)(gradInput, 1, f);
+
+      if (train) {
+        // when in training mode
+        // Q(X) = X - E[x] ; i.e. input centered to zero mean
+        // Y = Q(X) / σ    ; i.e. BN output before weight and bias
+        // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
+
+        // projection of gradOutput on to output scaled by std
+        real k = (real) dotp * invstd * invstd / n;
+        TH_TENSOR_APPLY2(real, gradIn, real, in,
+          *gradIn_data = (*in_data - mean) * k;);
+
+        accreal gradMean = sum / n;
+        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+          *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;);
+
+      } else {
+        // when in evaluation mode
+        // Q(X) = X - running_mean  ; i.e. input centered to zero mean
+        // Y = Q(X) / running_std    ; i.e. BN output before weight and bias
+        // dL/dX = w / running_std
+        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+          *gradIn_data = *gradOut_data * invstd * w;);
+      }
+
+      THTensor_(free)(gradIn);
+    }
+
+    if (gradWeight) {
+      real val = THTensor_(get1d)(gradWeight, f);
+      THTensor_(set1d)(gradWeight, f, val + scale * dotp * invstd);
+    }
+
+    if (gradBias) {
+      real val = THTensor_(get1d)(gradBias, f);
+      THTensor_(set1d)(gradBias, f, val + scale * sum);
+    }
+
+    THTensor_(free)(gradOut);
+    THTensor_(free)(in);
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/ClassNLLCriterion.c b/lib/THNN/generic/ClassNLLCriterion.c
new file mode 100644
index 0000000..eb02f7c
--- /dev/null
+++ b/lib/THNN/generic/ClassNLLCriterion.c
@@ -0,0 +1,140 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ClassNLLCriterion.c"
+#else
+
+void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
+{
+  int n_dims = THTensor_(nDimension)(input);
+  int n_classes = THTensor_(size)(input, n_dims - 1);
+
+  if (THIndexTensor_(nDimension)(target) > 1) {
+    THError("multi-target not supported");
+  }
+  if (THTensor_(nDimension)(input) > 2) {
+    THError("input tensor should be 1D or 2D");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  real *input_data = THTensor_(data)(input);
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *output_data = THTensor_(data)(output);
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  output_data[0] = total_weight_data[0] = 0.0;
+
+  if (THTensor_(nDimension)(input) == 1) {
+    int cur_target = target_data[0] - 1;
+    THAssert(cur_target >= 0 && cur_target < n_classes);
+    total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f;
+    output_data[0] = -input_data[cur_target] * total_weight_data[0];
+  } else if (THTensor_(nDimension)(input) == 2) {
+    int batch_size = THTensor_(size)(input, 0);
+    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
+
+    int n_target = THTensor_(size)(input, 1);
+
+    int i;
+    for (i = 0; i < batch_size; i++) {
+      int cur_target = target_data[i] - 1;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+      total_weight_data[0] += cur_weight;
+      output_data[0] -= input_data[i * n_target + cur_target] * cur_weight;
+    }
+  }
+
+  if (sizeAverage && total_weight_data[0]) {
+    output_data[0] /= total_weight_data[0];
+  }
+
+  if (weights) {
+    THTensor_(free)(weights);
+  }
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+}
+
+void THNN_(ClassNLLCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
+{
+  int n_dims = THTensor_(nDimension)(input);
+  int n_classes = THTensor_(size)(input, n_dims - 1);
+
+  if (!THTensor_(isContiguous)(gradInput)) {
+    THError("gradInput must be contiguous");
+  }
+
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  if (!(*total_weight_data > 0)) {
+    return;
+  }
+
+  if (THIndexTensor_(nDimension)(target) > 1) {
+    THError("multi-target not supported");
+  }
+
+  if (THTensor_(nDimension)(input) > 2) {
+    THError("input tensor should be 1D or 2D");
+  }
+
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *gradInput_data = THTensor_(data)(gradInput);
+
+  if (THTensor_(nDimension)(input) == 1) {
+    int cur_target = target_data[0] - 1;
+    THAssert(cur_target >= 0 && cur_target < n_classes);
+
+    gradInput_data[cur_target] =
+      (!sizeAverage && weights) ? -weights_data[cur_target] : -1;
+
+  } else if (THTensor_(nDimension)(input) == 2) {
+    int batch_size = THTensor_(size)(input, 0);
+    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
+
+    int n_target = THTensor_(size)(input, 1);
+
+    int i;
+    for (i = 0; i < batch_size; i++){
+      int cur_target = target_data[i] - 1;
+
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      gradInput_data[i * n_target + cur_target] =
+        -(weights ? weights_data[cur_target] : 1.0f);
+
+      if (sizeAverage && *total_weight_data) {
+        gradInput_data[i * n_target + cur_target] /= *total_weight_data;
+      }
+    }
+  }
+
+  THIndexTensor_(free)(target);
+  if (weights) {
+    THTensor_(free)(weights);
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/DistKLDivCriterion.c b/lib/THNN/generic/DistKLDivCriterion.c
new file mode 100644
index 0000000..507324d
--- /dev/null
+++ b/lib/THNN/generic/DistKLDivCriterion.c
@@ -0,0 +1,39 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/DistKLDivCriterion.c"
+#else
+
+void THNN_(DistKLDivCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    sum += *target_data > 0 ? *target_data * (log(*target_data) - *input_data) : 0;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(DistKLDivCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = *target_data > 0 ? norm * (-*target_data) : 0;
+  );
+}
+
+#endif
diff --git a/lib/THNN/generic/ELU.c b/lib/THNN/generic/ELU.c
new file mode 100644
index 0000000..8303de0
--- /dev/null
+++ b/lib/THNN/generic/ELU.c
@@ -0,0 +1,51 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ELU.c"
+#else
+
+void THNN_(ELU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real alpha,
+          bool inplace)
+{
+  if(inplace) {
+    TH_TENSOR_APPLY(real, input,
+      if(*input_data <= 0) {
+        *input_data = (exp(*input_data) - 1) * alpha;
+      }
+    );
+    THTensor_(set)(output, input);
+  } else {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, input, real, output,
+      *output_data = *input_data <= 0 ? (exp(*input_data)-1)*alpha : *input_data;
+    );
+  }
+}
+
+void THNN_(ELU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          real alpha,
+          bool inplace)
+{
+  if(inplace) {
+    TH_TENSOR_APPLY2(real, gradOutput, real, output,
+      if(*output_data <= 0) {
+        *gradOutput_data *= *output_data + alpha;
+      }
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  } else {
+    THTensor_(resizeAs)(gradInput, output);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + alpha) : *gradOutput_data;
+    );
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/HardShrink.c b/lib/THNN/generic/HardShrink.c
new file mode 100644
index 0000000..689f565
--- /dev/null
+++ b/lib/THNN/generic/HardShrink.c
@@ -0,0 +1,39 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/HardShrink.c"
+#else
+
+void THNN_(HardShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real lambda)
+{
+  THTensor_(resizeAs)(output, input);
+
+  TH_TENSOR_APPLY2(real, output, real, input,
+    if (*input_data > lambda)
+      *output_data = *input_data;
+    else if (*input_data < -lambda)
+      *output_data = *input_data;
+    else
+      *output_data = 0;
+  );
+}
+
+void THNN_(HardShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda)
+{
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    if (*input_data > lambda || *input_data < -lambda)
+      *gradInput_data = *gradOutput_data;
+    else
+      *gradInput_data = 0;
+  );
+}
+
+#endif
diff --git a/lib/THNN/generic/HardTanh.c b/lib/THNN/generic/HardTanh.c
new file mode 100644
index 0000000..9764ec0
--- /dev/null
+++ b/lib/THNN/generic/HardTanh.c
@@ -0,0 +1,84 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/HardTanh.c"
+#else
+
+void THNN_(HardTanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real min_val,
+          real max_val)
+{
+  THTensor_(resizeAs)(output, input);
+  
+  if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+  {
+    TH_TENSOR_APPLY2(real, output, real, input,
+      if (*input_data < min_val)
+        *output_data = min_val;
+      else if (*input_data <= max_val)
+        *output_data = *input_data;
+      else
+        *output_data = max_val;
+    );
+  }
+  else
+  {
+    real* ptr_output = THTensor_(data)(output);
+    real* ptr_input  = THTensor_(data)(input);
+    long i;
+
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(input); i++)
+    {
+      if (ptr_input[i] < min_val)
+        ptr_output[i] = min_val;
+      else if (ptr_input[i] <= max_val)
+        ptr_output[i] = ptr_input[i];
+      else
+        ptr_output[i] = max_val;
+    }
+  }
+}
+
+void THNN_(HardTanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real min_val,
+          real max_val)
+{
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (input->nDimension == 1 ||
+    !THTensor_(isContiguous)(input) ||
+    !THTensor_(isContiguous)(gradOutput) ||
+    !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      if (*input_data < min_val || *input_data > max_val)
+        *gradInput_data = 0;
+      else
+        *gradInput_data = *gradOutput_data;
+    );
+  }
+  else
+  {
+    real* ptr_gradOutput = THTensor_(data)(gradOutput);
+    real* ptr_gradInput  = THTensor_(data)(gradInput);
+    real* ptr_input      = THTensor_(data)(input);
+    long i;
+
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(input); i++)
+    {
+      if (ptr_input[i] < min_val || ptr_input[i] > max_val)
+        ptr_gradInput[i] = 0;
+      else
+        ptr_gradInput[i] = ptr_gradOutput[i];
+    }
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/L1Cost.c b/lib/THNN/generic/L1Cost.c
new file mode 100644
index 0000000..86f69a6
--- /dev/null
+++ b/lib/THNN/generic/L1Cost.c
@@ -0,0 +1,36 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/L1Cost.c"
+#else
+
+void THNN_(L1Cost_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  accreal sum = 0;
+
+  TH_TENSOR_APPLY(real, input, 
+    sum += fabs(*input_data);
+  );
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(L1Cost_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY2(real, gradInput, real, input,
+    if (*input_data > 0)
+      *gradInput_data = 1;
+    else if (*input_data < 0)
+      *gradInput_data = -1;
+    else
+      *gradInput_data = 0;
+  );
+}
+
+#endif
diff --git a/lib/THNN/generic/LeakyReLU.c b/lib/THNN/generic/LeakyReLU.c
new file mode 100644
index 0000000..5276989
--- /dev/null
+++ b/lib/THNN/generic/LeakyReLU.c
@@ -0,0 +1,54 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LeakyReLU.c"
+#else
+
+void THNN_(LeakyReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real negval,
+          bool inplace)
+{
+  if (inplace)
+  {
+    TH_TENSOR_APPLY(real, input,
+      if (*input_data <= 0)
+        *input_data *= negval;
+    );
+    THTensor_(set)(output, input);
+  }
+  else
+  {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = *input_data > 0 ? *input_data : *input_data * negval;
+    );
+  }
+}
+
+void THNN_(LeakyReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real negval,
+          bool inplace)
+{
+  if (inplace)
+  {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,
+      if (*input_data <= 0)
+        *gradOutput_data *= negval;
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  }
+  else
+  {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      *gradInput_data = *input_data > 0 ? *gradOutput_data : *gradOutput_data * negval;
+    );
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/LogSigmoid.c b/lib/THNN/generic/LogSigmoid.c
new file mode 100644
index 0000000..20932f1
--- /dev/null
+++ b/lib/THNN/generic/LogSigmoid.c
@@ -0,0 +1,35 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogSigmoid.c"
+#else
+
+void THNN_(LogSigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *buffer)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(resizeAs)(buffer, input);
+
+  TH_TENSOR_APPLY3(real, output, real, input, real, buffer,
+    real z = exp(-*input_data);
+    *buffer_data = z;
+    *output_data = -log(1. + z);
+  );
+}
+
+void THNN_(LogSigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *buffer)
+{
+  THTensor_(resizeAs)(gradInput, buffer);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, buffer,
+    real z = *buffer_data;
+    *gradInput_data = *gradOutput_data * z / (1. + z);
+  );
+}
+
+#endif
diff --git a/lib/THNN/generic/LogSoftMax.c b/lib/THNN/generic/LogSoftMax.c
new file mode 100644
index 0000000..3160d8a
--- /dev/null
+++ b/lib/THNN/generic/LogSoftMax.c
@@ -0,0 +1,110 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogSoftMax.c"
+#else
+
+void THNN_(LogSoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  real *input_data, *output_data;
+  long nframe = 0, dim = 0;
+  long t, d;
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+  }
+  else if (input->nDimension == 2)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+  }
+  else
+  {
+    THArgCheck(0, 2, "vector or matrix expected");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resizeAs)(output, input);
+
+  real *input_data0 = THTensor_(data)(input);
+  real *output_data0 = THTensor_(data)(output);
+
+  accreal logsum;
+  real maxInput;
+  #pragma omp parallel for private(t, d, maxInput, logsum, input_data, output_data)
+  for (t = 0; t < nframe; t++)
+  {
+    logsum = 0;
+    maxInput = -THInf;
+    input_data = input_data0 + dim*t;
+    output_data = output_data0 + dim*t;
+
+    for (d = 0; d < dim; d++)
+      maxInput = THMax(maxInput, input_data[d]);
+
+    for (d = 0; d < dim; d++)
+      logsum += exp(input_data[d] - maxInput);
+    logsum = maxInput + log(logsum);
+
+    for (d = 0; d < dim; d++)
+      output_data[d] = input_data[d] - logsum;
+  }
+
+  THTensor_(free)(input);
+}
+
+void THNN_(LogSoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  real *gradInput_data, *gradOutput_data, *output_data;
+  long nframe = 0, dim = 0;
+  long t, d;
+
+  if (output->nDimension == 1)
+  {
+    nframe = 1;
+    dim = output->size[0];
+  }
+  else if (output->nDimension == 2)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+  }
+  else
+  {
+    THError("vector or matrix expected");
+  }
+
+  THTensor_(resizeAs)(gradInput, output);
+  real *gradInput_data0 = THTensor_(data)(gradInput);
+  real *output_data0 = THTensor_(data)(output);
+  real *gradOutput_data0 = THTensor_(data)(gradOutput);
+  accreal sum;
+  #pragma omp parallel for private(t, sum, d, gradInput_data, output_data, gradOutput_data)
+  for (t = 0; t < nframe; t++)
+  {
+    sum = 0;
+    gradInput_data = gradInput_data0 + dim*t;
+    output_data = output_data0 + dim*t;
+    gradOutput_data = gradOutput_data0 + dim*t;
+
+    for (d = 0; d < dim; d++)
+      sum += gradOutput_data[d];
+
+    for (d = 0; d < dim; d++)
+      gradInput_data[d] = gradOutput_data[d] - exp(output_data[d])*sum;
+  }
+
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/lib/THNN/generic/LookupTable.c b/lib/THNN/generic/LookupTable.c
new file mode 100644
index 0000000..a35ff84
--- /dev/null
+++ b/lib/THNN/generic/LookupTable.c
@@ -0,0 +1,213 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LookupTable.c"
+#else
+
+static void THNN_(LookupTable_resetCount)(
+          THInteger_t *count_data,
+          THIndexTensor *input)
+{
+  int i;
+  THIndex_t *input_data = THIndexTensor_(data)(input);
+  long numel = THIndexTensor_(nElement)(input);
+
+  for (i = 0; i<numel; i++)
+  {
+    long k = input_data[i] - 1;
+    count_data[k] = 0;
+  }
+  for (i = 0; i<numel; i++)
+  {
+    long k = input_data[i] - 1;
+    count_data[k]++;
+  }
+}
+
+void THNN_(LookupTable_accGradParameters)(
+          THNNState *state,
+          THIndexTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THIntegerTensor *count,
+          THTensor *sorted,
+          THTensor *indices,
+          bool scaleGradByFreq,
+          int paddingValue,
+          real scale)
+{
+  long i;
+  THInteger_t *count_data = NULL;
+
+  if (scaleGradByFreq)
+  {
+    THIntegerTensor_(resize1d)(count, gradWeight->size[0]);
+    count_data = THIntegerTensor_(data)(count);
+  }
+
+  if (!THTensor_(isContiguous)(gradWeight))
+    THError("gradWeight must be contiguous");
+  if (!THIndexTensor_(isContiguous)(input))
+    THError("input must be contiguous");
+  if (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2)
+    THError("input must be a vector or matrix");
+
+  THIndex_t *input_data = THIndexTensor_(data)(input);
+  long numel = THIndexTensor_(nElement)(input);
+  long numw = THTensor_(size)(gradWeight, 0);
+
+  // check that inputs are all within range
+  for (i=0; i<numel; i++)
+    if (input_data[i] < 1 || input_data[i] > numw)
+      THError("input out of range");
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  real *gw = THTensor_(data)(gradWeight);
+  real *go = THTensor_(data)(gradOutput);
+  long stride = THTensor_(stride)(gradWeight, 0);
+
+  if (count_data)
+    THNN_(LookupTable_resetCount)(count_data, input);
+
+#ifdef _OPENMP
+  if (numel > 1000)
+  {
+    // The strategy is to parallelize over sections of the vocabulary, so that
+    // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread
+    // has to traverse the entire input, but the dominating factor is the axpy
+    // BLAS call.
+    #pragma omp parallel private(i)
+    {
+      int tid = omp_get_thread_num();
+      int nthreads = omp_get_num_threads();
+
+      long start = tid * (numw/nthreads + 1);
+      long end = start + (numw/nthreads + 1);
+      for (i=0; i<numel; i++)
+      {
+        if (input_data[i] != paddingValue)
+        {
+            long k = input_data[i] - 1;
+            if (k >= start && k < end)
+            {
+                real scale_ = scale;
+                if (count_data) scale_ /= count_data[k];
+                THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
+            }
+        }
+      }
+    }
+
+    THTensor_(free)(gradOutput);
+    return;
+  }
+#endif
+
+  for (i=0; i<numel; i++)
+  {
+    if (input_data[i] != paddingValue)
+    {
+        long k = input_data[i] - 1;
+        real scale_ = scale;
+        if (count_data) scale_ /= count_data[k];
+        THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
+     }
+  }
+
+  THTensor_(free)(gradOutput);
+}
+
+/*
+ * Keep the norm of weight smaller than maxNorm
+ */
+
+static void THNN_(LookupTable_renormRow)(
+          real *row_data,
+          long stride,
+          real maxNorm,
+          real normType)
+{
+  real norm = 0;
+  real new_norm;
+  long j;
+  for (j=0; j<stride; j++)
+  {
+    if (normType == 1) {
+      norm += fabs(row_data[j]);
+    } else if (normType == 2) {
+      norm += row_data[j] * row_data[j];
+    } else {
+      norm += pow(fabs(row_data[j]), normType);
+    }
+  }
+  norm = pow(norm, 1.0 / normType);
+  if (norm > maxNorm)
+  {
+    new_norm = maxNorm / (norm + 1e-7);
+    for (j=0; j<stride; j++) {
+      row_data[j] *= new_norm;
+    }
+  }
+}
+
+static int THNN_(compare_THIndex)(const void* a, const void* b)
+{
+   return *(const THIndex_t*)a < *(const THIndex_t*)b ? -1 : 1;
+}
+
+void THNN_(LookupTable_renorm)(
+          THNNState *state,
+          THIndexTensor *idx,
+          THTensor *weight,
+          real maxNorm,
+          real normType)
+{
+  if (!THTensor_(isContiguous)(weight))
+    THError("weight must be contiguous");
+  if (!THIndexTensor_(isContiguous)(idx))
+    THError("input must be contiguous");
+  if (THIndexTensor_(nDimension)(idx) != 1)
+    THError("idx must be a vector");
+  if (normType <= 0)
+    THError("non-positive-norm not supported");
+
+  long i;
+  THIndex_t *row_idx = THIndexTensor_(data)(idx);
+  long numel = THIndexTensor_(nElement)(idx);
+
+  long numw = THTensor_(size)(weight, 0);
+  long stride = THTensor_(stride)(weight, 0);
+  real *gw = THTensor_(data)(weight);
+  for (i=0; i<numel; i++)
+    if (row_idx[i] < 1 || row_idx[i] > numw)
+      THError("input out of range");
+  // get unique indices
+  qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex));
+  long ptr = 0;
+  for (i=0; i<numel; i++)
+    if (i == 0 || row_idx[i] != row_idx[i-1])
+      row_idx[ptr++] = row_idx[i];
+  numel = ptr;
+
+#ifdef _OPENMP
+  if (numel > 1000)
+  {
+    // The strategy is to parallelize over the rows that appear in
+    // row_idx, so that thread 1 handles the rows in row_idx[0..numel/nThreads].
+    // This distributes the work evenly to each thread.
+    #pragma omp parallel for private(i)
+    for (i=0; i<numel; i++)
+    {
+      long k = row_idx[i] - 1;
+      THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
+    }
+    return;
+  }
+#endif
+  for (i=0; i<numel; i++)
+  {
+    long k = row_idx[i] - 1;
+    THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/MSECriterion.c b/lib/THNN/generic/MSECriterion.c
new file mode 100644
index 0000000..c576e3d
--- /dev/null
+++ b/lib/THNN/generic/MSECriterion.c
@@ -0,0 +1,40 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MSECriterion.c"
+#else
+
+void THNN_(MSECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real z = (*input_data - *target_data);
+    sum += z*z;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(MSECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  real norm = (sizeAverage ? 2./((real)THTensor_(nElement)(input)) : 2.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = norm * (*input_data - *target_data);
+  );
+}
+
+#endif
diff --git a/lib/THNN/generic/MarginCriterion.c b/lib/THNN/generic/MarginCriterion.c
new file mode 100644
index 0000000..792ce7b
--- /dev/null
+++ b/lib/THNN/generic/MarginCriterion.c
@@ -0,0 +1,42 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MarginCriterion.c"
+#else
+
+void THNN_(MarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          real margin)
+{
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real z = (margin - *input_data * *target_data);
+    sum += z>0 ? z : 0;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(MarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          real margin)
+{
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = (*input_data * *target_data) < margin ? -norm * *target_data : 0;
+  );
+}
+
+#endif
diff --git a/lib/THNN/generic/MultiLabelMarginCriterion.c b/lib/THNN/generic/MultiLabelMarginCriterion.c
new file mode 100644
index 0000000..4cbb000
--- /dev/null
+++ b/lib/THNN/generic/MultiLabelMarginCriterion.c
@@ -0,0 +1,174 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MultiLabelMarginCriterion.c"
+#else
+
+void THNN_(MultiLabelMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          THTensor *isTarget,
+          bool sizeAverage)
+{
+  real *input_data, *target_data, *isTarget_data;
+  long nframe, dim;
+  long t, d, dt, ddt;
+  real sum;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
+  }
+
+  THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
+  THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
+
+  target = THTensor_(newContiguous)(target);
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+  target_data = THTensor_(data)(target);
+
+  THTensor_(resizeAs)(isTarget, target);
+  THTensor_(zero)(isTarget);
+  isTarget_data = THTensor_(data)(isTarget);
+
+  sum = 0;
+  for (t = 0; t < nframe; t++)
+  {
+    for (ddt = 0; ddt < dim; ddt++)
+    {
+      long target_idx = (long)target_data[ddt]-1;
+      if (target_idx < 0)
+        break;
+      isTarget_data[target_idx] = 1;
+    }
+    for (dt = 0; dt < dim; dt++)
+    {
+      long target_idx = (long)target_data[dt]-1;
+      real input_target;
+      if (target_idx < 0)
+        break;
+
+      input_target = input_data[target_idx];
+      for (d = 0; d < dim; d++)
+      {
+        if (!isTarget_data[d])
+        {
+          real z = 1 - input_target + input_data[d];
+          if (z > 0)
+            sum += z;
+        }
+      }
+    }
+    input_data += dim;
+    target_data += dim;
+    isTarget_data += dim;
+  }
+
+  sum /= dim;
+  if (sizeAverage)
+    sum /= nframe;
+
+  THTensor_(set1d)(output, 0, sum);
+
+  THTensor_(free)(input);
+  THTensor_(free)(target);
+}
+
+void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          THTensor *isTarget,
+          bool sizeAverage)
+{
+  real *input_data;
+  real *gradInput_data;
+  real *target_data;
+  real *isTarget_data;
+  long nframe, dim;
+  long t, d, dt;
+  real g;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
+    THArgCheck((isTarget->nDimension == 1) && (isTarget->size[0] == dim), 3, "inconsistent isTarget size");
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
+    THArgCheck((isTarget->nDimension == 2) && (isTarget->size[0] == nframe) && (isTarget->size[1] == dim), 3, "inconsistent isTarget size");
+  }
+
+  THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
+  THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
+
+  THArgCheck(THTensor_(minall)(isTarget) >= 0, 3, "isTarget out of range");
+  THArgCheck(THTensor_(maxall)(isTarget) <= 1, 3, "isTarget out of range");
+
+  target = THTensor_(newContiguous)(target);
+  input = THTensor_(newContiguous)(input);
+  isTarget = THTensor_(newContiguous)(isTarget);
+  input_data = THTensor_(data)(input);
+  target_data = THTensor_(data)(target);
+  isTarget_data = THTensor_(data)(isTarget);
+
+  g = sizeAverage ? ( 1./((real)(nframe*dim)) ) : ( 1./((real)dim) );
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+  gradInput_data = THTensor_(data)(gradInput);
+
+  for (t = 0; t < nframe; t++)
+  {
+    for (dt = 0; dt < dim; dt++)
+    {
+      long target_idx = (long)target_data[dt]-1;
+      real input_target;
+      if (target_idx < 0)
+        break;
+
+      input_target = input_data[target_idx];
+      for (d = 0; d < dim; d++)
+      {
+        if (!isTarget_data[d])
+        {
+          real z = 1 - input_target + input_data[d];
+          if (z > 0)
+          {
+            gradInput_data[target_idx] -= g;
+            gradInput_data[d] += g;
+          }
+        }
+      }
+    }
+    input_data += dim;
+    target_data += dim;
+    isTarget_data += dim;
+    gradInput_data += dim;
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(target);
+  THTensor_(free)(isTarget);
+}
+
+#endif
diff --git a/lib/THNN/generic/MultiMarginCriterion.c b/lib/THNN/generic/MultiMarginCriterion.c
new file mode 100644
index 0000000..2463da1
--- /dev/null
+++ b/lib/THNN/generic/MultiMarginCriterion.c
@@ -0,0 +1,159 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MultiMarginCriterion.c"
+#else
+
+void THNN_(MultiMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          int p,
+          THTensor *weights,
+          real margin)
+{
+  real *input_data, *target_data, *weights_data;
+  long nframe, dim;
+  long t, d;
+  real sum;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, "inconsistent target size");
+  }
+
+  for (t = 0; t < nframe; t++)
+  {
+    real idx = THTensor_(get1d)(target, t);
+    THArgCheck((idx >= 1) && (idx <= dim), 3, "target out of range");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  target = THTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+  input_data = THTensor_(data)(input);
+  target_data = THTensor_(data)(target);
+  weights_data = weights ? THTensor_(data)(weights) : NULL;
+
+  sum = 0;
+  for (t = 0; t < nframe; t++)
+  {
+    long target_idx = (long)(target_data[t]-1);
+    real input_target = input_data[target_idx];
+    for (d = 0; d < dim; d++)
+    {
+      real z = margin - input_target + input_data[d];
+      if (d == target_idx)
+        continue;
+
+      if (z > 0) {
+        real h = (p==1) ? z : z*z;
+        if(weights_data)
+          h *= weights_data[target_idx];
+        sum += h;
+      }
+    }
+    input_data += dim;
+  }
+
+  sum /= dim;
+  if(sizeAverage)
+    sum /= nframe;
+
+  THTensor_(set1d)(output, 0, sum);
+
+  THTensor_(free)(input);
+  THTensor_(free)(target);
+  if(weights)
+    THTensor_(free)(weights);
+}
+
+void THNN_(MultiMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          int p,
+          THTensor *weights,
+          real margin)
+{
+  real *input_data;
+  real *gradInput_data;
+  real *target_data;
+  real *weights_data;
+  long nframe, dim;
+  long t, d;
+  real g;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, "inconsistent target size");
+  }
+
+  g = (sizeAverage ? 1./((real)(nframe*dim)) : 1./((real)dim));
+
+  input = THTensor_(newContiguous)(input);
+  target = THTensor_(newContiguous)(target);
+  input_data = THTensor_(data)(input);
+
+  THTensor_(resizeAs)(gradInput, input);
+  gradInput_data = THTensor_(data)(gradInput);
+
+  target_data = THTensor_(data)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+  weights_data = weights ? THTensor_(data)(weights) : NULL;
+
+  for (t = 0; t < nframe; t++)
+  {
+    long target_idx = (long)(target_data[t])-1;
+    real input_target = input_data[target_idx];
+    real gradInput_target = 0;
+    for (d = 0; d < dim; d++)
+    {
+      real z = margin - input_target + input_data[d];
+      if (d == target_idx)
+        continue;
+
+      if (z > 0)
+      {
+        real h = (p == 1) ? g : 2*g*z;
+        if(weights_data)
+          h *= weights_data[target_idx];
+        gradInput_target -= h;
+        gradInput_data[d] = h;
+      }
+      else
+        gradInput_data[d] = 0;
+    }
+    gradInput_data[target_idx] = gradInput_target;
+
+    input_data += dim;
+    gradInput_data += dim;
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(target);
+  if(weights)
+    THTensor_(free)(weights);
+}
+
+#endif
diff --git a/lib/THNN/generic/PReLU.c b/lib/THNN/generic/PReLU.c
new file mode 100644
index 0000000..b1b2c0f
--- /dev/null
+++ b/lib/THNN/generic/PReLU.c
@@ -0,0 +1,228 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/PReLU.c"
+#else
+
+void THNN_(PReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THIndex_t nOutputPlane)
+{
+  THTensor_(resizeAs)(output, input);
+
+  if (nOutputPlane == 0)
+  {
+    // handle shared parameter case
+    real w = *THTensor_(data)(weight);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data > 0) ? *input_data : w*(*input_data);
+    );
+  }
+  else
+  {
+    long bs, ks;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      switch (input_ndim)
+      {
+        case 1:
+          bs = 1;
+          ks = 1;
+          break;
+        case 2:
+          bs = input->size[0];
+          ks = 1;
+          break;
+        case 3:
+          bs = 1;
+          ks = input->size[1] * input->size[2];
+          break;
+        case 4:
+          bs = input->size[0];
+          ks = input->size[2] * input->size[3];
+          break;
+      }
+
+      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
+        THError("wrong number of input planes");
+    }
+
+    real *output_data = THTensor_(data)(output);
+    real *input_data = THTensor_(data)(input);
+    real *weight_data = THTensor_(data)(weight);
+    THIndex_t i, j, k;
+#pragma omp parallel for private(j,k)
+    for (i = 0; i < bs; ++i)
+    {
+      real* n_input_data = input_data + i*nOutputPlane*ks;
+      real* n_output_data = output_data + i*nOutputPlane*ks;
+      for (j = 0; j < nOutputPlane; ++j)
+      {
+        for (k = 0; k < ks; ++k)
+          n_output_data[k] = (n_input_data[k] > 0) ? n_input_data[k] : weight_data[j] * n_input_data[k];
+        n_input_data += ks;
+        n_output_data += ks;
+      }
+    }
+  }
+}
+
+void THNN_(PReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THIndex_t nOutputPlane)
+{
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (nOutputPlane == 0)
+  {
+    real w = THTensor_(data)(weight)[0];
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+       if ((*input_data) > 0)
+         *gradInput_data = *gradOutput_data;
+       else
+         *gradInput_data = w * (*gradOutput_data);
+    );
+  }
+  else
+  {
+    const real *input_data = THTensor_(data)(input);
+    const real *gradOutput_data = THTensor_(data)(gradOutput);
+    const real *weight_data = THTensor_(data)(weight);
+    real *gradInput_data = THTensor_(data)(gradInput);
+
+    long bs, ks;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      switch (input_ndim)
+      {
+        case 1:
+          bs = 1;
+          ks = 1;
+          break;
+        case 2:
+          bs = input->size[0];
+          ks = 1;
+          break;
+        case 3:
+          bs = 1;
+          ks = input->size[1] * input->size[2];
+          break;
+        case 4:
+          bs = input->size[0];
+          ks = input->size[2] * input->size[3];
+          break;
+      }
+
+      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
+        THError("wrong number of input planes");
+    }
+
+    THIndex_t i, j, k;
+#pragma omp parallel for private(j,k)
+    for (i = 0; i < bs; ++i)
+    {
+      const real *n_input_data = input_data + i*nOutputPlane*ks;
+      const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+      real *n_gradInput_data = gradInput_data + i*nOutputPlane*ks;
+
+      for (j = 0; j < nOutputPlane; ++j)
+      {
+        real w = weight_data[j];
+        for (k = 0; k < ks; ++k)
+        {
+          if (n_input_data[k] > 0)
+            n_gradInput_data[k] = n_gradOutput_data[k];
+          else
+            n_gradInput_data[k] = n_gradOutput_data[k] * w;
+        }
+        n_input_data += ks;
+        n_gradInput_data += ks;
+        n_gradOutput_data += ks;
+      }
+    }
+  }
+}
+
+void THNN_(PReLU_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradWeight,
+          THTensor *gradWeightBuf,
+          THTensor *gradWeightBuf2,
+          THIndex_t nOutputPlane,
+          real scale)
+{
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+
+  if (nOutputPlane == 0)
+  {
+    real sum = 0;
+    TH_TENSOR_APPLY2(real, input, real, gradOutput,
+      if ((*input_data) <= 0)
+        sum += (*input_data) * (*gradOutput_data);
+    );
+    gradWeight_data[0] += scale * sum;
+  }
+  else
+  {
+    long bs, ks;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      switch (input_ndim)
+      {
+        case 1:
+          bs = 1;
+          ks = 1;
+          break;
+        case 2:
+          bs = input->size[0];
+          ks = 1;
+          break;
+        case 3:
+          bs = 1;
+          ks = input->size[1] * input->size[2];
+          break;
+        case 4:
+          bs = input->size[0];
+          ks = input->size[2] * input->size[3];
+          break;
+      }
+
+      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
+        THError("wrong number of input planes");
+    }
+
+    const real *input_data = THTensor_(data)(input);
+    const real *gradOutput_data = THTensor_(data)(gradOutput);
+    const real *weight_data = THTensor_(data)(weight);
+    real *gradWeight_data = THTensor_(data)(gradWeight);
+
+    THIndex_t i, j, k;
+    for (i = 0; i < bs; ++i)
+    {
+      const real *n_input_data = input_data + i*nOutputPlane*ks;
+      const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+
+      for (j = 0; j < nOutputPlane; ++j)
+      {
+        real sum = 0;
+        for (k = 0; k < ks; ++k)
+          if (n_input_data[k] <= 0)
+            sum += n_gradOutput_data[k] * n_input_data[k];
+        gradWeight_data[j] += scale * sum;
+        n_input_data += ks;
+        n_gradOutput_data += ks;
+      }
+    }
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/RReLU.c b/lib/THNN/generic/RReLU.c
new file mode 100644
index 0000000..8bf6764
--- /dev/null
+++ b/lib/THNN/generic/RReLU.c
@@ -0,0 +1,127 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/RReLU.c"
+#else
+
+void THNN_(RReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *noise,
+          real lower,
+          real upper,
+          bool train,
+          bool inplace,
+          THGenerator *generator)
+{
+  if (train)
+  {
+    // get default random generator
+    THTensor_(resizeAs)(noise, input);
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, input, real, noise,
+        if (*input_data <= 0)
+        {
+          const real r = (real)THRandom_uniform(generator, lower, upper);
+          *input_data = (*input_data) * r;
+          *noise_data = r;
+        }
+        else
+        {
+          *noise_data = 1;
+        }
+      );
+      THTensor_(set)(output, input);
+    }
+    else
+    {
+      THTensor_(resizeAs)(output, input);
+      TH_TENSOR_APPLY3(real, input, real, output, real, noise,
+        if (*input_data <= 0)
+        {
+          const real r = (real)THRandom_uniform(generator, lower, upper);
+          *output_data = (*input_data) * r;
+          *noise_data = r;
+        }
+        else
+        {
+          *output_data = *input_data;
+          *noise_data = 1;
+        }
+      );
+    }
+  }
+  else
+  {
+    const real negSlope = (lower + upper) / 2;
+    if (inplace)
+    {
+      TH_TENSOR_APPLY(real, input,
+        if (*input_data <= 0)
+        {
+          *input_data = *input_data * negSlope;
+        }
+      );
+      THTensor_(set)(output, input);
+    }
+    else
+    {
+      THTensor_(resizeAs)(output, input);
+      TH_TENSOR_APPLY2(real, input, real, output,
+        const real r = (*input_data) <= 0 ? negSlope : 1;
+        *output_data = *input_data * r;
+      );
+    }
+  }  
+}
+
+void THNN_(RReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *noise,
+          real lower,
+          real upper,
+          bool train,
+          bool inplace)
+{
+  if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
+  {
+    // multiply the gradient by the noise tensor
+    if (inplace)
+    {
+      THTensor_(cmul)(gradOutput, gradOutput, noise);
+      THTensor_(set)(gradInput, gradOutput);
+    }
+    else
+    {
+      THTensor_(resizeAs)(gradInput, input);
+      THTensor_(cmul)(gradInput, gradOutput, noise);
+    }    
+  }
+  else
+  { 
+    // use constant factor for negative input values
+    const real negSlope = (lower + upper) / 2;
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, gradOutput, real, input,
+        if (*input_data <= 0)
+        {
+          *gradOutput_data = (*gradOutput_data) * negSlope;
+        }
+      );
+      THTensor_(set)(gradInput, gradOutput);
+    }
+    else
+    {
+      THTensor_(resizeAs)(gradInput, input);
+      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+        *gradInput_data = (*input_data) <= 0 ? (*gradOutput_data) * negSlope : (*gradOutput_data);
+      );
+    }
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/Sigmoid.c b/lib/THNN/generic/Sigmoid.c
new file mode 100644
index 0000000..0a1b375
--- /dev/null
+++ b/lib/THNN/generic/Sigmoid.c
@@ -0,0 +1,31 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Sigmoid.c"
+#else
+
+void THNN_(Sigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+
+  TH_TENSOR_APPLY2(real, output, real, input,
+    *output_data = 1./(1.+ exp(- *input_data));
+  );
+}
+
+void THNN_(Sigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(gradInput, output);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+    real z = *output_data;
+    *gradInput_data = *gradOutput_data * (1. - z) * z;
+  );
+}
+
+#endif
diff --git a/lib/THNN/generic/SmoothL1Criterion.c b/lib/THNN/generic/SmoothL1Criterion.c
new file mode 100644
index 0000000..8b53100
--- /dev/null
+++ b/lib/THNN/generic/SmoothL1Criterion.c
@@ -0,0 +1,45 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SmoothL1Criterion.c"
+#else
+
+void THNN_(SmoothL1Criterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  real sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real z = fabs(*input_data - *target_data);
+    sum += z < 1 ? 0.5*z*z : z - 0.5;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(SmoothL1Criterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    real x = *input_data - *target_data;
+    if (x < -1.)
+     *gradInput_data = - norm;
+    else if (x > 1.)
+     *gradInput_data = norm;
+    else
+     *gradInput_data = norm * x;
+  );
+}
+
+#endif
diff --git a/lib/THNN/generic/SoftMarginCriterion.c b/lib/THNN/generic/SoftMarginCriterion.c
new file mode 100644
index 0000000..d9b618d
--- /dev/null
+++ b/lib/THNN/generic/SoftMarginCriterion.c
@@ -0,0 +1,40 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftMarginCriterion.c"
+#else
+
+void THNN_(SoftMarginCriterion_updateOutput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *target,
+  THTensor *output,
+  bool sizeAverage)
+{
+  real sum;
+
+  sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+                   real z = log(1. + exp(-*input_data* *target_data));
+                   sum += z;)
+
+  if(sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(SoftMarginCriterion_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *target,
+  THTensor *gradInput,
+  bool sizeAverage)
+{
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+                   real z = exp(-*target_data * *input_data);
+                   *gradInput_data = -norm*(*target_data)*z/(1. + z);)
+}
+
+#endif
diff --git a/lib/THNN/generic/SoftMax.c b/lib/THNN/generic/SoftMax.c
new file mode 100644
index 0000000..8bccefd
--- /dev/null
+++ b/lib/THNN/generic/SoftMax.c
@@ -0,0 +1,149 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftMax.c"
+#else
+
+void THNN_(SoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  real *input_data, *output_data;
+  long nframe = 0, dim = 0, stride = 0;
+  long t;
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = 1;
+  }
+  else if (input->nDimension == 2)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = 1;
+  }
+  else if (input->nDimension == 3)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = input->size[1]*input->size[2];
+  }
+  else if (input->nDimension == 4)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = input->size[2]*input->size[3];
+  }
+  else
+  {
+    THArgCheck(0, 2, "1D, 2D, 3D or 4D tensor expected");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resizeAs)(output, input);
+
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(t)
+  for (t = 0; t < stride*nframe; t++)
+  {
+    real *input_ptr = input_data + (t/stride)*dim*stride + t % stride;
+    real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
+
+    real inputMax = -THInf;
+    accreal sum;
+
+    long d;
+    for (d = 0; d < dim; d++)
+    {
+      if (input_ptr[d*stride] >= inputMax) inputMax = input_ptr[d*stride];
+    }
+
+    sum = 0;
+    for (d = 0; d < dim; d++)
+    {
+      real z = exp(input_ptr[d*stride] - inputMax);
+      output_ptr[d*stride] = z;
+      sum += z;
+    }
+
+    for (d = 0; d < dim; d++)
+    {
+      output_ptr[d*stride] *= 1/sum;
+    }
+  }
+
+  THTensor_(free)(input);
+}
+
+void THNN_(SoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  real *gradInput_data, *gradOutput_data, *output_data;
+  long nframe = 0, dim = 0, stride = 0;
+  long t;
+
+  if (output->nDimension == 1)
+  {
+    nframe = 1;
+    dim = output->size[0];
+    stride = 1;
+  }
+  else if (output->nDimension == 2)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = 1;
+  }
+  else if (output->nDimension == 3)
+  {
+    nframe = 1;
+    dim = output->size[0];
+    stride = output->size[1]*output->size[2];
+  }
+  else if (output->nDimension == 4)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = output->size[2]*output->size[3];
+  }
+  else
+  {
+    THError("1D, 2D, 3D or 4D tensor expected");
+  }
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  output = THTensor_(newContiguous)(output);
+
+  THTensor_(resizeAs)(gradInput, output);
+  gradInput_data = THTensor_(data)(gradInput);
+  output_data = THTensor_(data)(output);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(t)
+  for (t = 0; t < stride*nframe; t++)
+  {
+    real *gradInput_ptr = gradInput_data + (t/stride)*dim*stride + t % stride;
+    real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
+    real *gradOutput_ptr = gradOutput_data + (t/stride)*dim*stride + t % stride;
+
+    long d;
+    accreal sum = 0;
+    for (d = 0; d < dim; d++)
+      sum += (accreal)gradOutput_ptr[d*stride] * output_ptr[d*stride];
+
+    for (d = 0; d < dim; d++)
+      gradInput_ptr[d*stride] = output_ptr[d*stride] * (gradOutput_ptr[d*stride] - sum);
+  }
+
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(output);
+}
+
+#endif
diff --git a/lib/THNN/generic/SoftPlus.c b/lib/THNN/generic/SoftPlus.c
new file mode 100644
index 0000000..407413f
--- /dev/null
+++ b/lib/THNN/generic/SoftPlus.c
@@ -0,0 +1,42 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftPlus.c"
+#else
+
+void THNN_(SoftPlus_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real beta,
+          real threshold)
+{
+  THTensor_(resizeAs)(output, input);
+
+  // f(x) = 1/beta * log(1 + exp(beta * x))
+  TH_TENSOR_APPLY2(real, output, real, input,               \
+    *output_data = (*input_data * beta) > threshold ? *input_data : THLog1p(exp(*input_data * beta)) / beta;
+  );
+}
+
+void THNN_(SoftPlus_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          real beta,
+          real threshold)
+{
+  THTensor_(resizeAs)(gradInput, output);
+  
+  // d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
+  // SINCE
+  // y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1)
+  // THEREFORE:
+  // d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+    real z = exp(*output_data * beta);
+    *gradInput_data = (*output_data * beta) > threshold ? *gradOutput_data : *gradOutput_data * (z - 1.)/z;
+  );
+}
+
+#endif
diff --git a/lib/THNN/generic/SoftShrink.c b/lib/THNN/generic/SoftShrink.c
new file mode 100644
index 0000000..7bd1cc8
--- /dev/null
+++ b/lib/THNN/generic/SoftShrink.c
@@ -0,0 +1,39 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftShrink.c"
+#else
+
+void THNN_(SoftShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real lambda)
+{
+  THTensor_(resizeAs)(output, input);
+  
+  TH_TENSOR_APPLY2(real, output, real, input,
+    if ((*input_data) > lambda)
+     *output_data = *input_data - lambda;
+    else if ((*input_data) < -lambda)
+     *output_data = *input_data + lambda;
+    else
+     *output_data = 0;
+  );
+}
+
+void THNN_(SoftShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda)
+{
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    if ((*input_data) > lambda || (*input_data) < -lambda)
+      *gradInput_data = (*gradOutput_data);
+    else
+      *gradInput_data = 0;
+  );
+}
+
+#endif
diff --git a/lib/THNN/generic/SparseLinear.c b/lib/THNN/generic/SparseLinear.c
new file mode 100644
index 0000000..807280e
--- /dev/null
+++ b/lib/THNN/generic/SparseLinear.c
@@ -0,0 +1,550 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SparseLinear.c"
+#else
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
+#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])
+
+static bool THNN_(checkLegacyInput)(THTensor* t)
+{
+  return t->nDimension == 3 && t->size[2] == 2;
+}
+
+static bool THNN_(checkInput)(THTensor* t)
+{
+  return t->nDimension == 2 && t->size[1] == 3;
+}
+
+static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1)
+{
+  return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
+}
+
+static bool THNN_(checkSize1D)(THTensor* t, long size0)
+{
+  return t->nDimension == 1 && t->size[0] == size0;
+}
+
+static void THNN_(set1d)(THTensor *t, long x0, real value) {
+  THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value);
+}
+static real THNN_(get3d)(const THTensor *t, long x0, long x1, long x2) {
+  return THStorage_(get)(t->storage, t->storageOffset +
+                         x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]);
+}
+static real THNN_(get2d)(const THTensor *t, long x0, long x1) {
+  return THStorage_(get)(t->storage, t->storageOffset +
+                         x0*t->stride[0] + x1*t->stride[1]);
+}
+
+void THNN_(SparseLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias)
+{
+  long h, i, j, hp0, hp1;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+  long batchSize = THTensor_(size)(output, 0);
+
+  THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3");
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+  long nnz = THTensor_(size)(input, 0);
+
+  THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1);
+  THLongTensor_zero(csr);
+
+//#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+  for (i=0; i<nnz; i++) {
+    hp0 = (long)(THNN_(get2d)(input, i, 0)) - 1;
+    hp1 = (i+1 == nnz) ?
+            batchSize :
+            (long)(THNN_(get2d)(input, i+1, 0)) - 1;
+    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+      THLongTensor_set1d(csr, h+1, i+1);
+    }
+  }
+
+
+  // output = weight * input + bias
+  THTensor_(zero)(output);
+#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000)
+  for (h = 0; h < batchSize; h++) {
+    long i_start = THLongTensor_get1d(csr, h);
+    long i_end = THLongTensor_get1d(csr, h+1);
+    for (i = i_start; i < i_end; i++) {
+      real val = THNN_(get2d)(input, i, 2);
+      if (val == 0) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+            val,
+            COL_PTR2(weight, offset), weight->stride[0],
+            ROW_PTR2(output, h), output->stride[1]);
+      } else {
+        THError("index out of bound. updateOutput: %d not between 1 and %d",
+            offset + 1, inDim);
+      }
+    }
+  }
+
+  THTensor* output_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(output_row, output, 0, h);
+    THTensor_(cadd)(output_row, bias, 1.0, output_row);
+  }
+  THTensor_(free)(output_row);
+  THLongTensor_free(csr);
+}
+
+void THNN_(SparseLinear_legacyUpdateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias)
+{
+  long h, i;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2");
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+  long batchSize = THTensor_(size)(input, 0);
+  long nnz = THTensor_(size)(input, 1);
+  THTensor_(resize2d)(output, batchSize, outDim);
+
+  // output = weight * input + bias
+  THTensor_(zero)(output);
+#pragma omp parallel for private(h, i) schedule(static) if (   \
+  batchSize > 1 && batchSize * nnz * outDim > 10000)
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      real val = THNN_(get3d)(input, h, i, 1);
+      if (val == 0) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+                      val,
+                      COL_PTR2(weight, offset), weight->stride[0],
+                      ROW_PTR2(output, h), output->stride[1]);
+      } else {
+        THError("index out of bound. updateOutput: %d not between 1 and %d",
+                offset + 1, inDim);
+      }
+    }
+  }
+
+  THTensor* output_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(output_row, output, 0, h);
+    THTensor_(cadd)(output_row, bias, 1.0, output_row);
+  }
+  THTensor_(free)(output_row);
+}
+
+void THNN_(SparseLinear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          real weightDecay,
+          real scale)
+{
+  long h, i, col, hp0, hp1;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkInput)(input), 2,
+             "input must be in coo format, nnz x 3");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+             "gradBias size wrong");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+             "gradOutput must be contiguous");
+
+  long nnz = THTensor_(size)(input, 0);
+
+  THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
+  THLongTensor_zero(csc);
+
+#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+  for (i = 0; i < nnz; i++) {
+    hp0 = (long)(THNN_(get2d)(input, i, 1)) - 1;
+    hp1 = (i+1 == nnz) ?
+            inDim :
+            (long)(THNN_(get2d)(input, i+1, 1)) - 1;
+    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+      THLongTensor_set1d(csc, h+1, i+1);
+    }
+  }
+
+  // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000)
+  for (col = 0; col < inDim; col++) {
+    long i_start = THLongTensor_get1d(csc, col);
+    long i_end = THLongTensor_get1d(csc, col+1);
+    for (i = i_start; i < i_end; i++) {
+      real val = scale * THNN_(get2d)(input, i, 2);
+
+      h = (long)(THNN_(get2d)(input, i, 0)) - 1;
+      long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+            val,
+            ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+            COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+      } else {
+        THError(
+            "index out of bound. accGradParameters: %d not between 1 and %d",
+            offset + 1,
+            inDim);
+      }
+    }
+  }
+
+  // gradBias += gradOutput
+  THTensor* buf = THTensor_(new)();
+  THTensor_(sum)(buf, gradOutput, 0);
+  THTensor_(cadd)(gradBias, gradBias, scale, buf);
+  THTensor_(free)(buf);
+  THLongTensor_free(csc);
+
+  if (weightDecay != 0) {
+    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
+  }
+}
+
+void THNN_(SparseLinear_legacyAccGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          real weightDecay,
+          real scale)
+{
+  long h, i;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkLegacyInput)(input), 2,
+             "input size must be batchsize x nnz x 2");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+             "gradBias size wrong");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+             "gradOutput must be contiguous");
+
+  long batchSize = THTensor_(size)(input, 0);
+  long nnz = THTensor_(size)(input, 1);
+  THTensor_(resize2d)(gradOutput, batchSize, outDim);
+
+  // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i) schedule(static) if (\
+  batchSize * nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    for (h = 0; h < batchSize; h++) {
+      real val = scale * THNN_(get3d)(input, h, i, 1);
+      if (val == 0) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+                      val,
+                      ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+                      COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+      } else {
+        THError(
+          "index out of bound. accGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+
+  // gradBias += gradOutput
+  THTensor* gradOutput_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(gradOutput_row, gradOutput, 0, h);
+    THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row);
+  }
+  THTensor_(free)(gradOutput_row);
+
+  if (weightDecay != 0) {
+    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
+  }
+}
+
+void THNN_(SparseLinear_updateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          real learningRate)
+{
+  long h, i;
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(lastInput), 6,
+             "input must be in coo format, nnz x 3");
+
+
+  long nnz = THTensor_(size)(lastInput, 0);
+
+  // collect unique offsets of non-0 val in input
+  THTensor* offsets = THTensor_(newWithSize1d)(nnz);
+  long cnt = 0;
+  for (i = 0; i < nnz; i++) {
+    real val = THNN_(get2d)(lastInput, i, 2);
+    if (val == 0) {
+      continue;
+    }
+    long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      THNN_(set1d)(offsets, cnt++, offset);
+    } else {
+      THError(
+          "index out of bound. updateParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+    }
+  }
+  if (cnt == 0) return;
+  THTensor_(resize1d)(offsets, cnt);
+
+  THTensor* uniqueOffsets = THTensor_(new)();
+  THLongTensor* ri = THLongTensor_new();
+  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
+  THLongTensor_free(ri);
+  THTensor_(free)(offsets);
+
+  cnt = 1;
+  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
+  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
+    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
+      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
+    }
+  }
+  THTensor_(resize1d)(uniqueOffsets, cnt);
+
+  // weight += -learningRate * gradWeight
+  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
+  for (i = 0; i < cnt; i++) {
+    long offset = (long)uniqueOffsets_p[i];
+    THBlas_(axpy)(outDim,
+                  -learningRate,
+                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
+                  COL_PTR2(weight, offset), weight->stride[0]);
+  }
+
+  THTensor_(free)(uniqueOffsets);
+}
+
+void THNN_(SparseLinear_legacyUpdateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          real learningRate)
+{
+  long h, i;
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THNN_(checkLegacyInput)(lastInput), 6,
+             "input size must be batchsize x nnz x 2");
+
+
+  long batchSize = THTensor_(size)(lastInput, 0);
+  long nnz = THTensor_(size)(lastInput, 1);
+
+  // collect unique offsets of non-0 val in input
+  THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz);
+  long cnt = 0;
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      real val = THNN_(get3d)(lastInput, h, i, 1);
+      if (val == 0 ) {
+        continue;
+      }
+      long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THNN_(set1d)(offsets, cnt++, offset);
+      } else {
+        THError(
+          "index out of bound. updateParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+  THTensor_(resize1d)(offsets, cnt);
+
+  THTensor* uniqueOffsets = THTensor_(new)();
+  THLongTensor* ri = THLongTensor_new();
+  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
+  THLongTensor_free(ri);
+  THTensor_(free)(offsets);
+
+  cnt = 1;
+  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
+  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
+    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
+      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
+    }
+  }
+  THTensor_(resize1d)(uniqueOffsets, cnt);
+
+  // weight += -learningRate * gradWeight
+  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
+  for (i = 0; i < cnt; i++) {
+    long offset = (long)uniqueOffsets_p[i];
+    THBlas_(axpy)(outDim,
+                  -learningRate,
+                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
+                  COL_PTR2(weight, offset), weight->stride[0]);
+  }
+
+  THTensor_(free)(uniqueOffsets);
+}
+
+void THNN_(SparseLinear_zeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput)
+{
+  long h, i, j;
+
+  long outDim = gradWeight->size[0];
+  long inDim = gradWeight->size[1];
+
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(lastInput), 4,
+             "input must be in coo format, nnz x 3");
+
+  THTensor_(zero)(gradBias);
+
+  long nnz = THTensor_(size)(lastInput, 0);
+
+#pragma omp parallel for private(i, j) schedule(static) if (   \
+  nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    if (THNN_(get2d)(lastInput, i, 2) == 0 ) {
+      continue;
+    }
+
+    long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      real* pGradWeight = COL_PTR2(gradWeight, offset);
+      if (gradWeight->stride[0] == 1) {
+        THVector_(fill)(pGradWeight, 0, outDim);
+      } else {
+        long stride = gradWeight->stride[0];
+        for (j = 0; j < outDim; ++j) {
+          pGradWeight[j * stride] = 0;
+        }
+      }
+    } else {
+      THError(
+          "index out of bound. zeroGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+    }
+  }
+}
+
+void THNN_(SparseLinear_legacyZeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput)
+{
+  long h, i, j;
+
+  long outDim = gradWeight->size[0];
+  long inDim = gradWeight->size[1];
+
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
+  THArgCheck(THNN_(checkLegacyInput)(lastInput), 4,
+             "input size must be batchsize x nnz x 2");
+
+  THTensor_(zero)(gradBias);
+
+  long batchSize = THTensor_(size)(lastInput, 0);
+  long nnz = THTensor_(size)(lastInput, 1);
+
+#pragma omp parallel for private(h, i, j) schedule(static) if (   \
+  batchSize > 1 && batchSize * nnz * outDim > 10000)
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        real* pGradWeight = COL_PTR2(gradWeight, offset);
+        if (gradWeight->stride[0] == 1) {
+          THVector_(fill)(pGradWeight, 0, outDim);
+        } else {
+          long stride = gradWeight->stride[0];
+          for (j = 0; j < outDim; ++j) {
+            pGradWeight[j * stride] = 0;
+          }
+        }
+      } else {
+        THError(
+          "index out of bound. zeroGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+}
+
+#undef ROW_PTR2
+#undef COL_PTR2
+
+#endif
diff --git a/lib/THNN/generic/SpatialAdaptiveMaxPooling.c b/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
new file mode 100644
index 0000000..61afc40
--- /dev/null
+++ b/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
@@ -0,0 +1,274 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.c"
+#else
+
+static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          real *indx_p,
+          real *indy_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          long stridew,
+          long strideh,
+          long strided)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = (int)floor((float)i / oheight * iheight);
+      int y_end   = (int)ceil((float)(i + 1) / oheight * iheight);
+      int kH = y_end-y_start;
+
+      for(j = 0; j < owidth; j++)
+      {
+        
+        int x_start = (int)floor((float)j / owidth * iwidth);
+        int x_end   = (int)ceil((float)(j + 1) / owidth * iwidth);
+        int kW = x_end-x_start;
+
+        /* local pointers */
+        real *ip = input_p   + k*strided + y_start*strideh + x_start*stridew;
+        real *op = output_p  + k*owidth*oheight + i*owidth + j;
+        real *indyp = indy_p + k*owidth*oheight + i*owidth + j;
+        real *indxp = indx_p + k*owidth*oheight + i*owidth + j;
+
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -FLT_MAX;
+        long tcntr = 0;
+        int x,y;
+        for(y = 0; y < kH; y++)
+        {
+          for(x = 0; x < kW; x++)
+          {
+            real val = *(ip + y*strideh + x*stridew);
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = tcntr;
+            }
+            tcntr++;
+          }
+        }
+
+        /* set output to local max */
+        *op = maxval;
+
+        /* store location of max (x,y) */
+        *indyp = (int)(maxindex / kW)+1;
+        *indxp = (maxindex % kW) +1;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int owidth,
+          int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  
+  long istride_d;
+  long istride_h;
+  long istride_w;
+  long istride_b;
+
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
+
+  if (input->nDimension == 4) 
+  {
+    istride_b = input->stride[0];
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  /* strides */
+  istride_d = input->stride[dimh-1];
+  istride_h = input->stride[dimh];
+  istride_w = input->stride[dimw];
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    /* indices will contain i,j locations for each output point */
+    THTensor_(resize4d)(indices, 2, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data,
+                                                      indices_data+nslices*owidth*oheight, indices_data,
+                                                      nslices,
+                                                      iwidth, iheight,
+                                                      owidth, oheight,
+                                                      istride_w,istride_h,
+                                                      istride_d);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    /* indices will contain i,j locations for each output point */
+    THTensor_(resize5d)(indices, 2, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight,
+                                                        indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
+                                                        nslices,
+                                                        iwidth, iheight,
+                                                        owidth, oheight,
+                                                        istride_w,istride_h,
+                                                        istride_d);
+    }
+  }
+}
+
+static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          real *indx_p,
+          real *indy_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+    real *indx_p_k = indx_p + k*owidth*oheight;
+    real *indy_p_k = indy_p + k*owidth*oheight;
+    
+    /* calculate max points */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = (int)floor((float) i / oheight * iheight);
+      for(j = 0; j < owidth; j++)
+      {
+        int x_start = (int)floor((float) j / owidth * iwidth);
+        /* retrieve position of max */
+        long maxi = indy_p_k[i*owidth + j] - 1 + y_start;
+        long maxj = indx_p_k[i*owidth + j] - 1 + x_start;
+        
+        /* update gradient */
+        gradInput_p_k[maxi*iwidth + maxj] += gradOutput_p_k[i*owidth + j];
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                         indices_data+nslices*owidth*oheight, indices_data,
+                                                         nslices,
+                                                         iwidth, iheight,
+                                                         owidth, oheight);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                           indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
+                                                           nslices,
+                                                           iwidth, iheight,
+                                                           owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
+
diff --git a/lib/THNN/generic/SpatialAveragePooling.c b/lib/THNN/generic/SpatialAveragePooling.c
new file mode 100644
index 0000000..37ee274
--- /dev/null
+++ b/lib/THNN/generic/SpatialAveragePooling.c
@@ -0,0 +1,258 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAveragePooling.c"
+#else
+
+void THNN_(SpatialAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode,
+          bool count_include_pad)
+{
+  real *output_data;
+  real *input_data;
+
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+  long nInputPlane; // number of channels (or colors)
+
+  long k;
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimc++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  nInputPlane = input->size[dimc];
+
+  if(ceil_mode)
+  {
+    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  else
+  {
+    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  THArgCheck(inputWidth >= kW - 2 * padW && inputHeight >= kH - 2 * padH, 2, "input image smaller than kernel size");
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+  else
+    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+  
+  input = THTensor_(newContiguous)(input);
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+  
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      long xx, yy;
+      /* For all output pixels... */
+      real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+      real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      long i;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        ptr_output[i] = 0;
+      
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          /* Compute the mean of the input image... */
+          long hstart = yy * dH - padH;
+          long wstart = xx * dW - padW;
+          long hend = fminf(hstart + kH, inputHeight + padH);
+          long wend = fminf(wstart + kW, inputWidth + padW);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = fmaxf(hstart, 0);
+          wstart = fmaxf(wstart, 0);
+          hend = fminf(hend, inputHeight);
+          wend = fminf(wend, inputWidth);
+
+          real sum = 0;
+
+          int divide_factor;
+          if(count_include_pad)
+            divide_factor = pool_size;
+          else
+            divide_factor = (hend - hstart) * (wend - wstart);
+
+          long kx, ky;
+
+          for(ky = hstart; ky < hend; ky++)
+          {
+            for(kx = wstart; kx < wend; kx++)
+              sum += ptr_input[ky*inputWidth + kx];
+          }
+          /* Update output */
+          *ptr_output++ += sum/divide_factor;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(SpatialAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode,
+          bool count_include_pad)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+  long nInputPlane; // number of channels (or colors)
+
+  real *gradOutput_data;
+  real *input_data, *gradInput_data;
+
+  long k;
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimc++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  nInputPlane = input->size[dimc];
+
+  if(ceil_mode)
+  {
+    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  else
+  {
+    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  input_data = THTensor_(data)(input);
+
+  THTensor_(resizeAs)(gradInput, input);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradInput), 4, "gradInput must be contiguous");
+
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      long xx, yy;
+
+      real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+
+      long i;
+      for(i=0; i<inputWidth*inputHeight; i++)
+        ptr_gi[i] = 0.0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          long hstart = yy * dH - padH;
+          long wstart = xx * dW - padW;
+          long hend = fminf(hstart + kH, inputHeight + padH);
+          long wend = fminf(wstart + kW, inputWidth + padW);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = fmaxf(hstart, 0);
+          wstart = fmaxf(wstart, 0);
+          hend = fminf(hend, inputHeight);
+          wend = fminf(wend, inputWidth);
+
+          real z = *ptr_gradOutput++;
+
+          int divide_factor;
+          if(count_include_pad)
+            divide_factor = pool_size;
+          else
+            divide_factor = (hend - hstart) * (wend - wstart);
+
+          long kx, ky;
+          for(ky = hstart ; ky < hend; ky++)
+          {
+            for(kx = wstart; kx < wend; kx++)
+              ptr_gradInput[ky*inputWidth + kx] += z/divide_factor;
+          }
+        }
+      }
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/lib/THNN/generic/SpatialClassNLLCriterion.c b/lib/THNN/generic/SpatialClassNLLCriterion.c
new file mode 100644
index 0000000..3121c30
--- /dev/null
+++ b/lib/THNN/generic/SpatialClassNLLCriterion.c
@@ -0,0 +1,124 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialClassNLLCriterion.c"
+#else
+
+#define INITIAL_CHECK                                                            \
+  THArgCheck(THIndexTensor_(nDimension)(target) == 3, 3,                         \
+              "only batches of spatial targets supported (3D tensors)");         \
+  THArgCheck(THTensor_(nDimension)(input) == 4, 2,                               \
+              "only batches of spatial inputs supported (4D tensors)");          \
+                                                                                 \
+  {                                                                              \
+    long input0 = THTensor_(size)(input, 0);                                     \
+    long input1 = THTensor_(size)(input, 1);                                     \
+    long input2 = THTensor_(size)(input, 2);                                     \
+    long input3 = THTensor_(size)(input, 3);                                     \
+    long target0 = THIndexTensor_(size)(target, 0);                              \
+    long target1 = THIndexTensor_(size)(target, 1);                              \
+    long target2 = THIndexTensor_(size)(target, 2);                              \
+    THAssertMsg(input0 == target0 && input2 == target1 && input3 == target2,     \
+              "size mismatch (got input: %ldx%ldx%ldx%ld, target: %ldx%ldx%ld)", \
+              input0, input1, input2, input3, target0, target1, target2);        \
+  }
+
+void THNN_(SpatialClassNLLCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
+{
+  INITIAL_CHECK;
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  real *input_data = THTensor_(data)(input);
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *output_data = THTensor_(data)(output);
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  long batch_size = THTensor_(size)(input, 0);
+  long n_classes = THTensor_(size)(input, 1);
+  long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
+  long sample_size = map_size * n_classes;
+
+  real total_weight_acc = 0;
+  real output_acc = 0;
+  for (int b = 0; b < batch_size; b++) {
+    for (int elem = 0; elem < map_size; elem++) {
+      int cur_target = target_data[b * map_size + elem] - 1;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+      total_weight_acc += cur_weight;
+      output_acc -= input_data[b * sample_size + cur_target * map_size + elem] * cur_weight;
+    }
+  }
+  *total_weight_data = total_weight_acc;
+  *output_data = output_acc;
+
+  if (sizeAverage && *total_weight_data)
+    *output_data /= *total_weight_data;
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  if (weights)
+    THTensor_(free)(weights);
+}
+
+void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
+{
+  INITIAL_CHECK;
+  THArgCheck(THTensor_(isContiguous)(gradInput), 4,
+              "gradInput must be contiguous");
+
+  real *total_weight_data = THTensor_(data)(total_weight);
+  if (*total_weight_data <= 0)
+    return;
+
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *gradInput_data = THTensor_(data)(gradInput);
+
+  long batch_size = THTensor_(size)(input, 0);
+  long n_classes = THTensor_(size)(input, 1);
+  long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
+  long sample_size = map_size * n_classes;
+
+  real normalize = sizeAverage ? *total_weight_data : 1.0f;
+
+  int b,elem;
+#pragma omp parallel for
+  for (b = 0; b < batch_size; b++) {
+    for (elem = 0; elem < map_size; elem++) {
+      int cur_target = target_data[b * map_size + elem] - 1;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      gradInput_data[b * sample_size + cur_target * map_size + elem] =
+        -(weights ? weights_data[cur_target] : 1.0f) / normalize;
+    }
+  }
+
+  THIndexTensor_(free)(target);
+  if (weights)
+    THTensor_(free)(weights);
+}
+
+#undef INITIAL_CHECK
+
+#endif
diff --git a/lib/THNN/generic/SpatialConvolutionLocal.c b/lib/THNN/generic/SpatialConvolutionLocal.c
new file mode 100644
index 0000000..091c6f0
--- /dev/null
+++ b/lib/THNN/generic/SpatialConvolutionLocal.c
@@ -0,0 +1,241 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionLocal.c"
+#else
+
+
+static void THNN_(SpatialConvolutionLocal_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
+                                                         int kW, int kH, int dW, int dH, int padW, int padH,
+                                                         long nInputPlane, long inputWidth, long inputHeight,
+                                                         long nOutputPlane, long outputWidth, long outputHeight)
+{
+  long i;
+  THTensor *output3d, *finput3d;
+
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
+
+  THTensor_(copy)(output, bias);
+
+  output3d = THTensor_(newWithStorage3d)(output->storage, output->storageOffset,
+                                         outputHeight*outputWidth, 1,
+                                         nOutputPlane, outputHeight*outputWidth,
+                                         1, nOutputPlane*outputHeight*outputWidth);
+ 
+  finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset,
+                                         outputHeight*outputWidth, 1,
+                                         kW*kH*nInputPlane, outputHeight*outputWidth,
+                                         1, kW*kH*nInputPlane*outputHeight*outputWidth);
+  // weight:    oH*oW x nOutputPlane x nInputPlane*kH*kW
+  // finput3d:  oH*oW x nInputPlane*kH*kW x 1  
+  THTensor_(baddbmm)(output3d, 1.0, output3d, 1.0, weight, finput3d);
+  // output3d:  oH*oW x nOutputPlane x 1
+  
+  THTensor_(free)(output3d);
+  THTensor_(free)(finput3d);
+}
+
+void THNN_(SpatialConvolutionLocal_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight)
+{
+  long nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
+  long nOutputPlane = THTensor_(size)(weight,1);
+
+  if(input->nDimension == 3)
+  {
+    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+
+    THNN_(SpatialConvolutionLocal_updateOutput_frame)(input, output, weight, bias, finput,
+                                                 kW, kH, dW, dH, padW, padH,
+                                                 nInputPlane, inputWidth, inputHeight,
+                                                 nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
+                                                   kW, kH, dW, dH, padW, padH,
+                                                   nInputPlane, inputWidth, inputHeight,
+                                                   nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+}
+
+
+static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)(THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
+                                                            int kW, int kH, int dW, int dH, int padW, int padH, 
+                                                            long nInputPlane, long inputWidth, long inputHeight,
+                                                            long nOutputPlane, long outputWidth, long outputHeight)
+{
+  THTensor *gradOutput3d, *fgradInput3d;
+  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             nOutputPlane, outputHeight*outputWidth,
+                                             1, nOutputPlane*outputHeight*outputWidth);
+  fgradInput3d = THTensor_(newWithStorage3d)(fgradInput->storage, fgradInput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             kW*kH*nInputPlane, outputHeight*outputWidth,
+                                             1, kW*kH*nInputPlane*outputHeight*outputWidth);
+  // weight:        oH*oW x nInputPlane*kH*kW x nOutputPlane
+  // gradOutput3d:  oH*oW x nOutputPlane x 1         
+  THTensor_(baddbmm)(fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d);
+  // fgradInput3d:  oH*oW x nInputPlane*kH*kW x 1  
+  
+  THTensor_(free)(gradOutput3d);
+  THTensor_(free)(fgradInput3d);
+  
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, 
+                                            nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
+}
+
+void THNN_(SpatialConvolutionLocal_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight)
+{
+  long nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
+  long nOutputPlane = THTensor_(size)(weight,1);
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+  THTensor_(transpose)(weight, weight, 1, 2);
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionLocal_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kW, kH, dW, dH, padW, padH, 
+                                                       nInputPlane, inputWidth, inputHeight,
+                                                       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kW, kH, dW, dH, padW, padH, 
+                                                         nInputPlane, inputWidth, inputHeight,
+                                                         nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(transpose)(weight, weight, 1, 2);
+}
+
+static void THNN_(SpatialConvolutionLocal_accGradParameters_frame)(THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, real scale, 
+                                                            int kW, int kH, int dW, int dH, int padW, int padH, 
+                                                            long nInputPlane, long inputWidth, long inputHeight,
+                                                            long nOutputPlane, long outputWidth, long outputHeight)
+{
+   
+  THTensor *gradOutput3d, *finput3d;
+  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             nOutputPlane, outputHeight*outputWidth,
+                                             1, nOutputPlane*outputHeight*outputWidth);
+  finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset,
+                                         outputHeight*outputWidth, 1,
+                                         1, kW*kH*nInputPlane*outputHeight*outputWidth,
+                                         kW*kH*nInputPlane, outputHeight*outputWidth);
+  // gradOutput3d:  oH*oW x nOutputPlane x 1  
+  // finput3d:      oH*oW x 1 x kW*kH*nInputPlane
+  THTensor_(baddbmm)(gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d);
+  // gradWeight:    oH*oW x nOutputPlane x kW*kH*nInputPlane
+
+  THTensor_(cadd)(gradBias, gradBias, scale, gradOutput);
+
+  THTensor_(free)(gradOutput3d);
+  THTensor_(free)(finput3d);
+}
+
+void THNN_(SpatialConvolutionLocal_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight,
+    real scale)
+{
+  long nInputPlane = THTensor_(size)(gradWeight,2)/(kW*kH);
+  long nOutputPlane = THTensor_(size)(gradWeight,1);
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionLocal_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale, kW, kH, dW, dH, padW, padH,
+                                                         nInputPlane, inputWidth, inputHeight,
+                                                         nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale, kW, kH, dW, dH, padW, padH,
+                                                           nInputPlane, inputWidth, inputHeight,
+                                                           nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/SpatialConvolutionMM.c b/lib/THNN/generic/SpatialConvolutionMM.c
new file mode 100644
index 0000000..a549a37
--- /dev/null
+++ b/lib/THNN/generic/SpatialConvolutionMM.c
@@ -0,0 +1,280 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionMM.c"
+#else
+
+static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          long nInputPlane,
+          long inputWidth,
+          long inputHeight,
+          long nOutputPlane,
+          long outputWidth,
+          long outputHeight)
+{
+  long i;
+  THTensor *output2d;
+
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
+
+  output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
+                                         nOutputPlane, -1,
+                                         outputHeight*outputWidth, -1);
+  if (bias) {
+    for(i = 0; i < nOutputPlane; i++)
+        THVector_(fill)(output->storage->data+output->storageOffset+output->stride[0]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+  } else {
+    THTensor_(zero)(output);
+  }
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+void THNN_(SpatialConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  int dimf = 0;
+  int dimw = 2;
+  int dimh = 1;
+
+  long nInputPlane;
+  long inputWidth;
+  long inputHeight;
+  long nOutputPlane;
+  long outputWidth;
+  long outputHeight;
+
+  THArgCheck( input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected");
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  if (input->nDimension == 4) {
+    dimf++;
+    dimw++;
+    dimh++;
+  }
+
+  nInputPlane = input->size[dimf];
+  inputWidth   = input->size[dimw];
+  inputHeight  = input->size[dimh];
+  nOutputPlane = weight->size[0];
+  outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+  outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+        nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  if (nInputPlane*kW*kH != weight->size[1])
+    THError("Wrong number of input channels! Input has %d channels, expected %d",nInputPlane,weight->size[1]/(kW*kH));
+
+  if(input->nDimension == 3)
+  {
+    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+
+    THNN_(SpatialConvolutionMM_updateOutput_frame)(input, output, weight, bias, finput,
+                                                 kW, kH, dW, dH, padW, padH,
+                                                 nInputPlane, inputWidth, inputHeight,
+                                                 nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionMM_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
+                                                   kW, kH, dW, dH, padW, padH,
+                                                   nInputPlane, inputWidth, inputHeight,
+                                                   nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+}
+
+static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
+                                                       gradOutput->size[0], -1,
+                                                       gradOutput->size[1]*gradOutput->size[2], -1);
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, gradInput->size[0], gradInput->size[2], gradInput->size[1], gradOutput->size[2], gradOutput->size[1]);
+}
+
+void THNN_(SpatialConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  long nOutputPlane = weight->size[0];
+
+  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 3, "Number of output features is not equal to nOutputPlane" );
+  THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+  THTensor_(transpose)(weight, weight, 0, 1);
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kW, kH, dW, dH, padW, padH);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kW, kH, dW, dH, padW, padH);
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(transpose)(weight, weight, 0, 1);
+}
+
+static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale)
+{
+  long i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
+                                                       gradOutput->size[0], -1,
+                                                       gradOutput->size[1]*gradOutput->size[2], -1);
+
+  THTensor_(transpose)(finput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
+  THTensor_(transpose)(finput, finput, 0, 1);
+
+  if (gradBias) {
+    for(i = 0; i < gradBias->size[0]; i++)
+    {
+      long k;
+      real sum = 0;
+      real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+      for(k = 0; k < gradOutput2d->size[1]; k++)
+        sum += data[k];
+      (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
+    }
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(SpatialConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          real scale)
+{
+  long nOutputPlane = gradWeight->size[0];
+  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 3, "Number of output features is not equal to nOutputPlane" );
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/SpatialConvolutionMap.c b/lib/THNN/generic/SpatialConvolutionMap.c
new file mode 100644
index 0000000..aef0b1e
--- /dev/null
+++ b/lib/THNN/generic/SpatialConvolutionMap.c
@@ -0,0 +1,259 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionMap.c"
+#else
+
+void THNN_(SpatialConvolutionMap_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+  );
+
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimc++;
+    dimw++;
+    dimh++;
+  }
+
+  const long kH       = weight->size[1];
+  const long kW       = weight->size[2];
+
+  THArgCheck(input->size[dimc] >= nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size");
+
+  const long input_w  = input->size[dimw];
+  const long input_h  = input->size[dimh];
+  const long output_w = (input_w - kW) / dW + 1;
+  const long output_h = (input_h - kH) / dH + 1;
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nOutputPlane, output_h, output_w);
+  else
+    THTensor_(resize4d)(output, input->size[0], nOutputPlane, output_h, output_w);
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  output = THTensor_(newContiguous)(output);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *output_data = THTensor_(data)(output);
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nOutputPlane; p++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      /* add bias */
+      real *ptr_output = output_data + p*output_w*output_h + m*nOutputPlane*output_w*output_h;
+      long j, k;
+      real z= bias_data[p];
+      for (j = 0; j < output_h*output_w; j++)
+        ptr_output[j] = z;
+
+      /* convolve all maps */
+      int nweight = connTable->size[0];
+      for (k = 0; k < nweight; k++)
+      {
+        /* get offsets for input/output */
+        int o = (int)connTable_data[k*2+1]-1;
+        int i = (int)connTable_data[k*2+0]-1;
+
+        if (o == p)
+        {
+          THTensor_(validXCorr2Dptr)(
+            output_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,
+            1.0,
+            input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+            weight_data + k*kW*kH,
+            kH, kW,
+            dH, dW
+          );
+        }
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(output);
+}
+
+void THNN_(SpatialConvolutionMap_updateGradInput)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+  );
+
+  real *weight_data = THTensor_(data)(weight);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  /* and dims */
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  const long input_h  = input->size[dimh];
+  const long input_w  = input->size[dimw];
+  const long output_h = gradOutput->size[dimh];
+  const long output_w = gradOutput->size[dimw];
+  const long kH       = weight->size[1];
+  const long kW       = weight->size[2];
+
+  /* contiguous */
+  gradInput = THTensor_(newContiguous)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* Resize/Zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* get raw pointers */
+  real *gradInput_data = THTensor_(data)(gradInput);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nInputPlane; p++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      long k;
+      /* backward all */
+      int nkernel = connTable->size[0];
+      for (k = 0; k < nkernel; k++)
+      {
+        int o = (int)connTable_data[k*2+1]-1;
+        int i = (int)connTable_data[k*2+0]-1;
+        if (i == p)
+        {
+          /* gradient to input */
+          THTensor_(fullConv2Dptr)(
+            gradInput_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, 1.0,
+            gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,  output_h,  output_w,
+            weight_data + k*kW*kH, kH, kW, dH, dW
+          );
+        }
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(gradInput);
+  THTensor_(free)(gradOutput);
+}
+
+void THNN_(SpatialConvolutionMap_accGradParameters)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH, real scale)
+{
+  THArgCheck(
+    gradWeight != NULL && gradWeight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+    "3D gradWeight tensor expected (connTable:size(1) x kH x kW)"
+  );
+
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+  real *gradBias_data = THTensor_(data)(gradBias);
+
+  /* and dims */
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  const long input_h  = input->size[dimh];
+  const long input_w  = input->size[dimw];
+  const long output_h = gradOutput->size[dimh];
+  const long output_w = gradOutput->size[dimw];
+  const long kH       = gradWeight->size[1];
+  const long kW       = gradWeight->size[2];
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+
+  long k;
+  /* gradients wrt bias */
+#pragma omp parallel for private(k)
+  for (k = 0; k < nOutputPlane; k++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      real *ptr_gradOutput = gradOutput_data + k*output_w*output_h + m*nOutputPlane*output_w*output_h;
+      long l;
+      for (l = 0; l < output_h*output_w; l++)
+        gradBias_data[k] += scale*ptr_gradOutput[l];
+    }
+  }
+
+  /* gradients wrt weight */
+  const int nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
+  for (k = 0; k < nkernel; k++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      int o = (int)THTensor_(get2d)(connTable,k,1)-1;
+      int i = (int)THTensor_(get2d)(connTable,k,0)-1;
+
+      /* gradient to kernel */
+      THTensor_(validXCorr2DRevptr)(
+        gradWeight_data + k*kW*kH,
+        scale,
+        input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+        gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h , output_h, output_w,
+        dH, dW
+      );
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/lib/THNN/generic/SpatialDilatedConvolution.c b/lib/THNN/generic/SpatialDilatedConvolution.c
new file mode 100644
index 0000000..3f75016
--- /dev/null
+++ b/lib/THNN/generic/SpatialDilatedConvolution.c
@@ -0,0 +1,337 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDilatedConvolution.c"
+#else
+
+void THNN_(SpatialDilatedConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH)
+{
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long n_ = outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        0,
+        THTensor_(data)(output_n), n_
+      );
+    } else {
+      THTensor_(zero)(output_n);
+    }
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = columns->size[1];
+    long k = nInputPlane*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(columns), n,
+      THTensor_(data)(weight), k,
+      1,
+      THTensor_(data)(output_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(SpatialDilatedConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH)
+{
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+  THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
+
+  // Params
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    long m = nInputPlane*kW*kH;
+    long n = gradColumns->size[1];
+    long k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(gradOutput_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2im)(
+      THTensor_(data)(gradColumns),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(gradInput_n)
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+    THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+
+void THNN_(SpatialDilatedConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH,
+    real scale)
+{
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(gradWeight->nDimension == 4, 4, "gradWeight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+  THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params
+  int nInputPlane = gradWeight->size[1];
+  int nOutputPlane = gradWeight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = nInputPlane*kW*kH;
+    long k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(gradOutput_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long k_ = outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/SpatialFractionalMaxPooling.c b/lib/THNN/generic/SpatialFractionalMaxPooling.c
new file mode 100644
index 0000000..1c2b6ab
--- /dev/null
+++ b/lib/THNN/generic/SpatialFractionalMaxPooling.c
@@ -0,0 +1,251 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFractionalMaxPooling.c"
+#else
+
+static long* THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+  real sample,
+  long inputSize,
+  long outputSize,
+  int poolSize) {
+  real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1);
+  long* sequence = (long*) THAlloc(sizeof(long) * outputSize);
+
+  long i;
+  for (i = 0; i < outputSize - 1; ++i) {
+    sequence[i] =
+      (long) ((i + sample) * alpha) - (long) (sample * alpha);
+  }
+  sequence[outputSize - 1] = inputSize - poolSize;
+
+  return sequence;
+}
+
+static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+  real* input,
+  real* output,
+  real* indices,
+  real* randomSamples,
+  long numPlanes,
+  long inputW, long inputH,
+  long outputW, long outputH,
+  int poolSizeW, int poolSizeH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; ++plane) {
+    /* each plane contains 2 random samples, one for W and one for H */
+    real* randomSamplesForPlane = randomSamples + plane * 2;
+
+    /* Generate interval sequence */
+    long* sequenceW =
+      THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[0], inputW, outputW, poolSizeW);
+    long* sequenceH =
+      THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[1], inputH, outputH, poolSizeH);
+
+    /* loop over output */
+    long h, w;
+
+    real* inputForPlane = input + plane * inputW * inputH;
+    real* outputForPlane = output + plane * outputW * outputH;
+    real* indicesForPlane = indices + plane * outputW * outputH;
+
+    for (h = 0; h < outputH; ++h) {
+      long inputHStart = sequenceH[h];
+
+      for (w = 0; w < outputW; ++w) {
+        long inputWStart = sequenceW[w];
+
+        real maxVal = -THInf;
+        long maxIndex = -1;
+
+        long h2, w2;
+        for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) {
+          for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) {
+            THAssert(h2 >= 0 && h2 < inputH);
+            THAssert(w2 >= 0 && w2 < inputW);
+
+            long planeIndex = h2 * inputW + w2;
+            real val = inputForPlane[planeIndex];
+            if (val > maxVal) {
+              maxVal = val;
+              maxIndex = planeIndex;
+            }
+          }
+        }
+
+        THAssert(maxVal != -THInf);
+        THAssert(maxIndex != -1);
+
+        outputForPlane[h * outputW + w] = maxVal;
+        /* +1 to lua index */
+        indicesForPlane[h * outputW + w] = (real) maxIndex + 1;
+      }
+    }
+
+    THFree(sequenceW);
+    THFree(sequenceH);
+  }
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputW, int outputH,
+    int poolSizeW, int poolSizeH,
+    THTensor *indices,
+    THTensor *randomSamples) {
+  
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
+             "3D or 4D (batch mode) tensor expected");
+
+  if (numInputDims == 4) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim++;
+    heightDim++;
+    widthDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+
+  THArgCheck(outputH + poolSizeH - 1 < inputH, 7,
+             "poolSizeH too large relative to input height");
+  THArgCheck(outputW + poolSizeW - 1 < inputW, 6,
+             "poolSizeW too large relative to input width");
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (numInputDims == 3) {
+    /* resize output */
+    THTensor_(resize3d)(output, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THTensor_(resize3d)(indices, numPlanes, outputH, outputW);
+
+    THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+      THTensor_(data)(input),
+      THTensor_(data)(output),
+      THTensor_(data)(indices),
+      THTensor_(data)(randomSamples),
+      numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
+  } else {
+    THTensor_(resize4d)(output, numBatch, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THTensor_(resize4d)(indices, numBatch, numPlanes, outputH, outputW);
+
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+        THTensor_(data)(input) + batch * numPlanes * inputH * inputW,
+        THTensor_(data)(output) + batch * numPlanes * outputH * outputW,
+        THTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+        THTensor_(data)(randomSamples) + batch * numPlanes * 2,
+        numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+  real* gradInput,
+  real* gradOutput,
+  real* indices,
+  long numPlanes,
+  long inputW, long inputH,
+  long outputW, long outputH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; plane++) {
+    real* gradInputForPlane = gradInput + plane * inputW * inputH;
+    real* gradOutputForPlane = gradOutput + plane * outputW * outputH;
+    real* indicesForPlane = indices + plane * outputW * outputH;
+
+    long h, w;
+    for (h = 0; h < outputH; ++h) {
+      for (w = 0; w < outputW; ++w) {
+        long outputIndex = h * outputW + w;
+        long index = indicesForPlane[outputIndex] - 1;
+        THAssert(index >= 0 && index < inputW * inputH);
+
+        gradInputForPlane[index] += gradOutputForPlane[outputIndex];
+      }
+    }
+  }
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int outputW, int outputH,
+    int poolSizeW, int poolSizeH,
+    THTensor *indices) {
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  if (numInputDims == 4) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim = 1;
+    heightDim++;
+    widthDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+
+  THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3,
+             "gradOutput width unexpected");
+  THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3,
+             "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (numInputDims == 3) {
+    THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      THTensor_(data)(indices),
+      numPlanes, inputW, inputH, outputW, outputH);
+  } else {
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW,
+        THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW,
+        THTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+        numPlanes, inputW, inputH, outputW, outputH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/lib/THNN/generic/SpatialFullConvolution.c b/lib/THNN/generic/SpatialFullConvolution.c
new file mode 100644
index 0000000..20dd126
--- /dev/null
+++ b/lib/THNN/generic/SpatialFullConvolution.c
@@ -0,0 +1,380 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullConvolution.c"
+#else
+
+static void THNN_(im2col)(const real* data_im, const int channels,
+      const int height, const int width, const int kernel_h, const int kernel_w,
+      const int pad_h, const int pad_w,
+      const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w,
+      real* data_col) {
+  const int height_col = (height + 2 * pad_h -
+                          (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_col = (width + 2 * pad_w -
+                         (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  const int channels_col = channels * kernel_h * kernel_w;
+  for (int c_col = 0; c_col < channels_col; ++c_col) {
+    int w_offset = c_col % kernel_w;
+    int h_offset = (c_col / kernel_w) % kernel_h;
+    int c_im = c_col / kernel_h / kernel_w;
+    for (int h_col = 0; h_col < height_col; ++h_col) {
+      for (int w_col = 0; w_col < width_col; ++w_col) {
+        int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+        data_col[(c_col * height_col + h_col) * width_col + w_col] =
+          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+          data_im[(c_im * height + h_im) * width + w_im] : 0;
+      }
+    }
+  }
+}
+
+static void THNN_(col2im)(const real* data_col, const int channels,
+      const int height, const int width, const int kernel_h, const int kernel_w,
+      const int pad_h, const int pad_w,
+      const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w,
+      real* data_im) {
+  memset(data_im, 0, sizeof(real) * height * width * channels);
+  const int height_col = (height + 2 * pad_h -
+                          (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_col = (width + 2 * pad_w -
+                         (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  const int channels_col = channels * kernel_h * kernel_w;
+  for (int c_col = 0; c_col < channels_col; ++c_col) {
+    int w_offset = c_col % kernel_w;
+    int h_offset = (c_col / kernel_w) % kernel_h;
+    int c_im = c_col / kernel_h / kernel_w;
+    for (int h_col = 0; h_col < height_col; ++h_col) {
+      for (int w_col = 0; w_col < width_col; ++w_col) {
+        int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+        if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
+          data_im[(c_im * height + h_im) * width + w_im] +=
+            data_col[(c_col * height_col + h_col) * width_col + w_col];
+      }
+    }
+  }
+}
+
+void THNN_(SpatialFullConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH)
+{
+  int nInputPlane = THTensor_(size)(weight,0);
+  int nOutputPlane = THTensor_(size)(weight,1);
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[1] * weight->size[2] * weight->size[3];
+    long n = columns->size[1];
+    long k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(input_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(columns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2im)(
+      THTensor_(data)(columns),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
+      THTensor_(data)(output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long n_ = outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        1,
+        THTensor_(data)(output_n), n_
+    );
+
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(SpatialFullConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH)
+{
+  int nInputPlane = THTensor_(size)(weight,0);
+  int nOutputPlane = THTensor_(size)(weight,1);
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
+      THTensor_(data)(gradColumns)
+    );
+
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[0];
+    long n = gradColumns->size[1];
+    long k = weight->size[1] * weight->size[2] * weight->size[3];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 'n',
+        n, m, k,
+        1,
+        THTensor_(data)(gradColumns), n,
+        THTensor_(data)(weight), k,
+        0,
+        THTensor_(data)(gradInput_n), n
+    );
+  }
+
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+    THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+
+void THNN_(SpatialFullConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH,
+    real scale)
+{
+  int nInputPlane = THTensor_(size)(gradWeight,0);
+  int nOutputPlane = THTensor_(size)(gradWeight,1);
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long n = columns->size[0];   // nOutputPlane * kh * kw
+    long m = input_n->size[0];   // nInputPlane
+    long k = columns->size[1];   // inputHeight * inputWidth
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(input_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long k_ = outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    THBlas_(gemv)(
+        't',
+        k_, m_,
+        scale,
+        THTensor_(data)(gradOutput_n), k_,
+        THTensor_(data)(ones), 1,
+        1,
+        THTensor_(data)(gradBias), 1
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/SpatialFullConvolutionMap.c b/lib/THNN/generic/SpatialFullConvolutionMap.c
new file mode 100644
index 0000000..bbb0282
--- /dev/null
+++ b/lib/THNN/generic/SpatialFullConvolutionMap.c
@@ -0,0 +1,212 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullConvolutionMap.c"
+#else
+
+void THNN_(SpatialFullConvolutionMap_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output_, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+  );
+
+  const int kH = (int)weight->size[1];
+  const int kW = (int)weight->size[2];
+
+  THArgCheck(input != NULL && input->nDimension == 3, 2, "3D tensor expected");
+  THArgCheck(input->size[0] >= nInputPlane, 2, "invalid number of input planes");
+
+  THTensor_(resize3d)(
+    output_, nOutputPlane,
+    (input->size[1] - 1) * dH + kH,
+    (input->size[2] - 1) * dW + kW
+  );
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  THTensor* output = THTensor_(newContiguous)(output_);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *output_data = THTensor_(data)(output);
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  /* and dims */
+  const long input_h = input->size[1];
+  const long input_w = input->size[2];
+  const long output_h = output->size[1];
+  const long output_w = output->size[2];
+  const long weight_h = weight->size[1];
+  const long weight_w = weight->size[2];
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nOutputPlane; p++)
+  {
+    /* add bias */
+    real *ptr_output = output_data + p*output_w*output_h;
+    long j;
+    int nweight;
+    long k;
+
+    for (j = 0; j < output_h*output_w; j++)
+      ptr_output[j] = bias_data[p];
+
+    /* convolve all maps */
+    nweight = connTable->size[0];
+    for (k = 0; k < nweight; k++)
+    {
+      /* get offsets for input/output */
+      int o = (int)connTable_data[k*2+1]-1;
+      int i = (int)connTable_data[k*2+0]-1;
+
+      if (o == p)
+      {
+        THTensor_(fullConv2Dptr)(
+          output_data + o*output_w*output_h,
+          1.0,
+          input_data + i*input_w*input_h, input_h, input_w,
+          weight_data + k*weight_w*weight_h, weight_h, weight_w,
+          dH, dW
+        );
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(freeCopyTo)(output, output_);
+}
+
+void THNN_(SpatialFullConvolutionMap_updateGradInput)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput_, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+  );
+
+  /* contiguous */
+  THTensor* gradInput = THTensor_(newContiguous)(gradInput_);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* Resize/Zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* get raw pointers */
+  real *gradInput_data = THTensor_(data)(gradInput);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *weight_data = THTensor_(data)(weight);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  /* and dims */
+  const long input_h = input->size[1];
+  const long input_w = input->size[2];
+  const long output_h = gradOutput->size[1];
+  const long output_w = gradOutput->size[2];
+  const long kH = weight->size[1];
+  const long kW = weight->size[2];
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nInputPlane; p++)
+  {
+    long k;
+    /* backward all */
+    int nkernel = connTable->size[0];
+    for (k = 0; k < nkernel; k++)
+    {
+      int o = (int)connTable_data[k*2+1]-1;
+      int i = (int)connTable_data[k*2+0]-1;
+      if (i == p)
+      {
+        /* gradient to input */
+        THTensor_(validXCorr2Dptr)(
+          gradInput_data + i*input_w*input_h,
+          1.0,
+          gradOutput_data + o*output_w*output_h,  output_h,  output_w,
+          weight_data + k*kW*kH, kH, kW,
+          dH, dW
+        );
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(freeCopyTo)(gradInput, gradInput_);
+  THTensor_(free)(gradOutput);
+}
+
+void THNN_(SpatialFullConvolutionMap_accGradParameters)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH, real scale)
+{
+  THArgCheck(
+    gradWeight != NULL && gradWeight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+    "3D gradWeight tensor expected (connTable:size(1) x kH x kW)"
+  );
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+  real *gradBias_data = THTensor_(data)(gradBias);
+
+  /* and dims */
+  const long input_h  = input->size[1];
+  const long input_w  = input->size[2];
+  const long output_h = gradOutput->size[1];
+  const long output_w = gradOutput->size[2];
+  const long weight_h = gradWeight->size[1];
+  const long weight_w = gradWeight->size[2];
+
+  /* gradients wrt bias */
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nOutputPlane; k++)
+  {
+    real *ptr_gradOutput = gradOutput_data + k*output_w*output_h;
+    long l;
+    for (l = 0; l < output_h*output_w; l++)
+      gradBias_data[k] += scale*ptr_gradOutput[l];
+  }
+
+  /* gradients wrt weight */
+  int nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
+  for (k = 0; k < nkernel; k++)
+  {
+    int o = (int)THTensor_(get2d)(connTable,k,1)-1;
+    int i = (int)THTensor_(get2d)(connTable,k,0)-1;
+
+    /* gradient to kernel */
+    THTensor_(validXCorr2DRevptr)(
+      gradWeight_data + k*weight_w*weight_h,
+      scale,
+      gradOutput_data + o*output_w*output_h, output_h, output_w,
+      input_data + i*input_w*input_h, input_h, input_w,
+      dH, dW
+    );
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/lib/THNN/generic/SpatialMaxPooling.c b/lib/THNN/generic/SpatialMaxPooling.c
new file mode 100644
index 0000000..d28fe85
--- /dev/null
+++ b/lib/THNN/generic/SpatialMaxPooling.c
@@ -0,0 +1,300 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialMaxPooling.c"
+#else
+
+static void THNN_(SpatialMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          real *ind_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j;
+    real *ip = input_p   + k*iwidth*iheight;
+    for(i = 0; i < oheight; i++)
+    {
+      for(j = 0; j < owidth; j++)
+      {
+        long hstart = i * dH - padH;
+        long wstart = j * dW - padW;
+        long hend = fminf(hstart + kH, iheight);
+        long wend = fminf(wstart + kW, iwidth);
+        hstart = fmaxf(hstart, 0);
+        wstart = fmaxf(wstart, 0);
+
+        /* local pointers */
+        real *op = output_p  + k*owidth*oheight + i*owidth + j;
+        real *indp = ind_p   + k*owidth*oheight + i*owidth + j;
+
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -THInf;
+        long tcntr = 0;
+        long x,y;
+        for(y = hstart; y < hend; y++)
+        {
+          for(x = wstart; x < wend; x++)
+          {
+            tcntr = y*iwidth + x;
+            real val = *(ip + tcntr);
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = tcntr;
+            }
+          }
+        }
+
+        /* set output to local max */
+        *op = maxval;
+
+        /* store location of max */
+        *indp = maxindex + 1;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
+
+  if (input->nDimension == 4) 
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+  THArgCheck(input->size[dimw] >= kW - padW && input->size[dimh] >= kH - padH, 2, "input image smaller than kernel size");
+
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  if (ceil_mode)
+  {
+    oheight = (long)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1;
+    owidth  = (long)(ceil((float)(iwidth  - kW + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    oheight = (long)(floor((float)(iheight - kH + 2*padH) / dH)) + 1;
+    owidth  = (long)(floor((float)(iwidth  - kW + 2*padW) / dW)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((oheight - 1)*dH >= iheight + padH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + padW)
+      --owidth;
+  }
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    /* indices will contain the locations for each output point */
+    THTensor_(resize3d)(indices,  nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    THNN_(SpatialMaxPooling_updateOutput_frame)(input_data, output_data,
+                                              indices_data,
+                                              nslices,
+                                              iwidth, iheight,
+                                              owidth, oheight,
+                                              kW, kH, dW, dH,
+                                              padW, padH);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    /* indices will contain the locations for each output point */
+    THTensor_(resize4d)(indices, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialMaxPooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
+                                                indices_data+p*nslices*owidth*oheight,
+                                                nslices,
+                                                iwidth, iheight,
+                                                owidth, oheight,
+                                                kW, kH, dW, dH,
+                                                padW, padH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          real *ind_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          int dW,
+          int dH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+    real *ind_p_k = ind_p + k*owidth*oheight;
+
+    /* calculate max points */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      for(j = 0; j < owidth; j++)
+      {
+        /* retrieve position of max */
+        long maxp = ind_p_k[i*owidth + j] - 1;
+        /* update gradient */
+        gradInput_p_k[maxp] += gradOutput_p_k[i*owidth + j];
+      }
+    }
+  }
+}
+
+void THNN_(SpatialMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                 indices_data,
+                                                 nslices,
+                                                 iwidth, iheight,
+                                                 owidth, oheight,
+                                                 dW, dH);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                   indices_data+p*nslices*owidth*oheight,
+                                                   nslices,
+                                                   iwidth, iheight,
+                                                   owidth, oheight,
+                                                   dW, dH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/lib/THNN/generic/SpatialMaxUnpooling.c b/lib/THNN/generic/SpatialMaxUnpooling.c
new file mode 100644
index 0000000..6e7a76e
--- /dev/null
+++ b/lib/THNN/generic/SpatialMaxUnpooling.c
@@ -0,0 +1,223 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialMaxUnpooling.c"
+#else
+
+static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
+                                                      real *ind_p,
+                                                      long nslices,
+                                                      long iwidth, long iheight,
+                                                      long owidth, long oheight)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {    
+    real *output_p_k = output_p + k*owidth*oheight;
+    real *input_p_k = input_p + k*iwidth*iheight;
+    real *ind_p_k = ind_p + k*iwidth*iheight;
+
+    long i, j, maxp;
+    for(i = 0; i < iheight; i++)
+    {
+      for(j = 0; j < iwidth; j++)
+      {
+        maxp = ind_p_k[i*iwidth + j] - 1;  /* retrieve position of max */
+        if(maxp<0 || maxp>=owidth*oheight){
+            THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
+        }
+        output_p_k[maxp] = input_p_k[i*iwidth + j]; /* update output */
+      }
+    }
+  }
+}
+
+void THNN_(SpatialMaxUnpooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *indices,
+    int owidth, int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
+  if (!THTensor_(isSameSizeAs)(input, indices)){
+    THError("Invalid input size w.r.t current indices size");
+  }  
+
+  if (input->nDimension == 4) 
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  /* get contiguous input and indices */
+  input = THTensor_(newContiguous)(input);
+  indices = THTensor_(newContiguous)(indices);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data, output_data,
+                                              indices_data,
+                                              nslices,
+                                              iwidth, iheight,
+                                              owidth, oheight);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
+                                                indices_data+p*nslices*iwidth*iheight,
+                                                nslices,
+                                                iwidth, iheight,
+                                                owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  THTensor_(free)(indices);
+}
+
+static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
+                                                         real *ind_p,
+                                                         long nslices,
+                                                         long iwidth, long iheight,
+                                                         long owidth, long oheight)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+    real *ind_p_k = ind_p + k*iwidth*iheight;
+
+    long i, j, maxp;
+    for(i = 0; i < iheight; i++)
+    {
+      for(j = 0; j < iwidth; j++)
+      {        
+        maxp = ind_p_k[i*iwidth + j] - 1; /* retrieve position of max */         
+        if(maxp<0 || maxp>=owidth*oheight){
+            THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
+        }  
+        gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */
+      }
+    }
+  }
+}
+
+void THNN_(SpatialMaxUnpooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *indices,
+    int owidth, int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  if (!THTensor_(isSameSizeAs)(input, indices)){
+    THError("Invalid input size w.r.t current indices size");
+  } 
+
+  /* get contiguous gradOutput and indices */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  indices = THTensor_(newContiguous)(indices);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
+    THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d", oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]);
+  }
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                 indices_data,
+                                                 nslices,
+                                                 iwidth, iheight,
+                                                 owidth, oheight);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                   indices_data+p*nslices*iwidth*iheight,
+                                                   nslices,
+                                                   iwidth, iheight,
+                                                   owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(indices);
+}
+
+#endif
diff --git a/lib/THNN/generic/SpatialReflectionPadding.c b/lib/THNN/generic/SpatialReflectionPadding.c
new file mode 100644
index 0000000..08e0ba0
--- /dev/null
+++ b/lib/THNN/generic/SpatialReflectionPadding.c
@@ -0,0 +1,255 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialReflectionPadding.c"
+#else
+
+static void THNN_(SpatialReflectionPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l * 2 - j;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = (iwidth + pad_l - 1) * 2 - j;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t * 2 - i;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = (iheight + pad_t - 1) * 2 - i;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
+        real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p = *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
+                                                  THTensor *input,
+                                                  THTensor *output,
+                                                  int pad_l, int pad_r,
+                                                  int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THArgCheck(input->nDimension == 3 ||
+    input->nDimension == 4 , 2, "input must be 3 or 4-dimensional");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 , 2, "input is too small");
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialReflectionPadding_updateOutput_frame)(input_data, output_data,
+                                                    nslices,
+                                                    iwidth, iheight,
+                                                    owidth, oheight,
+                                                    pad_l, pad_r,
+                                                    pad_t, pad_b);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialReflectionPadding_updateOutput_frame)(
+        input_data+p*nslices*iwidth*iheight,
+        output_data+p*nslices*owidth*oheight,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l * 2 - j;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = (iwidth + pad_l - 1) * 2 - j;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t * 2 - i;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = (iheight + pad_t - 1) * 2 - i;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
+        real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p += *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *gradOutput,
+                                                      THTensor *gradInput,
+                                                      int pad_l, int pad_r,
+                                                      int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+                "gradOutput width unexpected");
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 3) {
+    THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight,
+      owidth, oheight,
+      pad_l, pad_r,
+      pad_t, pad_b);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/lib/THNN/generic/SpatialReplicationPadding.c b/lib/THNN/generic/SpatialReplicationPadding.c
new file mode 100644
index 0000000..cdd6fc5
--- /dev/null
+++ b/lib/THNN/generic/SpatialReplicationPadding.c
@@ -0,0 +1,254 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialReplicationPadding.c"
+#else
+
+static void THNN_(SpatialReplicationPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = iwidth + pad_l - 1;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = iheight + pad_t - 1;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
+        real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p = *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *output,
+                                                         int pad_l, int pad_r,
+                                                         int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4,
+             2, "input must be 3 or 4-dimensional");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 , 2, "input is too small");
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialReplicationPadding_updateOutput_frame)(input_data, output_data,
+                                                    nslices,
+                                                    iwidth, iheight,
+                                                    owidth, oheight,
+                                                    pad_l, pad_r,
+                                                    pad_t, pad_b);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialReplicationPadding_updateOutput_frame)(
+        input_data+p*nslices*iwidth*iheight,
+        output_data+p*nslices*owidth*oheight,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = iwidth + pad_l - 1;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = iheight + pad_t - 1;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
+        real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p += *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *gradOutput,
+                                                      THTensor *gradInput,
+                                                      int pad_l, int pad_r,
+                                                      int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+                "gradOutput width unexpected");
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 3) {
+    THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight,
+      owidth, oheight,
+      pad_l, pad_r,
+      pad_t, pad_b);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+
+#endif
diff --git a/lib/THNN/generic/SpatialSubSampling.c b/lib/THNN/generic/SpatialSubSampling.c
new file mode 100644
index 0000000..abfbfce
--- /dev/null
+++ b/lib/THNN/generic/SpatialSubSampling.c
@@ -0,0 +1,267 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialSubSampling.c"
+#else
+
+void THNN_(SpatialSubSampling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    int kW, int kH,
+    int dW, int dH)
+{
+  
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *output_data;
+  real *input_data;
+
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  int nInputPlane = THTensor_(size)(weight,0);
+
+  long k;
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  THArgCheck(input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size");
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+  else
+    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+  
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+  
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      long xx, yy;
+      /* For all output pixels... */
+      real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+      /* Get the good mask for (k,i) (k out, i in) */
+      real the_weight = weight_data[k];
+      /* Initialize to the bias */
+      real z = bias_data[k];
+      long i;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        ptr_output[i] = z;
+      
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          /* Compute the mean of the input image... */
+          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real sum = 0;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              sum += ptr_input[kx];
+            ptr_input += inputWidth; /* next input line */
+          }
+          /* Update output */
+          *ptr_output++ += the_weight*sum;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(SpatialSubSampling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    int kW, int kH,
+    int dW, int dH)
+{
+  
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  int nInputPlane = THTensor_(size)(weight,0);
+
+  real *weight_data;
+  real *gradOutput_data;
+  real *input_data, *gradInput_data;
+
+  long k;
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  weight_data = THTensor_(data)(weight);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  input_data = THTensor_(data)(input);
+
+  THTensor_(resizeAs)(gradInput, input);
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real the_weight = weight_data[k];
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      long xx, yy;
+
+      real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      long i;
+      for(i=0; i<inputWidth*inputHeight; i++)
+        ptr_gi[i] = 0.0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real z = *ptr_gradOutput++ * the_weight;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              ptr_gradInput[kx] += z;
+            ptr_gradInput += inputWidth;
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(SpatialSubSampling_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    int kW, int kH,
+    int dW, int dH,
+    real scale)
+{
+  long nbatch = 1;
+  long dimw = 2;
+  long dimh = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  int nInputPlane = THTensor_(size)(gradWeight,0);
+
+  real *gradWeight_data;
+  real *gradBias_data;
+  real *gradOutput_data;
+  real *input_data;
+
+  long k;
+
+  if (input->nDimension == 4) {
+    dimw++;
+    dimh++;
+    nbatch = input->size[0];
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  gradWeight_data = THTensor_(data)(gradWeight);
+  gradBias_data = THTensor_(data)(gradBias);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      real sum;
+      long xx, yy;
+      long i;
+
+      sum = 0;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        sum += ptr_gradOutput[i];
+      gradBias_data[k] += scale*sum;
+
+      sum = 0;
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real z = *ptr_gradOutput++;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              sum += z * ptr_input[kx];
+            ptr_input += inputWidth;
+          }
+        }
+      }
+      gradWeight_data[k] += scale*sum;
+    }
+  }
+
+  THTensor_(free)(input);
+}
+
+#endif
diff --git a/lib/THNN/generic/SpatialUpSamplingNearest.c b/lib/THNN/generic/SpatialUpSamplingNearest.c
new file mode 100644
index 0000000..7ef093c
--- /dev/null
+++ b/lib/THNN/generic/SpatialUpSamplingNearest.c
@@ -0,0 +1,143 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialUpSamplingNearest.c"
+#else
+
+void THNN_(SpatialUpSamplingNearest_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int scale_factor)
+{
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int xDim = input->nDimension-2;
+  int yDim = input->nDimension-1;
+
+  // dims
+  int idim = input->nDimension;  // Gauranteed to be between 3 and 5
+  int osz0 = output->size[0];
+  int osz1 = output->size[1];
+  int osz2 = output->size[2];
+  int osz3 = 1;
+  if (idim > 3) {
+    osz3 = output->size[3];
+  }
+
+  // get strides
+  long *is = input->stride;
+  long *os = output->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(input);
+  real *pout = THTensor_(data)(output);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, isrc, idst;
+  int iout[4];  // Output indices
+  int iin[4];  // Input indices
+
+  for (i0 = 0; i0 < osz0; i0++) {
+    iout[0] = i0;
+    iin[0] = i0;
+    for (i1 = 0; i1 < osz1; i1++) {
+      iout[1] = i1;
+      iin[1] = i1;
+      for (i2 = 0; i2 < osz2; i2++) {
+        iout[2] = i2;
+        iin[2] = i2;
+        for (i3 = 0; i3 < osz3; i3++) {
+          iout[3] = i3;
+          iin[3] = i3;
+
+          // set the indices for the upsampled dimensions
+          iin[xDim] = iout[xDim] / dW;
+          iin[yDim] = iout[yDim] / dH;
+
+          idst = i0*os[0] + i1*os[1] + i2*os[2];
+          isrc = iin[0]*is[0] + iin[1]*is[1] + iin[2]*is[2];
+          if (idim > 3) {
+            idst += i3*os[3];
+            isrc += iin[3]*is[3];
+          }
+
+          pout[idst] = pin[isrc];
+        }
+      }
+    }
+  }
+}
+
+void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int scale_factor)
+{
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int xDim = gradInput->nDimension-2;
+  int yDim = gradInput->nDimension-1;
+
+  // dims
+  int idim = gradInput->nDimension;  // Gauranteed to be between 3 and 5
+  int isz0 = gradInput->size[0];
+  int isz1 = gradInput->size[1];
+  int isz2 = gradInput->size[2];
+  int isz3 = 1;
+  if (idim > 3) {
+    isz3 = gradInput->size[3];
+  }
+
+  // get strides
+  long *is = gradInput->stride;
+  long *os = gradOutput->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(gradInput);
+  real *pout = THTensor_(data)(gradOutput);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, isrc, idst, x, y;
+  int iin[4];  // Input indices
+  int iout[4];  // Output indices
+
+  THTensor_(zero)(gradInput);
+
+  for (i0 = 0; i0 < isz0; i0++) {
+    iin[0] = i0;
+    iout[0] = i0;
+    for (i1 = 0; i1 < isz1; i1++) {
+      iin[1] = i1;
+      iout[1] = i1;
+      for (i2 = 0; i2 < isz2; i2++) {
+        iin[2] = i2;
+        iout[2] = i2;
+        for (i3 = 0; i3 < isz3; i3++) {
+          iin[3] = i3;
+          iout[3] = i3;
+
+          idst = i0*is[0] + i1*is[1] + i2*is[2];
+          if (idim > 3) {
+            idst += i3*is[3];
+          }
+
+          // Now accumulate the gradients from gradOutput
+          for (y = 0; y < dH; y++) {
+            for (x = 0; x < dW; x++) {
+              iout[xDim] = dW * iin[xDim] + x;
+              iout[yDim] = dH * iin[yDim] + y;
+              isrc = iout[0]*os[0] + iout[1]*os[1] + iout[2]*os[2];
+              if (idim > 3) {
+                isrc += iout[3]*os[3];
+              }
+              pin[idst] += pout[isrc];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/Sqrt.c b/lib/THNN/generic/Sqrt.c
new file mode 100644
index 0000000..826ed1d
--- /dev/null
+++ b/lib/THNN/generic/Sqrt.c
@@ -0,0 +1,50 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Sqrt.c"
+#else
+
+void THNN_(Sqrt_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real eps)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(sqrt)(output, input);
+}
+
+void THNN_(Sqrt_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (output->nDimension == 1 || 
+      !THTensor_(isContiguous)(output) || 
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      *gradInput_data = (*output_data == 0.0) ? 0.0 : (0.5 * (*gradOutput_data / *output_data));
+    );
+  }
+  else
+  {
+    real *gradOutput_data = THTensor_(data)(gradOutput);
+    real *gradInput_data  = THTensor_(data)(gradInput);
+    real *output_data     = THTensor_(data)(output);
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < THTensor_(nElement)(output); i++)
+    {
+      if (output_data[i] == 0.0)
+        gradInput_data[i] = 0.0;
+      else
+        gradInput_data[i] = 0.5 * (gradOutput_data[i] / output_data[i]);
+    }
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/Square.c b/lib/THNN/generic/Square.c
new file mode 100644
index 0000000..a26c001
--- /dev/null
+++ b/lib/THNN/generic/Square.c
@@ -0,0 +1,58 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Square.c"
+#else
+
+void THNN_(Square_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+  
+  if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+  {
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data) * (*input_data);
+    );
+  }
+  else
+  {
+    real *output_data = THTensor_(data)(output);
+    real *input_data  = THTensor_(data)(input);
+    long i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(input); i++)
+      output_data[i] = input_data[i]*input_data[i];
+  }
+}
+
+void THNN_(Square_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (input->nDimension == 1 || 
+      !THTensor_(isContiguous)(input) || 
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      *gradInput_data  = 2.0 * (*gradOutput_data) * (*input_data);
+    );
+  }
+  else
+  {
+    real *gradOutput_data = THTensor_(data)(gradOutput);
+    real *gradInput_data  = THTensor_(data)(gradInput);
+    real *input_data  = THTensor_(data)(input);
+    long i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(gradInput); i++)
+      gradInput_data[i] = 2.0 * gradOutput_data[i] * input_data[i];
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/THNN.h b/lib/THNN/generic/THNN.h
new file mode 100644
index 0000000..1600fb1
--- /dev/null
+++ b/lib/THNN/generic/THNN.h
@@ -0,0 +1,1096 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THNN.h"
+#else
+
+TH_API void THNN_(Abs_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output);           // [OUT] Abs output
+TH_API void THNN_(Abs_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. output
+          THTensor *gradInput);        // [OUT] gradient w.r.t. input
+
+TH_API void THNN_(AbsCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // tensor with target values
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          bool sizeAverage);           // if true, the loss will be divided by batch size
+TH_API void THNN_(AbsCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // tensor with target values
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage);           // if true, the gradient will be normalized by batch size
+
+TH_API void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (1D/2D)
+          THIndexTensor *target,       // tensor containing indexes of target classes
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight);     // [BUFFER]
+TH_API void THNN_(ClassNLLCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (1D/2D)
+          THIndexTensor *target,       // tensor containing indexes of target classes
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight);     // [BUFFER]
+
+TH_API void THNN_(SpatialClassNLLCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (4D)
+          THIndexTensor *target,       // tensor containing indexes of target classes (3D)
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight);     // [BUFFER]
+TH_API void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (4D)
+          THIndexTensor *target,       // tensor containing indexes of target classes (3D)
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight);     // [BUFFER]
+
+TH_API void THNN_(ELU_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] ELU output
+          real alpha,                  // an ELU parameter (as in paper)
+          bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+TH_API void THNN_(ELU_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          THTensor *output,            // output from a forward pass
+          real alpha,                  // an ELU parameter (as in paper)
+          bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+
+TH_API void THNN_(DistKLDivCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor
+          THTensor *output,            // [OUT] a one-element tensor containing the loss
+          bool sizeAverage);           // if true, the loss will be normalized **by total number of elements**
+TH_API void THNN_(DistKLDivCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage);           // if true, the loss will be normalized **by total number of elements**
+
+// HardShink outputs 0 on interval of (-lambda; lambda) or original value otherwise.
+TH_API void THNN_(HardShrink_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] output tensor
+          real lambda);                // HardShrink parameter
+TH_API void THNN_(HardShrink_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          real lambda);                // HardShrink parameter
+
+// HardTanh clamps the values to the interval [min_val; max_val].
+TH_API void THNN_(HardTanh_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] output tensor
+          real min_val,                // lower threshold
+          real max_val);               // upper threshold
+TH_API void THNN_(HardTanh_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. the input
+          real min_val,                // lower threshold
+          real max_val);               // upper threshold
+
+TH_API void THNN_(L1Cost_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output);           // [OUT] output tensor
+TH_API void THNN_(L1Cost_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t module's output
+          THTensor *gradInput);        // [OUT] gradient w.r.t the input
+
+TH_API void THNN_(LeakyReLU_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // [MODIFIED] input tensor
+          THTensor *output,            // [OUT] output tensor
+          real negval,                 // negative part slope
+          bool inplace);               // if true, modifies the input tensor and sets the output tensor on it (no additional memory is allocated)
+TH_API void THNN_(LeakyReLU_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // [MODIFIED] gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. the input
+          real negval,                 // negative part slope
+          bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+
+TH_API void THNN_(LogSigmoid_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // output tensor
+          THTensor *buffer);           // [BUFFER]
+TH_API void THNN_(LogSigmoid_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          THTensor *buffer);           // [BUFFER]
+
+TH_API void THNN_(LogSoftMax_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output);           // [OUT] output tensor
+TH_API void THNN_(LogSoftMax_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          THTensor *output);           // module's output
+
+TH_API void THNN_(LookupTable_accGradParameters)(
+          THNNState *state,
+          THIndexTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THIntegerTensor *count,
+          THTensor *sorted,
+          THTensor *indices,
+          bool scaleGradByFreq,
+          int paddingValue,
+          real scale);
+
+TH_API void THNN_(LookupTable_renorm)(
+          THNNState *state,            // library's state
+          THIndexTensor *idx,          // vector that contains row indices (modified in function)
+          THTensor *weight,            // 2D tensor whose rows will be renormalized
+          real maxNorm,                // maximum norm
+          real normType);              // the norm type (e.g., normType=2, then it's 2-norm)
+
+TH_API void THNN_(MarginCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor (should contain only 1s and -1s)
+          THTensor *output,            // [OUT] a one-element tensor containing the loss
+          bool sizeAverage,            // if true, the loss is normalized by **total number of elements**
+          real margin);                // a margin that is required for the loss to be 0
+TH_API void THNN_(MarginCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor (should contin only 1s and -1s)
+          THTensor *gradInput,         // [OUT] gradient w.r.t. module's input
+          bool sizeAverage,            // if true, the gradient is normalized by **total number of elements**
+          real margin);                // a margin that is required for the loss to be 0
+
+TH_API void THNN_(SoftMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+
+TH_API void THNN_(SoftMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
+TH_API void THNN_(MSECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+TH_API void THNN_(MSECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
+TH_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          THTensor *isTarget,
+          bool sizeAverage);
+TH_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          THTensor *isTarget,
+          bool sizeAverage);
+
+TH_API void THNN_(MultiMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          int p,
+          THTensor* weights,
+          real margin);
+TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          int p,
+          THTensor *weights,
+          real margin);
+
+TH_API void THNN_(PReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THIndex_t nOutputPlane);
+TH_API void THNN_(PReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THIndex_t nOutputPlane);
+TH_API void THNN_(PReLU_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradWeight,
+          THTensor *gradWeightBuf,
+          THTensor *gradWeightBuf2,
+          THIndex_t nOutputPlane,
+          real scale);
+
+TH_API void THNN_(RReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *noise,
+          real lower,
+          real upper,
+          bool train,
+          bool inplace,
+          THGenerator *generator);
+TH_API void THNN_(RReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *noise,
+          real lower,
+          real upper,
+          bool train,
+          bool inplace);
+
+TH_API void THNN_(Sigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Sigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(SmoothL1Criterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+TH_API void THNN_(SmoothL1Criterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
+TH_API void THNN_(SoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(SoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(SoftPlus_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real beta,
+          real threshold);
+TH_API void THNN_(SoftPlus_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          real beta,
+          real threshold);
+
+TH_API void THNN_(SoftShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real lambda);
+TH_API void THNN_(SoftShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real lambda);
+
+TH_API void THNN_(SparseLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+TH_API void THNN_(SparseLinear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          real weightDecay,
+          real scale);
+TH_API void THNN_(SparseLinear_zeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput);
+TH_API void THNN_(SparseLinear_updateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          real learningRate);
+TH_API void THNN_(SparseLinear_legacyUpdateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+TH_API void THNN_(SparseLinear_legacyAccGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          real weightDecay,
+          real scale);
+TH_API void THNN_(SparseLinear_legacyZeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput);
+TH_API void THNN_(SparseLinear_legacyUpdateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          real learningRate);
+
+TH_API void THNN_(Sqrt_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real eps);
+TH_API void THNN_(Sqrt_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(Square_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Square_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+
+TH_API void THNN_(Tanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Tanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(Threshold_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real threshold,
+          real val,
+          bool inplace);
+TH_API void THNN_(Threshold_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real threshold,
+          bool inplace);
+
+TH_API void THNN_(TemporalConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int dW,
+          int inputFrameSize,
+          int outputFrameSize);
+TH_API void THNN_(TemporalConvolution_updateGradInput)(
+          THNNState* state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int dW);
+TH_API void THNN_(TemporalConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int dW,
+          real scale);
+TH_API void THNN_(TemporalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kW, int dW);
+TH_API void THNN_(TemporalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int kW, int dW);
+TH_API void THNN_(TemporalSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int dW,
+          int inputFrameSize);
+TH_API void THNN_(TemporalSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int dW);
+TH_API void THNN_(TemporalSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int dW,
+          real scale);
+
+TH_API void THNN_(BatchNormalization_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *running_mean,
+          THTensor *running_var,
+          THTensor *save_mean,
+          THTensor *save_std,
+          bool train,
+          double momentum,
+          double eps);
+TH_API void THNN_(BatchNormalization_backward)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *running_mean,
+          THTensor *running_var,
+          THTensor *save_mean,
+          THTensor *save_std,
+          bool train,
+          double scale,
+          double eps);
+
+TH_API void THNN_(SpatialConvolutionMap_updateOutput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *output,       // [OUT] convolution output
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialConvolutionMap_updateGradInput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradInput,    // [OUT] gradient w.r.t. input
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialConvolutionMap_accGradParameters)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradWeight,   // 3D gradWeight tensor (connTable:size(1) x kH x kW)
+          THTensor *gradBias,     // 1D gradBias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH,         // stride
+          real scale);            // scaling factor
+
+TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          real scale);
+
+TH_API void THNN_(SpatialConvolutionLocal_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight);
+TH_API void THNN_(SpatialConvolutionLocal_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight);
+TH_API void THNN_(SpatialConvolutionLocal_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight,
+          real scale);
+
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int owidth, int oheight);
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices);
+
+TH_API void THNN_(SpatialAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode,
+          bool count_include_pad);
+TH_API void THNN_(SpatialAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode,
+          bool count_include_pad);
+
+TH_API void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int outputW, int outputH,
+          int poolSizeW, int poolSizeH,
+          THTensor *indices,
+          THTensor *randomSamples);
+TH_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int outputW, int outputH,
+          int poolSizeW, int poolSizeH,
+          THTensor *indices);
+
+TH_API void THNN_(SpatialFullConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH);
+TH_API void THNN_(SpatialFullConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH);
+TH_API void THNN_(SpatialFullConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH,
+          real scale);
+
+TH_API void THNN_(SpatialFullConvolutionMap_updateOutput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *output,       // [OUT] convolution output
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialFullConvolutionMap_updateGradInput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradInput,    // [OUT] gradient w.r.t. input
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialFullConvolutionMap_accGradParameters)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradWeight,   // 3D gradWeight tensor (connTable:size(1) x kH x kW)
+          THTensor *gradBias,     // 1D gradBias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH,         // stride
+          real scale);            // scaling factor
+
+TH_API void THNN_(SpatialDilatedConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH,
+    real scale);
+
+TH_API void THNN_(SpatialMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode);
+TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode);
+
+TH_API void THNN_(SpatialMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int owidth, int oheight);
+TH_API void THNN_(SpatialMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int owidth, int oheight);
+
+TH_API void THNN_(SpatialSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int kH,
+          int dW, int dH);
+TH_API void THNN_(SpatialSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int kH,
+          int dW, int dH);
+TH_API void THNN_(SpatialSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int kH,
+          int dW, int dH,
+          real scale);
+
+TH_API void THNN_(SpatialUpSamplingNearest_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int scale_factor);
+TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int scale_factor);
+
+TH_API void THNN_(unfolded_acc)(
+          THTensor *finput,
+          THTensor *input,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int nInputPlane,
+          int inputWidth, int inputHeight,
+          int outputWidth, int outputHeight);
+TH_API void THNN_(unfolded_copy)(
+          THTensor *finput,
+          THTensor *input,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int nInputPlane,
+          int inputWidth, int inputHeight,
+          int outputWidth, int outputHeight);
+
+TH_API void THNN_(VolumetricAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH);
+TH_API void THNN_(VolumetricAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH);
+
+TH_API void THNN_(VolumetricConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          real scale);
+
+TH_API void THNN_(VolumetricConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale);
+
+TH_API void THNN_(VolumetricFullConvolution_updateOutput)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *output,         // [OUT] volumetric convolution output
+          THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *bias,           // gradBias tensor (nOutputPlane)
+          THTensor *finput,         // [OUT] internal columns buffer
+          THTensor *fgradInput,     // [OUT] internal ones buffer
+          int dT, int dW, int dH,   // stride of the convolution
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH);  // extra output adjustment
+TH_API void THNN_(VolumetricFullConvolution_updateGradInput)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *gradOutput,     // gradient w.r.t. output
+          THTensor *gradInput,      // [OUT] gradient w.r.t. input
+          THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *finput,         // internal columns buffer
+          THTensor *fgradInput,     // internal ones buffer
+          int dT, int dW, int dH,   // stride
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH);  // extra output adjustment
+TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *gradOutput,     // gradient w.r.t. output
+          THTensor *gradWeight,     // gradWeight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *gradBias,       // gradBias tensor (nOutputPlane)
+          THTensor *finput,         // internal columns buffer
+          THTensor *fgradInput,     // internal ones buffer
+          int dT, int dW, int dH,   // stride
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH,   // extra output adjustment
+          real scale);              // scaling factor
+
+TH_API void THNN_(VolumetricMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          bool ceilMode);
+TH_API void THNN_(VolumetricMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+
+TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int oT, int oW, int oH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int oT, int oW, int oH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+
+TH_API void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *output,
+                                                         int pad_l, int pad_r,
+                                                         int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state,
+                                                            THTensor *input,
+                                                            THTensor *gradOutput,
+                                                            THTensor *gradInput,
+                                                            int pad_l, int pad_r,
+                                                            int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *output,
+                                                         int pad_l, int pad_r,
+                                                         int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
+                                                            THTensor *input,
+                                                            THTensor *gradOutput,
+                                                            THTensor *gradInput,
+                                                            int pad_l, int pad_r,
+                                                            int pad_t, int pad_b);
+
+#endif
diff --git a/lib/THNN/generic/Tanh.c b/lib/THNN/generic/Tanh.c
new file mode 100644
index 0000000..d6da1e4
--- /dev/null
+++ b/lib/THNN/generic/Tanh.c
@@ -0,0 +1,49 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Tanh.c"
+#else
+
+void THNN_(Tanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(tanh)(output, input);
+}
+
+void THNN_(Tanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(gradInput, output);
+
+  if (output->nDimension == 1 || 
+      !THTensor_(isContiguous)(output) || 
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      real z = *output_data;            \
+      *gradInput_data = *gradOutput_data * (1. - z*z);
+    );
+  }
+  else
+  {
+    real* ptr_gradOutput = THTensor_(data)(gradOutput);
+    real* ptr_gradInput  = THTensor_(data)(gradInput);
+    real* ptr_output     = THTensor_(data)(output);
+    long i;
+
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(gradInput); i++)
+    {
+      real z = ptr_output[i];
+      ptr_gradInput[i] = ptr_gradOutput[i] * (1. - z*z);
+    }
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/TemporalConvolution.c b/lib/THNN/generic/TemporalConvolution.c
new file mode 100644
index 0000000..a29a353
--- /dev/null
+++ b/lib/THNN/generic/TemporalConvolution.c
@@ -0,0 +1,349 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalConvolution.c"
+#else
+
+void THNN_(TemporalConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW,
+          int dW,
+          int inputFrameSize,
+          int outputFrameSize)
+{
+  THTensor *outputWindow, *inputWindow;
+  int nInputFrame, nOutputFrame;
+  long k, i;
+  
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+  
+  THArgCheck(input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
+  
+  if (input->nDimension == 3) 
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  THArgCheck(input->size[dimF] == inputFrameSize, 2, "invalid input frame size");
+  THArgCheck(input->size[dimS] >= kW, 2, "input sequence smaller than kernel size");
+
+  input = THTensor_(newContiguous)(input);
+  outputWindow = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  nInputFrame = input->size[dimS];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  if (input->nDimension == 2)
+  {
+    THTensor_(resize2d)(output,
+                        nOutputFrame,
+                        outputFrameSize);
+
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THTensor_(select)(outputWindow, output, 0, k);
+      THTensor_(copy)(outputWindow, bias);
+    }
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THTensor_(setStorage2d)(outputWindow, output->storage, 
+                              output->storageOffset + k*output->size[1],
+                              nFrame, outputFrameStride*output->size[1],
+                              output->size[1], 1);
+
+      THTensor_(transpose)(weight, NULL, 0, 1);
+      THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, weight);
+      THTensor_(transpose)(weight, NULL, 0, 1);
+    }
+  }
+  else
+  {
+    THTensor *outputSample = THTensor_(new)();
+    THTensor *inputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+    
+    THTensor_(resize3d)(output,
+                        nBatchFrame,
+                        nOutputFrame,
+                        outputFrameSize);
+    
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(outputSample, output, 0, i);
+      THTensor_(select)(inputSample, input, 0, i);
+      long nOutputSampleFrame = nOutputFrame;
+      
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THTensor_(select)(outputWindow, outputSample, 0, k);
+        THTensor_(copy)(outputWindow, bias);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(outputWindow, outputSample->storage, 
+                                outputSample->storageOffset + k*outputSample->size[1],
+                                nFrame, outputFrameStride*outputSample->size[1],
+                                outputSample->size[1], 1);
+
+        THTensor_(transpose)(weight, NULL, 0, 1);
+        THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, weight);
+        THTensor_(transpose)(weight, NULL, 0, 1);
+      }
+    }
+    THTensor_(free)(outputSample);
+    THTensor_(free)(inputSample);
+  }
+
+  THTensor_(free)(outputWindow);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW,
+          int dW)
+{
+  long nInputFrame;
+  long nOutputFrame;
+
+  THTensor *gradOutputWindow;
+  THTensor *gradInputWindow;
+  long k, i;
+  
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+  
+  if (gradOutput->nDimension == 3) 
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+  gradOutputWindow = THTensor_(new)();
+  gradInputWindow = THTensor_(new)();
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (gradOutput->nDimension == 2)
+  {
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THTensor_(setStorage2d)(gradInputWindow, gradInput->storage,
+                              gradInput->storageOffset+k*dW*gradInput->size[1],
+                              nFrame, inputFrameStride*gradInput->size[1],
+                              kW*gradInput->size[1], 1);
+
+      THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
+    }
+  }
+  else
+  {
+    THTensor *gradOutputSample = THTensor_(new)();
+    THTensor *gradInputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+    
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(gradOutputSample, gradOutput, 0, i);
+      THTensor_(select)(gradInputSample, gradInput, 0, i);
+      int nOutputSampleFrame = nOutputFrame;
+      
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(gradInputWindow, gradInputSample->storage,
+                                gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
+                                nFrame, inputFrameStride*gradInputSample->size[1],
+                                kW*gradInputSample->size[1], 1);
+
+        THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
+      }
+    }
+    THTensor_(free)(gradOutputSample);
+    THTensor_(free)(gradInputSample);
+  }
+
+  THTensor_(free)(gradOutputWindow);
+  THTensor_(free)(gradInputWindow);
+
+}
+
+void THNN_(TemporalConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW,
+          int dW,
+          real scale)
+{
+  long nInputFrame;
+  long nOutputFrame;
+
+  THTensor *gradOutputWindow;
+  THTensor *inputWindow;
+  long k, i;
+  
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+  
+  if (gradOutput->nDimension == 3) 
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutputWindow = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+  
+  if (input->nDimension == 2)
+  {
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THTensor_(select)(gradOutputWindow, gradOutput, 0, k);
+      THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
+    }
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage, 
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
+      THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow);
+      THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
+    }
+  }
+  else
+  {
+    THTensor *gradOutputSample = THTensor_(new)();
+    THTensor *inputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+    
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(gradOutputSample, gradOutput, 0, i);
+      THTensor_(select)(inputSample, input, 0, i);
+      int nOutputSampleFrame = nOutputFrame;
+      
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THTensor_(select)(gradOutputWindow, gradOutputSample, 0, k);
+        THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage, 
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
+        THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow);
+        THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
+      }
+    }
+    THTensor_(free)(gradOutputSample);
+    THTensor_(free)(inputSample);
+  }
+
+  THTensor_(free)(gradOutputWindow);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(input);
+
+}
+
+#endif
diff --git a/lib/THNN/generic/TemporalMaxPooling.c b/lib/THNN/generic/TemporalMaxPooling.c
new file mode 100644
index 0000000..48cbcab
--- /dev/null
+++ b/lib/THNN/generic/TemporalMaxPooling.c
@@ -0,0 +1,235 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalMaxPooling.c"
+#else
+
+void THNN_(TemporalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kW,
+          int dW)
+{
+  long niframe;
+  long framesize;
+  long noframe;
+
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+  long t, y;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  THArgCheck(input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  THArgCheck(input->size[dimS] >= kW, 2, "input sequence smaller than kernel size");
+
+  /* sizes */
+  niframe = input->size[dimS];
+  framesize = input->size[dimF];
+  noframe = (niframe - kW) / dW + 1;
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 2)
+  {
+    /* resize output */
+    THTensor_(resize2d)(output, noframe, framesize);
+
+    /* indices will contain index locations for each output point */
+    THTensor_(resize2d)(indices, noframe, framesize);
+
+    /* get raw pointers */
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    for(t = 0; t < noframe; t++)
+    {
+      real *ip = input_data + t*framesize*dW;
+      real *op = output_data + t*framesize;
+      real *xp = indices_data + t*framesize;
+#pragma omp parallel for private(y)
+      for(y = 0; y < framesize; y++)
+      {
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -THInf;
+        long x;
+        for(x = 0; x < kW; x++)
+        {
+          real val = ip[x*framesize+y];
+          if (val > maxval)
+          {
+            maxval = val;
+            maxindex = x;
+          }
+        }
+
+        /* set output to local max */
+        op[y] = maxval;
+        xp[y] = (real)maxindex;
+      }
+    }
+  }
+  else
+  {
+    /* number of batch frames */
+    long nbframe = input->size[0];
+    long i;
+
+    /* resize output */
+    THTensor_(resize3d)(output, nbframe, noframe, framesize);
+
+    /* indices will contain index locations for each output point */
+    THTensor_(resize3d)(indices, nbframe, noframe, framesize);
+
+    /* get raw pointers */
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    for(i = 0; i < nbframe; i++)
+    {
+      real *inputSample_data = input_data + i*niframe*framesize;
+      real *outputSample_data = output_data + i*noframe*framesize;
+      real *indicesSample_data = indices_data + i*noframe*framesize;
+
+      for(t = 0; t < noframe; t++)
+      {
+        real *ip = inputSample_data + t*framesize*dW;
+        real *op = outputSample_data + t*framesize;
+        real *xp = indicesSample_data + t*framesize;
+
+#pragma omp parallel for private(y)
+        for(y = 0; y < framesize; y++)
+        {
+          /* compute local max: */
+          long maxindex = -1;
+          real maxval = -THInf;
+          long x;
+          for(x = 0; x < kW; x++)
+          {
+            real val = ip[x*framesize+y];
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = x;
+            }
+          }
+
+          /* set output to local max */
+          op[y] = maxval;
+          xp[y] = (real)maxindex;
+        }
+      }
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int kW,
+          int dW)
+{
+  long niframe;
+  int noframe;
+  long framesize;
+
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  long t, y;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize and zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  /* sizes */
+  niframe = input->size[dimS];
+  noframe = gradOutput->size[dimS];
+  framesize = gradOutput->size[dimF];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  if (input->nDimension == 2)
+  {
+    for(t = 0; t < noframe; t++)
+    {
+      real *gip = gradInput_data + t*framesize*dW;
+      real *gop = gradOutput_data + t*framesize;
+      real *xp = indices_data + t*framesize;
+#pragma omp parallel for private(y)
+      for(y = 0; y < framesize; y++)
+      {
+        /* compute local max: */
+        long maxindex = (long)xp[y];
+        gip[maxindex*framesize+y] += gop[y];
+      }
+    }
+  }
+  else
+  {
+    /* number of batch frames */
+    long nbframe = input->size[0];
+    long i;
+
+    for(i = 0; i < nbframe; i++)
+    {
+      real *gradInputSample_data = gradInput_data + i*niframe*framesize;
+      real *gradOutputSample_data = gradOutput_data + i*noframe*framesize;
+      real *indicesSample_data = indices_data + i*noframe*framesize;
+
+      for(t = 0; t < noframe; t++)
+      {
+        real *gip = gradInputSample_data + t*framesize*dW;
+        real *gop = gradOutputSample_data + t*framesize;
+        real *xp = indicesSample_data + t*framesize;
+#pragma omp parallel for private(y)
+        for(y = 0; y < framesize; y++)
+        {
+          /* compute local max: */
+          long maxindex = (long)xp[y];
+          gip[maxindex*framesize+y] += gop[y];
+        }
+      }
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/lib/THNN/generic/TemporalSubSampling.c b/lib/THNN/generic/TemporalSubSampling.c
new file mode 100644
index 0000000..7fa323d
--- /dev/null
+++ b/lib/THNN/generic/TemporalSubSampling.c
@@ -0,0 +1,116 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalSubSampling.c"
+#else
+
+void THNN_(TemporalSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW,
+          int dW,
+          int inputFrameSize)
+{
+  THTensor *outputFrame, *inputWindow;
+  int nInputFrame, nOutputFrame;
+  long k;
+  
+  THArgCheck( input->nDimension == 2, 2, "2D tensor expected");
+  THArgCheck( input->size[1] == inputFrameSize, 2, "invalid input frame size");
+  THArgCheck( input->size[0] >= kW, 2, "input sequence smaller than kernel size");
+
+  outputFrame = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  nInputFrame = input->size[0];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  THTensor_(resize2d)(output,
+                      nOutputFrame,
+                      inputFrameSize);
+  
+  for(k = 0; k < nOutputFrame; k++)
+  {
+    THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
+    THTensor_(select)(outputFrame, output, 0, k);
+    THTensor_(sum)(outputFrame, inputWindow, 0);
+    THTensor_(cmul)(outputFrame, outputFrame, weight);
+    THTensor_(cadd)(outputFrame, outputFrame, 1, bias);
+  }
+
+  THTensor_(free)(outputFrame);
+  THTensor_(free)(inputWindow);
+}
+
+void THNN_(TemporalSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW,
+          int dW)
+{
+
+  THTensor *gradOutputFrame;
+  THTensor *gradInputWindow, *buffer, *kwunit;
+  long k;
+
+  gradOutputFrame = THTensor_(new)();
+  gradInputWindow = THTensor_(new)();
+  buffer = THTensor_(new)();
+  kwunit = THTensor_(newWithSize1d)(kW);
+
+  THTensor_(fill)(kwunit, 1);
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  for(k = 0; k < gradOutput->size[0]; k++)
+  {
+    THTensor_(narrow)(gradInputWindow, gradInput, 0, k*dW, kW);
+    THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
+    THTensor_(cmul)(buffer, weight, gradOutputFrame);
+    THTensor_(addr)(gradInputWindow, 1, gradInputWindow, 1, kwunit, buffer);
+  }
+
+  THTensor_(free)(gradOutputFrame);
+  THTensor_(free)(gradInputWindow);
+  THTensor_(free)(buffer);
+  THTensor_(free)(kwunit);
+}
+
+void THNN_(TemporalSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW,
+          int dW,
+          real scale)
+{
+  THTensor *gradOutputFrame;
+  THTensor *inputWindow, *buffer;
+  long k;
+
+
+  gradOutputFrame = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+  buffer = THTensor_(new)();
+
+  for(k = 0; k < gradOutput->size[0]; k++)
+  {
+    THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
+    THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
+    THTensor_(sum)(buffer, inputWindow, 0);
+    THTensor_(addcmul)(gradWeight, gradWeight, scale, buffer, gradOutputFrame);
+    THTensor_(cadd)(gradBias, gradBias, scale, gradOutputFrame);
+  }
+
+  THTensor_(free)(gradOutputFrame);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(buffer);
+}
+
+#endif
diff --git a/lib/THNN/generic/Threshold.c b/lib/THNN/generic/Threshold.c
new file mode 100644
index 0000000..ac00360
--- /dev/null
+++ b/lib/THNN/generic/Threshold.c
@@ -0,0 +1,58 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Threshold.c"
+#else
+
+void THNN_(Threshold_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          real threshold,
+          real val,
+          bool inplace)
+{
+  if (inplace)
+  {
+    TH_TENSOR_APPLY(real, input,
+      if (*input_data <= threshold)
+        *input_data = val;
+    );
+    THTensor_(set)(output, input);
+  }
+  else
+  {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data > threshold) ? *input_data : val;
+    );
+  }
+}
+
+void THNN_(Threshold_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          real threshold,
+          bool inplace)
+{
+  if (inplace)
+  {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,
+      if ((*input_data) <= threshold)
+        *gradOutput_data = 0;
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  }
+  else
+  {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      if ((*input_data) > threshold)
+        *gradInput_data = *gradOutput_data;
+      else
+        *gradInput_data = 0;
+    );
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/VolumetricAveragePooling.c b/lib/THNN/generic/VolumetricAveragePooling.c
new file mode 100644
index 0000000..49b311e
--- /dev/null
+++ b/lib/THNN/generic/VolumetricAveragePooling.c
@@ -0,0 +1,309 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c"
+#else
+
+static void THNN_(VolumetricAveragePooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j, ti;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* local pointers */
+          real *ip = input_p + k * itime * iwidth * iheight
+            + ti * iwidth * iheight * dT +  i * iwidth * dH + j * dW;
+          real *op = output_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* compute local sum: */
+          real sum = 0.0;
+          int x, y, z;
+
+          for (z=0; z < kT; z++)
+          {
+            for (y = 0; y < kH; y++)
+            {
+              for (x = 0; x < kW; x++)
+              {
+                sum +=  *(ip + z * iwidth * iheight + y * iwidth + x);
+              }
+            }
+          }
+
+          /* set output to local max */
+          *op = sum / (kT * kW * kH);
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch-mode) tensor expected"
+  );
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
+    "input image smaller than kernel size"
+  );
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  otime   = (itime   - kT) / dT + 1;
+  oheight = (iheight - kH) / dH + 1;
+  owidth  = (iwidth  - kW) / dW + 1;
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 4) /* non-batch mode */
+  {
+    /* resize output */
+    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(VolumetricAveragePooling_updateOutput_frame)(
+      input_data, output_data, nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH
+    );
+  }
+  else  /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+    /* resize output */
+    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p=0; p < nBatch; p++)
+    {
+      THNN_(VolumetricAveragePooling_updateOutput_frame)(
+        input_data + p * istride, output_data + p * ostride, nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j, ti;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* local pointers */
+          real *ip = gradInput_p + k * itime * iwidth * iheight
+            + ti * iwidth * iheight * dT +  i * iwidth * dH + j * dW;
+          real *op = gradOutput_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* scatter gradients out to footprint: */
+          real val  = *op / (kT * kW * kH);
+          int x,y,z;
+          for (z=0; z < kT; z++)
+          {
+            for (y = 0; y < kH; y++)
+            {
+              for (x = 0; x < kW; x++)
+              {
+                *(ip + z * iwidth * iheight + y * iwidth + x) += val;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  int nslices;
+  int itime;
+  int iheight;
+  int iwidth;
+  int otime;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  otime = gradOutput->size[dimt];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  /* backprop */
+  if (input->nDimension == 4) /* non-batch mode*/
+  {
+    THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data, nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH
+    );
+  }
+  else /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nBatch; p++)
+    {
+      THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+        gradInput_data  + p * istride, gradOutput_data + p * ostride, nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/lib/THNN/generic/VolumetricConvolution.c b/lib/THNN/generic/VolumetricConvolution.c
new file mode 100644
index 0000000..852dd54
--- /dev/null
+++ b/lib/THNN/generic/VolumetricConvolution.c
@@ -0,0 +1,247 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricConvolution.c"
+#else
+
+void THNN_(VolumetricConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch-mode) tensor expected"
+  );
+
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  long nOutputPlane = weight->size[0];
+  long kT           = weight->size[2];
+  long kH           = weight->size[3];
+  long kW           = weight->size[4];
+  long inputDepth   = input->size[dimt];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long outputDepth  = (inputDepth - kT) / dT + 1;
+  long outputWidth  = (inputWidth - kW) / dW + 1;
+  long outputHeight = (inputHeight - kH) / dH + 1;
+  THTensor *outn = THTensor_(new)();
+  long i, j;
+  if (input->nDimension == 4) /* non-batch mode */
+  {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+    /* add bias */
+    for (i = 0; i < bias->size[0]; i++)
+    {
+      THTensor_(select)(outn, output, 0, i);
+      THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+    }
+
+    /* do convolutions */
+    THTensor_(conv3Dmv)(output, 1.0, 1.0, input, weight, dT, dH, dW, "V", "X");
+  }
+  else /* batch mode */
+  {
+    long nBatch = input->size[0];
+    THTensor_(resize5d)(output, nBatch, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor *inb = THTensor_(new)();
+    THTensor *outb = THTensor_(new)();
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(inb, input, 0, j);
+      THTensor_(select)(outb, output, 0, j);
+
+      /* add bias */
+      for (i = 0; i < bias->size[0]; i++)
+      {
+        THTensor_(select)(outn, outb, 0, i);
+        THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+      }
+
+      /* do convolutions */
+      THTensor_(conv3Dmv)(outb, 1.0, 1.0, inb, weight, dT, dH, dW, "V", "X");
+    }
+
+    THTensor_(free)(inb);
+    THTensor_(free)(outb);
+  }
+  THTensor_(free)(outn);
+}
+
+void THNN_(VolumetricConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THArgCheck(weight->nDimension == 5, 4,
+    "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+  );
+
+  int nOutputPlane = (int)weight->size[0];
+
+  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3,
+    "4D or 5D (batch-mode) tensor expected"
+  );
+
+  int dimPlane = 0;
+  if (gradOutput->nDimension == 5)
+  {
+    dimPlane++;
+  }
+
+  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  /* gradient to input */
+  THTensor *tweight = THTensor_(newTranspose)(weight, 0, 1);
+  if (gradOutput->nDimension == 4) /* non-batch mode */
+  {
+    THTensor_(conv3Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dT, dH, dW, "F", "C");
+  }
+  else /* batch mode */
+  {
+    long nBatch = gradOutput->size[0];
+    THTensor *ginpb = THTensor_(new)();
+    THTensor *goutb = THTensor_(new)();
+    long j;
+
+    THTensor_(resize5d)(gradInput,
+      input->size[0], input->size[1], input->size[2], input->size[3], input->size[4]
+    );
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(ginpb, gradInput, 0, j);
+      THTensor_(select)(goutb, gradOutput, 0, j);
+      THTensor_(conv3Dmv)(ginpb, 0.0, 1.0, goutb, tweight, dT, dH, dW, "F", "C");
+    }
+    THTensor_(free)(ginpb);
+    THTensor_(free)(goutb);
+  }
+
+  THTensor_(free)(tweight);
+}
+
+void THNN_(VolumetricConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          real scale)
+{
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THArgCheck(gradWeight->nDimension == 5, 4,
+    "5D gradWeight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+  );
+
+  int nOutputPlane = (int)gradWeight->size[0];
+
+  THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
+    "gradBias tensor has wrong size"
+  );
+
+  long k;
+  real *gradBias_data;
+  THTensor *gradOutSlice;
+  int dimPlane = 0;
+  if (gradOutput->nDimension == 5)
+  {
+    dimPlane++;
+  }
+
+  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  if (gradOutput->nDimension == 4) /* non-batch mode */
+  {
+    /* gradient to bias */
+    gradBias_data = THTensor_(data)(gradBias);
+    gradOutSlice = THTensor_(new)();
+    for (k = 0; k < nOutputPlane; k++)
+    {
+      THTensor_(select)(gradOutSlice, gradOutput, 0, k);
+      gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+    }
+    THTensor_(free)(gradOutSlice);
+
+    /* gradient to kernels */
+    THTensor_(conv3DRevger)(gradWeight, 1.0, scale, input, gradOutput, dT, dH, dW);
+  }
+  else /* batch mode */
+  {
+    long nBatch = gradOutput->size[0];
+    THTensor *inpb = THTensor_(new)();
+    THTensor *goutb = THTensor_(new)();
+    long j;
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(inpb, input, 0, j);
+      THTensor_(select)(goutb, gradOutput, 0, j);
+
+      /* gradient to bias */
+      gradBias_data = THTensor_(data)(gradBias);
+      gradOutSlice = THTensor_(new)();
+      for (k = 0; k < nOutputPlane; k++)
+      {
+        THTensor_(select)(gradOutSlice, goutb, 0, k);
+        gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+      }
+      THTensor_(free)(gradOutSlice);
+
+      /* gradient to kernels */
+      THTensor_(conv3DRevger)(gradWeight, 1.0, scale, inpb, goutb, dT, dH, dW);
+    }
+    THTensor_(free)(inpb);
+    THTensor_(free)(goutb);
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/VolumetricConvolutionMM.c b/lib/THNN/generic/VolumetricConvolutionMM.c
new file mode 100644
index 0000000..a226350
--- /dev/null
+++ b/lib/THNN/generic/VolumetricConvolutionMM.c
@@ -0,0 +1,514 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricConvolutionMM.c"
+#else
+
+/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
+static void THNN_(unfolded_acc_vol)(
+          THTensor *finput,
+          THTensor *input,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int nInputPlane,
+          int inputDepth,
+          int inputWidth,
+          int inputHeight,
+          int outputDepth,
+          int outputWidth,
+          int outputHeight)
+{
+  int nip;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+//#pragma omp parallel for private(nip)
+  for (nip = 0; nip < nInputPlane; nip++)
+  {
+    int kt, kw, kh, t, y, x, it, ix, iy;
+    for (kt = 0; kt < kT; kt++)
+    {
+      for (kh = 0; kh < kH; kh++)
+      {
+        for (kw = 0; kw < kW; kw++)
+        {
+          real *src = finput_data
+            + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
+            + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
+            + kh  * (kW*outputDepth*outputHeight*outputWidth)
+            + kw  * (outputDepth*outputHeight*outputWidth);
+
+          real *dst = input_data + nip*(inputDepth*inputHeight*inputWidth);
+          if (pT > 0 || pH > 0 || pW > 0)
+          {
+            for (t = 0; t < outputDepth; t++)
+            {
+              it = t*dT - pT + kt;
+              for (y = 0; y < outputHeight; y++)
+              {
+                iy = y*dH - pH + kh;
+                for (x = 0; x < outputWidth; x++)
+                {
+                  ix = x*dW - pW + kw;
+                  if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
+                  {
+                  }
+                  else
+                  {
+                    THVector_(add)(dst+it*inputHeight*inputWidth+iy*inputWidth+ix, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+                  }
+                }
+              }
+            }
+          }
+          else
+          {
+            for (t = 0; t < outputDepth; t++)
+            {
+              it = t*dT + kt;
+              for (y = 0; y < outputHeight; y++)
+              {
+                iy = y*dH + kh;
+                for(x = 0; x < outputWidth; x++)
+                {
+                  ix = x*dW + kw;
+                  THVector_(add)(dst+it*inputHeight*inputWidth+iy*inputWidth+ix, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void THNN_(unfolded_copy_vol)(
+          THTensor *finput,
+          THTensor *input,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int nInputPlane,
+          int inputDepth,
+          int inputWidth,
+          int inputHeight,
+          int outputDepth,
+          int outputWidth,
+          int outputHeight)
+{
+  long k;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+// #pragma omp parallel for private(k)
+  for (k = 0; k < nInputPlane*kT*kH*kW; k++)
+  {
+    int nip = k / (kT*kH*kW);
+    int rest = k % (kT*kH*kW);
+    int kt = rest / (kH*kW);
+    rest = rest % (kH*kW);
+    int kh = rest / kW;
+    int kw = rest % kW;
+    int t,x,y,it,ix,iy;
+    real *dst = finput_data
+      + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
+      + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
+      + kh  * (kW*outputDepth*outputHeight*outputWidth)
+      + kw  * (outputDepth*outputHeight*outputWidth);
+    real *src = input_data + nip*(inputDepth*inputHeight*inputWidth);
+
+    if (pT > 0 || pH > 0 || pW > 0)
+    {
+      for (t = 0; t < outputDepth; t++)
+      {
+        it = t*dT - pT + kt;
+        for (y = 0; y < outputHeight; y++)
+        {
+          iy = y*dH - pH + kh;
+          for (x = 0; x < outputWidth; x++)
+          {
+            ix = x*dW - pW + kw;
+            if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
+              memset(dst+t*outputHeight*outputWidth+y*outputWidth+x, 0, sizeof(real)*(1));
+            else
+              memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
+          }
+        }
+      }
+    }
+    else
+    {
+      for (t = 0; t < outputDepth; t++)
+      {
+        it = t*dT + kt;
+        for (y = 0; y < outputHeight; y++)
+        {
+          iy = y*dH + kh;
+          for(x = 0; x < outputWidth; x++)
+          {
+            ix = x*dW + kw;
+            memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
+          }
+        }
+      }
+    }
+  }
+}
+
+static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          long nInputPlane,
+          long inputDepth,
+          long inputWidth,
+          long inputHeight,
+          long nOutputPlane,
+          long outputDepth,
+          long outputWidth,
+          long outputHeight)
+{
+  long i;
+  THTensor *output2d;
+
+  THNN_(unfolded_copy_vol)(
+    finput, input,
+    kT, kW, kH,
+    dT, dW, dH,
+    pT, pW, pH,
+    nInputPlane,
+    inputDepth, inputWidth, inputHeight,
+    outputDepth, outputWidth, outputHeight
+  );
+
+  output2d = THTensor_(newWithStorage2d)(
+    output->storage, output->storageOffset, nOutputPlane, -1,
+    outputDepth*outputHeight*outputWidth, -1
+  );
+
+  for (i = 0; i < nOutputPlane; i++)
+  {
+    THVector_(fill)(
+      output->storage->data+output->storageOffset+output->stride[0]*i,
+      THTensor_(get1d)(bias, i),
+      outputDepth*outputHeight*outputWidth
+    );
+  }
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+void THNN_(VolumetricConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimf = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  long nInputPlane;
+  long inputDepth;
+  long inputHeight;
+  long inputWidth;
+  long nOutputPlane;
+  long outputDepth;
+  long outputHeight;
+  long outputWidth;
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D(batch mode) tensor expected"
+  );
+
+  if (input->nDimension == 5)
+  {
+    dimf++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  nInputPlane = input->size[dimf];
+  inputDepth = input->size[dimt];
+  inputHeight  = input->size[dimh];
+  inputWidth   = input->size[dimw];
+  nOutputPlane = weight->size[0];
+  outputDepth  = (inputDepth + 2*pT - kT) / dT + 1;
+  outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
+  outputWidth  = (inputWidth + 2*pW - kW) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+  {
+    THError(
+      "Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      nOutputPlane, outputDepth, outputHeight, outputWidth
+    );
+  }
+
+  if (input->nDimension == 4)
+  {
+    THTensor_(resize2d)(finput, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+    THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+      input, output, weight, bias, finput,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH,
+      nInputPlane, inputDepth, inputWidth, inputHeight,
+      nOutputPlane, outputDepth, outputWidth, outputHeight
+    );
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
+    THTensor_(resize5d)(output, T, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+// #pragma omp parallel for private(t)
+    for (t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+        input_t, output_t, weight, bias, finput_t,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH,
+        nInputPlane, inputDepth, inputWidth, inputHeight,
+        nOutputPlane, outputDepth, outputWidth, outputHeight
+      );
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+}
+
+static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
+    gradOutput->storage, gradOutput->storageOffset,
+    gradOutput->size[0], -1,
+    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+  );
+
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc_vol)(
+    fgradInput, gradInput,
+    kT, kW, kH,
+    dT, dW, dH,
+    pT, pW, pH,
+    gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2],
+    gradOutput->size[1], gradOutput->size[3], gradOutput->size[2]
+  );
+}
+
+void THNN_(VolumetricConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  // number of input/output planes and kernel size is indirectly defined by the weight tensor
+  THArgCheck(weight->nDimension == 2, 4,
+    "2D weight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
+  );
+
+  int nOutputPlane = (int)weight->size[0];
+
+  THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+  THTensor_(transpose)(weight, weight, 0, 1);
+
+  if (input->nDimension == 4)
+  {
+    THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+      gradInput, gradOutput, weight, fgradInput,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH
+    );
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+//#pragma omp parallel for private(t)
+    for (t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+        gradInput_t, gradOutput_t, weight, fgradInput_t,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(transpose)(weight, weight, 0, 1);
+}
+
+static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale)
+{
+  long i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
+    gradOutput->storage, gradOutput->storageOffset,
+    gradOutput->size[0], -1,
+    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+  );
+
+  THTensor_(transpose)(finput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
+  THTensor_(transpose)(finput, finput, 0, 1);
+
+  for (i = 0; i < gradBias->size[0]; i++)
+  {
+    long k;
+    real sum = 0;
+    real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+    for (k = 0; k < gradOutput2d->size[1]; k++)
+      sum += data[k];
+
+    (gradBias->storage->data + gradBias->storageOffset)[i] += scale * sum;
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(VolumetricConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale)
+{
+  THArgCheck(gradWeight->nDimension == 2, 4,
+    "2D gradWeight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
+  );
+
+  int nOutputPlane = (int)gradWeight->size[0];
+
+  THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
+    "gradBias tensor has wrong size"
+  );
+
+  THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 3,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  if (input->nDimension == 4)   // non-batch mode
+  {
+    THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
+  }
+  else  // batch mode
+  {
+    long T = input->size[0];
+    long t;
+
+    for (t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/VolumetricFullConvolution.c b/lib/THNN/generic/VolumetricFullConvolution.c
new file mode 100644
index 0000000..5a6a1a7
--- /dev/null
+++ b/lib/THNN/generic/VolumetricFullConvolution.c
@@ -0,0 +1,458 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricFullConvolution.c"
+#else
+
+static void THNN_(vol2col)(
+  const real *data_vol, const int channels,
+  const int depth, const int height, const int width,
+  const int kT, const int kH, const int kW,
+  const int pT, const int pH, const int pW,
+  const int dT, const int dH, const int dW,
+  real *data_col)
+{
+  int c, t, h, w;
+  int depth_col  = (depth  + 2 * pT - kT) / dT + 1;
+  int height_col = (height + 2 * pH - kH) / dH + 1;
+  int width_col  = (width  + 2 * pW - kW) / dW + 1;
+  int channels_col = channels * kT * kH * kW;
+  for (c = 0; c < channels_col; ++c)
+  {
+    int w_offset = c % kW;
+    int h_offset = (c / kW) % kH;
+    int t_offset = (c / kW / kH) % kT;
+    int c_vol = c / kT / kH / kW;
+    for (t = 0; t < depth_col; ++t)
+    {
+      for (h = 0; h < height_col; ++h)
+      {
+        for (w = 0; w < width_col; ++w)
+        {
+          int t_pad = t * dT - pT + t_offset;
+          int h_pad = h * dH - pH + h_offset;
+          int w_pad = w * dW - pW + w_offset;
+          if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+            data_col[((c * depth_col + t) * height_col + h) * width_col + w] =
+              data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad];
+          else
+            data_col[((c * depth_col + t) * height_col + h) * width_col + w] = 0;
+        }
+      }
+    }
+  }
+}
+
+static void THNN_(col2vol)(
+  const real* data_col, const int channels,
+  const int depth, const int height, const int width,
+  const int kT, const int kH, const int kW,
+  const int pT, const int pH, const int pW,
+  const int dT, const int dH, const int dW,
+  real* data_vol)
+{
+  int c, t, h, w;
+  memset(data_vol, 0, sizeof(real) * depth * height * width * channels);
+  int depth_col = (depth + 2 * pT - kT) / dT + 1;
+  int height_col = (height + 2 * pH - kH) / dH + 1;
+  int width_col = (width + 2 * pW - kW) / dW + 1;
+  int channels_col = channels * kT * kH * kW;
+  for (c = 0; c < channels_col; ++c)
+  {
+    int w_offset = c % kW;
+    int h_offset = (c / kW) % kH;
+    int t_offset = (c / kW / kH) % kT;
+    int c_vol = c / kT / kH / kW;
+    for (t = 0; t < depth_col; ++t)
+    {
+      for (h = 0; h < height_col; ++h)
+      {
+        for (w = 0; w < width_col; ++w)
+        {
+          int t_pad = t * dT - pT + t_offset;
+          int h_pad = h * dH - pH + h_offset;
+          int w_pad = w * dW - pW + w_offset;
+          if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+            data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] +=
+              data_col[((c * depth_col + t) * height_col + h) * width_col + w];
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricFullConvolution_updateOutput)(
+  THNNState *state,
+  THTensor *input,          // 4D or 5D (batch) tensor
+  THTensor *output,
+  THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+  THTensor *bias,
+  THTensor *finput,         // internal columns buffer
+  THTensor *fgradInput,     // internal ones buffer
+  int dT, int dW, int dH,   // stride of the convolution
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH)   // extra output adjustment
+{
+  THTensor *columns = finput;
+  THTensor *ones    = fgradInput;
+
+  // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  THArgCheck(weight->nDimension == 5, 4,
+    "5D weight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
+  );
+
+  const int nInputPlane  = (int)weight->size[0];
+  const int nOutputPlane = (int)weight->size[1];
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch mode) tensor is expected"
+  );
+
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  }
+  else
+  {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  // Batch size + input planes
+  const long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    const long n = columns->size[1];
+    const long k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 't',
+      n, m, k,
+      1,
+      THTensor_(data)(input_n), n,
+      THTensor_(data)(weight), m,
+      0,
+      THTensor_(data)(columns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2vol)(
+      THTensor_(data)(columns),
+      nOutputPlane, outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
+      THTensor_(data)(output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m_ = nOutputPlane;
+    const long n_ = outputDepth * outputHeight * outputWidth;
+    const long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      't', 'n',
+      n_, m_, k_,
+      1,
+      THTensor_(data)(ones), k_,
+      THTensor_(data)(bias), k_,
+      1,
+      THTensor_(data)(output_n), n_
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0)
+  {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(VolumetricFullConvolution_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  THTensor *weight,
+  THTensor *finput,
+  THTensor *fgradInput,     // only used by cuda impl
+  int dT, int dW, int dH,   // stride
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH)   // extra output adjustment
+{
+  THTensor *gradColumns = finput;
+
+  // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  THArgCheck(weight->nDimension == 5, 4,
+    "5D weight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
+  );
+
+  const int nInputPlane  = (int)weight->size[0];
+  const int nOutputPlane = (int)weight->size[1];
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch mode) tensor is expected"
+  );
+
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  // Batch size + input planes
+  const long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
+      THTensor_(data)(gradColumns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m = weight->size[0];
+    const long n = gradColumns->size[1];
+    const long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(gradColumns), n,
+      THTensor_(data)(weight), k,
+      0,
+      THTensor_(data)(gradInput_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0)
+  {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(VolumetricFullConvolution_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *finput,
+  THTensor *fgradInput,
+  int dT, int dW, int dH,   // stride
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH,   // extra output adjustment
+  real scale)
+{
+  // number of input & output planes and kernel size is indirectly defined by the gradWeight tensor
+  THArgCheck(gradWeight->nDimension == 5, 4,
+    "5D gradWeight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
+  );
+
+  int nInputPlane  = (int)gradWeight->size[0];
+  int nOutputPlane = (int)gradWeight->size[1];
+  int kT           = (int)gradWeight->size[2];
+  int kH           = (int)gradWeight->size[3];
+  int kW           = (int)gradWeight->size[4];
+
+  THTensor *columns = finput;
+  THTensor *ones = fgradInput;
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch mode) tensor is expected"
+  );
+
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  // Batch size + input planes
+  const long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(gradOutput_n), nOutputPlane,
+      outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long n = columns->size[0];   // nOutputPlane * kt * kh * kw
+    const long m = input_n->size[0];   // nInputPlane
+    const long k = columns->size[1];   // inputHeight * inputWidth
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      't', 'n',
+      n, m, k,
+      scale,
+      THTensor_(data)(columns), k,
+      THTensor_(data)(input_n), k,
+      1,
+      THTensor_(data)(gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m_ = nOutputPlane;
+    const long k_ = outputDepth * outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    THBlas_(gemv)(
+      't',
+      k_, m_,
+      scale,
+      THTensor_(data)(gradOutput_n), k_,
+      THTensor_(data)(ones), 1,
+      1,
+      THTensor_(data)(gradBias), 1
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0)
+  {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/VolumetricMaxPooling.c b/lib/THNN/generic/VolumetricMaxPooling.c
new file mode 100644
index 0000000..053c02c
--- /dev/null
+++ b/lib/THNN/generic/VolumetricMaxPooling.c
@@ -0,0 +1,392 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricMaxPooling.c"
+#else
+
+static void THNN_(VolumetricMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          real *indz_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j, ti;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* local pointers */
+
+          long start_t = ti * dT - pT;
+          long start_h = i * dH - pH;
+          long start_w = j * dW - pW;
+
+          long kernel_t = fminf(kT, kT + start_t);
+          long kernel_h = fminf(kH, kH + start_h);
+          long kernel_w = fminf(kW, kW + start_w);
+
+          start_t = fmaxf(start_t, 0);
+          start_h = fmaxf(start_h, 0);
+          start_w = fmaxf(start_w, 0);
+
+          real *ip = input_p + k * itime * iwidth * iheight
+            + start_t * iwidth * iheight + start_h * iwidth + start_w;
+          real *op = output_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+          real *indzp = indz_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* compute local max: */
+          real maxval = -THInf;
+          int x,y,z;
+          int mx, my, mz;
+
+          for (z = 0; z < kernel_t; z++)
+          {
+            for (y = 0; y < kernel_h; y++)
+            {
+              for (x = 0; x < kernel_w; x++)
+              {
+                if ((start_t + z < itime) && (start_h + y < iheight) && (start_w + x < iwidth))
+                {
+                  real val = *(ip + z * iwidth * iheight + y * iwidth + x);
+                  if (val > maxval)
+                  {
+                    maxval = val;
+                    // Store indices w.r.t the kernel dimension
+                    mz = z + (kT - kernel_t);
+                    my = y + (kH - kernel_h);
+                    mx = x + (kW - kernel_w);
+                  }
+                }
+              }
+            }
+          }
+
+          // set max values
+          ((unsigned char*)(indzp))[0] = mz;
+          ((unsigned char*)(indzp))[1] = my;
+          ((unsigned char*)(indzp))[2] = mx;
+          ((unsigned char*)(indzp))[3] = 0;
+
+          /* set output to local max */
+          *op = maxval;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          bool ceilMode)
+{
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch-mode) tensor expected"
+  );
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
+    "input image smaller than kernel size"
+  );
+
+  THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2,
+    "pad should be smaller than half of kernel size"
+  );
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  if (ceilMode)
+  {
+    otime   = (int)(ceil((float)(itime   - kT + 2 * pT) / dT) + 1);
+    oheight = (int)(ceil((float)(iheight - kH + 2 * pH) / dH) + 1);
+    owidth  = (int)(ceil((float)(iwidth  - kW + 2 * pW) / dW) + 1);
+  }
+  else
+  {
+    otime   = (int)(floor((float)(itime   - kT + 2 * pT) / dT) + 1);
+    oheight = (int)(floor((float)(iheight - kH + 2 * pH) / dH) + 1);
+    owidth  = (int)(floor((float)(iwidth  - kW + 2 * pW) / dW) + 1);
+  }
+
+  if (pT || pW || pH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((otime - 1)*dT >= itime + pT)
+      --otime;
+    if ((oheight - 1)*dH >= iheight + pH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + pW)
+      --owidth;
+  }
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 4) /* non-batch mode */
+  {
+    /* resize output */
+    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+    /* indices will contain ti,i,j uchar locations packed into float/double */
+    THTensor_(resize4d)(indices, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    THNN_(VolumetricMaxPooling_updateOutput_frame)(
+      input_data, output_data,
+      indices_data,
+      nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH
+    );
+  }
+  else /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+    /* resize output */
+    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+    /* indices will contain ti,i,j locations for each output point */
+    THTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p=0; p < nBatch; p++)
+    {
+      THNN_(VolumetricMaxPooling_updateOutput_frame)(
+        input_data   + p * istride,
+        output_data  + p * ostride,
+        indices_data + p * ostride,
+        nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          real *indz_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k  = gradInput_p  + k * itime * iwidth * iheight;
+    real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
+    real *indz_p_k = indz_p + k * otime * owidth * oheight;
+
+    /* calculate max points */
+    long ti, i, j;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* retrieve position of max */
+          real * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j];
+          long maxti = ((unsigned char*)(indzp))[0] + ti * dT - pT;
+          long maxi  = ((unsigned char*)(indzp))[1] + i * dH - pH;
+          long maxj  = ((unsigned char*)(indzp))[2] + j * dW - pW;
+
+          /* update gradient */
+          gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] +=
+            gradOutput_p_k[ti * oheight * owidth + i * owidth + j];
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int nslices;
+  int itime;
+  int iheight;
+  int iwidth;
+  int otime;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  otime = gradOutput->size[dimt];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 4) /* non-batch mode*/
+  {
+    THNN_(VolumetricMaxPooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data,
+      indices_data,
+      nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      dT, dW, dH,
+      pT, pW, pH
+    );
+  }
+  else /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nBatch; p++)
+    {
+      THNN_(VolumetricMaxPooling_updateGradInput_frame)(
+        gradInput_data + p * istride,
+        gradOutput_data + p * ostride,
+        indices_data + p * ostride,
+        nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/lib/THNN/generic/VolumetricMaxUnpooling.c b/lib/THNN/generic/VolumetricMaxUnpooling.c
new file mode 100644
index 0000000..247dd5f
--- /dev/null
+++ b/lib/THNN/generic/VolumetricMaxUnpooling.c
@@ -0,0 +1,325 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricMaxUnpooling.c"
+#else
+
+static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          real *ind_p,
+          long nslices,
+          long iT,
+          long iW,
+          long iH,
+          long oT,
+          long oW,
+          long oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    long ti, i, j, maxz, maxy, maxx;
+    for (ti = 0; ti < iT; ti++)
+    {
+      for (i = 0; i < iH; i++)
+      {
+        for (j = 0; j < iW; j++)
+        {
+          long start_t = ti * dT - pT;
+          long start_h = i * dH - pH;
+          long start_w = j * dW - pW;
+
+          //real *output_p_k = output_p + k*oT*oW*oH + ti*oW*oH*dT + i*oW*dH + j*dW;
+          real *input_p_k = input_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+          real *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+
+          maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
+          maxy = ((unsigned char*)(ind_p_k))[1];
+          maxx = ((unsigned char*)(ind_p_k))[2];
+
+          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
+          {
+            THError(
+              "invalid max index z= %d, y= %d, x= %d, oT= %d, oW= %d, oH= %d",
+              start_t+maxz, start_h+maxy, start_w+maxx, oT, oW, oH
+            );
+          }
+          output_p[k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx)] = *input_p_k; /* update output */
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int nbatch = 1;
+  int nslices;
+  int iT;
+  int iH;
+  int iW;
+  real *input_data;
+  real *output_data;
+  real *indices_data;
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5 , 2,
+    "4D or 5D (batch mode) tensor expected"
+  );
+
+  if (!THTensor_(isSameSizeAs)(input, indices))
+  {
+    THError("Invalid input size w.r.t current indices size");
+  }
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimt++;
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimt-1];
+  iT = input->size[dimt];
+  iH = input->size[dimh];
+  iW = input->size[dimw];
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+  indices = THTensor_(newContiguous)(indices);
+
+  /* resize output */
+  if (input->nDimension == 4)
+  {
+    THTensor_(resize4d)(output, nslices, oT, oH, oW);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+    THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+      input_data, output_data,
+      indices_data,
+      nslices,
+      iT, iW, iH,
+      oT, oW, oH,
+      dT, dW, dH, pT, pW, pH
+    );
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize5d)(output, nbatch, nslices, oT, oH, oW);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+        input_data+p*nslices*iT*iW*iH,
+        output_data+p*nslices*oT*oW*oH,
+        indices_data+p*nslices*iT*iW*iH,
+        nslices,
+        iT, iW, iH,
+        oT, oW, oH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  THTensor_(free)(indices);
+}
+
+static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          real *ind_p,
+          long nslices,
+          long iT,
+          long iW,
+          long iH,
+          long oT,
+          long oW,
+          long oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    long ti, i, j, maxz, maxy, maxx;
+    for (ti = 0; ti < iT; ti++)
+    {
+      for (i = 0; i < iH; i++)
+      {
+        for (j = 0; j < iW; j++)
+        {
+          long start_t = ti * dT - pT;
+          long start_h = i * dH - pH;
+          long start_w = j * dW - pW;
+
+          real *gradInput_p_k = gradInput_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+          //real *gradOutput_p_k = gradOutput_p + k*oT*oW*oH + ti*oW*oH*dT + i*oW*dH + j*dW;
+          real *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+
+          maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
+          maxy = ((unsigned char*)(ind_p_k))[1];
+          maxx = ((unsigned char*)(ind_p_k))[2];
+
+          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
+          {
+            THError(
+              "invalid max index z= %d, y= %d, x= %d, oT= %d, oW= %d, oH= %d",
+              start_t+maxz, start_h+maxy, start_w+maxx, oT, oW, oH
+            );
+          }
+          *gradInput_p_k = gradOutput_p[k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx)]; /* update gradient */
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int nbatch = 1;
+  int nslices;
+  int iT;
+  int iH;
+  int iW;
+  real *gradInput_data;
+  real *gradOutput_data;
+  real *indices_data;
+
+  if (!THTensor_(isSameSizeAs)(input, indices))
+  {
+    THError("Invalid input size w.r.t current indices size");
+  }
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  indices = THTensor_(newContiguous)(indices);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimt++;
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimt-1];
+  iT = input->size[dimt];
+  iH = input->size[dimh];
+  iW = input->size[dimw];
+
+  if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh])
+  {
+    THError(
+      "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%d",
+      oT, oH, oW,gradOutput->size[dimh], gradOutput->size[dimw]
+    );
+  }
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 4)
+  {
+    THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data,
+      indices_data,
+      nslices,
+      iT, iW, iH,
+      oT, oW, oH,
+      dT, dW, dH,
+      pT, pW, pH
+    );
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+        gradInput_data+p*nslices*iT*iW*iH,
+        gradOutput_data+p*nslices*oT*oW*oH,
+        indices_data+p*nslices*iT*iW*iH,
+        nslices,
+        iT, iW, iH,
+        oT, oW, oH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(indices);
+}
+
+#endif
diff --git a/lib/THNN/generic/unfold.c b/lib/THNN/generic/unfold.c
new file mode 100644
index 0000000..25146c0
--- /dev/null
+++ b/lib/THNN/generic/unfold.c
@@ -0,0 +1,158 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/unfold.c"
+#else
+
+#ifdef _WIN32
+# include <windows.h>
+#endif
+
+/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
+void THNN_(unfolded_acc)(
+          THTensor *finput,
+          THTensor *input,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int nInputPlane,
+          int inputWidth,
+          int inputHeight,
+          int outputWidth,
+          int outputHeight)
+{
+#ifdef _WIN32
+  LONG_PTR nip;
+#else
+  size_t nip;
+#endif
+
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(nip)
+  for(nip = 0; nip < nInputPlane; nip++)
+  {
+    size_t kw, kh, y, x;
+    long long ix = 0, iy = 0;
+    for(kh = 0; kh < kH; kh++)
+    {
+      for(kw = 0; kw < kW; kw++)
+      {
+        real *src = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
+        real *dst = input_data + nip*(inputHeight*inputWidth);
+        if (padW > 0 || padH > 0) {
+          size_t lpad,rpad;
+          for(y = 0; y < outputHeight; y++) {
+            iy = (long long)(y*dH - padH + kh);
+            if (iy < 0 || iy >= inputHeight) {
+            } else {
+              if (dW==1){
+                 ix = (long long)(0 - padW + kw);
+                 lpad = fmaxf(0,(int)(padW-kw));
+                 rpad = fmaxf(0,(int)(padW-(kW-kw-1)));
+                 THVector_(add)(dst+(size_t)(iy*inputWidth+ix+lpad), src+(size_t)(y*outputWidth+lpad), 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
+              }
+              else{
+                for (x=0; x<outputWidth; x++){
+                   ix = (long long)(x*dW - padW + kw);
+                   if (ix < 0 || ix >= inputWidth){
+                   }else
+                     THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth+x), 1, 1);
+                }
+              }
+            }
+          }
+        } else {
+          for(y = 0; y < outputHeight; y++) {
+            iy = (long long)(y*dH + kh);
+            ix = (long long)(0 + kw);
+            if (dW == 1 )
+               THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth), 1, outputWidth); /* note: THVector_add could handle 1 value better */
+            else{
+              for(x = 0; x < outputWidth; x++)
+                THVector_(add)(dst+(size_t)(iy*inputWidth+ix+x*dW), src+(size_t)(y*outputWidth+x), 1, 1);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(unfolded_copy)(
+          THTensor *finput,
+          THTensor *input,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int nInputPlane,
+          int inputWidth,
+          int inputHeight,
+          int outputWidth,
+          int outputHeight)
+{
+  long k;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane*kH*kW; k++) {
+    size_t nip = k / (kH*kW);
+    size_t rest = k % (kH*kW);
+    size_t kh = rest / kW;
+    size_t kw = rest % kW;
+    size_t x,y;
+    long long ix,iy;
+    real *dst = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
+    real *src = input_data + nip*(inputHeight*inputWidth);
+    if (padW > 0 || padH > 0) {
+      size_t lpad,rpad;
+      for(y = 0; y < outputHeight; y++) {
+        iy = (long long)(y*dH - padH + kh);
+        if (iy < 0 || iy >= inputHeight) {
+          memset(dst+y*outputWidth, 0, sizeof(real)*outputWidth);
+        } else {
+          if (dW==1){
+             ix = (long long)(0 - padW + kw);
+             lpad = fmaxf(0,(int)(padW-kw));
+             rpad = fmaxf(0,(int)(padW-(kW-kw-1)));
+             if (outputWidth-rpad-lpad <= 0) {
+                memset(dst+(size_t)(y*outputWidth), 0, sizeof(real)*outputWidth);
+             } else {
+                if (lpad > 0) memset(dst+y*outputWidth, 0, sizeof(real)*lpad);
+                memcpy(dst+(size_t)(y*outputWidth+lpad), src+(size_t)(iy*inputWidth+ix+lpad), sizeof(real)*(outputWidth-rpad-lpad));
+                if (rpad > 0) memset(dst+y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad);
+             }
+          }
+          else{
+            for (x=0; x<outputWidth; x++){
+               ix = (long long)(x*dW - padW + kw);
+               if (ix < 0 || ix >= inputWidth)
+                 memset(dst+(size_t)(y*outputWidth+x), 0, sizeof(real)*1);
+               else
+                 memcpy(dst+(size_t)(y*outputWidth+x), src+(size_t)(iy*inputWidth+ix), sizeof(real)*(1));
+            }
+          }
+        }
+      }
+    } else {
+      for(y = 0; y < outputHeight; y++) {
+        iy = (long long)(y*dH + kh);
+        ix = (long long)(0 + kw);
+        if (dW == 1)
+           memcpy(dst+(size_t)(y*outputWidth), src+(size_t)(iy*inputWidth+ix), sizeof(real)*outputWidth);
+        else{
+          for (x=0; x<outputWidth; x++)
+             memcpy(dst+(size_t)(y*outputWidth+x), src+(size_t)(iy*inputWidth+ix+x*dW), sizeof(real)*(1));
+         }
+      }
+    }
+  }
+}
+
+#endif
diff --git a/lib/THNN/init.c b/lib/THNN/init.c
new file mode 100644
index 0000000..7c0de94
--- /dev/null
+++ b/lib/THNN/init.c
@@ -0,0 +1,173 @@
+#include "TH.h"
+#include "THNN.h"
+
+#define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
+#define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
+
+#include "generic/Abs.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/AbsCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/ClassNLLCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialClassNLLCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/DistKLDivCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/ELU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/HardShrink.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/HardTanh.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/L1Cost.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LeakyReLU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LogSigmoid.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LogSoftMax.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LookupTable.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MSECriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MultiLabelMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MultiMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/PReLU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/RReLU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Sigmoid.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SmoothL1Criterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftMax.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftPlus.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftShrink.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SparseLinear.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Sqrt.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Square.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Tanh.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Threshold.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalSubSampling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/BatchNormalization.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/unfold.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialConvolutionMap.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialConvolutionMM.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialConvolutionLocal.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialFullConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialFullConvolutionMap.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialDilatedConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialAdaptiveMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialAveragePooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialFractionalMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialMaxUnpooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialSubSampling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialUpSamplingNearest.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricAveragePooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricConvolutionMM.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricFullConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricMaxUnpooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialReflectionPadding.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialReplicationPadding.c"
+#include "THGenerateFloatTypes.h"
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..f38456d
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,18 @@
+site_name: nn
+theme : simplex
+repo_url : https://github.com/torch/nn
+use_directory_urls : false
+markdown_extensions: [extra]
+docs_dir : doc
+pages:
+- [index.md, Home]
+- [module.md, Modules, Module Interface]
+- [containers.md, Modules, Containers]
+- [transfer.md, Modules, Transfer Functions]
+- [simple.md, Modules, Simple Layers]
+- [table.md, Modules, Table Layers]
+- [convolution.md, Modules, Convolution Layers] 
+- [criterion.md, Criterion, Criterions]
+- [overview.md, Additional Documentation, Overview]
+- [training.md, Additional Documentation, Training]
+- [testing.md, Additional Documentation, Testing]
diff --git a/rocks/nn-scm-1.rockspec b/rocks/nn-scm-1.rockspec
new file mode 100644
index 0000000..9b455d9
--- /dev/null
+++ b/rocks/nn-scm-1.rockspec
@@ -0,0 +1,27 @@
+package = "nn"
+version = "scm-1"
+
+source = {
+   url = "git://github.com/torch/nn.git",
+}
+
+description = {
+   summary = "Neural Network package for Torch",
+   detailed = [[
+   ]],
+   homepage = "https://github.com/torch/nn",
+   license = "BSD"
+}
+
+dependencies = {
+   "torch >= 7.0",
+   "luaffi"
+}
+
+build = {
+   type = "command",
+   build_command = [[
+cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)"  -DLUA_INCDIR="$(LUA_INCDIR)" -DLUA_LIBDIR="$(LUA_LIBDIR)" && $(MAKE)
+]],
+   install_command = "cd build && $(MAKE) install"
+}
diff --git a/test.lua b/test.lua
new file mode 100644
index 0000000..8bf98ec
--- /dev/null
+++ b/test.lua
@@ -0,0 +1,6307 @@
+-- you can easily test specific units like this:
+-- th -lnn -e "nn.test{'LookupTable'}"
+-- th -lnn -e "nn.test{'LookupTable', 'Add'}"
+
+local mytester = torch.Tester()
+local jac
+local sjac
+
+local precision = 1e-5
+local expprecision = 1e-4
+
+local nntest = torch.TestSuite()
+
+local function equal(t1, t2, msg)
+   if (torch.type(t1) == "table") then
+      for k, v in pairs(t2) do
+         equal(t1[k], t2[k], msg)
+      end
+   else
+      mytester:eq(t1, t2, 0.00001, msg)
+   end
+end
+
+
+--[[ Generate tests to exercise the tostring component of modules. ]]
+local tostringTestModules = {
+    nnLinear = nn.Linear(1, 2),
+    nnReshape = nn.Reshape(10),
+    nnSpatialZeroPadding = nn.SpatialZeroPadding(1, 1, 1, 1)}
+for test_name, component in pairs(tostringTestModules) do
+  nntest['tostring' .. test_name] =
+    function ()
+      mytester:assert(tostring(component):find(
+                         torch.type(component) .. '(', 1, true) ~= nil,
+                      'nn components should have a descriptive tostring' ..
+                      ' beginning with the classname')
+    end
+end
+
+
+function nntest.Add()
+   local inj_vals = {math.random(3,5), 1}  -- Also test the inj = 1 spatial case
+   local ini = math.random(3,5)
+   local ink = math.random(3,5)
+
+   for ind, inj in pairs(inj_vals) do
+      local input = torch.Tensor(ini,inj,ink):zero()
+      local module = nn.Add(ini,inj,ink)
+
+      -- 1D
+      local err = jac.testJacobian(module,input)
+      mytester:assertlt(err,precision, 'error on state ')
+
+      local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+      mytester:assertlt(err,precision, 'error on bias ')
+
+      local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+      mytester:assertlt(err,precision, 'error on bias [direct update] ')
+
+      for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+         mytester:assertlt(err, precision, string.format('error on bias [%s]', t))
+      end
+
+      -- 2D
+      local nframe = math.random(50,70)
+      local input = torch.Tensor(nframe, ini,inj,ink):zero()
+
+      local err = jac.testJacobian(module,input)
+      mytester:assertlt(err,precision, 'error on state ')
+
+      local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+      mytester:assertlt(err,precision, 'error on bias ')
+
+      local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+      mytester:assertlt(err,precision, 'error on bias [direct update] ')
+
+      for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+         mytester:assertlt(err, precision, string.format('error on bias [%s]', t))
+      end
+
+      -- IO
+      local ferr,berr = jac.testIO(module,input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   end
+end
+
+function nntest.CMul()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.CMul(ini, inj, ink)
+
+   -- 1D
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   -- 2D
+   local nframe = math.random(50,70)
+   local nframe = 5
+   local input = torch.randn(nframe, ini,inj,ink)
+   local output = module:forward(input)
+   local output2 = torch.cmul(input, module.weight:view(1,ini,inj,ink):expandAs(input))
+   mytester:assertTensorEq(output2, output, 0.000001, 'CMul forward 2D err')
+
+   module:zeroGradParameters()
+   local gradWeight = module.gradWeight:clone()
+   local gradInput = module:backward(input, output)
+   local gradInput2 = gradInput:clone():zero()
+   local outputView = output:view(input:size(1), -1)
+   gradInput2:view(input:size(1), -1):addcmul(1, module.weight:view(1,-1):expandAs(outputView), outputView)
+   mytester:assertTensorEq(gradInput2, gradInput, 0.000001, 'CMul updateGradInput 2D err')
+   mytester:assert(gradInput:isSameSizeAs(input), 'CMul gradInput 2D size err')
+
+   local inputView = input:view(nframe, -1)
+   local gradWeightView = gradWeight:view(1, -1)
+   for i=1,nframe do
+      gradWeightView:addcmul(1, inputView[i], outputView[i])
+   end
+   mytester:assertTensorEq(gradWeight, module.gradWeight, 0.000001, 'CMul accGradParameters 2D err')
+   mytester:assert(module.weight:isSameSizeAs(module.gradWeight), 'CMul gradWeight size err')
+
+   input:zero()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format('error on weight [%s]', t))
+   end
+
+   -- Non-contiguous input or gradOutput
+   local testModule = nn.CMul(4, 3, 5)
+   local testInput = torch.rand(10, 3, 5):resize(10, 1, 3, 5):expand(10, 4, 3, 5)
+   local testOutput = testModule:forward(testInput)
+
+   mytester:assert(testOutput:isSameSizeAs(testInput), 'CMul non-contiguous forward err')
+
+   local testGradOutput = torch.rand(10, 3, 5):resize(10, 1, 3, 5):expand(10, 4, 3, 5)
+   testOutput = testModule:forward(testInput)
+   local testGradInput = testModule:backward(testOutput, testGradOutput)
+
+   mytester:assert(testGradInput:isSameSizeAs(testGradOutput), 'CMul non-contiguous backward err')
+
+   -- IO
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Dropout()
+   local p = 0.2 --prob of droping out a neuron
+   local input = torch.Tensor(1000):fill((1-p))
+   local module = nn.Dropout(p)
+   -- version 2
+   local output = module:forward(input)
+   mytester:assert(math.abs(output:mean() - (1-p)) < 0.05, 'dropout output')
+   local gradInput = module:backward(input, input)
+   mytester:assert(math.abs(gradInput:mean() - (1-p)) < 0.05, 'dropout gradInput')
+   -- test inplace version
+   local module = nn.Dropout(p,nil,true)
+   local output = module:forward(input:clone())
+   mytester:assert(math.abs(output:mean() - (1-p)) < 0.05, 'dropout output')
+   local gradInput = module:backward(input:clone(), input:clone())
+   mytester:assert(math.abs(gradInput:mean() - (1-p)) < 0.05, 'dropout gradInput')
+
+   -- version 1 (old nnx version)
+   local input = input:fill(1)
+   local module = nn.Dropout(p,true)
+   local output = module:forward(input)
+   mytester:assert(math.abs(output:mean() - (1-p)) < 0.05, 'dropout output')
+   local gradInput = module:backward(input, input)
+   mytester:assert(math.abs(gradInput:mean() - (1-p)) < 0.05, 'dropout gradInput')
+end
+
+function nntest.SpatialDropout()
+   local p = 0.2 --prob of dropiing out a neuron
+   local w = math.random(1,5)
+   local h = math.random(1,5)
+   local nfeats = 1000
+   local input = torch.Tensor(nfeats, w, h):fill(1)
+   local module = nn.SpatialDropout(p)
+   module.train = true
+   local output = module:forward(input)
+   mytester:assert(math.abs(output:mean() - (1-p)) < 0.05, 'dropout output')
+   local gradInput = module:backward(input, input)
+   mytester:assert(math.abs(gradInput:mean() - (1-p)) < 0.05, 'dropout gradInput')
+end
+
+function nntest.SpatialDropoutBatch()
+   local p = 0.2 --prob of dropiing out a neuron
+   local bsz = math.random(1,5)
+   local w = math.random(1,5)
+   local h = math.random(1,5)
+   local nfeats = 1000
+   local input = torch.Tensor(bsz, nfeats, w, h):fill(1)
+   local module = nn.SpatialDropout(p)
+   module.train = true
+   local output = module:forward(input)
+   mytester:assert(math.abs(output:mean() - (1-p)) < 0.05, 'dropout output')
+   local gradInput = module:backward(input, input)
+   mytester:assert(math.abs(gradInput:mean() - (1-p)) < 0.05, 'dropout gradInput')
+end
+
+function nntest.VolumetricDropout()
+   local p = 0.2 --prob of dropiing out a neuron
+   local t = math.random(1,5)
+   local w = math.random(1,5)
+   local h = math.random(1,5)
+   local nfeats = 1000
+   local input = torch.Tensor(nfeats, t, w, h):fill(1)
+   local module = nn.VolumetricDropout(p)
+   module.train = true
+   local output = module:forward(input)
+   mytester:assert(math.abs(output:mean() - (1-p)) < 0.05, 'dropout output')
+   local gradInput = module:backward(input, input)
+   mytester:assert(math.abs(gradInput:mean() - (1-p)) < 0.05, 'dropout gradInput')
+end
+
+function nntest.VolumetricDropoutBatch()
+   local p = 0.2 --prob of dropiing out a neuron
+   local bsz = math.random(1,5)
+   local t = math.random(1,5)
+   local w = math.random(1,5)
+   local h = math.random(1,5)
+   local nfeats = 1000
+   local input = torch.Tensor(bsz, nfeats, t, w, h):fill(1)
+   local module = nn.VolumetricDropout(p)
+   module.train = true
+   local output = module:forward(input)
+   mytester:assert(math.abs(output:mean() - (1-p)) < 0.05, 'dropout output')
+   local gradInput = module:backward(input, input)
+   mytester:assert(math.abs(gradInput:mean() - (1-p)) < 0.05, 'dropout gradInput')
+end
+
+function nntest.ReLU()
+   local input = torch.randn(3,4)
+   local gradOutput = torch.randn(3,4)
+   local module = nn.ReLU()
+   local output = module:forward(input)
+   local output2 = input:clone():gt(input, 0):cmul(input)
+   mytester:assertTensorEq(output, output2, 0.000001, 'ReLU output')
+   local gradInput = module:backward(input, gradOutput)
+   local gradInput2 = input:clone():gt(input, 0):cmul(gradOutput)
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'ReLU gradInput')
+end
+
+function nntest.Exp()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.Exp()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Log()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.Log()
+
+   local err = jac.testJacobian(module,input, 0.1, 10)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input, 0.1, 10)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.HardTanh()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.HardTanh()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision ,  'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Clamp()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local max_value =  math.abs(math.random())
+   local min_value = -math.abs(math.random())
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.Clamp(min_value, max_value)
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision ,  'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Abs()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.Abs()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision ,  'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Threshold()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.Threshold(torch.uniform(-2,2),torch.uniform(-2,2))
+
+   local err = nn.Jacobian.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = nn.Jacobian.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.ELU()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.ELU(0.3)
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision ,  'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.ELUIP()
+   local input = torch.randn(3,4)
+   local input2 = input:clone()
+   local gradOutput = torch.randn(3,4)
+   local gradOutput2 = gradOutput:clone()
+
+   -- Compare in-place to not in-place
+   local module = nn.ELU(0.3, true)
+   local module2 = nn.ELU(0.3, false)
+
+   local output = module:forward(input)
+   local output2 = module2:forward(input2)
+   mytester:assertTensorEq(output, output2, 0.000001, 'ELU output')
+   local gradInput = module:backward(input, gradOutput)
+   local gradInput2 = module2:backward(input2, gradOutput2)
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'ELU gradInput')
+end
+
+function nntest.PReLU()
+   local ini = math.random(3,5)
+   local input = torch.Tensor(ini):zero()
+
+   local module = nn.PReLU(ini)
+
+   -- 1D
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                        'error on weight [%s]', t))
+   end
+
+   -- 2D
+   local nframe = math.random(1,7)
+   local input = torch.Tensor(nframe, ini):zero()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                        'error on weight [%s]', t))
+   end
+
+   -- 4D
+   local nframe = math.random(1,7)
+   local kW, kH = math.random(1,8), math.random(1,8)
+   local input = torch.Tensor(nframe, ini, kW, kH):zero()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                        'error on weight [%s]', t))
+   end
+
+   -- IO
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.RReLU()
+   local nframe = math.random(1,7)
+   local size = math.random(1,7)
+   local kW, kH = math.random(1,8), math.random(1,8)
+   local input = torch.Tensor(nframe, size, kW, kH):zero()
+
+   local l = 1/math.random(5,8)
+   local u = 1/math.random(3,5)
+
+   -- test in evaluation mode (not inplace), RReLU behaves like LeakyReLU
+   local module = nn.RReLU(l, u, false)
+   mytester:assert(module.train, 'default mode ')
+   module:evaluate()
+
+   -- gradient check
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   -- IO
+   local ferr,berr = jac.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+   -- test training and evalation mode
+   for _,train in ipairs({true,false}) do
+      -- test with separate output buffer and inplace
+      for _,inplace in ipairs({false,true}) do
+         module = nn.RReLU(l, u, inplace)
+         if train then
+            module:training()
+         else
+            module:evaluate()
+         end
+         input = torch.rand(nframe, size, kW, kH) - 0.5
+         input:storage()[1] = -1
+         local original_input = input:clone()
+         local output = module:forward(input)
+         mytester:assert(output:sign():eq(original_input:sign()):all(), 'sign flipped forward ')
+         local gradOutput = torch.ones(output:size())
+         local gradInput = module:backward(input, gradOutput)
+         mytester:assert(gradInput:gt(0):eq(input:ne(0)):all(), 'gradient ')
+         mytester:assert(gradInput:lt(1):eq(input:le(0)):all(), 'backward negative inputs ')
+         mytester:assert(gradInput:eq(1):eq(input:gt(0)):all(), 'backward positive inputs ')
+         if not train then
+            local err = gradInput[input:le(0)]:mean()-(module.lower+module.upper)/2
+            mytester:assertlt(err, precision, 'error on gradient ')
+         end
+
+         input = -torch.rand(1000)
+         module:forward(input) -- fill internal noise tensor
+         local g = module:backward(input, torch.ones(1000))
+         local err = math.abs(g[input:le(0)]:mean()-(module.lower+module.upper)/2)
+         mytester:assertlt(err, 0.05, 'mean deviation of gradient for negative inputs ')
+      end
+   end
+end
+
+function nntest.LeakyReLU()
+   local input = torch.randn(3,4)
+   local gradOutput = torch.randn(3,4)
+   local negval = math.random()
+   local module = nn.LeakyReLU(negval)
+   local output = module:forward(input)
+   local output2 = input:clone():gt(input, 0):cmul(input) + input:clone():le(input,0):cmul(input) * module.negval
+   mytester:assertTensorEq(output, output2, 0.000001, 'LeakyReLU output')
+   local gradInput = module:backward(input, gradOutput)
+   local gradInput2 = input:clone():gt(input, 0):cmul(gradOutput) + input:clone():le(input,0):cmul(gradOutput) * module.negval
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'LeakyReLU gradInput')
+end
+
+function nntest.LeakyReLUIP()
+   local input = torch.randn(3,4)
+   local gradOutput = torch.randn(3,4)
+   local negval = math.random()
+   local module = nn.LeakyReLU(negval,true)
+   local output = input:clone():gt(input, 0):cmul(input) + input:clone():le(input,0):cmul(input) * module.negval
+   local output2 = module:forward(input)
+   mytester:assertTensorEq(output2, output, 0.000001, 'LeakyReLU output')
+   local gradInput = input:clone():gt(input, 0):cmul(gradOutput) + input:clone():le(input,0):cmul(gradOutput) * module.negval
+   local gradInput2 = module:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput2, gradInput, 0.000001, 'LeakyReLU gradInput')
+end
+
+function nntest.HardShrink()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.HardShrink(math.random()/2)
+
+   local err = nn.Jacobian.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = nn.Jacobian.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.SoftShrink()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.SoftShrink(math.random()/2)
+
+   local err = nn.Jacobian.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = nn.Jacobian.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Power()
+   local in1 = torch.rand(5,7)
+   local module = nn.Power(2)
+   local out = module:forward(in1)
+   local err = out:dist(in1:cmul(in1))
+   mytester:assertlt(err, 1e-15, torch.typename(module) .. ' - forward err ')
+
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local pw = torch.uniform()*math.random(1,10)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.Power(pw)
+
+   local err = nn.Jacobian.testJacobian(module, input, 0.1, 2)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = nn.Jacobian.testIO(module,input, 0.1, 2)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Normalize()
+   -- compare forward against torch implementation
+   -- and check gradient
+   for _,p in pairs({1,2,3,4,1.5}) do
+      local ini = math.random(3,10)
+      local input = torch.randn(ini)
+      local module = nn.Normalize(p)
+      local out = module:forward(input)
+      local expected = torch.div(input,input:norm(p))
+      mytester:assertTensorEq(out, expected, 1e-7,
+                              torch.typename(module) ..' (' .. p ..') - forward err ')
+
+      local err = jac.testJacobian(module, input, -2, 2)
+      mytester:assertlt(err, precision, 'error norm '..p..' on state ')
+   end
+
+   -- batch mode
+   for _,p in pairs({1,2,3,4,torch.uniform()*math.random(1,10),math.huge}) do
+      local ini = math.random(3,5)
+      local inj = math.random(3,5)
+      local ink = math.random(3,5)
+      local input = torch.Tensor(inj, ini):zero()
+
+      local module = nn.Normalize(p)
+
+      local err = jac.testJacobian(module, input, -2, 2)
+      mytester:assertlt(err, precision, 'error norm '..p..' on state ')
+   end
+
+   -- test IO correctness
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(inj, ini):zero()
+
+   local module = nn.Normalize(2)
+
+   local ferr, berr = jac.testIO(module,input, 0.1, 2)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+end
+
+function nntest.Square()
+   local in1 = torch.rand(5,7)
+   local module = nn.Square()
+   local out = module:forward(in1)
+   local err = out:dist(in1:cmul(in1))
+   mytester:assertlt(err, 1e-15, torch.typename(module) .. ' - forward err ')
+
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.Square()
+
+   local err = nn.Jacobian.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = nn.Jacobian.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Sqrt()
+   local in1 = torch.rand(5,7)
+   local module = nn.Sqrt()
+   local out = module:forward(in1)
+   local err = out:dist(in1:sqrt())
+   mytester:assertlt(err, 1e-15, torch.typename(module) .. ' - forward err ')
+
+   -- Test zero inputs; we will avoid a div-by-zero by setting to zero
+   local zin = torch.DoubleTensor(5, 7):zero()
+   module:forward(zin)
+   local zgradout = torch.rand(5, 7)
+   local zgradin = module:backward(zin, zgradout)
+   mytester:assertTensorEq(zgradin, torch.DoubleTensor(5, 7):zero(), 0.000001, "error in sqrt backward singularity")
+
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.Sqrt()
+
+   local err = nn.Jacobian.testJacobian(module, input, 0.1, 2)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = nn.Jacobian.testIO(module, input, 0, 2)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Linear()
+   local ini = math.random(3,5)
+   local inj_vals = {math.random(3,5), 1}  -- Also test the inj = 1 spatial case
+   local input = torch.Tensor(ini):zero()
+
+   for ind, inj in pairs(inj_vals) do
+      local module = nn.Linear(ini,inj)
+
+      local function jacTests(module)
+         -- 1D
+         local err = jac.testJacobian(module,input)
+         mytester:assertlt(err,precision, 'error on state ')
+
+         local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+         mytester:assertlt(err,precision, 'error on weight ')
+
+         if module.bias then
+            local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+            mytester:assertlt(err,precision, 'error on bias ')
+         end
+
+         local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+         mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+         if module.bias then
+            local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+            mytester:assertlt(err,precision, 'error on bias [direct update] ')
+         end
+
+         nn.hessian.enable()
+
+         local err = jac.testDiagHessianInput(module, input)
+         mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+         local err = jac.testDiagHessianWeight(module, input)
+         mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+         if module.bias then
+            local err = jac.testDiagHessianBias(module, input)
+            mytester:assertlt(err , precision, 'error on diagHessianBias')
+         end
+
+         for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+            mytester:assertlt(err, precision, string.format(
+                                 'error on weight [%s]', t))
+         end
+
+         if module.bias then
+            for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+               mytester:assertlt(err, precision, string.format(
+                                    'error on bias [%s]', t))
+            end
+         end
+
+         -- 2D
+         local nframe = math.random(50,70)
+         local input = torch.Tensor(nframe, ini):zero()
+
+         local err = jac.testJacobian(module,input)
+         mytester:assertlt(err,precision, 'error on state ')
+
+         local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+         mytester:assertlt(err,precision, 'error on weight ')
+
+         if module.bias then
+            local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+            mytester:assertlt(err,precision, 'error on bias ')
+         end
+
+         local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+         mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+         if module.bias then
+            local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+            mytester:assertlt(err,precision, 'error on bias [direct update] ')
+         end
+
+         local err = jac.testDiagHessianInput(module, input)
+         mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+         local err = jac.testDiagHessianWeight(module, input)
+         mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+         if module.bias then
+            local err = jac.testDiagHessianBias(module, input)
+            mytester:assertlt(err , precision, 'error on diag HessianBias')
+         end
+
+         for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+            mytester:assertlt(err, precision, string.format(
+                                 'error on weight [%s]', t))
+         end
+
+         if module.bias then
+            for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+               mytester:assertlt(err, precision, string.format(
+                                    'error on bias [%s]', t))
+            end
+         end
+
+         -- IO
+         local ferr,berr = jac.testIO(module,input)
+         mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+         mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+      end
+
+      jacTests(module)
+      module:noBias()
+      jacTests(module)
+      module.bias = torch.Tensor(inj):zero()
+      module.gradBias = torch.Tensor(inj):zero()
+      module:reset()
+      jacTests(module)
+   end  -- for ind, inj in pairs(inj_vals) do
+end
+
+local function test_sparse_linear(inb, ini, inj, numNonzero)
+   local module = nn.SparseLinear(ini,inj, true)
+   local linear = nn.Linear(ini, inj)
+   linear.weight = module.weight:clone()
+   linear.bias = module.bias:clone()
+   module:zeroGradParameters()
+   linear:zeroGradParameters()
+
+   -- Create a random sparse vector
+   local input = {}
+   local nonsparse = torch.zeros(inb, ini)
+   for i=1,inb do
+       local nnz = math.random(1, 3) + numNonzero
+       local inds = torch.randperm(ini)[{{1,nnz}}]
+       input[i] = torch.Tensor(nnz, 2)
+       input[i]:select(2,1):copy(inds)
+       input[i]:select(2,2):copy(torch.rand(nnz))
+       nonsparse[i]:scatter(1, input[i]:select(2,1):long(), input[i]:select(2,2))
+   end
+   local gradOutput = torch.rand(inb, inj)
+
+   local cmps = {'weight', 'bias', 'gradWeight', 'gradBias'}
+
+   -- Check output wrt linear, non-batch
+   local actual = module:forward(input[1])
+   local expected = linear:forward(nonsparse[1])
+   local actualgi = module:backward(input[1], gradOutput[1])
+   local expectedgi = linear:backward(nonsparse[1], gradOutput[1])
+   module:updateParameters(1)
+   linear:updateParameters(1)
+   local err = (expected - actual):abs():max()
+   local gierr = (expectedgi - actualgi[1]:select(2,2)):abs():max()
+   mytester:assertle(err, precision, 'error on result')
+   mytester:assertle(gierr, precision, 'error on gradInput')
+
+   for _,var in ipairs(cmps) do
+        local err = (module[var] - linear[var]):abs():max()
+        mytester:assertle(err, precision, 'error on '..var)
+   end
+   module:zeroGradParameters()
+   linear:zeroGradParameters()
+
+   -- Check output wrt linear, batch
+   -- doing this n times checks for fast last input param updates
+   local test_n_times = function(ntimes)
+      local actual, expected, actualgi, expectedgi
+      for i=1, ntimes do
+         actual = module:forward(input)
+         expected = linear:forward(nonsparse)
+         actualgi = module:backward(input, gradOutput)
+         expectedgi = linear:backward(nonsparse, gradOutput)
+      end
+      module:updateParameters(1)
+      linear:updateParameters(1)
+      local err = (expected - actual):abs():max()
+      local gicheck = torch.Tensor():resizeAs(expectedgi)
+      for i=1,#actualgi do gicheck[i]:copy(actualgi[i]:select(2,2)) end
+      local gierr = (expectedgi - gicheck):abs():max()
+      mytester:assertle(err, precision, 'error on result with ntimes = '..ntimes)
+      mytester:assertle(gierr, precision, 'error on gradInput with ntimes = '..ntimes)
+
+      for _,var in ipairs(cmps) do
+          local err = (module[var] - linear[var]):abs():max()
+          mytester:assertle(err, precision, 'error on '..var..' with ntimes = '..ntimes)
+      end
+
+      module:zeroGradParameters()
+      linear:zeroGradParameters()
+      mytester:assertle(module.gradWeight:sum(), precision, 'error zeroing gradweight')
+      mytester:assertle(module.gradBias:sum(), precision, 'error zeroing gradweight')
+
+   end
+
+   test_n_times(1)
+   test_n_times(2)
+   test_n_times(3)
+
+   -- legacy batch mode
+   local batch = math.random(2,5)
+
+   local input = torch.Tensor(batch, numNonzero, 2):zero()
+   for k=1,batch do
+      local N = {}
+      for i = 1, ini do N[i] = i end
+      for i = 1, numNonzero do
+         local j = math.random(i,ini)
+         N[i], N[j] = N[j], N[i]
+      end
+      for i = 1, numNonzero do input[{k,i,1}] = N[i] end
+   end
+   local values = input:select(3,2)
+   values:copy(torch.rand(values:nElement())):mul(2):add(-1)
+
+   -- Check output
+   local actual = module:forward(input):clone()
+   local expected = torch.Tensor(batch, inj)
+   for k = 1, batch do
+      expected[k]:copy(module:forward(input[k]))
+   end
+   local err = (expected - actual):abs():max()
+   mytester:assertle(err, precision, 'error on batch result forward')
+end
+
+function nntest.SparseLinear()
+   local inb = math.random(5,10)
+   local ini = math.random(50,100)
+   local inj = math.random(5,10)
+   local numNonzero = math.random(3,5)
+
+   test_sparse_linear(inb, ini, inj, numNonzero)
+   -- Tests OMP parallelism
+   test_sparse_linear(1, 50000, 10, 20000)
+   test_sparse_linear(1000, 1000, 10, 100)
+end
+
+function nntest.Bilinear()
+
+   -- set up data:
+   local N = 10
+   local D1 = 5
+   local D2 = 4
+   local K  = 3
+   local input  = {torch.randn(N, D1), torch.randn(N, D2)}
+   local target = torch.randn(N, K)
+
+   -- test forward
+   local module = nn.Bilinear(D1, D2, K)
+   local expected = torch.zeros(N,K)
+   for k = 1, K do
+      local temp = torch.mm(module.weight[k], input[2]:t())
+      temp:cmul(input[1]:t())
+      temp = temp:sum(1)
+      temp:add(module.bias[k])
+      expected[{{},k}] = temp:view(-1)
+   end
+   local output = module:forward(input)
+   mytester:assertTensorEq(expected, output, 0.000001, 'Bilinear forward 2D err')
+
+   -- For testing grads we'll follow the nn.DotProduct strategy of using a SplitTable
+   local input2 = torch.randn(2, N, D1)
+   local module2 = nn.Sequential()
+   module2:add(nn.SplitTable(1))
+   module2:add(nn.ParallelTable():add(nn.Linear(D1,D1)):add(nn.Linear(D1,D2)))
+   module2:add(nn.Bilinear(D1, D2, K))
+   module2:add(nn.Linear(K,1))
+
+   local err = jac.testJacobian(module2, input2)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module2, input2, module2:get(3).weight, module2:get(3).gradWeight)
+   mytester:assertlt(err, precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module2, input2, module2:get(3).bias, module2:get(3).gradBias)
+   mytester:assertlt(err, precision, 'error on bias ')
+
+end
+
+function nntest.PartialLinear()
+
+   -- settings for experiment:
+   local N = 10
+   local D = 5
+   local K = 15
+
+   -- test forward-backward pass of module:
+   local module = nn.PartialLinear(D, K)
+   for sub_K = 1,K do
+
+      -- get random test case:
+      local input  = torch.randn(N, D)
+      local partition = torch.randperm(K):narrow(1, 1, sub_K)
+
+      -- do forward-backward pass:
+      module:setPartition(partition)
+      module:forward(input)
+      mytester:asserteq(module.output:size(1), N)
+      mytester:asserteq(module.output:size(2), sub_K)
+      module:backward(input, torch.ones(N, sub_K))
+      mytester:asserteq(module.gradInput:size(1), input:size(1))
+      mytester:asserteq(module.gradInput:size(2), input:size(2))
+
+      -- do parameter update:
+      local lr = .01
+      module:updateParameters(lr)
+   end
+   module:resetPartition()
+
+   -- compare output with linear layer:
+   local module2 = nn.Linear(D, K)
+   module2.weight:copy(module.network:get(1):get(2).weight)
+   module2.bias:fill(0)
+   if module.bias then module2.bias:copy(module.bias) end
+   local input = torch.randn(N, D)
+   local diff = (module:forward(input) - module2:forward(input)):abs():sum()
+   mytester:assertlt(diff, 1e-7)
+
+   -- gradient checks:
+   local sub_K = 5
+   local partition = torch.randperm(K):narrow(1, 1, sub_K)
+   module:setPartition(partition)
+   local err = sjac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = sjac.testJacobianParameters(module, input, module.network:get(1):get(2).weight, module.network:get(1):get(2).gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = sjac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err,precision, 'error on bias ')
+
+   local err = sjac.testJacobianUpdateParameters(module, input, module.network:get(1):get(2).weight)
+   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+   local err = sjac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err,precision, 'error on bias [direct update] ')
+
+   local ferr, berr = sjac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Euclidean()
+   local ini = math.random(5,7)
+   local inj = math.random(5,7)
+   local input = torch.randn(ini)
+   local gradOutput = torch.randn(inj)
+   local module = nn.Euclidean(ini,inj)
+   local output = module:forward(input):clone()
+
+   local output2 = torch.Tensor(inj):zero()
+   for o = 1,module.weight:size(2) do
+      output2[o] = input:dist(module.weight:select(2,o))
+   end
+   mytester:assertTensorEq(output, output2, 0.000001, 'Euclidean forward 1D err')
+
+   local input2 = torch.randn(8, ini)
+   input2[2]:copy(input)
+   local output2 = module:forward(input2)
+   mytester:assertTensorEq(output2[2], output, 0.000001, 'Euclidean forward 2D err')
+
+   local output = module:forward(input):clone()
+   module:zeroGradParameters()
+   local gradInput = module:backward(input, gradOutput, 1):clone()
+   local gradInput2 = torch.zeros(ini)
+   local temp = input:clone()
+   for o = 1,module.weight:size(2) do
+      temp:copy(input)
+      temp:add(-1,module.weight:select(2,o))
+      temp:mul(gradOutput[o]/output[o])
+      gradInput2:add(temp)
+   end
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'Euclidean updateGradInput 1D err')
+
+   local gradWeight = module.gradWeight:clone():zero()
+   for o = 1,module.weight:size(2) do
+      temp:copy(module.weight:select(2,o)):add(-1,input)
+      temp:mul(gradOutput[o]/output[o])
+      gradWeight:select(2,o):add(1, temp)
+   end
+   mytester:assertTensorEq(gradWeight, module.gradWeight, 0.000001, 'Euclidean accGradParameters 1D err')
+
+   local input2 = input:view(1, -1):repeatTensor(8, 1)
+   local gradOutput2 = gradOutput:view(1, -1):repeatTensor(8, 1)
+   local output2 = module:forward(input2)
+   module:zeroGradParameters()
+   local gradInput2 = module:backward(input2, gradOutput2, 1/8)
+   mytester:assertTensorEq(gradInput2[2], gradInput, 0.000001, 'Euclidean updateGradInput 2D err')
+
+   mytester:assertTensorEq(gradWeight, module.gradWeight, 0.000001, 'Euclidean accGradParameters 2D err')
+
+   input:zero()
+   module.fastBackward = false
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.WeightedEuclidean()
+   local ini = math.random(5,7)
+   local inj = math.random(5,7)
+   local input = torch.randn(ini)
+   local gradOutput = torch.randn(inj)
+   local module = nn.WeightedEuclidean(ini,inj)
+
+   local output = module:forward(input):clone()
+
+   local output2 = torch.Tensor(inj):zero()
+   local temp = input:clone()
+   for o = 1,module.weight:size(2) do
+      temp:copy(input):add(-1,module.weight:select(2,o))
+      temp:cmul(temp)
+      temp:cmul(module.diagCov:select(2,o)):cmul(module.diagCov:select(2,o))
+      output2[o] = math.sqrt(temp:sum())
+   end
+   mytester:assertTensorEq(output, output2, 0.000001, 'WeightedEuclidean forward 1D err')
+
+   local input2 = torch.randn(8, ini)
+   input2[2]:copy(input)
+   local output2 = module:forward(input2)
+   mytester:assertTensorEq(output2[2], output, 0.000001, 'WeightedEuclidean forward 2D err')
+
+   local output = module:forward(input):clone()
+   module:zeroGradParameters()
+   local gradInput = module:backward(input, gradOutput, 1):clone()
+   local gradInput2 = torch.zeros(ini)
+   for o = 1,module.weight:size(2) do
+      temp:copy(input)
+      temp:add(-1,module.weight:select(2,o))
+      temp:cmul(module.diagCov:select(2,o)):cmul(module.diagCov:select(2,o))
+      temp:mul(gradOutput[o]/output[o])
+      gradInput2:add(temp)
+   end
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'WeightedEuclidean updateGradInput 1D err')
+
+   local gradWeight = module.gradWeight:clone():zero()
+   local gradDiagCov = module.gradDiagCov:clone():zero()
+   for o = 1,module.weight:size(2) do
+      if output[o] ~= 0 then
+         temp:copy(module.weight:select(2,o)):add(-1,input)
+         temp:cmul(module.diagCov:select(2,o)):cmul(module.diagCov:select(2,o))
+         temp:mul(gradOutput[o]/output[o])
+         gradWeight:select(2,o):add(temp)
+
+         temp:copy(module.weight:select(2,o)):add(-1,input)
+         temp:cmul(temp)
+         temp:cmul(module.diagCov:select(2,o))
+         temp:mul(gradOutput[o]/output[o])
+         gradDiagCov:select(2,o):add(temp)
+      end
+   end
+   mytester:assertTensorEq(gradWeight, module.gradWeight, 0.000001, 'WeightedEuclidean accGradParameters gradWeight 1D err')
+   mytester:assertTensorEq(gradDiagCov, module.gradDiagCov, 0.000001, 'WeightedEuclidean accGradParameters gradDiagCov 1D err')
+
+   local input2 = input:view(1, -1):repeatTensor(8, 1)
+   local gradOutput2 = gradOutput:view(1, -1):repeatTensor(8, 1)
+   local output2 = module:forward(input2)
+   module:zeroGradParameters()
+   local gradInput2 = module:backward(input2, gradOutput2, 1/8)
+   mytester:assertTensorEq(gradInput2[2], gradInput, 0.000001, 'WeightedEuclidean updateGradInput 2D err')
+
+   mytester:assertTensorEq(gradWeight, module.gradWeight, 0.000001, 'WeightedEuclidean accGradParameters gradWeight 2D err')
+   mytester:assertTensorEq(gradDiagCov, module.gradDiagCov, 0.000001, 'WeightedEuclidean accGradParameters gradDiagCov 2D err')
+
+   input:zero()
+   module.fastBackward = false
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.diagCov, module.gradDiagCov)
+   mytester:assertlt(err,precision, 'error on bias ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+   input:zero()
+   module:zeroGradParameters()
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.diagCov, module.gradDiagCov)
+   mytester:assertlt(err,precision, 'error on bias ')
+
+   local ferr,berr = jac.testIO(module,input2)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+local function criterionJacobianTest(cri, input, target)
+   local eps = 1e-6
+   local _ = cri:forward(input, target)
+   local dfdx = cri:backward(input, target)
+   -- for each input perturbation, do central difference
+   local centraldiff_dfdx = torch.Tensor():resizeAs(dfdx)
+   local input_s = input:storage()
+   local centraldiff_dfdx_s = centraldiff_dfdx:storage()
+   for i=1,input:nElement() do
+      -- f(xi + h)
+      input_s[i] = input_s[i] + eps
+      local fx1 = cri:forward(input, target)
+      -- f(xi - h)
+      input_s[i] = input_s[i] - 2*eps
+      local fx2 = cri:forward(input, target)
+      -- f'(xi) = (f(xi + h) - f(xi - h)) / 2h
+      local cdfx = (fx1 - fx2) / (2*eps)
+      -- store f' in appropriate place
+      centraldiff_dfdx_s[i] = cdfx
+      -- reset input[i]
+      input_s[i] = input_s[i] + eps
+   end
+
+   -- compare centraldiff_dfdx with :backward()
+   local err = (centraldiff_dfdx - dfdx):abs():max()
+   mytester:assertlt(err, precision, 'error in difference between central difference and :backward')
+end
+
+local function criterionJacobianTest1DTable(cri, input0, target)
+   -- supposes input is a tensor, which is splitted in the first dimension
+   local input = input0:split(1,1)
+   for i=1,#input do
+      input[i] = input[i][1]
+   end
+   local eps = 1e-6
+   local _ = cri:forward(input, target)
+   local dfdx = cri:backward(input, target)
+   -- for each input perturbation, do central difference
+   local centraldiff_dfdx = torch.Tensor():resizeAs(input0)
+   local input_s = input0:storage()
+   local centraldiff_dfdx_s = centraldiff_dfdx:storage()
+   for i=1,input0:nElement() do
+      -- f(xi + h)
+      input_s[i] = input_s[i] + eps
+      local fx1 = cri:forward(input, target)
+      -- f(xi - h)
+      input_s[i] = input_s[i] - 2*eps
+      local fx2 = cri:forward(input, target)
+      -- f'(xi) = (f(xi + h) - f(xi - h)) / 2h
+      local cdfx = (fx1 - fx2) / (2*eps)
+      -- store f' in appropriate place
+      centraldiff_dfdx_s[i] = cdfx
+      -- reset input[i]
+      input_s[i] = input_s[i] + eps
+   end
+   local centraldiff_dfdx_t = centraldiff_dfdx:split(1,1)
+   for i=1,#centraldiff_dfdx_t do
+      centraldiff_dfdx_t[i] = centraldiff_dfdx_t[i][1]
+   end
+   for i=1,#centraldiff_dfdx_t do
+      -- compare centraldiff_dfdx with :backward()
+      local err = (centraldiff_dfdx_t[i] - dfdx[i]):abs():max()
+      mytester:assertlt(err, precision, 'error in difference between central difference and :backward')
+   end
+end
+
+function nntest.SmoothL1Criterion()
+   local input = torch.rand(10)
+   local target = input:clone():add(torch.rand(10))
+   local cri = nn.SmoothL1Criterion()
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.MSECriterion()
+   local input = torch.rand(10)
+   local target = input:clone():add(torch.rand(10))
+   local cri = nn.MSECriterion()
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.ClassSimplexCriterion()
+   local nClasses = torch.random(3,15)
+   local input = torch.rand(nClasses)
+   local target = torch.random(1,nClasses)
+   local cri = nn.ClassSimplexCriterion(nClasses)
+   criterionJacobianTest(cri, input, target)
+end
+
+
+function nntest.MarginCriterion()
+   local input = torch.rand(100)
+   local target = input:clone():add(torch.rand(100))
+   local cri = nn.MarginCriterion()
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.SoftMarginCriterion()
+   local input = torch.rand(100)
+   local target = input:clone():add(torch.rand(100))
+   local cri = nn.SoftMarginCriterion()
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.MultiMarginCriterion()
+   local input = torch.rand(100)
+   local target = math.random(1,100)
+   local cri = nn.MultiMarginCriterion(math.random(1,2), nil, 0.1)
+   criterionJacobianTest(cri, input, target)
+
+   local cri = nn.MultiMarginCriterion()
+   criterionJacobianTest(cri, input, target)
+
+   local cri = nn.MultiMarginCriterion(2)
+   criterionJacobianTest(cri, input, target)
+
+   local weights = torch.randn(100)
+   local cri = nn.MultiMarginCriterion(1, weights)
+end
+
+function nntest.MarginRankingCriterion()
+   local input = {torch.rand(1), torch.rand(1)}
+   local mrc = nn.MarginRankingCriterion()
+   local output = mrc:forward(input, 1)
+   local gradInput = mrc:backward(input, 1)
+   -- cast to float
+   local input2 = {input[1]:float(), input[2]:float()}
+   local mrc2 = mrc:clone():float()
+   local output2 = mrc2:forward(input2, 1)
+   local gradInput2 = mrc2:backward(input2, 1)
+   mytester:assert(math.abs(output2 - output) < 0.00001, "MRC:type() forward error")
+   mytester:assertTensorEq(gradInput[1]:float(), gradInput2[1], 0.00001, "MRC:type() backward error 1")
+   mytester:assert(torch.type(gradInput2[1]) == 'torch.FloatTensor', "MRC:type() error 1")
+   mytester:assertTensorEq(gradInput[2]:float(), gradInput2[2], 0.00001, "MRC:type() backward error 2")
+   mytester:assert(torch.type(gradInput2[2]) == 'torch.FloatTensor', "MRC:type() error 2")
+
+   -- batch, sizeAverage true, jacobian
+   local margin = math.random()*2-1
+   local batch_size = math.random(2,10)
+   local crit = nn.MarginRankingCriterion(margin)
+   crit.sizeAverage = true
+   local v = torch.rand(2,batch_size)
+   local t = torch.Tensor(batch_size):random(0,1):mul(2):add(-1)
+   criterionJacobianTest1DTable(crit,v,t)
+
+   -- batch, sizeAverage false, jacobian
+   local margin = math.random()*2-1
+   local crit = nn.MarginRankingCriterion(margin)
+   crit.sizeAverage = false
+   local v = torch.rand(2,batch_size)
+   local t = torch.Tensor(batch_size):random(0,1):mul(2):add(-1)
+   criterionJacobianTest1DTable(crit,v,t)
+
+end
+
+function nntest.MaskedSelect()
+   local input = torch.randn(4, 5)
+   local mask = torch.ByteTensor(4, 5):bernoulli()
+   local module = nn.MaskedSelect()
+   local out = module:forward({input, mask})
+   local err = out:dist(input:maskedSelect(mask))
+   mytester:assertlt(err, 1e-15, torch.typename(module) .. ' - forward err ')
+
+   local gradOut = torch.Tensor({20, 80})
+   input = torch.Tensor({{10, 20}, {30, 40}})
+   local inTarget = torch.Tensor({{20, 0}, {0, 80}})
+   local mask = torch.ByteTensor({{1, 0}, {0, 1}})
+   local module = nn.MaskedSelect()
+   module:forward({input, mask})
+   local gradIn = module:backward({input, mask}, gradOut)
+   mytester:assertTensorEq(inTarget, gradIn[1], 1e-15, torch.typename(module) .. ' - backward err ')
+end
+
+function nntest.ParallelCriterion()
+   local input = {torch.rand(2,10), torch.randn(2,10)}
+   local target = {torch.IntTensor{1,8}, torch.randn(2,10)}
+   local nll = nn.ClassNLLCriterion()
+   local mse = nn.MSECriterion()
+   local pc = nn.ParallelCriterion():add(nll, 0.5):add(mse)
+   local output = pc:forward(input, target)
+   local output2 = nll:forward(input[1], target[1])/2 + mse:forward(input[2], target[2])
+   mytester:assert(math.abs(output2 - output) < 0.00001, "ParallelCriterion forward error")
+   local gradInput2 = {nll:backward(input[1], target[1]):clone():div(2), mse:backward(input[2], target[2])}
+   local gradInput = pc:backward(input, target)
+   mytester:assertTensorEq(gradInput[1], gradInput2[1], 0.000001, "ParallelCriterion backward error 1")
+   mytester:assertTensorEq(gradInput[2], gradInput2[2], 0.000001, "ParallelCriterion backward error 2")
+
+   -- test type
+   pc:float()
+   gradInput[1], gradInput[2] = gradInput[1]:clone(), gradInput[2]:clone()
+   local input3 = {input[1]:float(), input[2]:float()}
+   local target3 = {target[1]:float(), target[2]:float()}
+   local output3 = pc:forward(input3, target3)
+   local gradInput3 = pc:backward(input3, target3)
+   mytester:assert(math.abs(output3 - output) < 0.00001, "ParallelCriterion forward error type")
+   mytester:assertTensorEq(gradInput[1]:float(), gradInput3[1], 0.000001, "ParallelCriterion backward error 1 type")
+   mytester:assertTensorEq(gradInput[2]:float(), gradInput3[2], 0.000001, "ParallelCriterion backward error 2 type")
+
+   -- test repeatTarget
+   local input = {torch.rand(2,10), torch.randn(2,10)}
+   local target = torch.randn(2,10)
+   local mse = nn.MSECriterion()
+   local pc = nn.ParallelCriterion(true):add(mse, 0.5):add(mse:clone())
+   local output = pc:forward(input, target)
+   local output2 = mse:forward(input[1], target)/2 + mse:forward(input[2], target)
+   mytester:assert(math.abs(output2 - output) < 0.00001, "ParallelCriterion repeatTarget forward error")
+   local gradInput = pc:backward(input, target)
+   local gradInput2 = {mse:backward(input[1], target):clone():div(2), mse:backward(input[2], target)}
+   mytester:assertTensorEq(gradInput[1], gradInput2[1], 0.000001, "ParallelCriterion repeatTarget backward error 1")
+   mytester:assertTensorEq(gradInput[2], gradInput2[2], 0.000001, "ParallelCriterion repeatTarget backward error 2")
+
+   -- table input
+   local input = {torch.randn(2,10), {torch.rand(2,10), torch.randn(2,10)}}
+   local target = {torch.IntTensor{2,5}, {torch.IntTensor{1,8}, torch.randn(2,10)}}
+   local nll2 = nn.ClassNLLCriterion()
+   local nll = nn.ClassNLLCriterion()
+   local mse = nn.MSECriterion()
+   local pc = nn.ParallelCriterion():add(nll, 0.5):add(mse)
+   local pc2 = nn.ParallelCriterion():add(nll2, 0.4):add(pc)
+   local output = pc2:forward(input, target)
+   local output2 = nll2:forward(input[1], target[1])*0.4 + nll:forward(input[2][1], target[2][1])/2 + mse:forward(input[2][2], target[2][2])
+   mytester:assert(math.abs(output2 - output) < 0.00001, "ParallelCriterion table forward error")
+   local gradInput2 = {
+       nll2:backward(input[1], target[1]):clone():mul(0.4),
+      {nll:backward(input[2][2], target[2][1]):clone():div(2), mse:backward(input[2][2], target[2][2])}
+   }
+   local gradInput = pc2:backward(input, target)
+   mytester:assertTensorEq(gradInput[1], gradInput2[1], 0.000001, "ParallelCriterion table backward error 1")
+   mytester:assertTensorEq(gradInput[2][1], gradInput2[2][1], 0.000001, "ParallelCriterion table backward error 2")
+   mytester:assertTensorEq(gradInput[2][2], gradInput2[2][2], 0.000001, "ParallelCriterion table backward error 3")
+end
+
+function nntest.MultiCriterion()
+   local input = torch.rand(2,10)
+   local target = torch.IntTensor{1,8}
+   local nll = nn.ClassNLLCriterion()
+   local nll2 = nn.CrossEntropyCriterion()
+   local mc = nn.MultiCriterion():add(nll, 0.5):add(nll2)
+   local output = mc:forward(input, target)
+   local output2 = nll:forward(input, target)/2 + nll2:forward(input, target)
+   mytester:assert(math.abs(output2 - output) < 0.00001, "MultiCriterion forward error")
+   local gradInput = mc:backward(input, target)
+   local gradInput2 = nll:backward(input, target):clone():div(2):add(nll2:backward(input, target))
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, "MultiCriterion backward error ")
+
+   -- test type
+   mc:float()
+   gradInput = gradInput:clone()
+   local input3 = input:float()
+   local target3 = target:float()
+   local output3 = mc:forward(input3, target3)
+   local gradInput3 = mc:backward(input3, target3)
+   mytester:assert(math.abs(output3 - output) < 0.00001, "MultiCriterion forward error type")
+   mytester:assertTensorEq(gradInput:float(), gradInput3, 0.000001, "MultiCriterion backward error type")
+
+   -- test table input
+   mc:double()
+   local input = {torch.randn(2,10), {torch.randn(2,10), torch.randn(2,10)}}
+   local target = {torch.IntTensor{1,8}, {torch.IntTensor{5,6}, torch.IntTensor{4,3}}}
+   local pnllc = nn.ParallelCriterion():add(nll):add(nn.ParallelCriterion():add(nll:clone()):add(nll:clone()))
+   local pnllc2 = nn.ParallelCriterion():add(nll2):add(nn.ParallelCriterion():add(nll2:clone()):add(nll2:clone()))
+   local mc = nn.MultiCriterion():add(pnllc, 0.5):add(pnllc2)
+   local output = mc:forward(input, target)
+   local output2 = pnllc:forward(input, target)/2 + pnllc2:forward(input, target)
+   mytester:assert(math.abs(output2 - output) < 0.00001, "MultiCriterion forward table error")
+   local gradInput = mc:backward(input, target)
+   local gradInput2 = pnllc:clone():backward(input, target)
+   local gradInput2b = pnllc2:backward(input, target)
+   gradInput2[1]:div(2):add(gradInput2b[1])
+   gradInput2[2][1]:div(2):add(gradInput2b[2][1])
+   gradInput2[2][2]:div(2):add(gradInput2b[2][2])
+   mytester:assertTensorEq(gradInput[1], gradInput2[1], 0.000001, "MultiCriterion backward table 1 error ")
+   mytester:assertTensorEq(gradInput[2][1], gradInput2[2][1], 0.000001, "MultiCriterion backward table 2 error ")
+   mytester:assertTensorEq(gradInput[2][2], gradInput2[2][2], 0.000001, "MultiCriterion backward table 3 error ")
+end
+
+function nntest.WeightedMSECriterion()
+   local input = torch.rand(10)
+   local target = input:clone():add(torch.rand(10))
+   local cri = nn.WeightedMSECriterion(torch.rand(10))
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.BCECriterion()
+   local eps = 1e-2
+   local input = torch.rand(10)*(1-eps) + eps/2
+   local target = torch.rand(10)*(1-eps) + eps/2
+   local cri = nn.BCECriterion()
+   criterionJacobianTest(cri, input, target)
+   --with weights
+   local weights= torch.rand(10)*(1-eps) + eps/2
+   local cri = nn.BCECriterion(weights)
+   criterionJacobianTest(cri, input, target)
+   -- with weights + batch
+   local bsz = 5
+   local input = torch.rand(bsz, 10)*(1-eps) + eps/2
+   local target = torch.rand(bsz, 10)*(1-eps) + eps/2
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.DistKLDivCriterion()
+   local input = torch.rand(10)
+   local target = input:clone():add(torch.rand(10))
+   local cri = nn.DistKLDivCriterion(true)  -- sizeAverage = true
+   criterionJacobianTest(cri, input, target)
+   cri = nn.DistKLDivCriterion(false)  -- sizeAverage = false
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.ClassNLLCriterion()
+   local numLabels = math.random(5,10)
+   local input = torch.rand(numLabels)
+   local target = math.random(1,numLabels)
+
+   -- default ClassNLLCriterion
+   local cri = nn.ClassNLLCriterion()
+   criterionJacobianTest(cri, input, target)
+
+   -- ClassNLLCriterion with weights
+   local weights = torch.rand(numLabels)
+   weights = weights / weights:sum()
+   cri = nn.ClassNLLCriterion(weights)
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.SpatialClassNLLCriterion()
+   local numLabels = math.random(5,10)
+   local h = math.random(5, 20)
+   local w = math.random(5, 20)
+   local batchSize = math.random(1, 4)
+   local input = torch.rand(batchSize, numLabels, h, w)
+   local target = torch.Tensor(batchSize, h, w)
+   target:apply(function() return math.random(1, numLabels) end)
+
+   -- default ClassNLLCriterion
+   local cri = nn.SpatialClassNLLCriterion()
+   criterionJacobianTest(cri, input, target)
+
+   -- ClassNLLCriterion with weights
+   local weights = torch.rand(numLabels)
+   cri = nn.SpatialClassNLLCriterion(weights)
+   criterionJacobianTest(cri, input, target)
+
+   -- check with ClassNLLCriterion
+   local spatial = nn.SpatialClassNLLCriterion(weights)
+   local regular = nn.ClassNLLCriterion(weights)
+   local spatial_out = spatial:forward(input, target)
+   local regular_out = regular:forward(input:permute(1, 3, 4, 2):contiguous():view(-1, numLabels),
+                                       target:view(-1))
+   mytester:eq(spatial_out, regular_out, 1e-6,
+         "spatial and regular criterions give different results")
+end
+
+function nntest.MultiLabelSoftMarginCriterion()
+    local cri = nn.MultiLabelSoftMarginCriterion()
+
+    -- stochastic
+    local numLabels = math.random(5, 10)
+    local input = torch.randn(numLabels)
+    local target = torch.round(torch.rand(numLabels))
+
+    criterionJacobianTest(cri, input, target)
+
+    -- batch
+    local numLabels = math.random(5, 10)
+    local bsz = math.random(3, 7)
+    local input = torch.randn(bsz, numLabels)
+    local target = torch.round(torch.rand(bsz, numLabels))
+
+    criterionJacobianTest(cri, input, target)
+
+end
+
+function nntest.CrossEntropyCriterion()
+   -- stochastic
+   local numLabels = math.random(5, 10)
+   local input = torch.zeros(numLabels)
+   local target = torch.random(1, numLabels)
+
+   local cri = nn.CrossEntropyCriterion()
+   criterionJacobianTest(cri, input, target)
+
+   -- batch
+   local numLabels = math.random(5,10)
+   local bsz = math.random(3, 7)
+   local input = torch.zeros(bsz, numLabels)
+   local target = torch.Tensor(bsz):random(1, numLabels)
+
+   local cri = nn.CrossEntropyCriterion()
+   criterionJacobianTest(cri, input, target)
+
+   -- with weights
+   local weights = torch.rand(numLabels)
+   weights = weights / weights:sum()
+   cri = nn.CrossEntropyCriterion(weights)
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.LogSigmoid()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.LogSigmoid()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.LogSoftmax()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local input = torch.Tensor(ini,inj):zero()
+   local module = nn.LogSoftMax()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err, 1e-3, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   
+   -- test logsoftmax when gradOutput is non-contiguous
+   local layer = nn.LogSoftMax()
+   layer:zeroGradParameters()
+   local input = torch.randn(4, 10)
+   local data = torch.randn(4, 20)
+   local gradOutput = data:narrow(2, 1, 10):fill(0)
+   local output = layer:forward(input)
+   local gradInput1 = layer:backward(input, gradOutput):clone()
+   local output = layer:forward(input)
+   gradOutput = gradOutput:clone()
+   local gradInput2 = layer:backward(input, gradOutput):clone()
+
+   mytester:assertlt(gradInput1:add(-1, gradInput2):abs():max(), 
+		     1e-10, 
+		     torch.typename(layer) 
+			.. ' non-contiguous gradOutput check')
+   
+   
+   
+
+end
+
+-- function nntest.TemporalLogSoftmax()
+--    local ini = math.random(10,20)
+--    local inj = math.random(10,20)
+--    local input = torch.Tensor(ini,inj):zero()
+--    local module = nn.TemporalLogSoftMax()
+
+--    local err = jac.testJacobian(module,input)
+--    mytester:assertlt(err,precision, 'error on state ')
+
+--    local ferr,berr = jac.testIO(module,input)
+--    mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+--    mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+-- end
+
+function nntest.Max()
+   -- 1D
+   local ini = math.random(3,7)
+   local input = torch.Tensor(ini):zero()
+   local module = nn.Max(1)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- negative dimension
+   local module = nn.Max(-1)
+   local input = torch.Tensor({1, 2, 3})
+   local expected = torch.Tensor({3})
+   local output = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+   -- batch
+   local module = nn.Max(1, 1)
+   local input = torch.Tensor({{1, 2, 3},{4, 5, 6}})
+   local expected = torch.Tensor({3, 6})
+   local output = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+
+   -- 3D
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj*ink):zero()
+   local module = nn.Max(1)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Min()
+   -- 1D
+   local ini = math.random(3,7)
+   local input = torch.Tensor(ini):zero()
+   local module = nn.Min(1)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- negative dimension
+   local module = nn.Min(-1)
+   local input = torch.Tensor({1, 2, 3})
+   local expected = torch.Tensor({1})
+   local output = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+   -- batch
+   local module = nn.Min(1, 1)
+   local input = torch.Tensor({{1, 2, 3},{4, 5, 6}})
+   local expected = torch.Tensor({1, 4})
+   local output = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+
+   -- 3D
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj*ink):zero()
+   local module = nn.Min(1)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Mean()
+   -- 1D
+   local ini = math.random(3,7)
+   local input = torch.Tensor(ini):zero()
+   local module = nn.Mean(1)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- negative dimension
+   local module = nn.Mean(-1)
+   local input = torch.Tensor({1, 2, 3})
+   local expected = torch.Tensor({2})
+   local output = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+   -- batch
+   local module = nn.Mean(1, 1)
+   local input = torch.Tensor({{1, 2, 3},{4, 5, 6}})
+   local expected = torch.Tensor({2, 5})
+   local output = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+
+   -- 3D
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.Mean(torch.random(1,3))
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Mul()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.Mul()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Sigmoid()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.Sigmoid()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Softmax()
+   local ini = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, ini):zero()
+   local module = nn.SoftMax()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,expprecision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.SpatialSoftMax()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local inl = math.random(3,5)
+   local input = torch.Tensor(inl, ink, inj, ini):zero()
+   local module = nn.SpatialSoftMax()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,expprecision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Softmin()
+   local ini = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, ini):zero()
+   local module = nn.SoftMin()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,expprecision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Softsign()
+   local ini = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, ini):zero()
+   local module = nn.SoftSign()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.SoftPlus()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.SoftPlus()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.SpatialSubtractiveNormalization_2dkernel()
+   local inputSize = math.random(6,9)
+   local kersize = 3
+   local nbfeatures = math.random(3,5)
+   local kernel = torch.Tensor(kersize,kersize):fill(1)
+   local module = nn.SpatialSubtractiveNormalization(nbfeatures,kernel)
+   local input = torch.rand(nbfeatures,inputSize,inputSize/2)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+    -- test batch mode
+   local output = module:forward(input):clone()
+   local gradOutput = output:clone():uniform(0,1)
+   local gradInput = module:backward(input, gradOutput):clone()
+   local batchSize = 4
+   local input2 = torch.rand(batchSize,nbfeatures,inputSize,inputSize/2)
+   input2[2]:copy(input)
+
+   local output2 = module:forward(input2)
+   local gradOutput2 = output2:clone():uniform(0,1)
+   gradOutput2[2]:copy(gradOutput)
+   local gradInput2 = module:backward(input2, gradOutput2)
+
+   mytester:assertTensorEq(output2[2], output, 0.000001, "SpatialSubstractiveNormalization 2d forward batch err")
+   mytester:assertTensorEq(gradOutput2[2], gradOutput, 0.000001, "SpatialSubstractiveNormalization 2d backward batch err")
+
+   local err = jac.testJacobian(module,input2)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input2)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+end
+
+function nntest.SpatialSubtractiveNormalization_1dkernel()
+   local inputSize = math.random(6,9)
+   local kersize = 3
+   local nbfeatures = math.random(3,5)
+   local kernel = torch.Tensor(kersize):fill(1)
+   local module = nn.SpatialSubtractiveNormalization(nbfeatures,kernel)
+   local input = torch.rand(nbfeatures,inputSize,inputSize/2)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+    -- test batch mode
+   local output = module:forward(input):clone()
+   local gradOutput = output:clone():uniform(0,1)
+   local gradInput = module:backward(input, gradOutput):clone()
+   local batchSize = 4
+   local input2 = torch.rand(batchSize,nbfeatures,inputSize,inputSize/2)
+   input2[2]:copy(input)
+
+   local output2 = module:forward(input2)
+   local gradOutput2 = output2:clone():uniform(0,1)
+   gradOutput2[2]:copy(gradOutput)
+   local gradInput2 = module:backward(input2, gradOutput2)
+
+   mytester:assertTensorEq(output2[2], output, 0.000001, "SpatialSubstractiveNormalization 1d forward batch err")
+   mytester:assertTensorEq(gradOutput2[2], gradOutput, 0.000001, "SpatialSubstractiveNormalization 1d backward batch err")
+
+   local err = jac.testJacobian(module,input2)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input2)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.SpatialDivisiveNormalization_2dkernel()
+   local inputSize = math.random(6,9)
+   local kersize = 3
+   local nbfeatures = math.random(3,5)
+   local kernel = torch.Tensor(kersize,kersize):fill(1)
+   local module = nn.SpatialDivisiveNormalization(nbfeatures,kernel)
+   local input = torch.rand(nbfeatures,inputSize,inputSize/2)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+   -- test batch mode
+   local output = module:forward(input):clone()
+   local gradOutput = output:clone():uniform(0,1)
+   local gradInput = module:backward(input, gradOutput):clone()
+   local batchSize = 4
+   local input2 = torch.rand(batchSize,nbfeatures,inputSize,inputSize/2)
+   input2[2]:copy(input)
+
+   local output2 = module:forward(input2)
+   local gradOutput2 = output2:clone():uniform(0,1)
+   gradOutput2[2]:copy(gradOutput)
+   local gradInput2 = module:backward(input2, gradOutput2)
+
+   mytester:assertTensorEq(output2[2], output, 0.000001, "SpatialDivisiveNormalization 2d forward batch err")
+   mytester:assertTensorEq(gradOutput2[2], gradOutput, 0.000001, "SpatialDivisiveNormalization 2d backward batch err")
+
+   local err = jac.testJacobian(module,input2)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input2)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.SpatialDivisiveNormalization_1dkernel()
+   local inputSize = math.random(6,9)
+   local kersize = 3
+   local nbfeatures = math.random(3,5)
+   local kernel = torch.Tensor(kersize):fill(1)
+   local module = nn.SpatialDivisiveNormalization(nbfeatures,kernel)
+   local input = torch.rand(nbfeatures,inputSize,inputSize/2)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+    -- test batch mode
+   local output = module:forward(input):clone()
+   local gradOutput = output:clone():uniform(0,1)
+   local gradInput = module:backward(input, gradOutput):clone()
+   local batchSize = 4
+   local input2 = torch.rand(batchSize,nbfeatures,inputSize,inputSize/2)
+   input2[2]:copy(input)
+
+   local output2 = module:forward(input2)
+   local gradOutput2 = output2:clone():uniform(0,1)
+   gradOutput2[2]:copy(gradOutput)
+   local gradInput2 = module:backward(input2, gradOutput2)
+
+   mytester:assertTensorEq(output2[2], output, 0.000001, "SpatialDivisiveNormalization 1d forward batch err")
+   mytester:assertTensorEq(gradOutput2[2], gradOutput, 0.000001, "SpatialDivisiveNormalization 1d backward batch err")
+
+   local err = jac.testJacobian(module,input2)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input2)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.SpatialContrastiveNormalization()
+   local inputSize = math.random(6,9)
+   local kersize = 3
+   local nbfeatures = math.random(3,5)
+   local kernel = torch.Tensor(kersize,kersize):fill(1)
+   local module = nn.SpatialContrastiveNormalization(nbfeatures,kernel)
+   local input = torch.rand(nbfeatures,inputSize,inputSize/2)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+   -- test batch mode and type
+   local output = module:forward(input):clone()
+   local gradOutput = output:clone():uniform(0,1)
+   local gradInput = module:backward(input, gradOutput):clone()
+   local batchSize = 4
+   local input2 = torch.rand(batchSize,nbfeatures,inputSize,inputSize/2):float()
+   input2[2]:copy(input)
+
+   module:float() -- type-cast
+   local output2 = module:forward(input2)
+   local gradOutput2 = output2:clone():uniform(0,1)
+   gradOutput2[2]:copy(gradOutput)
+   local gradInput2 = module:backward(input2, gradOutput2)
+
+   mytester:assertTensorEq(output2[2], output:float(), 0.000002, "SpatialContrastiveNormalization 2d forward batch err")
+   mytester:assertTensorEq(gradOutput2[2], gradOutput:float(), 0.000002, "SpatialContrastiveNormalization 2d backward batch err")
+
+   module:double()
+   input2 = input2:double()
+   local err = jac.testJacobian(module,input2)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input2)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.SpatialCrossMapLRN()
+   local inputSize = math.random(6,9)
+   local size = math.random(1,3)*2+1
+   local nbfeatures = math.random(3,8)
+   local alpha = math.random(1,100)/100
+   local beta  = math.random(0,100)/100
+   local k = math.random(1,3)
+   local module = nn.SpatialCrossMapLRN(size, alpha, beta, k)
+   local input = torch.rand(nbfeatures,inputSize,inputSize)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+   -- test batch mode and type
+   local output = module:forward(input):clone()
+   local gradOutput = output:clone():uniform(0,1)
+   local gradInput = module:backward(input, gradOutput):clone()
+   local batchSize = 4
+   local input2 = torch.rand(batchSize,nbfeatures,inputSize,inputSize):float()
+   input2[2]:copy(input)
+
+   module:float() -- type-cast
+   local output2 = module:forward(input2)
+   local gradOutput2 = output2:clone():uniform(0,1)
+   gradOutput2[2]:copy(gradOutput)
+   local gradInput2 = module:backward(input2, gradOutput2)
+
+   mytester:assertTensorEq(output2[2], output:float(), 0.000001, "SpatialCrossMapLRN 2d forward batch err")
+   mytester:assertTensorEq(gradOutput2[2], gradOutput:float(), 0.000001, "SpatialCrossMapLRN 2d backward batch err")
+
+   module:double()
+   input2 = input2:double()
+   local err = jac.testJacobian(module,input2)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input2)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+
+function nntest.SpatialConvolution()
+   local from = math.random(1,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local si = math.random(1,4)
+   local sj = math.random(1,4)
+   local outi = math.random(5,7)
+   local outj = math.random(5,7)
+   local ini = (outi-1)*si+ki
+   local inj = (outj-1)*sj+kj
+   local module = nn.SpatialConvolution(from, to, ki, kj, si, sj)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   local function jacTests(module)
+      -- stochastic
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error on state ')
+
+      local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+      mytester:assertlt(err , precision, 'error on weight ')
+
+      if module.bias then
+         local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+         mytester:assertlt(err , precision, 'error on bias ')
+      end
+
+      local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+      mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+      if module.bias then
+         local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+         mytester:assertlt(err , precision, 'error on bias [direct update] ')
+      end
+
+      nn.hessian.enable()
+
+      local err = jac.testDiagHessianInput(module, input)
+      mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+      local err = jac.testDiagHessianWeight(module, input)
+      mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+      if module.bias then
+         local err = jac.testDiagHessianBias(module, input)
+         mytester:assertlt(err , precision, 'error on diag HessianBias')
+      end
+
+      for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+         mytester:assertlt(err, precision, string.format(
+                              'error on weight [%s]', t))
+      end
+
+      if module.bias then
+         for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+            mytester:assertlt(err, precision, string.format(
+                                 'error on bias [%s]', t))
+         end
+      end
+
+      -- batch
+
+      --verbose = true
+      local batch = math.random(2,5)
+      outi = math.random(4,8)
+      outj = math.random(4,8)
+      ini = (outi-1)*si+ki
+      inj = (outj-1)*sj+kj
+      module = nn.SpatialConvolution(from, to, ki, kj, si, sj)
+      input = torch.Tensor(batch,from,inj,ini):zero()
+
+      --    print(from, to, ki, kj, si, sj, batch, ini, inj)
+      --    print(module.weight:size())
+      --    print(module.gradWeight:size())
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'batch error on state ')
+
+      local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+      mytester:assertlt(err , precision, 'batch error on weight ')
+
+      if module.bias then
+         local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+         mytester:assertlt(err , precision, 'batch error on bias ')
+      end
+
+      local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+      mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+      if module.bias then
+         local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+         mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+      end
+
+      local err = jac.testDiagHessianInput(module, input)
+      mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+      local err = jac.testDiagHessianWeight(module, input)
+      mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+      if module.bias then
+         local err = jac.testDiagHessianBias(module, input)
+         mytester:assertlt(err , precision, 'error on diag HessianBias')
+      end
+
+      for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+         mytester:assertlt(err, precision, string.format(
+                              'error on weight [%s]', t))
+      end
+
+      if module.bias then
+         for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+            mytester:assertlt(err, precision, string.format(
+                                 'batch error on bias [%s]', t))
+         end
+      end
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+      mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   end
+
+   jacTests(module)
+   module:noBias()
+   jacTests(module)
+   module.bias = torch.Tensor(module.nOutputPlane):zero()
+   module.gradBias = torch.Tensor(module.nOutputPlane):zero()
+   module:reset()
+   jacTests(module)
+end
+
+function nntest.SpatialConvolutionMM()
+   local from = math.random(2,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local di =  math.random(1,4)
+   local dj =  math.random(1,4)
+   local padW = math.random(0,2)
+   local padH = math.random(0,2)
+   local outi = math.random(5,9)
+   local outj = math.random(5,9)
+   local ini = (outi-1)*di+ki-padW*2
+   local inj = (outj-1)*dj+kj-padH*2
+   local module = nn.SpatialConvolutionMM(from, to, ki, kj, di, dj, padW, padH)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   -- stochastic
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   -- batch
+
+   --verbose = true
+   local batch = math.random(2,5)
+
+   module = nn.SpatialConvolutionMM(from, to, ki, kj, di, dj, padW, padH)
+   input = torch.Tensor(batch,from,inj,ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'batch error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'batch error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'batch error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+   -- non-contiguous
+   local input = torch.randn(batch,from,ini,inj):transpose(3,4) -- non-contiguous
+   local inputc = input:contiguous() -- contiguous
+   local output = module:forward(input):clone()
+   local outputc = module:forward(inputc):clone()
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+   local gradInput = module:backward(input, output):clone()
+   local gradInputc = module:backward(inputc, outputc):clone()
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+end
+
+function nntest.SpatialConvolutionLocal()
+   local from = math.random(1,4)
+   local to = math.random(1,4)
+   local ki = math.random(1,3)
+   local kj = math.random(1,3)
+   local si = math.random(1,3)
+   local sj = math.random(1,3)
+   local outi = math.random(5,6)
+   local outj = math.random(5,6)
+   local ini = (outi-1)*si+ki
+   local inj = (outj-1)*sj+kj
+   local module = nn.SpatialConvolutionLocal(from, to, ini, inj, ki, kj, si, sj)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   -- stochastic
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   nn.hessian.enable()
+
+   local err = jac.testDiagHessianInput(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+   local err = jac.testDiagHessianWeight(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+   local err = jac.testDiagHessianBias(module, input)
+   mytester:assertlt(err , precision, 'error on diag HessianBias')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   -- batch
+
+   --verbose = true
+   local batch = math.random(2,5)
+   outi = math.random(4,6)
+   outj = math.random(4,6)
+   ini = (outi-1)*si+ki
+   inj = (outj-1)*sj+kj
+   module = nn.SpatialConvolutionLocal(from, to, ini, inj, ki, kj, si, sj)
+   input = torch.Tensor(batch,from,inj,ini):zero()
+
+--    print(from, to, ki, kj, si, sj, batch, ini, inj)
+--    print(module.weight:size())
+--    print(module.gradWeight:size())
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'batch error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'batch error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'batch error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+
+   local err = jac.testDiagHessianInput(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+   local err = jac.testDiagHessianWeight(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+   local err = jac.testDiagHessianBias(module, input)
+   mytester:assertlt(err , precision, 'error on diag HessianBias')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+   -- check against nn.SpatialConvolution
+   local conv = nn.SpatialConvolution(from, to, ki, kj, si, sj)
+   torch.repeatTensor(module.bias, conv.bias:view(to, 1, 1), 1, outi, outj)
+   torch.repeatTensor(module.weight, conv.weight:view(1, 1, from, to, ki, kj), outi, outj, 1, 1, 1, 1)
+   local input = torch.rand(batch, from, inj, ini)
+   local output = module:forward(input)
+   local outputConv = conv:forward(input)
+   local err = torch.dist(output, outputConv)
+   mytester:assertlt(err, precision, 'error checking against nn.SpatialConvolution')
+
+end
+
+function nntest.SpatialFullConvolution()
+   local from = math.random(2,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local di =  math.random(1,4)
+   local dj =  math.random(1,4)
+   local padW = math.random(0,2)
+   local padH = math.random(0,2)
+   local outi = math.random(5,9)
+   local outj = math.random(5,9)
+   local adjW = (outi + padW*2 - ki) % di
+   local adjH = (outj + padH*2 - kj) % dj
+   local ini = math.floor((outi + padW*2 - ki)/di + 1)
+   local inj = math.floor((outj + padH*2 - kj)/dj + 1)
+   local module = nn.SpatialFullConvolution(from, to, ki, kj, di, dj, padW, padH, adjW, adjH)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   -- stochastic
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   -- batch
+
+   --verbose = true
+   local batch = math.random(2,5)
+
+   module = nn.SpatialFullConvolution(from, to, ki, kj, di, dj, padW, padH, adjW, adjH)
+   input = torch.Tensor(batch,from,inj,ini):zero()
+
+   -- Check that the required output size matches the actual output size
+   local output = module:forward(input)
+   mytester:asserteq(output:size(3), outj, 'output height error')
+   mytester:asserteq(output:size(4), outi, 'output width error')
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'batch error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'batch error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'batch error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+   -- non-contiguous
+   local input = torch.randn(batch,from,ini,inj):transpose(3,4) -- non-contiguous
+   local inputc = input:contiguous() -- contiguous
+   local output = module:forward(input)
+   local outputc = module:forward(inputc)
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+   local gradInput = module:backward(input, output)
+   local gradInputc = module:backward(inputc, outputc)
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+end
+
+function nntest.SpatialFullConvolutionDualInput()
+   local from = math.random(2,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local di =  math.random(1,4)
+   local dj =  math.random(1,4)
+   local padW = math.random(0,2)
+   local padH = math.random(0,2)
+   local outi = math.random(5,9)
+   local outj = math.random(5,9)
+   local ini = math.floor((outi + padW*2 - ki)/di + 1)
+   local inj = math.floor((outj + padH*2 - kj)/dj + 1)
+   local adjW = (outi + 2 * padW - ki) % di
+   local adjH = (outj + 2 * padH - kj) % dj
+   local targetTensor = torch.Tensor(outj, outi):zero()
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   local module = nn.SpatialFullConvolution(from, to, ki, kj, di, dj, padW, padH)
+   local moduleRef = nn.SpatialFullConvolution(from, to, ki, kj, di, dj, padW, padH, adjW, adjH)
+   moduleRef.weight:copy(module.weight)
+   moduleRef.bias:copy(module.bias)
+
+   -- Check that the required output size matches the actual output size
+   -- when using the dual input mode
+   local output = module:forward({input, targetTensor})
+   mytester:asserteq(output:size(2), outj, 'output height error')
+   mytester:asserteq(output:size(3), outi, 'output width error')
+
+   -- Check that backward and forward match the reference module
+   local outputRef = moduleRef:forward(input)
+   mytester:asserteq(0, (output-outputRef):abs():max(), torch.typename(module) .. ' - output err ')
+   local gradOutput = outputRef:clone():uniform()
+   local gradInputRef = moduleRef:backward(input, gradOutput)
+   local gradInput = module:backward({input, targetTensor}, gradOutput)
+   mytester:asserteq(0, (gradInput[1]-gradInputRef):abs():max(), torch.typename(module) .. ' - gradInput[1] err ')
+
+   -- Check that gradInput[2] is the singleton tensor {0}
+   mytester:asserteq(gradInput[2]:storage():size(), 1, torch.typename(module) .. ' - gradInput[2] size err ')
+   mytester:asserteq(gradInput[2]:storage()[1], 0, torch.typename(module) .. ' - gradInput[2] value err ')
+end
+
+function nntest.SpatialDilatedConvolution()
+   local from = math.random(1,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local di =  math.random(1,4)
+   local dj =  math.random(1,4)
+   local padW = math.random(0,2)
+   local padH = math.random(0,2)
+   local outi = math.random(5,9)
+   local outj = math.random(5,9)
+   local dilationW = math.random(0,10)
+   local dilationH = math.random(0,10)
+   local ini = (outi - 1) * di - 2 * padW + dilationW * (ki-1) + 1
+   local inj = (outj - 1) * dj - 2 * padH + dilationH * (kj-1) + 1
+
+   local module = nn.SpatialDilatedConvolution(from, to, ki, kj, di, dj, padW, padH, dilationW, dilationH)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   -- stochastic
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   -- batch
+
+   --verbose = true
+   local batch = math.random(2,5)
+
+   module = nn.SpatialDilatedConvolution(from, to, ki, kj, di, dj, padW, padH, dilationW, dilationH)
+   input = torch.Tensor(batch,from,inj,ini):zero()
+
+   -- Check that the required output size matches the actual output size
+   local output = module:forward(input)
+   mytester:asserteq(output:size(3), outj, 'output height error')
+   mytester:asserteq(output:size(4), outi, 'output width error')
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'batch error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'batch error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'batch error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+   -- non-contiguous
+   local input = torch.randn(batch,from,ini,inj):transpose(3,4) -- non-contiguous
+   local inputc = input:contiguous() -- contiguous
+   local output = module:forward(input)
+   local outputc = module:forward(inputc)
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+   local gradInput = module:backward(input, output)
+   local gradInputc = module:backward(inputc, outputc)
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+end
+
+function nntest.SpatialConvolutionMap()
+   local from = math.random(1,5)
+   local fanin = math.random(1, from)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local si = math.random(1,3)
+   local sj = math.random(1,3)
+   local outi = math.random(5,9)
+   local outj = math.random(5,9)
+   local ini = (outi-1)*si+ki
+   local inj = (outj-1)*sj+kj
+
+   local module = nn.SpatialConvolutionMap(nn.tables.random(from, to, fanin), ki, kj, si, sj)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   nn.hessian.enable()
+
+   local err = jac.testDiagHessianInput(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+   local err = jac.testDiagHessianWeight(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+   local err = jac.testDiagHessianBias(module, input)
+   mytester:assertlt(err , precision, 'error on diag HessianBias')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+
+
+    -- batch
+
+   --verbose = true
+   local batch = math.random(2,6)
+   module = nn.SpatialConvolutionMap(nn.tables.random(from, to, fanin), ki, kj, si, sj)
+   input = torch.Tensor(batch,from,inj,ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'batch error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'batch error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'batch error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+
+   local err = jac.testDiagHessianInput(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+   local err = jac.testDiagHessianWeight(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+   local err = jac.testDiagHessianBias(module, input)
+   mytester:assertlt(err , precision, 'error on diag HessianBias')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.SpatialFullConvolutionMap()
+   local from = math.random(2,4)
+   local to = math.random(2,5)
+   local fanin = math.random(1, from)
+   local tt = nn.tables.random(from, to, fanin)
+   local ki = math.random(2,5)
+   local kj = math.random(2,5)
+   local si = math.random(1,3)
+   local sj = math.random(1,3)
+   local ini = math.random(5,7)
+   local inj = math.random(5,7)
+   local module = nn.SpatialFullConvolutionMap(tt, ki, kj, si, sj)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   -- stochastic
+      local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   nn.hessian.enable()
+
+   local err = jac.testDiagHessianInput(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+   local err = jac.testDiagHessianWeight(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+   local err = jac.testDiagHessianBias(module, input)
+   mytester:assertlt(err , precision, 'error on diag HessianBias')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.SpatialFullConvolutionCompare()
+    local from = math.random(2,4)
+    local to = math.random(2,5)
+    local tt = nn.tables.full(from, to)
+    local ki = math.random(2,5)
+    local kj = math.random(2,5)
+    local si = math.random(1,3)
+    local sj = math.random(1,3)
+    local ini = math.random(7,8)
+    local inj = math.random(7,8)
+    local module1 = nn.SpatialFullConvolutionMap(tt, ki, kj, si, sj)
+    local module2 = nn.SpatialFullConvolution(from, to, ki, kj, si, sj)
+    local input = torch.rand(from, inj, ini)
+    for k=1,tt:size(1) do
+       module1.weight[k]:copy(module2.weight[tt[k][1]][tt[k][2]])
+       module1.bias:copy(module2.bias)
+    end
+
+    local o1 = module1:updateOutput(input)
+    local o2 = module2:updateOutput(input)
+    mytester:assertlt(o1:dist(o2), precision, 'error on output')
+
+    local go1 = torch.rand(o1:size())
+    local go2 = go1:clone()
+
+    local gi1= module1:updateGradInput(input,go1)
+    local gi2 = module2:updateGradInput(input,go2)
+    mytester:assertlt(gi1:dist(gi2), precision, 'error on gradInput')
+
+    module1:zeroGradParameters()
+    module2:zeroGradParameters()
+
+    module1:accGradParameters(input,go1)
+    module2:accGradParameters(input,go2)
+    for k=1,tt:size(1) do
+      mytester:assertlt(module1.gradWeight[k]:dist(module2.gradWeight[tt[k][1]][tt[k][2]]),precision,'error on gradWeight ' .. k)
+    end
+    mytester:assertlt(module1.gradBias:dist(module2.gradBias),precision,'error on gradBias ')
+end
+
+local function batchcompare(smod, sin, plist)
+   local bs = torch.LongStorage(sin:dim()+1)
+   bs[1] = 1
+   for i=1,sin:dim() do bs[i+1] = sin:size()[i] end
+   local bin = torch.Tensor(bs):copy(sin)
+   local bmod = smod:clone()
+
+   local sout = smod:forward(sin):clone()
+   local bout = bmod:forward(bin):clone()
+
+   local sgout = torch.randn(sout:size())
+   local bgout = torch.Tensor(bout:size())
+   bgout:copy(sgout)
+
+   local sgin = smod:backward(sin, sgout)
+   local bgin = bmod:backward(bin, bgout)
+
+   smod:accGradParameters(sin, sgout, 1)
+   bmod:accGradParameters(bin, bgout, 1)
+
+   mytester:assertTensorEq(sout,bout:select(1,1), 1e-8, 'batchcompare error on output')
+   mytester:assertTensorEq(sgin,bgin:select(1,1), 1e-8, 'batchcompare error on gradInput')
+
+   for i,v in pairs(plist) do
+      mytester:assertTensorEq(smod[v],bmod[v], 1e-8, 'batchcompare error on ' .. v)
+   end
+end
+
+function nntest.SpatialConvolutionBatchCompare()
+   local from = math.random(1,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local si = math.random(1,4)
+   local sj = math.random(1,4)
+   local outi = math.random(5,9)
+   local outj = math.random(5,9)
+   local ini = (outi-1)*si+ki
+   local inj = (outj-1)*sj+kj
+
+   local module = nn.SpatialConvolution(from, to, ki, kj, si, sj)
+   module:zeroGradParameters()
+   local input = torch.randn(from,inj,ini)
+
+   batchcompare(module,input, {'weight','bias','gradWeight','gradBias'})
+end
+
+function nntest.SpatialFullConvolutionBatchCompare()
+   local from = math.random(1,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local si = math.random(1,4)
+   local sj = math.random(1,4)
+   local ini = math.random(5,9)
+   local inj = math.random(5,9)
+
+   local module = nn.SpatialFullConvolution(from, to, ki, kj, si, sj)
+   module:zeroGradParameters()
+   local input = torch.randn(from, inj, ini)
+
+   batchcompare(module,input, {'weight','bias','gradWeight','gradBias'})
+end
+
+
+
+function nntest.SpatialSubSamplingBatchCompare()
+   local from = math.random(1,6)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local si = math.random(1,4)
+   local sj = math.random(1,4)
+   local outi = math.random(6,10)
+   local outj = math.random(6,10)
+   local ini = (outi-1)*si+ki
+   local inj = (outj-1)*sj+kj
+   local module = nn.SpatialSubSampling(from, ki, kj, si, sj)
+   module:zeroGradParameters()
+   local input = torch.randn(from,inj,ini)--torch.Tensor(from, inj, ini):zero()
+
+   batchcompare(module,input, {'weight','bias','gradWeight','gradBias'})
+end
+
+function nntest.SpatialSubSampling()
+   local from = math.random(1,6)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local si = math.random(1,4)
+   local sj = math.random(1,4)
+   local outi = math.random(6,10)
+   local outj = math.random(6,10)
+   local ini = (outi-1)*si+ki
+   local inj = (outj-1)*sj+kj
+   local module = nn.SpatialSubSampling(from, ki, kj, si, sj)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   local batch = math.random(2,5)
+   outi = math.random(4,8)
+   outj = math.random(4,8)
+   ini = (outi-1)*si+ki
+   inj = (outj-1)*sj+kj
+   module = nn.SpatialSubSampling(from, ki, kj, si, sj)
+   input = torch.Tensor(batch,from,inj,ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'batch error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'batch error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'batch error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.SpatialMaxPooling()
+   for _,ceil_mode in pairs({true,false}) do
+      local from = math.random(1,5)
+      local ki = math.random(1,4)
+      local kj = math.random(1,4)
+      local si = math.random(1,3)
+      local sj = math.random(1,3)
+      local outi = math.random(4,5)
+      local outj = math.random(4,5)
+      local padW = math.min(math.random(0,1),math.floor(ki/2))
+      local padH =  math.min(math.random(0,1),math.floor(kj/2))
+      local ini = (outi-1)*si+ki-2*padW
+      local inj = (outj-1)*sj+kj-2*padH
+
+      local ceil_string = ceil_mode and 'ceil' or 'floor'
+      local module = nn.SpatialMaxPooling(ki,kj,si,sj,padW,padH)
+      if ceil_mode then module:ceil() else module:floor() end
+      local input = torch.rand(from,inj,ini)
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state ')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+      -- batch
+      local nbatch = math.random(2,5)
+      input = torch.rand(nbatch,from,inj,ini)
+      module = nn.SpatialMaxPooling(ki,kj,si,sj,padW,padH)
+      if ceil_mode then module:ceil() else module:floor() end
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state (Batch)')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+  end
+end
+
+function nntest.SpatialMaxUnpooling()
+   for _,ceil_mode in pairs({true,false}) do
+      local from = math.random(1,5)
+      local ki = math.random(2,4)
+      local kj = math.random(2,4)
+      local si, sj = ki, kj
+      local outi = math.random(4,5)
+      local outj = math.random(4,5)
+      local padW = math.min(math.random(0,1),math.floor(ki/2))
+      local padH = math.min(math.random(0,1),math.floor(kj/2))
+      local ini = (outi-1)*si+ki-2*padW
+      local inj = (outj-1)*sj+kj-2*padH
+
+      local ceil_string = ceil_mode and 'ceil' or 'floor'
+      local poolingModule = nn.SpatialMaxPooling(ki,kj,si,sj,padW,padH)
+      if ceil_mode then poolingModule:ceil() else poolingModule:floor() end
+      local module = nn.SpatialMaxUnpooling(poolingModule)
+
+      local original = torch.rand(from,inj,ini)
+      local input = poolingModule:forward(original)
+      local output = module:forward(input)
+
+      mytester:assert(output:isSameSizeAs(original),'SpatialMaxUnpooling output size err')
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state ')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+      -- batch
+      local nbatch = math.random(2,5)
+      original = torch.rand(nbatch,from,inj,ini)
+      input = poolingModule:forward(original)
+      output = module:forward(input)
+
+      mytester:assert(output:isSameSizeAs(original),'SpatialMaxUnpooling batch output size err')
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state (Batch)')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+  end
+end
+
+function nntest.SpatialFractionalMaxPooling()
+    local batch = math.random(1, 3)
+    local plane = math.random(1, 3)
+    local outW = math.random(1, 7)
+    local outH = math.random(1, 7)
+    local poolSizeW = math.random(2, 4)
+    local poolSizeH = math.random(2, 4)
+
+    local minInW = outW + poolSizeW
+    local minInH = outH + poolSizeH
+
+    local inW = math.random(minInW, minInW + 6)
+    local inH = math.random(minInH, minInH + 6)
+
+    -- fix the pooling regions so they aren't regenerated with every
+    -- forward(), so testJacobian can work properly
+    local module =
+        nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
+        :fixPoolingRegions()
+    local input = nil
+    if batch == 1 then
+        input = torch.Tensor(plane, inH, inW):zero()
+    else
+        input = torch.Tensor(batch, plane, inH, inW):zero()
+    end
+
+    local err = nn.Jacobian.testJacobian(module, input)
+    mytester:assertlt(err, precision, 'error on state')
+end
+
+function nntest.SpatialFractionalMaxPooling_Ratio()
+    -- Fix a reduction ratio, and test with two different input sizes
+    local reductionRatioW = torch.uniform(0.4, 0.74)
+    local reductionRatioH = torch.uniform(0.4, 0.74)
+
+    for tries = 1, 2 do
+        local batch = math.random(1, 3)
+        local plane = math.random(1, 3)
+        local poolSizeW = math.random(2, 3)
+        local poolSizeH = math.random(2, 3)
+
+        local minInW = math.random(5, 8) + poolSizeW
+        local minInH = math.random(5, 8) + poolSizeH
+
+        local inW = math.random(minInW, minInW + 6)
+        local inH = math.random(minInH, minInH + 6)
+
+        -- fix the pooling regions so they aren't regenerated with every
+        -- forward(), so testJacobian can work properly
+        local module =
+            nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH,
+                                           reductionRatioW, reductionRatioH)
+            :fixPoolingRegions()
+        local input = nil
+        if batch == 1 then
+            input = torch.Tensor(plane, inH, inW):zero()
+        else
+            input = torch.Tensor(batch, plane, inH, inW):zero()
+        end
+
+        -- Make sure that the output size is based on our ratio
+        local output = module:updateOutput(input)
+        if batch == 1 then
+            mytester:asserteq(output:size(3), math.floor(reductionRatioW * inW))
+            mytester:asserteq(output:size(2), math.floor(reductionRatioH * inH))
+        else
+            mytester:asserteq(output:size(4), math.floor(reductionRatioW * inW))
+            mytester:asserteq(output:size(3), math.floor(reductionRatioH * inH))
+        end
+
+        local err = nn.Jacobian.testJacobian(module, input)
+        mytester:assertlt(err, precision, 'error on state')
+    end
+end
+
+function nntest.SpatialAveragePooling()
+   for _,count_include_pad in pairs({true,false}) do
+      for _,ceil_mode in pairs({true,false}) do
+        local from = math.random(1,5)
+        local ki = math.random(1,4)
+        local kj = math.random(1,4)
+        local si = math.random(1,3)
+        local sj = math.random(1,3)
+        local outi = math.random(4,5)
+        local outj = math.random(4,5)
+        local padW = math.min(math.random(0,1),math.floor(ki/2))
+        local padH =  math.min(math.random(0,1),math.floor(kj/2))
+        local ini = (outi-1)*si+ki-2*padW
+        local inj = (outj-1)*sj+kj-2*padH
+
+        local mode_string = ceil_mode and 'ceil' or 'floor'
+
+        local module = nn.SpatialAveragePooling(ki, kj, si, sj, padW, padH)
+        if ceil_mode then module:ceil() else module:floor() end
+        if count_include_pad then
+           module:setCountIncludePad()
+           mode_string = mode_string .. ' - count include padding'
+        else
+           module:setCountExcludePad()
+           mode_string = mode_string .. ' - count exclude padding'
+        end
+        local input = torch.Tensor(from, inj, ini):uniform()
+
+        local err = jac.testJacobian(module, input)
+        mytester:assertlt(err, precision, 'error'..mode_string..' on state ')
+
+        local ferr, berr = jac.testIO(module, input)
+        mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+        mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+        -- batch
+        local batch = math.random(2,5)
+        outi = math.random(4,5)
+        outj = math.random(4,5)
+        local padW = math.min(math.random(0,1),math.floor(ki/2))
+        local padH =  math.min(math.random(0,1),math.floor(kj/2))
+        local ini = (outi-1)*si+ki-2*padW
+        local inj = (outj-1)*sj+kj-2*padH
+
+        module = nn.SpatialAveragePooling(ki, kj, si, sj, padW, padH)
+        if ceil_mode then module:ceil() else module:floor() end
+        if count_include_pad then
+           module:setCountIncludePad()
+        else
+           module:setCountExcludePad()
+        end
+        input = torch.Tensor(batch,from,inj,ini):uniform()
+
+        local err = jac.testJacobian(module, input)
+        mytester:assertlt(err, precision, 'batch error'..mode_string..' on state ')
+
+        local ferr, berr = jac.testIO(module, input)
+        mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+        mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+        local ferr, berr = jac.testIO(module, input)
+        mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
+        mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+
+      end
+   end
+   -- test against SpatialSubSampling
+   local from = math.random(1,6)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local si = math.random(1,4)
+   local sj = math.random(1,4)
+   local outi = math.random(6,10)
+   local outj = math.random(6,10)
+   local padW = 0
+   local padH = 0
+   local ini = (outi-1)*si+ki-2*padW
+   local inj = (outj-1)*sj+kj-2*padH
+
+   local module = nn.SpatialAveragePooling(ki, kj, si, sj, padW, padH)
+   local sap = nn.SpatialSubSampling(from, ki, kj, si, sj)
+   sap.weight:fill(1.0/(ki*kj))
+   sap.bias:fill(0.0)
+
+   local input = torch.Tensor(from, inj, ini):uniform()
+
+   local output = module:forward(input)
+   local gradInput = module:backward(input, output)
+   local output2 = sap:forward(input)
+   local gradInput2 = sap:updateGradInput(input, output)
+
+   mytester:assertTensorEq(output, output2, 0.000001, torch.typename(module) .. ' forward err ')
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, torch.typename(module) .. ' backward err ')
+
+   -- test against SpatialSubSampling, batch mode
+   local batch = math.random(2,5)
+   outi = math.random(4,8)
+   outj = math.random(4,8)
+   local padW = 0
+   local padH = 0
+   local ini = (outi-1)*si+ki-2*padW
+   local inj = (outj-1)*sj+kj-2*padH
+
+   module = nn.SpatialAveragePooling(ki, kj, si, sj, padW, padH)
+   input = torch.Tensor(batch,from,inj,ini):uniform()
+
+   local sap = nn.SpatialSubSampling(from, ki, kj, si, sj)
+   sap.weight:fill(1.0/(ki*kj))
+   sap.bias:fill(0.0)
+
+   local output = module:forward(input)
+   local gradInput = module:backward(input, output)
+   local output2 = sap:forward(input)
+   local gradInput2 = sap:updateGradInput(input, output)
+
+   mytester:assertTensorEq(output, output2, 0.000001, torch.typename(module) .. ' forward err (Batch) ')
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, torch.typename(module) .. ' backward err (Batch) ')
+
+end
+
+function nntest.SpatialAdaptiveMaxPooling()
+   local from = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local ini = math.random(1,16)
+   local inj = math.random(1,16)
+
+   local module = nn.SpatialAdaptiveMaxPooling(ki,kj)
+   local input = torch.rand(from,ini,inj)
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+   -- batch
+   local nbatch = math.random(1,3)
+   input = torch.rand(nbatch,from,ini,inj)
+   module = nn.SpatialAdaptiveMaxPooling(ki,kj)
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state (Batch) ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+
+   -- non-contiguous
+
+   input = torch.rand(from,ini,inj):transpose(2,3)
+   module = nn.SpatialAdaptiveMaxPooling(ki,kj)
+   local inputc = input:contiguous() -- contiguous
+   local output = module:forward(input):clone()
+   local outputc = module:forward(inputc):clone()
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - non-contiguous err ')
+   local gradInput = module:backward(input, output):clone()
+   local gradInputc = module:backward(inputc, outputc):clone()
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - non-contiguous err ')
+
+   -- non-contiguous batch
+   local nbatch = math.random(1,3)
+   input = torch.rand(nbatch,from,ini,inj):transpose(1,3):transpose(2,4)
+   local inputc = input:contiguous() -- contiguous
+   module = nn.SpatialAdaptiveMaxPooling(ki,kj)
+
+   local output = module:forward(input):clone()
+   local outputc = module:forward(inputc):clone()
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - batch non-contiguous err ')
+   local gradInput = module:backward(input, output):clone()
+   local gradInputc = module:backward(inputc, outputc):clone()
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - batch non-contiguous err ')
+
+end
+
+function nntest.SpatialLPPooling()
+   local fanin = math.random(1,4)
+   local osizex = math.random(1,4)
+   local osizey = math.random(1,4)
+   local p = 2
+   local mx = math.random(2,6)
+   local my = math.random(2,6)
+   local dx = math.random(2,mx)
+   local dy = math.random(2,my)
+   local sizex = osizex*mx
+   local sizey = osizey*my
+   local module = nn.SpatialLPPooling(fanin,p,mx,my,dx,dy)
+   local input = torch.rand(fanin,sizey,sizex)
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Sum()
+   -- 1D
+   local ini = math.random(3,7)
+   local input = torch.Tensor(ini):zero()
+   local module = nn.Sum(1)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- negative dimension
+   local module   = nn.Sum(-1)
+   local input    = torch.Tensor({1, 2, 3})
+   local expected = torch.Tensor({6})
+   local output   = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+
+   -- batch
+   local dimension = 1
+   local module    = nn.Sum(dimension, 1)
+   local input     = torch.Tensor({{1, 2, 3},{4, 5, 6}})
+   local expected  = torch.Tensor({6, 15})
+   local output    = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+
+   local err       = jac.testJacobian(module, input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- mean + batch
+   local dimension = 1
+   local module    = nn.Sum(dimension, 1, true)
+   local input     = torch.Tensor({{1, 2, 3},{4, 5, 6}})
+   local expected  = input:mean(dimension + 1)
+   local output    = module:forward(input)
+
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+
+   local err       = jac.testJacobian(module, input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- 3D
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.Sum(torch.random(1,3))
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Tanh()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.Tanh()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision ,  'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.TemporalConvolution()
+   -- 1D
+   local from = math.random(1,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local si = math.random(1,4)
+   local outi = math.random(5,7)
+   local ini = (outi-1)*si+ki
+   local module = nn.TemporalConvolution(from, to, ki,si)
+   local input = torch.Tensor(ini, from):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update]')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update]')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   -- 2D
+   local nBatchFrame = 4
+   local input = torch.Tensor(nBatchFrame, ini, from):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update]')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update]')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+   -- 2D matches 1D
+   local output = module:forward(input):clone()
+   local outputGrad = torch.randn(output:size())
+   local inputGrad = module:backward(input, outputGrad):clone()
+
+   local input1D = input:select(1, 2)
+   local output1D = module:forward(input1D)
+   local outputGrad1D = outputGrad:select(1, 2)
+   local inputGrad1D = module:backward(input1D, outputGrad1D)
+
+   mytester:assertTensorEq(output:select(1,2), output1D, 0.000001, 'error on 2D vs 1D forward)')
+   mytester:assertTensorEq(inputGrad:select(1,2), inputGrad1D, 0.000001, 'error on 2D vs 1D backward)')
+end
+
+function nntest.TemporalSubSampling()
+   local from = math.random(1,5)
+   local ki = math.random(1,6)
+   local si = math.random(1,4)
+   local outi = math.random(6,9)
+   local ini = (outi-1)*si+ki
+   local module = nn.TemporalSubSampling(from, ki, si)
+   local input = torch.Tensor(ini, from):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.TemporalMaxPooling()
+   local from = math.random(2,4)
+   local ki = math.random(5,7)
+   local si = math.random(1,2)
+   local outi = math.random(30,40)
+   local ini = (outi-1)*si+ki
+   local module = nn.TemporalMaxPooling(ki, si)
+   local input = torch.Tensor(ini, from):zero()
+
+   -- 1D
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+   -- 2D
+   local nBatchFrame = 2
+   local input = torch.Tensor(nBatchFrame, ini, from):zero()
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+   -- 2D matches 1D
+   local output = module:forward(input):clone()
+   local outputGrad = torch.randn(output:size())
+   local inputGrad = module:backward(input, outputGrad):clone()
+
+   local input1D = input:select(1, 2)
+   local output1D = module:forward(input1D)
+   local outputGrad1D = outputGrad:select(1, 2)
+   local inputGrad1D = module:backward(input1D, outputGrad1D)
+
+   mytester:assertTensorEq(output:select(1,2), output1D, 0.000001, 'error on 2D vs 1D forward)')
+   mytester:assertTensorEq(inputGrad:select(1,2), inputGrad1D, 0.000001, 'error on 2D vs 1D backward)')
+end
+
+function nntest.VolumetricFullConvolution_simple_test()
+    local module = nn.VolumetricFullConvolution(3, 1, 3, 3, 3, 3, 3, 3);
+    module.weight:fill(1);
+    module.bias:fill(0.1);
+
+    local input = torch.Tensor(1, 3, 2, 2, 2):zero();
+    for c = 1,3 do
+        input[1][c][1][1][1] = 1
+    end
+    local output = module:forward(input)
+    for t = 1,6 do
+        for h = 1,6 do
+            for w = 1,6 do
+                if t <= 3 and h <= 3 and w <= 3 then
+                    mytester:assertlt(output[1][1][t][h][w] - 3.1, precision, 'error on forward ')
+                else
+                    mytester:assertlt(output[1][1][t][h][w] - 0.1, precision, 'error on forward ')
+                end
+            end
+        end
+    end
+
+    module:zeroGradParameters()
+    local gradOut = torch.Tensor(1, 1, 6, 6, 6):fill(0.1);
+    local gradIn = module:backward(input, gradOut)
+    for t = 1,2 do
+        for h = 1,2 do
+            for w = 1,2 do
+                mytester:assertlt(gradIn[1][1][t][h][w] - 2.7, precision,
+                                  'error on backward input gradients ')
+            end
+        end
+    end
+
+    mytester:assertlt(module.gradBias[1] - 21.6, precision,
+                      'error on backward gradBias ')
+    for c = 1,3 do
+        for t = 1,3 do
+            for h = 1,3 do
+                for w = 1,3 do
+                    mytester:assertlt(module.gradWeight[c][1][t][h][w] - 0.1, precision,
+                                      'error on backward weight gradients ')
+                end
+            end
+        end
+    end
+end
+
+function nntest.VolumetricFullConvolution()
+    local from = math.random(2,3)
+    local to = math.random(2,3)
+    local kt = math.random(3,4)
+    local ki = math.random(3,4)
+    local kj = ki
+    local st = math.random(1,3)
+    local si = math.random(1,3)
+    local sj = si
+    local int = math.random(3,4)
+    local ini = math.random(3,4)
+    local inj = math.random(3,4)
+    local bs = math.random(1, 6)
+    local module = nn.VolumetricFullConvolution(from, to, kt, ki, kj, st, si, sj)
+
+    local input = torch.Tensor(bs, from, int, ini, inj):zero()
+
+    local err = jac.testJacobian(module, input)
+    mytester:assertlt(err, precision, 'error on state ')
+
+    local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+    mytester:assertlt(err , precision, 'error on weight ')
+
+    local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+    mytester:assertlt(err , precision, 'error on bias ')
+
+    local ferr, berr = jac.testIO(module, input)
+    mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+    mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.VolumetricFullConvolutionDualInput()
+   local from = math.random(2,3)
+   local to = math.random(2,3)
+   local kt = math.random(3,4)
+   local ki = math.random(3,4)
+   local kj = math.random(3,4)
+   local dt =  math.random(1,3)
+   local di =  math.random(1,3)
+   local dj =  math.random(1,3)
+   local padT = math.random(0,2)
+   local padW = math.random(0,2)
+   local padH = math.random(0,2)
+   local outt = math.random(5,9)
+   local outi = math.random(5,9)
+   local outj = math.random(5,9)
+   local int = math.floor((outt + padT*2 - kt)/dt + 1)
+   local ini = math.floor((outi + padW*2 - ki)/di + 1)
+   local inj = math.floor((outj + padH*2 - kj)/dj + 1)
+   local adjT = (outt + 2 * padT - kt) % dt
+   local adjW = (outi + 2 * padW - ki) % di
+   local adjH = (outj + 2 * padH - kj) % dj
+   local targetTensor = torch.Tensor(outt, outj, outi):zero()
+   local input = torch.Tensor(from, int, inj, ini):zero()
+
+   local module = nn.VolumetricFullConvolution(from, to, kt, ki, kj, dt, di, dj, padT, padW, padH)
+   local moduleRef = nn.VolumetricFullConvolution(from, to, kt, ki, kj, dt, di, dj, padT, padW, padH, adjT, adjW, adjH)
+   moduleRef.weight:copy(module.weight)
+   moduleRef.bias:copy(module.bias)
+
+   -- Check that the required output size matches the actual output size
+   -- when using the dual input mode
+   local output = module:forward({input, targetTensor})
+   mytester:asserteq(output:size(2), outt, 'output depth error')
+   mytester:asserteq(output:size(3), outj, 'output height error')
+   mytester:asserteq(output:size(4), outi, 'output width error')
+
+   -- Check that backward and forward match the reference module
+   local outputRef = moduleRef:forward(input)
+   mytester:asserteq(0, (output-outputRef):abs():max(), torch.typename(module) .. ' - output err ')
+   local gradOutput = outputRef:clone():uniform()
+   local gradInputRef = moduleRef:backward(input, gradOutput)
+   local gradInput = module:backward({input, targetTensor}, gradOutput)
+   mytester:asserteq(0, (gradInput[1]-gradInputRef):abs():max(), torch.typename(module) .. ' - gradInput[1] err ')
+
+   -- Check that gradInput[2] is the singleton tensor {0}
+   mytester:asserteq(gradInput[2]:storage():size(), 1, torch.typename(module) .. ' - gradInput[2] size err ')
+   mytester:asserteq(gradInput[2]:storage()[1], 0, torch.typename(module) .. ' - gradInput[2] value err ')
+end
+
+function nntest.VolumetricConvolution()
+   local from = math.random(2,4)
+   local to = math.random(1,4)
+   local kt = math.random(1,4)
+   local ki = math.random(1,4)
+   local kj = math.random(1,4)
+   local st = math.random(1,3)
+   local si = math.random(1,3)
+   local sj = math.random(1,3)
+   local padT = math.random(0,2)
+   local padW = math.random(0,2)
+   local padH = math.random(0,2)
+   local outt = math.random(5,7)
+   local outi = math.random(5,7)
+   local outj = math.random(5,7)
+   local int = (outt-1)*st+kt-padT*2
+   local ini = (outi-1)*si+ki-padW*2
+   local inj = (outj-1)*sj+kj-padH*2
+   local module = nn.VolumetricConvolution(from, to, kt, ki, kj, st, si, sj, padT, padW, padH)
+   local input = torch.Tensor(from, int, inj, ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.VolumetricConvolutionBatchCompare()
+   local from = math.random(2,3)
+   local to = math.random(2,3)
+   local kt = math.random(3,4)
+   local ki = math.random(3,4)
+   local kj = math.random(3,4)
+   local st = math.random(2,3)
+   local si = math.random(2,3)
+   local sj = math.random(2,3)
+   local padT = math.random(0,2)
+   local padW = math.random(0,2)
+   local padH = math.random(0,2)
+   local outt = math.random(3,4)
+   local outi = math.random(3,4)
+   local outj = math.random(3,4)
+   local int = (outt-1)*st+kt-padT*2
+   local ini = (outi-1)*si+ki-padW*2
+   local inj = (outj-1)*sj+kj-padH*2
+   local module = nn.VolumetricConvolution(from, to, kt, ki, kj, st, si, sj, padT, padW, padH)
+   module:zeroGradParameters()
+   local input = torch.randn(from, int, inj, ini)
+   batchcompare(module,input, {'weight','bias','gradWeight','gradBias'})
+end
+
+function nntest.VolumetricAveragePooling()
+   local from = math.random(2,3)
+   local kt = math.random(3,4)
+   local ki = math.random(3,4)
+   local kj = math.random(3,4)
+   local st = math.random(2,3)
+   local si = math.random(2,3)
+   local sj = math.random(2,3)
+   local outt = math.random(3,4)
+   local outi = math.random(3,4)
+   local outj = math.random(3,4)
+   local int = (outt-1)*st+kt
+   local ini = (outi-1)*si+ki
+   local inj = (outj-1)*sj+kj
+   local module = nn.VolumetricAveragePooling(kt, ki, kj, st, si, sj)
+   local input = torch.Tensor(from, int, inj, ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+      -- batch
+   local nbatch = math.random(2,3)
+   module = nn.VolumetricAveragePooling(kt, ki, kj, st, si, sj)
+   input = torch.Tensor(nbatch, from, int, inj, ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state (Batch) ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+end
+
+function nntest.VolumetricMaxPooling()
+   local from = math.random(2,3)
+   local kt = math.random(3,4)
+   local ki = math.random(3,4)
+   local kj = math.random(3,4)
+   local st = math.random(2,3)
+   local si = math.random(2,3)
+   local sj = math.random(2,3)
+   local outt = math.random(3,4)
+   local outi = math.random(3,4)
+   local outj = math.random(3,4)
+   local padT = math.min(math.random(0,2),math.floor(kt/2))
+   local padW = math.min(math.random(0,2),math.floor(ki/2))
+   local padH =  math.min(math.random(0,2),math.floor(kj/2))
+   local int = (outt-1)*st+kt-2*padT
+   local ini = (outi-1)*si+ki-2*padW
+   local inj = (outj-1)*sj+kj-2*padH
+   local module = nn.VolumetricMaxPooling(kt, ki, kj, st, si, sj, padT, padW, padH)
+   local input = torch.Tensor(from, int, inj, ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+   -- batch
+   local nbatch = math.random(2,3)
+   module = nn.VolumetricMaxPooling(kt, ki, kj, st, si, sj, padT, padW, padH)
+   input = torch.Tensor(nbatch, from, int, inj, ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state (Batch) ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+end
+
+function nntest.VolumetricMaxUnpooling()
+   local from = math.random(2,3)
+   local kt = math.random(3,4)
+   local ki = math.random(3,4)
+   local kj = math.random(3,4)
+   local st, si, sj = kt, ki, kj
+   local outt = math.random(3,4)
+   local outi = math.random(3,4)
+   local outj = math.random(3,4)
+   local padT = math.min(math.random(0,2),math.floor(kt/2))
+   local padW = math.min(math.random(0,2),math.floor(ki/2))
+   local padH = math.min(math.random(0,2),math.floor(kj/2))
+   local int = (outt-1)*st+kt-2*padT
+   local ini = (outi-1)*si+ki-2*padW
+   local inj = (outj-1)*sj+kj-2*padH
+
+   local poolingModule = nn.VolumetricMaxPooling(kt, ki, kj, st, si, sj, padT, padW, padH)
+   local module = nn.VolumetricMaxUnpooling(poolingModule)
+
+   local original = torch.rand(from,int,inj,ini)
+   local input = poolingModule:forward(original)
+   local output = module:forward(input)
+   mytester:assert(output:isSameSizeAs(original),'VolumetricMaxUnpooling output size err')
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+   -- batch
+   local nbatch = math.random(2,3)
+   original = torch.rand(nbatch,from,int,inj,ini)
+   input = poolingModule:forward(original)
+   output = module:forward(input)
+
+   mytester:assert(output:isSameSizeAs(original),'VolumetricMaxUnpooling batch output size err')
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on Batch')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+end
+
+function nntest.VolumetricMaxPooling_boundary()
+   -- simple kernel 2x2x2 with striding 2x2x2
+   local module = nn.VolumetricMaxPooling(2, 2, 2, 2, 2, 2):ceil()
+   local nip = math.random(3,256)
+   local input = torch.rand(nip, 2, 7, 7)
+
+   -- do a forward pass
+   local output = module:forward(input)
+
+   -- checking output size
+   mytester:asserteq(output:size(1), nip, 'wrong output channels')
+   mytester:asserteq(output:size(2), 1, 'wrong output temporal length')
+   mytester:asserteq(output:size(3), 4, 'wrong output height')
+   mytester:asserteq(output:size(4), 4, 'wrong output width')
+
+   -- checking output signals at top right
+   for c = 1,nip do
+      local max_val = input[c][1][1][7]
+      for t = 1,2 do
+        for h = 1,2 do
+          max_val = math.max(max_val, input[c][t][h][7])
+        end
+      end
+      mytester:asserteq(output[c][1][1][4], max_val, 'wrong forward execution')
+   end
+   -- checking output signals at bottom left
+   for c = 1,nip do
+       local max_val = input[c][1][7][1]
+       for t = 1,2 do
+         for w = 1,2 do
+           max_val = math.max(max_val, input[c][t][7][w])
+         end
+       end
+       mytester:asserteq(output[c][1][4][1], max_val, 'wrong forward execution')
+   end
+
+   -- check output signals at right bottom
+    for c = 1,nip do
+      local max_val = math.max(input[c][1][7][7], input[c][2][7][7])
+      mytester:asserteq(output[c][1][4][4], max_val, 'wrong forward execution')
+    end
+
+
+   -- backward is supposed to be tested in nntest.VolumetricMaxPooling
+   -- This is only test the boundary cases
+end
+
+function nntest.Module_getParameters_1()
+   local n = nn.Sequential()
+   n:add( nn.Linear(10,10) )
+   local p = n:getParameters()
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[1].weight):norm(), 0, 'getParameters(): weights wrong')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[1].bias):norm(), 0, 'getParameters(): bias wrong')
+end
+
+function nntest.Module_getParameters_2()
+   local n = nn.Sequential()
+   n:add( nn.Linear(10,10) )
+   local _ = n:getParameters()
+
+   n:add( nn.Linear(10,10) )
+   local p = n:getParameters()
+
+   mytester:asserteq((p[{ {111,210} }] - n.modules[2].weight):norm(), 0, 'error when appending new module')
+   mytester:asserteq((p[{ {211,220} }] - n.modules[2].bias):norm(), 0, 'error when appending new module')
+end
+
+function nntest.Module_getParameters_3()
+   local n = nn.Sequential()
+   n:add( nn.Linear(10,10) )
+   n:add( n.modules[1]:clone() )
+   local p = n:getParameters()
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[1].weight):norm(), 0, 'error when using cloning')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[1].bias):norm(), 0, 'error when using cloning')
+
+   mytester:asserteq((p[{ {111,210} }] - n.modules[2].weight):norm(), 0, 'error when using cloning')
+   mytester:asserteq((p[{ {211,220} }] - n.modules[2].bias):norm(), 0, 'error when using cloning')
+
+   mytester:asserteq((p[{ {111,210} }] - n.modules[1].weight):norm(), 0, 'error when using cloning')
+   mytester:asserteq((p[{ {211,220} }] - n.modules[1].bias):norm(), 0, 'error when using cloning')
+
+   n:reset()
+
+   mytester:assertgt((p[{ {111,210} }] - n.modules[1].weight):norm(), 0, 'error when using cloning')
+   mytester:assertgt((p[{ {211,220} }] - n.modules[1].bias):norm(), 0, 'error when using cloning')
+end
+
+function nntest.Module_getParameters_4()
+   local n = nn.Sequential()
+   n:add( nn.Linear(10,10) )
+   n:add( n.modules[1]:clone() )
+   local _ = n:getParameters()
+
+   n:add(nn.Linear(10,10))
+   local p = n:getParameters()
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[1].weight):norm(), 0, 'error when using cloning')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[1].bias):norm(), 0, 'error when using cloning')
+
+   mytester:asserteq((p[{ {111,210} }] - n.modules[2].weight):norm(), 0, 'error when using cloning')
+   mytester:asserteq((p[{ {211,220} }] - n.modules[2].bias):norm(), 0, 'error when using cloning')
+
+   mytester:asserteq((p[{ {221,320} }] - n.modules[3].weight):norm(), 0, 'error when using cloning')
+   mytester:asserteq((p[{ {321,330} }] - n.modules[3].bias):norm(), 0, 'error when using cloning')
+
+   mytester:asserteq(p:nElement(), 3*(10*10+10), 'error: incorrect number of elements in flat vector')
+end
+
+function nntest.Module_getParameters_5()
+   local n = nn.Sequential()
+   n:add( nn.Linear(10,10) )
+   n:add( n.modules[1]:clone('weight','bias','gradWeight','gradBias') )
+   local p = n:getParameters()
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[1].weight):norm(), 0, 'error when using cloning+sharing')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[1].bias):norm(), 0, 'error when using cloning+sharing')
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[2].weight):norm(), 0, 'error when using cloning+sharing')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[2].bias):norm(), 0, 'error when using cloning+sharing')
+
+   n:reset()
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[2].weight):norm(), 0, 'error when using cloning+sharing')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[2].bias):norm(), 0, 'error when using cloning+sharing')
+
+   mytester:asserteq(p:nElement(), (10*10+10), 'error: incorrect number of elements in flat vector')
+end
+
+function nntest.Module_getParameters_6()
+   local n = nn.Sequential()
+   n:add( nn.Linear(10,10) )
+   n:add( n.modules[1]:clone('weight','bias','gradWeight','gradBias') )
+   local _ = n:getParameters()
+
+   n:add(nn.Linear(10,10))
+   local p = n:getParameters()
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[1].weight):norm(), 0, 'error when using cloning+sharing')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[1].bias):norm(), 0, 'error when using cloning+sharing')
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[2].weight):norm(), 0, 'error when using cloning+sharing')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[2].bias):norm(), 0, 'error when using cloning+sharing')
+
+   mytester:asserteq((p[{ {111,210} }] - n.modules[3].weight):norm(), 0, 'error when using cloning+sharing')
+   mytester:asserteq((p[{ {211,220} }] - n.modules[3].bias):norm(), 0, 'error when using cloning+sharing')
+
+   mytester:asserteq(p:nElement(), 2*(10*10+10), 'error: incorrect number of elements in flat vector')
+end
+
+function nntest.Module_getParameters_7()
+   local n = nn.Sequential()
+   n:add( nn.Linear(10,10) )
+   n:add( n.modules[1]:clone('weight','bias','gradWeight','gradBias') )
+   local _ = n:getParameters()
+
+   n:add(nn.Linear(10,10))
+   local _ = n:getParameters()
+
+   local n1 = nn.Sequential()
+   n1:add( nn.Linear(10,10) )
+
+   local n2 = nn.Sequential()
+   n2:add( nn.Linear(10,10) )
+
+   local n = nn.Sequential()
+   n:add( n1 )
+   n:add( n2 )
+
+   local _ = n:getParameters()
+
+   local nf = nn.Sequential()
+   nf:add( n1 )
+   nf:add( nn.Linear(10,1) )
+
+   local p = nf:getParameters()
+
+   mytester:asserteq((p[{ {1,100} }] - n1.modules[1].weight):norm(), 0, 'error when using cloning+partial realloc')
+   mytester:asserteq((p[{ {101,110} }] - n1.modules[1].bias):norm(), 0, 'error when using cloning+partial realloc')
+
+   mytester:asserteq((p[{ {111,120} }] - nf.modules[2].weight):norm(), 0, 'error when using cloning+partial realloc')
+   mytester:asserteq((p[{ {121,121} }] - nf.modules[2].bias):norm(), 0, 'error when using cloning+partial realloc')
+
+   mytester:asserteq(p:nElement(), 121, 'error: incorrect number of elements in flat vector')
+end
+
+function nntest.Module_getParameters_8()
+   local function makeMLP(nin, ns)
+      local net = nn.Sequential()
+
+      for k,v in ipairs(ns) do
+         net:add(nn.Linear(nin, v))
+         nin = v
+      end
+      local _,_ = net:getParameters()
+      return net
+   end
+
+  local mlp1 = makeMLP(10, {10,10})
+  local mlp2 = makeMLP(10, {10,10})
+
+  local net = nn.Sequential():add(mlp1:get(1))
+                             :add(mlp2:get(1))
+
+  -- clone the second MLP to ensure that the weights before calling getParameters are preserved
+  mlp2 = mlp2:clone()
+
+  local p, _ = net:getParameters()
+
+  mytester:asserteq((p[{ {1,100} }] - net.modules[1].weight):norm(), 0, 'error when using partial realloc')
+  mytester:asserteq((p[{ {111,210} }] - net.modules[2].weight):norm(), 0, 'error when using partial realloc')
+  -- check that the weights have the same values as before get Parameters was called
+  mytester:asserteq((net.modules[1].weight - mlp1.modules[1].weight):norm(), 0, ' error when using partial realloc')
+  mytester:asserteq((net.modules[2].weight - mlp2.modules[1].weight):norm(), 0, ' error when using partial realloc')
+
+end
+
+function nntest.Module_getParameters_10()
+   -- tensors are non-contiguous but compact; they can be gathered
+   local L = nn.Linear(10,10)
+   L.weight = torch.Tensor(10,10):t():fill(1)
+   local tmp = torch.Tensor(10,10):fill(2)
+   L.bias = tmp:select(1,2)
+   local P = L:getParameters()
+   mytester:asserteq(L.weight:mean(), 1)
+   mytester:asserteq(L.bias:mean(), 2)
+   mytester:asserteq(L.weight:storage(), L.bias:storage())
+   mytester:asserteq(P:nElement(), 110)
+   mytester:asserteq(P:storage():size(), 110)
+   mytester:assertlt(L.bias[{ {10} }]:storageOffset() - 1, L.bias:storage():size())
+end
+
+function nntest.Module_getParameters_11()
+   -- tensors are non-compact; they can't be gathered
+   local L = nn.Linear(10,10)
+   local tmp = torch.Tensor(10,10):fill(2)
+   L.bias = tmp:select(2,2)
+   local ok, err = pcall(L.getParameters, L)
+   mytester:assert(not ok)
+end
+
+function nntest.Module_getParameters_12()
+   -- tensors are expanded (i.e. have dimension 0)
+   local L = nn.Linear(10,10)
+   L.weight = torch.Tensor(10, 1):fill(1)
+   torch.expand(L.weight, 10, 10)
+   L.gradWeight = torch.Tensor(10, 1):fill(1)
+   torch.expand(L.gradWeight, 10, 10)
+   L.bias = torch.Tensor(10):fill(2)
+   local P = L:getParameters()
+   mytester:asserteq(L.weight:mean(), 1)
+   mytester:asserteq(L.bias:mean(), 2)
+   mytester:asserteq(L.weight:storage(), L.bias:storage())
+   mytester:asserteq(P:nElement(), 20)
+   mytester:asserteq(P:storage():size(), 20)
+   mytester:assertlt(L.bias[{ {10} }]:storageOffset() - 1, L.bias:storage():size())
+end
+
+function nntest.Module_listModules()
+   local batchSize = 4
+   local inputSize, outputSize = 7, 6
+   local linear = nn.Linear(inputSize, outputSize)
+   local tanh = nn.Tanh()
+   local reshape = nn.Reshape(outputSize/2, 2)
+   local mlp3 = nn.Sequential()
+   mlp3:add(linear)
+   mlp3:add(tanh)
+   mlp3:add(reshape)
+
+   local mlp2 = nn.Sequential()
+   local view = nn.View(outputSize)
+   local linear2 = nn.Linear(outputSize, inputSize)
+   local tanh2 = nn.Tanh()
+   mlp2:add(mlp3)
+   mlp2:add(view)
+   mlp2:add(linear2)
+   mlp2:add(tanh2)
+
+   local concat = nn.ConcatTable()
+   local id = nn.Identity()
+   concat:add(mlp2)
+   concat:add(id)
+   local mlp = nn.Sequential()
+   local add = nn.CAddTable()
+   mlp:add(concat)
+   mlp:add(add)
+
+   local modules2 = {mlp, concat, mlp2, mlp3, linear, tanh, reshape, view, linear2, tanh2, id, add}
+   local modules = mlp:listModules()
+
+   mytester:assert(#modules2 == #modules, 'missing modules error')
+
+   for i,module in ipairs(modules) do
+      mytester:assert(torch.type(module) == torch.type(modules2[i]), 'module error')
+   end
+end
+
+function nntest.PairwiseDistance()
+   -- Note: testJacobian doesn't support table inputs, and rather than re-write
+   -- it so that it does, I'll just use a split table module on the input.
+   -- I assume both SplitTable and Sequential do not have bugs, otherwise this
+   -- test will break.
+   for p = 1,4 do  -- test a few Lp norms
+      -- TEST CASE 1: non-batch input, same code path but includes a resize
+      local ini = math.random(3,5)
+      local input = torch.Tensor(2, ini):zero()
+      local module = nn.Sequential()
+      module:add(nn.SplitTable(1))
+      module:add(nn.PairwiseDistance(p))
+
+      local err = jac.testJacobian(module,input)
+      mytester:assertlt(err, 1e-4, ' error on state ')
+
+      local ferr,berr = jac.testIO(module,input)
+      mytester:asserteq(ferr, 0, torch.typename(module)..' - i/o forward err ')
+      mytester:asserteq(berr, 0, torch.typename(module)..' - i/o backward err ')
+
+      -- Also check that the forward prop result is correct.
+      input = torch.rand(2, ini)
+      err = torch.dist(input:select(1,1), input:select(1,2), p) -
+        module:forward(input)[1]
+      mytester:assertlt(err,precision, ' error on non-batch fprop ')
+
+      -- TEST CASE 2: batch input
+      local inj = math.random(3,5)
+      input = torch.Tensor(2, inj, ini):zero()
+
+      -- (Rebuild the module to avoid correlated tests)
+      module = nn.Sequential()
+      module:add(nn.SplitTable(1))
+      module:add(nn.PairwiseDistance(p))
+
+      err = jac.testJacobian(module,input)
+      mytester:assertlt(err, 1e-4, ' error on state ')
+
+      -- Also check that the forward prop result is correct.
+      -- manually calculate each distance separately
+      local inputa = torch.rand(inj,ini)
+      local inputb = torch.rand(inj,ini)
+      local dist_manual = torch.Tensor(inj)
+      for i=1, inputa:size(1) do
+         dist_manual[i] = torch.dist(inputa:select(1,i), inputb:select(1,i),p)
+      end
+      -- compare the distances to the module's fprop
+      local dist = module:forward(torch.cat(inputa,inputb,1):resize(2,inj,ini))
+      err = dist - dist_manual
+      mytester:assertlt(err:norm(), precision, torch.typename(module) ..
+         ' error on batch fprop ')
+  end
+end
+
+function nntest.Index()
+    local net = nn.Index(1)
+
+    -- test 1D
+    local input = {torch.Tensor{10, 20, 30}, torch.LongTensor{1, 2, 2, 3}}
+    local output = net:forward(input)
+    equal(output, torch.Tensor{10, 20, 20, 30}, "error in 1D forward pass")
+
+    local gradOutput = torch.Tensor{1, 1, 1, 3 }
+    local gradInput = net:backward(input, gradOutput)
+    equal(gradInput[1], torch.Tensor{1, 2, 3}, "error in 1D backward pass")
+
+    -- test 2D
+    local input = {torch.Tensor{{10, 20}, {30, 40}}, torch.LongTensor{1, 1}}
+    local output = net:forward(input)
+    equal(output, torch.Tensor{{10, 20}, {10, 20}}, "error in 2D forward pass")
+
+    local gradOutput = torch.Tensor{{1, 2}, {1, 2}}
+    local gradInput = net:backward(input, gradOutput)
+    equal(gradInput[1], torch.Tensor{{2, 4}, {0, 0}}, "error in 2D backward pass")
+end
+
+function nntest.Squeeze()
+   local input  = torch.Tensor(2,1,3):zero()
+   local module = nn.Squeeze()
+   equal(module:forward(input), input:squeeze(), "error in forward pass")
+   local output = input:squeeze()
+   equal(module:backward(input, output), input, "error in backward pass")
+
+   -- testing the dimension option:
+   local input  = torch.Tensor(2,1,1,3):zero()
+   local module = nn.Squeeze(2)
+   equal(module:forward(input), input:squeeze(2), "error in forward pass with dimension")
+   local output = input:squeeze(2)
+   equal(module:backward(input, output), input, "error in backward pass with dimension")
+
+   -- with batch
+   local input  = torch.Tensor(2,1,1,3):zero()
+   local module = nn.Squeeze(2, 3)
+   equal(module:forward(input), input:squeeze(3), "error in forward pass with dimension")
+   local output = input:squeeze(3)
+   equal(module:backward(input, output), input, "error in backward pass with dimension")
+
+
+   -- ... of size one
+   local input  = torch.Tensor(1,1,1,3):zero()
+   local module = nn.Squeeze(2, 3)
+   equal(module:forward(input), input:squeeze(3), "error in forward pass with dimension")
+   local output = input:squeeze(3)
+   equal(module:backward(input, output), input, "error in backward pass with dimension")
+end
+
+function nntest.Unsqueeze()
+   local function assertInputOutputSize(inputSize, outputSize, tf)
+      local input = torch.Tensor(table.unpack(inputSize)):zero()
+      local output = torch.Tensor(table.unpack(outputSize)):zero()
+      local gradInput = input:clone()
+      local gradOutput = output:clone()
+      equal(tf:forward(input), output, "error in forward pass")
+      equal(tf:backward(input, gradOutput), gradInput, "error in backward pass")
+   end
+
+   local function test_normal()
+      -- insert dim 1 at head
+      local inputSize, outputSize = {2,3,4}, {1, 2,3,4}
+      local pos = 1
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos))
+
+      -- insert dim 1 at tail
+      local inputSize, outputSize = {2,3,4}, {2,3,4, 1}
+      local pos = 4
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos))
+
+      -- insert dim 1 in between
+      local inputSize, outputSize = {2,3,4}, {2, 1, 3,4}
+      local pos = 2
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos))
+   end
+
+   local function test_batchmode()
+      -- batch mode: insert dim 1 at head
+      local inputSize, outputSize = {5, 2, 3, 4}, {5, 1, 2, 3, 4}
+      local pos = 1
+      local numInputDims = 3
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos, numInputDims))
+
+      -- batch mode: insert dim 1 at tail
+      local inputSize, outputSize = {5, 2, 3, 4}, {5, 2, 3, 4, 1}
+      local pos = 4
+      local numInputDims = 3
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos, numInputDims))
+
+      -- batch mode: insert dim 1 in between
+      local inputSize, outputSize = {5, 2, 3, 4}, {5, 2, 1, 3, 4}
+      local pos = 2
+      local numInputDims = 3
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos, numInputDims))
+   end
+
+   local function test_sizeone()
+      local inputSize, outputSize = {1,1,3,1}, {1,1, 1, 3,1}
+      local pos = 3
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos))
+
+      local inputSize, outputSize = {1,1,3,2}, {1,1,3,2, 1}
+      local pos = 3
+      local numInputDims = 2
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos, numInputDims))
+   end
+
+   local function test_sizestrange()
+      local inputSize, outputSize = {2}, {2,1}
+      local pos = 2
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos))
+
+      local inputSize, outputSize = {1}, {1, 1}
+      local pos = 1
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos))
+   end
+
+   test_normal()
+   test_batchmode()
+   test_sizeone()
+   test_sizestrange()
+end
+
+function nntest.LookupTable()
+   local totalIndex = math.random(6,9)
+   local nIndex = math.random(3,5)
+   local entry_size = math.random(2,5)
+
+   local function dotest(module, input, minval, maxval)
+       local output = module:forward(input)
+       module:backwardUpdate(input, output, 0.1)
+       input:zero()
+
+       -- 1D
+       local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight, minval, maxval)
+       mytester:assertlt(err,precision, '1D error on weight ')
+
+       local err = jac.testJacobianUpdateParameters(module, input, module.weight, minval, maxval)
+       mytester:assertlt(err,precision, '1D error on weight [direct update] ')
+
+       module.gradWeight:zero()
+       for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+          mytester:assertlt(err, precision, string.format(
+                             '1D error on weight [%s]', t))
+       end
+
+       -- 2D
+       local nframe = math.random(2,5)
+       local input = torch.IntTensor(nframe, nIndex):zero()
+
+       local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight, minval, maxval)
+       mytester:assertlt(err,precision, '2D error on weight ')
+
+       local err = jac.testJacobianUpdateParameters(module, input, module.weight, minval, maxval)
+       mytester:assertlt(err,precision, '2D error on weight [direct update] ')
+
+       module.gradWeight:zero()
+       for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+          mytester:assertlt(err, precision, string.format(
+                             '2D error on weight [%s]', t))
+       end
+
+       -- IO
+       module.gradInput = torch.Tensor(3,4):zero() --fixes an error
+       local ferr,berr = jac.testIO(module,input,minval,maxval)
+       mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+       mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+       -- accUpdate
+       module:accUpdateOnly()
+       mytester:assert(not module.gradWeight, 'gradWeight is nil')
+       module:float()
+       local output = module:forward(input)
+       module:backwardUpdate(input, output, 0.1)
+   end
+   -- test without padding
+   local input = torch.randperm(totalIndex):narrow(1,1,nIndex):int()
+   local module = nn.LookupTable(totalIndex, entry_size)
+   dotest(module, input, 1, totalIndex)
+   -- test with padding set to 1, but no padding in inputs
+   local input = torch.randperm(totalIndex):narrow(1,1,nIndex):int()
+   local module = nn.LookupTable(totalIndex, entry_size, 1)
+   dotest(module, input, 2, totalIndex)
+   -- test whether padding weights remain unchanged
+   local paddingValue = math.random(totalIndex)
+   local module = nn.LookupTable(totalIndex, entry_size, paddingValue)
+   local padw = module.weight:select(1,paddingValue):fill(1)
+   local padw_sum = padw:sum()
+   local input = torch.IntTensor(nIndex)
+   for i = 1, 100 do
+       input:apply(
+       function() -- set randomly half of the input as padding
+           if torch.random(2) == 1 then return paddingValue end
+           return torch.random(totalIndex)
+       end)
+       local y = module:updateOutput(input)
+       module:updateGradInput(input, y)
+       module:accUpdateGradParameters(input, y, 0.1)
+   end
+   local err = padw_sum - padw:sum()
+   mytester:assertlt(err,precision, 'padding update error ')
+   -- test whether the weights changes accordingly when maxNorm is not nil
+   local all_index = torch.randperm(totalIndex):int()
+   -- input can have duplicates
+   local input = torch.repeatTensor(all_index:narrow(1,1,nIndex), 2)
+   local maxNorm = math.random()
+   for _, normType in ipairs{1, 2, math.random()} do
+      local module = nn.LookupTable(totalIndex, entry_size, 0, maxNorm, normType)
+      local oriW = module.weight:clone()
+      output = module:updateOutput(input)
+      -- check output is of small norm
+      for j = 1,output:size(1) do
+         local norm = torch.norm(output:select(1, j), normType)
+         if norm > maxNorm then
+            local err = norm - maxNorm;
+            mytester:assertlt(math.abs(err), precision, string.format(
+               'output after renorm exceeds maxNorm=[%f] with normType=[%f]', maxNorm, normType))
+         end
+      end
+      -- check the update of the module.weight
+      for j = 1,totalIndex do
+         local k = all_index[j]
+         if j <= nIndex then -- k is an index in "input"
+            local norm = torch.norm(module.weight:select(1, k), normType)
+            local oriNorm = torch.norm(oriW:select(1, k), normType)
+            if oriNorm > maxNorm then
+               local err = norm - maxNorm
+               mytester:assertlt(math.abs(err), precision, 'unexpected norm after renorm')
+            else
+               local err = norm - oriNorm
+               mytester:assertlt(math.abs(err), precision, 'unpexpected norm after renorm')
+            end
+         else -- k is not an index in "input"
+            local err = module.weight:select(1,k):sum() - oriW:select(1,k):sum()
+            mytester:assertlt(math.abs(err), precision, 'unexpected changes in weight after renorm')
+         end
+      end
+   end
+end
+
+function nntest.AddConstant()
+  local nbatch = torch.random(3, 5)
+  local f = torch.random(3, 5)
+  local h = torch.random(7,9)
+  local w = torch.random(7,9)
+  local input = torch.rand(nbatch, f, h, w):mul(20):add(-10)  -- [-10, 10]
+
+  local constant = torch.randn(1):squeeze()
+  local mod = nn.AddConstant(constant)
+
+  -- Test FPROP
+  local output = mod:forward(input)
+  local delta = output - input
+  mytester:assertlt(delta:add(-constant):abs():max(), precision, 'fprop error')
+
+  -- Test BPROP
+  local err = jac.testJacobian(mod, input)
+  mytester:assertlt(err, precision, 'bprop error ')
+
+  -- inplace comparisons
+  local ini = math.random(3,5)
+  local inj = math.random(3,5)
+  local ink = math.random(3,5)
+  local constant = torch.uniform()*math.random(1,10)
+
+  local input1 = torch.rand(ink, inj, ini)
+  local input2 = input1:clone()
+
+  local module1 = nn.AddConstant(constant,true)
+  local module2 = nn.AddConstant(constant)
+
+  local gradOutput1 = torch.rand(ink, inj, ini)
+  local gradOutput2 = gradOutput1:clone()
+
+  local out1 = module1:forward(input1)
+  local out2 = module2:forward(input2)
+
+  mytester:asserteq(0, (out1-out2):abs():max(), torch.typename(module1) ..
+                    ' - in-place forward err ')
+
+  local gradInput1 = module1:backward(input1, gradOutput1)
+  local gradInput2 = module2:backward(input2, gradOutput2)
+
+  mytester:asserteq(0, (gradInput1-gradInput2):abs():max(),
+                torch.typename(module1) .. ' - in-place backward err ')
+
+  local input1 = torch.rand(ink, inj, ini)
+  local input2 = input1:clone()
+
+  module1:forward(input1)
+  module1:backward(module1.output,torch.rand(input1:size()))
+
+  local err = (input1-input2):abs():max()
+  mytester:asserteq(err, 0, torch.typename(module1) ..
+                          ' - inplace input change err ')
+end
+
+function nntest.MulConstant()
+  local nbatch = torch.random(3, 5)
+  local f = torch.random(3, 5)
+  local h = torch.random(7,9)
+  local w = torch.random(7,9)
+  local input = torch.rand(nbatch, f, h, w):mul(20):add(-10)  -- [-10, 10]
+
+  local constant = torch.randn(1):squeeze()
+  local mod = nn.MulConstant(constant)
+
+  -- Test FPROP
+  local output = mod:forward(input)
+  local scale = output:clone():cdiv(input)
+  mytester:assertlt(scale:add(-constant):abs():max(), precision, 'fprop error')
+
+  -- Test BPROP
+  local err = jac.testJacobian(mod, input)
+  mytester:assertlt(err, precision, 'bprop error ')
+
+  -- inplace comparisons
+  local ini = math.random(3,5)
+  local inj = math.random(3,5)
+  local ink = math.random(3,5)
+  local constant = torch.uniform()*math.random(1,10)
+
+  local input1 = torch.rand(ink, inj, ini)
+  local input2 = input1:clone()
+
+  local module1 = nn.MulConstant(constant,true)
+  local module2 = nn.MulConstant(constant)
+
+  local gradOutput1 = torch.rand(ink, inj, ini)
+  local gradOutput2 = gradOutput1:clone()
+
+  local out1 = module1:forward(input1)
+  local out2 = module2:forward(input2)
+
+  mytester:asserteq(0, (out1-out2):abs():max(), torch.typename(module1) ..
+                    ' - in-place forward err ')
+
+  local gradInput1 = module1:backward(input1, gradOutput1)
+  local gradInput2 = module2:backward(input2, gradOutput2)
+
+  mytester:asserteq(0, (gradInput1-gradInput2):abs():max(),
+                torch.typename(module1) .. ' - in-place backward err ')
+
+  local input1 = torch.rand(ink, inj, ini)
+  local input2 = input1:clone()
+
+  module1:forward(input1)
+  module1:backward(module1.output,torch.rand(input1:size()))
+
+  local err = (input1-input2):abs():max()
+  mytester:assertalmosteq(err, 0, 1e-15, torch.typename(module1) ..
+                          ' - inplace input change err ')
+end
+
+function nntest.Copy()
+   local input = torch.randn(3,4):double()
+   local c = nn.Copy('torch.DoubleTensor', 'torch.FloatTensor')
+   local output = c:forward(input)
+   mytester:assert(torch.type(output) == 'torch.FloatTensor', 'copy forward type err')
+   mytester:assertTensorEq(output, input:float(), 0.000001, 'copy forward value err')
+   local gradInput = c:backward(input, output)
+   mytester:assert(torch.type(gradInput) == 'torch.DoubleTensor', 'copy backward type err')
+   mytester:assertTensorEq(gradInput, input, 0.000001, 'copy backward value err')
+   c.dontCast = true
+   c:double()
+   mytester:assert(torch.type(output) == 'torch.FloatTensor', 'copy forward type err')
+end
+
+function nntest.JoinTable()
+   local tensor = torch.rand(3,4,5)
+   local input = {tensor, tensor}
+   local module
+   for d = 1,tensor:dim() do
+      module = nn.JoinTable(d)
+      mytester:asserteq(module:forward(input):size(d), tensor:size(d)*2, "dimension " .. d)
+   end
+
+   -- Minibatch
+   local tensor = torch.rand(3,4,5)
+   local input = {tensor, tensor}
+   local module
+   for d = 1,tensor:dim()-1 do
+      module = nn.JoinTable(d, 2)
+      mytester:asserteq(module:forward(input):size(d+1), tensor:size(d+1)*2, "dimension " .. d)
+   end
+end
+
+function nntest.SplitTable()
+   local input = torch.randn(3,4,5)
+   local module
+   for d = 1,input:dim() do
+      module = nn.SplitTable(d)
+      mytester:asserteq(#module:forward(input), input:size(d), "dimension " .. d)
+   end
+
+   -- Minibatch
+   local input = torch.randn(3,4,5)
+   local module
+   for d = 1,input:dim()-1 do
+      module = nn.SplitTable(d, 2)
+      mytester:asserteq(#module:forward(input), input:size(d+1), "dimension " .. d)
+   end
+
+   -- Negative indices
+   local module = nn.SplitTable(-3)
+   local input = torch.randn(3,4,5)
+   mytester:asserteq(#module:forward(input), 3, "negative index")
+   local input = torch.randn(2,3,4,5)
+   mytester:asserteq(#module:forward(input), 3, "negative index (minibatch)")
+end
+
+function nntest.Select()
+  -- Test negative Select
+  local input = torch.Tensor{{4,6,7}, {8,0,1}}
+  mytester:asserteq(nn.Select(1,-1):forward(input)[1], 8, "negative index")
+  mytester:asserteq(nn.Select(1,-1):forward(input)[2], 0, "negative index")
+  mytester:asserteq(nn.Select(1,-2):forward(input)[2], 6, "negative index")
+end
+
+function nntest.SelectTable()
+   local input = {
+      torch.rand(3,4,5), torch.rand(3,4,5),
+      {torch.rand(3,4,5)},
+      {torch.rand(3,4,5), {torch.rand(3,4,5)}}
+   }
+   local gradOutputs = {
+      torch.rand(3,4,5), torch.rand(3,4,5),
+      {torch.rand(3,4,5)},
+      {torch.rand(3,4,5), {torch.rand(3,4,5)}}
+   }
+   local zeros = {
+      torch.Tensor(3,4,5):zero(), torch.Tensor(3,4,5):zero(),
+      {torch.Tensor(3,4,5):zero()},
+      {torch.Tensor(3,4,5):zero(), {torch.Tensor(3,4,5):zero()}}
+   }
+   local nonIdx = {2,3,4,1}
+   local module
+   for idx = 1,#input do
+      module = nn.SelectTable(idx)
+      local output = module:forward(input)
+      equal(output, input[idx], "output dimension " .. idx)
+      local gradInput = module:backward(input, gradOutputs[idx])
+      equal(gradInput[idx], gradOutputs[idx], "gradInput[idx] dimension " .. idx)
+      equal(gradInput[nonIdx[idx]], zeros[nonIdx[idx]], "gradInput[nonIdx] dimension " .. idx)
+   end
+
+   -- test negative index
+   local idx = -2
+   module = nn.SelectTable(idx)
+   local output = module:forward(input)
+   equal(output, input[#input+idx+1], "output dimension " .. idx)
+   local gradInput = module:backward(input, gradOutputs[#input+idx+1])
+   equal(gradInput[#input+idx+1], gradOutputs[#input+idx+1], "gradInput[idx] dimension " .. idx)
+   equal(gradInput[nonIdx[#input+idx+1]], zeros[nonIdx[#input+idx+1]], "gradInput[nonIdx] dimension " .. idx)
+
+   -- test typecast
+   local idx = #input
+   module = nn.SelectTable(idx)
+   module:float()
+   local output = module:forward(input)
+   equal(output, input[idx], "type output")
+   local gradInput = module:backward(input, gradOutputs[idx])
+   equal(gradInput[idx], gradOutputs[idx], "gradInput[idx] dimension " .. idx)
+   equal(gradInput[nonIdx[idx]], zeros[nonIdx[idx]], "gradInput[nonIdx] dimension " .. idx)
+
+   -- test on differently sized sub-input tables given consequetively
+   local input1 = {
+      torch.rand(3,4,5),
+      {torch.rand(3,4,5), torch.rand(3,4,5), torch.rand(3,4,5)}
+   }
+   local input2 = {
+      torch.rand(3,4,5),
+      {torch.rand(3,4,5), torch.rand(3,4,5)}
+   }
+
+   module = nn.SelectTable(1)
+   local output = module:forward(input1)
+   equal(output, input1[1], "output dimension 1")
+   local gradInput = module:backward(input1, output)
+   mytester:assert(#gradInput == #input1, "Table lengths")
+   mytester:assert(#gradInput[2] == #input1[2], "Sub-Table lengths")
+   output = module:forward(input2)
+   equal(output, input2[1], "output dimension 1")
+   gradInput = module:backward(input2, output)
+   mytester:assert(#gradInput == #input2, "Table lengths")
+   mytester:assert(#gradInput[2] == #input2[2], "Sub-Table lengths")
+
+   -- test on tables of increasing size
+   local input1 = {torch.rand(3,4,5), torch.rand(3,4,5)}
+   local input2 = {torch.rand(3,4,5), torch.rand(3,4,5), torch.rand(3,4,5)}
+   local gradOutput1 = torch.randn(3,4,5)
+   local gradOutput2 = torch.randn(3,4,5)
+
+   local module1 = nn.SelectTable(-1)
+   local output1 = module1:forward(input1):clone()
+   local output2 = module1:forward(input2)
+   local gradInput_ = module1:backward(input1, gradOutput1)
+   local gradInput1 = {}
+   for k,v in ipairs(gradInput_) do gradInput1[k] = v:clone() end
+   local gradInput2 = module1:backward(input2, gradOutput2)
+
+   local module3 = nn.SelectTable(-1)
+   local module4 = nn.SelectTable(-1)
+   local output3 = module3:forward(input1)
+   local output4 = module4:forward(input2)
+   local gradInput3 = module3:backward(input1, gradOutput1)
+   local gradInput4 = module4:backward(input2, gradOutput2)
+
+   equal(output1, output3, "output 1 and 3")
+   equal(output2, output4, "output 2 and 4")
+   equal(gradInput1, gradInput3, "gradInput 1 and 3")
+   equal(gradInput2, gradInput4, "gradInput 2 and 4")
+end
+
+function nntest.MixtureTable()
+   -- 2D
+   -- expertInput is a Table:
+   local expertInput = torch.randn(5,3,6)
+   local gradOutput = torch.randn(5,6)
+   local input = {
+      torch.rand(5,3),
+      {expertInput:select(2,1), expertInput:select(2,2), expertInput:select(2,3)}
+   }
+   local module = nn.MixtureTable()
+   local output = module:forward(input)
+   local output2 = torch.cmul(input[1]:view(5,3,1):expand(5,3,6), expertInput):sum(2):squeeze(2)
+   mytester:assertTensorEq(output, output2, 0.000001, "mixture output")
+   local gradInput = module:backward(input, gradOutput)
+   local gradOutput2 = torch.view(gradOutput, 5, 1, 6):expandAs(expertInput)
+   local gaterGradInput2 = torch.cmul(gradOutput2, expertInput):sum(3):select(3,1)
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2, 0.000001, "mixture gater gradInput")
+   local expertGradInput2 = torch.cmul(input[1]:view(5,3,1):expand(5,3,6), gradOutput:view(5,1,6):expand(5,3,6))
+   for i, expertGradInput in ipairs(gradInput[2]) do
+      mytester:assertTensorEq(expertGradInput, expertGradInput2:select(2,i), 0.000001, "mixture expert "..i.." gradInput")
+   end
+   -- expertInput is a Tensor:
+   local input = {input[1], expertInput}
+   local module = nn.MixtureTable(2)
+   local output = module:forward(input)
+   mytester:assertTensorEq(output, output2, 0.000001, "mixture2 output")
+   local gradInput = module:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2, 0.000001, "mixture2 gater gradInput")
+   mytester:assertTensorEq(gradInput[2], expertGradInput2, 0.000001, "mixture2 expert gradInput")
+
+   -- 3D
+   local expertInput = torch.randn(5,6,3,2)
+   local gradOutput = torch.randn(5,6,2)
+   -- expertInput is a Table:
+   local input = {
+      torch.rand(5,3),
+      {expertInput:select(3,1), expertInput:select(3,2), expertInput:select(3,3)}
+   }
+   local module = nn.MixtureTable()
+   local output = module:forward(input)
+   local output2 = torch.cmul(input[1]:view(5,1,3,1):expand(5,6,3,2), expertInput):sum(3):squeeze(3)
+   mytester:assertTensorEq(output, output2, 0.000001, "mixture3 output")
+   local gradInput = module:backward(input, gradOutput)
+   local gradOutput2 = torch.view(gradOutput,5,6,1,2):expandAs(expertInput)
+   local gaterGradInput2 = torch.cmul(gradOutput2, expertInput):sum(4):select(4,1):sum(2):select(2,1)
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2, 0.000001, "mixture3 gater gradInput")
+   local expertGradInput2 = torch.cmul(input[1]:view(5,1,3,1):expand(5,6,3,2), gradOutput2)
+   for i, expertGradInput in ipairs(gradInput[2]) do
+      mytester:assertTensorEq(expertGradInput, expertGradInput2:select(3,i), 0.000001, "mixture3 expert "..i.." gradInput")
+   end
+   -- expertInput is a Tensor
+   local input = {input[1], expertInput}
+   local module = nn.MixtureTable(3)
+   local output = module:forward(input)
+   mytester:assertTensorEq(output, output2, 0.000001, "mixture4 output")
+   local gradInput = module:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2, 0.000001, "mixture4 gater gradInput")
+   mytester:assertTensorEq(gradInput[2], expertGradInput2, 0.000001, "mixture4 expert gradInput")
+
+   -- 1D
+   -- expertInput is a Table:
+   local expertInput = torch.randn(3,6)
+   local gradOutput = torch.randn(6)
+   local input = {
+      torch.rand(3),
+      {expertInput:select(1,1), expertInput:select(1,2), expertInput:select(1,3)}
+   }
+   local module = nn.MixtureTable()
+   local output = module:forward(input)
+   local output2 = torch.cmul(input[1]:view(3,1):expand(3,6), expertInput):sum(1):squeeze(1)
+   mytester:assertTensorEq(output, output2, 0.000001, "mixture5 output")
+   local gradInput = module:backward(input, gradOutput)
+   local gradOutput2 = torch.view(gradOutput, 1, 6):expandAs(expertInput)
+   local gaterGradInput2 = torch.cmul(gradOutput2, expertInput):sum(2):select(2,1)
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2, 0.000001, "mixture5 gater gradInput")
+   local expertGradInput2 = torch.cmul(input[1]:view(3,1):expand(3,6), gradOutput:view(1,6):expand(3,6))
+   for i, expertGradInput in ipairs(gradInput[2]) do
+      mytester:assertTensorEq(expertGradInput, expertGradInput2:select(1,i), 0.000001, "mixture5 expert "..i.." gradInput")
+   end
+   -- test type-cast
+   module:float()
+   local input2 = {
+      input[1]:float(),
+      {input[2][1]:float(), input[2][2]:float(), input[2][3]:float()}
+   }
+   local output = module:forward(input2)
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "mixture5B output")
+   local gradInput = module:backward(input2, gradOutput:float())
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2:float(), 0.000001, "mixture5B gater gradInput")
+   for i, expertGradInput in ipairs(gradInput[2]) do
+      mytester:assertTensorEq(expertGradInput, expertGradInput2:select(1,i):float(), 0.000001, "mixture5B expert "..i.." gradInput")
+   end
+   -- expertInput is a Tensor:
+   local input = {input[1], expertInput}
+   local module = nn.MixtureTable(1)
+   local output = module:forward(input)
+   mytester:assertTensorEq(output, output2, 0.000001, "mixture6 output")
+   local gradInput = module:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2, 0.000001, "mixture6 gater gradInput")
+   mytester:assertTensorEq(gradInput[2], expertGradInput2, 0.000001, "mixture6 expert gradInput")
+   -- test type-cast:
+   module:float()
+   local input2 = {input[1]:float(), expertInput:float()}
+   local output = module:forward(input2)
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "mixture6B output")
+   local gradInput = module:backward(input2, gradOutput:float())
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2:float(), 0.000001, "mixture6B gater gradInput")
+   mytester:assertTensorEq(gradInput[2], expertGradInput2:float(), 0.000001, "mixture6B expert gradInput")
+
+   --2D gater, 1D expert
+   -- expertInput is a Table:
+   local expertInput = torch.randn(5,3)
+   local gradOutput = torch.randn(5)
+   local input = {
+      torch.rand(5,3),
+      {expertInput:select(2,1), expertInput:select(2,2), expertInput:select(2,3)}
+   }
+   local module = nn.MixtureTable()
+   local output = module:forward(input)
+   local output2 = torch.cmul(input[1], expertInput):sum(2):squeeze(2)
+   mytester:assertTensorEq(output, output2, 0.000001, "mixture7 output")
+   local gradInput = module:backward(input, gradOutput)
+   local gradOutput2 = torch.view(gradOutput, 5, 1):expandAs(expertInput)
+   local gaterGradInput2 = torch.cmul(gradOutput2, expertInput)
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2, 0.000001, "mixture7 gater gradInput")
+   local expertGradInput2 = torch.cmul(input[1], gradOutput:view(5,1):expand(5,3))
+   for i, expertGradInput in ipairs(gradInput[2]) do
+      mytester:assertTensorEq(expertGradInput, expertGradInput2:select(2,i), 0.000001, "mixture7 expert "..i.." gradInput")
+   end
+end
+
+function nntest.Narrow()
+   -- check basic narrow functionality #1
+   local input = torch.rand(9, 4, 14)
+   local output = input:narrow(1, 3, 5)
+   local gradOutput = torch.rand(5, 4, 14)
+   local gradInput = torch.zeros(9, 4, 14)
+   gradInput:narrow(1, 3, 5):copy(gradOutput)
+   local module1 = nn.Narrow(1, 3, 5)
+   local output1 = module1:forward(input)
+   local gradInput1 = module1:backward(input, gradOutput)
+   local module2 = nn.Narrow(1, 3, -3)
+   local output2 = module2:forward(input)
+   local gradInput2 = module2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output1, 0.0000001, "Narrow #1 output err")
+   mytester:assertTensorEq(gradInput, gradInput1, 0.00001, "Narrow #1 gradInput err")
+   mytester:assertTensorEq(output, output2, 0.0000001, "Narrow #1 negative output err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "Narrow #1 negative gradInput err")
+
+   -- check basic narrow functionality #2
+   local input = torch.rand(3, 10, 4)
+   local output = input:narrow(2, 5, 3)
+   local gradOutput = torch.rand(3, 3, 4)
+   local gradInput = torch.zeros(3, 10, 4)
+   gradInput:narrow(2, 5, 3):copy(gradOutput)
+   local module1 = nn.Narrow(2, 5, 3)
+   local output1 = module1:forward(input)
+   local gradInput1 = module1:backward(input, gradOutput)
+   local module2 = nn.Narrow(2, 5, -4)
+   local output2 = module2:forward(input)
+   local gradInput2 = module2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output1, 0.0000001, "Narrow #2 output err")
+   mytester:assertTensorEq(gradInput, gradInput1, 0.00001, "Narrow #2 gradInput err")
+   mytester:assertTensorEq(output, output2, 0.0000001, "Narrow #2 negative output err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "Narrow #2 negative gradInput err")
+
+   -- check basic narrow functionality #3
+   local input = torch.rand(6, 11, 7)
+   local output = input:narrow(3, 1, 1)
+   local gradOutput = torch.rand(6, 11, 1)
+   local gradInput = torch.zeros(6, 11, 7)
+   gradInput:narrow(3, 1, 1):copy(gradOutput)
+   local module1 = nn.Narrow(3, 1, 1)
+   local output1 = module1:forward(input)
+   local gradInput1 = module1:backward(input, gradOutput)
+   local module2 = nn.Narrow(3, 1, -7)
+   local output2 = module2:forward(input)
+   local gradInput2 = module2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output1, 0.0000001, "Narrow #3 output err")
+   mytester:assertTensorEq(gradInput, gradInput1, 0.00001, "Narrow #3 gradInput err")
+   mytester:assertTensorEq(output, output2, 0.0000001, "Narrow #3 negative output err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "Narrow #3 negative gradInput err")
+end
+
+function nntest.NarrowTable()
+   local input = torch.randn(3,10,4)
+   local gradOutput = torch.randn(3,3,4)
+   local nt = nn.NarrowTable(5,3)
+   local seq = nn.Sequential()
+   seq:add(nn.SplitTable(1,2))
+   seq:add(nt)
+   seq:add(nn.JoinTable(1,1))
+   seq:add(nn.Reshape(3,3,4))
+   local seq2 = nn.Narrow(2,5,3)
+   local output = seq:forward(input)
+   local gradInput = seq:backward(input, gradOutput)
+   local output2 = seq2:forward(input)
+   local gradInput2 = seq2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output2, 0.0000001, "NarrowTable output err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "NarrowTable gradInput err")
+
+   -- now try it with a smaller input
+   local input = input:narrow(2, 1, 8)
+   local output = seq:forward(input)
+   local gradInput = seq:backward(input, gradOutput)
+   local output2 = seq2:forward(input)
+   local gradInput2 = seq2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output2, 0.0000001, "NarrowTable small output err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "NarrowTable small gradInput err")
+
+   -- test type-cast
+   local input = input:float()
+   local gradOutput = gradOutput:float()
+   seq:float()
+   seq2:float()
+   local output = seq:forward(input)
+   local gradInput = seq:backward(input, gradOutput)
+   local output2 = seq2:forward(input)
+   local gradInput2 = seq2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output2, 0.0000001, "NarrowTable output float err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "NarrowTable gradInput float err")
+end
+
+function nntest.View()
+   local input = torch.rand(10)
+   local template = torch.rand(5,2)
+   local target = template:size():totable()
+   local module = nn.View(template:size())
+   mytester:assertTableEq(module:forward(input):size():totable(), target, "Error in forward (1)")
+   local module = nn.View(table.unpack(target))
+   mytester:assertTableEq(module:forward(input):size():totable(), target, "Error in forward (2)")
+
+   -- Minibatch
+   local minibatch = torch.rand(5,10)
+   mytester:asserteq(module:forward(minibatch):size(1),
+      minibatch:size(1),
+      "Error in minibatch dimension")
+   mytester:asserteq(module:forward(minibatch):nElement(),
+      minibatch:nElement(),
+      "Error in minibatch nElement")
+   local module = nn.View(-1):setNumInputDims(1)
+   mytester:asserteq(module:forward(minibatch):size(1),
+      minibatch:size(1),
+      "Error in minibatch dimension with size -1")
+   mytester:asserteq(module:forward(minibatch):nElement(),
+      minibatch:nElement(),
+      "Error in minibatch nElement with size -1")
+
+   -- another setNumInputDims case
+   local minibatch = torch.rand(5,4,10)
+   local module = nn.View(-1):setNumInputDims(2)
+   mytester:asserteq(module:forward(minibatch):size(1),
+      minibatch:size(1),
+      "Error in minibatch dimension with size -1")
+
+   -- another setNumInputDims case
+   local minibatch = torch.rand(2,5,4,10)
+   local module = nn.View(4,-1):setNumInputDims(2)
+   local out = module:forward(minibatch)
+   mytester:asserteq(out:size(1), minibatch:size(1)*minibatch:size(2),
+                          "Error in minibatch dimension with size -1")
+   mytester:asserteq(out:size(2), minibatch:size(3),
+                          "Error in minibatch dimension with size -1")
+   mytester:asserteq(out:size(3), minibatch:size(4),
+                          "Error in minibatch dimension with size -1")
+
+   -- Minibatch Generalization
+   local minibatch = torch.rand(5,2,6)
+   local module = nn.View(6)
+   mytester:asserteq(
+      module:forward(minibatch):size(1),
+      minibatch:size(1)*minibatch:size(2),
+      "Error in minibatch generalization dimension")
+   mytester:asserteq(
+      module:forward(minibatch):nElement(),
+      minibatch:nElement(),
+      "Error in minibatch generalization nElement")
+end
+
+function nntest.Reshape()
+   local input = torch.rand(10)
+   local template = torch.rand(5,2)
+   local target = template:size():totable()
+   local module = nn.Reshape(template:size())
+   mytester:assertTableEq(module:forward(input):size():totable(), target, "Error in forward (1)")
+   local module = nn.View(table.unpack(target))
+   mytester:assertTableEq(module:forward(input):size():totable(), target, "Error in forward (2)")
+
+   -- Minibatch
+   local minibatch = torch.rand(5,10)
+   mytester:asserteq(module:forward(minibatch):size(1),
+      minibatch:size(1),
+      "Error in minibatch dimension")
+   mytester:asserteq(module:forward(minibatch):nElement(),
+      minibatch:nElement(),
+      "Error in minibatch nElement")
+end
+
+-- Define a test for SpatialUpSamplingCuda
+function nntest.SpatialUpSamplingNearest()
+  local scale = torch.random(2,4)
+  for dim = 3,4 do
+    local m = nn.SpatialUpSamplingNearest(scale)
+
+    -- Create a randomly sized dimD vector
+    local shape = {}
+    for i = 1, dim do
+      table.insert(shape, torch.random(2, 2+dim-1))
+    end
+
+    -- Check that the gradient is correct by using finite elements
+    local input = torch.Tensor(table.unpack(shape)):zero()
+
+    local err = jac.testJacobian(m, input)
+    mytester:assertlt(err, precision, ' error on state ')
+
+    local ferr, berr = jac.testIO(m, input)
+    mytester:asserteq(ferr, 0, torch.typename(m)..' - i/o forward err ')
+    mytester:asserteq(berr, 0, torch.typename(m)..' - i/o backward err ')
+  end
+end
+
+function nntest.Concat()
+   local input = torch.randn(4, 2)
+   local num_modules = math.random(2, 5)
+   local linears = {}
+   for i = 1,num_modules do
+       linears[i] = nn.Linear(2,5)
+   end
+
+   local m = nn.Concat(1)
+   for _,module in ipairs(linears) do
+      m:add(module)
+      module:zeroGradParameters()
+      module.weight:fill(1)
+      module.bias:fill(0)
+   end
+
+   local output = m:forward(input)
+   local output2 = input:sum(2):expand(4, 5):repeatTensor(num_modules, 1)
+   mytester:assertTensorEq(output2, output, 0.000001, 'Concat forward err')
+
+   local gradInput = m:backward(input, torch.ones(output2:size()))
+   local gradInput2 = torch.ones(4, 2):fill(num_modules * 5)
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'Concat backward err (gradInput)')
+
+   local gradWeight = input:sum(1):expand(5, 2)
+   for _,module in ipairs(linears) do
+      mytester:assertTensorEq(gradWeight, module.gradWeight, 0.000001, 'Concat backward err (gradWeight)')
+   end
+end
+
+function nntest.Parallel()
+   local input = torch.randn(3, 4, 5)
+   local m = nn.Parallel(1,3)
+   m:add(nn.View(4,5,1))
+   m:add(nn.View(4,5,1))
+   m:add(nn.View(4,5,1))
+
+   local output = m:forward(input)
+   local output2 = input:transpose(1,3):transpose(1,2)
+   mytester:assertTensorEq(output2, output, 0.000001, 'Parallel forward err')
+
+   local gradInput = m:backward(input, output2)
+   mytester:assertTensorEq(gradInput, input, 0.000001, 'Parallel backward err')
+end
+
+function nntest.ParallelTable()
+   local input = torch.randn(3, 4, 5)
+   local p = nn.ParallelTable()
+   p:add(nn.View(4,5,1))
+   p:add(nn.View(4,5,1))
+   p:add(nn.View(4,5,1))
+   local m = nn.Sequential()
+   m:add(nn.SplitTable(1))
+   m:add(p)
+   m:add(nn.JoinTable(3))
+
+   local output = m:forward(input)
+   local output2 = input:transpose(1,3):transpose(1,2)
+   mytester:assertTensorEq(output2, output, 0.000001, 'ParallelTable forward err')
+
+   local gradInput = m:backward(input, output2)
+   mytester:assertTensorEq(gradInput, input, 0.000001, 'ParallelTable backward err')
+end
+
+function nntest.ConcatTable()
+   -- Test tensor input
+   local input = torch.rand(5, 5, 5)
+   local m = nn.Sequential()
+
+   local concat = nn.ConcatTable()
+   concat:add(nn.Identity())
+
+   m:add(concat)  -- Output of concat is a table of length 1
+   m:add(nn.JoinTable(1))  -- jac needs a tensor tensor output
+
+   local err = jac.testJacobian(m, input)
+   mytester:assertlt(err, precision, ' error on state ')
+
+   local ferr, berr = jac.testIO(m, input)
+   mytester:asserteq(ferr, 0, torch.typename(m)..' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(m)..' - i/o backward err ')
+
+   -- Now test a table input
+   local input = {
+      torch.randn(3,4):float(), torch.randn(3,4):float(), {torch.randn(3,4):float()}
+   }
+   local _gradOutput = {
+      torch.randn(3,3,4):float(), torch.randn(3,3,4):float(), torch.randn(3,3,4):float()
+   }
+   local gradOutput = {
+      {_gradOutput[1][1], _gradOutput[2][1], {_gradOutput[3][1]}},
+      {_gradOutput[1][2], _gradOutput[2][2], {_gradOutput[3][2]}},
+      {_gradOutput[1][3], _gradOutput[2][3], {_gradOutput[3][3]}}
+   }
+   local module = nn.ConcatTable()
+   module:add(nn.Identity())
+   module:add(nn.Identity())
+   module:add(nn.Identity())
+   module:float()
+
+   local output = module:forward(input)
+   local output2 = {input, input, input}
+   equal(output2, output, "ConcatTable table output")
+   local gradInput = module:backward(input, gradOutput)
+   local gradInput2 = {_gradOutput[1]:sum(1):squeeze(1), _gradOutput[2]:sum(1):squeeze(1), {_gradOutput[3]:sum(1):squeeze(1)}}
+   equal(gradInput, gradInput2, "ConcatTable table gradInput")
+
+   -- test outputs for variable length inputs
+   local test = nn.ConcatTable()
+   test:add(nn.Identity())
+   test:add(nn.Identity())
+
+   local x = {torch.randn(5), torch.randn(5)}
+   local y = {torch.randn(5)}
+
+   local o1 = #(test:forward(x))
+   local go1 = #(test:backward(x, {x, x}))
+   local o2 = #(test:forward(y))
+   local go2 = #(test:backward(y, {y, y}))
+   mytester:assert(o1 == 2, "ConcatTable table variable length")
+   mytester:assert(go1 == 2, "ConcatTable table variable length")
+   mytester:assert(o2 == 2, "ConcatTable table variable length")
+   mytester:assert(go2 == 1, "ConcatTable table variable length")
+end
+
+function nntest.FlattenTable()
+   -- Create a nested table.  Obviously we can't even stochastically test
+   -- the space of all possible nested tables (it's infinite), but here is a
+   -- hand-coded one that covers all the cases we need:
+   local input = {
+     torch.rand(1),
+     {
+       torch.rand(2),
+       {
+         torch.rand(3)
+       },
+     },
+     torch.rand(4)
+   }
+   local gradOutput = {
+     torch.rand(1),
+     torch.rand(2),
+     torch.rand(3),
+     torch.rand(4)
+   }
+
+   -- Check the FPROP
+   local m = nn.FlattenTable()
+   local output = m:forward(input)
+   mytester:assert(#output == 4, torch.typename(m)..' - fprop err ')
+   -- This is ugly, but check that the mapping from input to output is correct
+   mytester:assert(output[1] == input[1])
+   mytester:assert(output[2] == input[2][1])
+   mytester:assert(output[3] == input[2][2][1])
+   mytester:assert(output[4] == input[3])
+
+   -- Check the BPROP
+   local gradInput = m:backward(input, gradOutput)
+   -- Again, check that the mapping is correct
+   mytester:assert(gradOutput[1] == gradInput[1])
+   mytester:assert(gradOutput[2] == gradInput[2][1])
+   mytester:assert(gradOutput[3] == gradInput[2][2][1])
+   mytester:assert(gradOutput[4] == gradInput[3])
+
+   -- More uglyness: FlattenTable doesn't rebuild the table every updateOutput
+   -- call, so we need to make sure that modifications to the input are
+   -- detected correctly (and that the table is correctly rebuilt.
+   -- CASE 1: Nothing changes so the output table shouldn't be redefined
+   local old_input_map = m.input_map
+   local old_output = m.output
+   local _ = m:forward(input)
+   mytester:assert(old_input_map == m.input_map and old_output == m.output)
+
+   -- CASE 2: An element is added to the input table
+   old_input_map = m.input_map
+   old_output = m.output
+   input[2][#(input[2])+1] = torch.rand(5)
+   m:forward(input)
+   mytester:assert(old_input_map ~= m.input_map and old_output ~= m.output)
+
+   -- CASE 3: An element is removed from the input table
+   old_input_map = m.input_map
+   old_output = m.output
+   input[#input] = nil
+   m:forward(input)
+   mytester:assert(old_input_map ~= m.input_map and old_output ~= m.output)
+
+   -- At this point further testing is not necessary I think, but just to be
+   -- consistent: perform a jacobian test by using SplitTable and JointTable
+   -- elements
+   m = nn.Sequential()
+   local par = nn.ParallelTable()
+   par:add(nn.SplitTable(1))
+   par:add(nn.SplitTable(1))
+   m:add(nn.SplitTable(1))
+   m:add(par)  -- this will create a nested table
+   m:add(nn.FlattenTable())  -- This will flatten the nested table
+   m:add(nn.JoinTable(1))  -- Finally, this will create a 1D tensor
+
+   input = torch.Tensor(2,2,2)
+   local err = jac.testJacobian(m, input)
+   mytester:assertlt(err, precision, 'error on bprop ')
+end
+
+function nntest.L1Penalty()
+   local weight = 1
+   local sizeAverage = false
+   local m = nn.L1Penalty(weight, sizeAverage, false)
+
+   local input = torch.rand(2,10):add(-0.5)
+   input[1][1] = 0
+
+   local _ = m:forward(input)
+   local grad = m:backward(input, torch.ones(input:size()))
+
+   local err = input:clone():abs():sum()*weight - m.loss
+   mytester:assertlt(math.abs(err), precision, 'error on fprop ')
+
+   local true_grad = (input:gt(0):typeAs(grad) +
+      input:lt(0):typeAs(grad):mul(-1)):mul(weight)
+   mytester:assertlt((true_grad - grad):abs():max(), precision,
+      'error on bprop ')
+
+   -- Note: We cannot use the Jacobian test for this Module since the backward
+   -- gradient cannot be estimated using finite differences (ie, the loss
+   -- during BPROP is not included in the FPROP output)
+end
+
+function nntest.L1Cost()
+   local input = torch.rand(10) * 2 - 1
+   local m = nn.L1Cost()
+   local output = m:forward(input)
+   local err = output - torch.abs(input):sum()
+   mytester:assertalmosteq(err, 0, 1e-15, 'L1Cost forward')
+end
+
+function nntest.DepthConcat()
+   local outputSize = torch.IntTensor{5,6,7,8}
+   local input = torch.randn(2,3,12,12)
+   local gradOutput = torch.randn(2, outputSize:sum(), 12, 12)
+   local concat = nn.DepthConcat(2)
+   concat:add(nn.SpatialConvolutionMM(3, outputSize[1], 1, 1, 1, 1)) --> 2, 5, 12, 12
+   concat:add(nn.SpatialConvolutionMM(3, outputSize[2], 3, 3, 1, 1)) --> 2, 6, 10, 10
+   concat:add(nn.SpatialConvolutionMM(3, outputSize[3], 4, 4, 1, 1)) --> 2, 7, 9, 9
+   concat:add(nn.SpatialConvolutionMM(3, outputSize[4], 5, 5, 1, 1)) --> 2, 8, 8, 8
+   concat:zeroGradParameters()
+   -- forward/backward
+   local outputConcat = concat:forward(input)
+   local gradInputConcat = concat:backward(input, gradOutput)
+   -- the spatial dims are the largest, the nFilters is the sum
+   local output = torch.Tensor(2, outputSize:sum(), 12, 12):zero() -- zero for padding
+   local narrows = { {{},{1,5},{},{}}, {{},{6,11},{2,11},{2,11}}, {{},{12,18},{2,10},{2,10}}, {{},{19,26},{3,10},{3,10}} }
+   local gradInput = input:clone():zero()
+   for i=1,4 do
+      local conv = concat:get(i)
+      local gradWeight = conv.gradWeight:clone()
+      conv:zeroGradParameters()
+      output[narrows[i]]:copy(conv:forward(input))
+      gradInput:add(conv:backward(input, gradOutput[narrows[i]]))
+      mytester:assertTensorEq(gradWeight, conv.gradWeight, 0.000001, "Error in SpatialConcat:accGradParameters for conv "..i)
+   end
+   mytester:assertTensorEq(output, outputConcat, 0.000001, "Error in SpatialConcat:updateOutput")
+   mytester:assertTensorEq(gradInput, gradInputConcat, 0.000001, "Error in SpatialConcat:updateGradInput")
+end
+
+function nntest.MV()
+  local mv = nn.MV(false)
+  local outdim = torch.random(10,20)
+  local indim = torch.random(10,20)
+  local M = torch.randn(outdim, indim)
+  local V = torch.randn(indim)
+
+  -- Test forward pass.
+  local output = mv:forward({M, V})
+  mytester:assertTableEq(output:size():totable(), {outdim},
+  'Output has wrong dimensionality')
+  mytester:assertTensorEq(output, M * V, 1e-10,
+  'Wrong output')
+
+  -- Test backward pass.
+  local gradOutput = torch.randn(outdim)
+  local gradInput = mv:backward({M, V}, gradOutput)
+  mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+  local gradM, gradV = table.unpack(gradInput)
+  mytester:assertTableEq(gradM:size():totable(), M:size():totable(),
+  'Gradient for input M has wrong size')
+  mytester:assertTableEq(gradV:size():totable(), V:size():totable(),
+  'Gradient for input V has wrong size')
+  mytester:assertTensorEq(gradM, torch.ger(gradOutput, V), 1e-10,
+  'Wrong gradient for input M')
+  -- d/dV(j) (A(i,j)V(j)) = (
+  mytester:assertTensorEq(gradV, M:t() * gradOutput, 1e-10,
+  'Wrong gradient for input V')
+end
+
+function nntest.BatchMVNoTranspose()
+  local mv = nn.MV()
+  local outdim = torch.random(10,20)
+  local indim = torch.random(10,20)
+  for bSize = 1, 11, 5 do
+    local M = torch.randn(bSize, outdim, indim)
+    local V = torch.randn(bSize, indim)
+
+    -- Test forward pass.
+    local output = mv:forward({M, V})
+    mytester:assertTableEq(output:size():totable(), {bSize, outdim},
+    'Output has wrong dimensionality')
+    for i = 1, bSize do
+      mytester:assertTensorEq(output[i], M[i] * V[i], 1e-10,
+      'Output wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+
+    -- Test backward pass.
+    local gradOutput = torch.randn(bSize, outdim)
+    local gradInput = mv:backward({M, V}, gradOutput)
+    mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+    local gradM, gradV = table.unpack(gradInput)
+    mytester:assertTableEq(gradM:size():totable(), M:size():totable(),
+    'Gradient for input M has wrong size')
+    mytester:assertTableEq(gradV:size():totable(), V:size():totable(),
+    'Gradient for input V has wrong size')
+    for i = 1, bSize do
+      mytester:assertTensorEq(gradM[i], torch.ger(gradOutput[i], V[i]), 1e-10,
+      'Gradient for input M wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+      mytester:assertTensorEq(gradV[i], M[i]:t() * gradOutput[i], 1e-10,
+      'Gradient for input V wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+  end
+end
+
+function nntest.BatchMVTranspose()
+  local mv = nn.MV(true)
+  local outdim = torch.random(10,20)
+  local indim = torch.random(10,20)
+  for bSize = 1, 11, 5 do
+    local M = torch.randn(bSize, indim, outdim)
+    local V = torch.randn(bSize, indim)
+
+    -- Test forward pass.
+    local output = mv:forward({M, V})
+    mytester:assertTableEq(output:size():totable(), {bSize, outdim},
+    'Output has wrong dimensionality')
+    for i = 1, bSize do
+      mytester:assertTensorEq(output[i], M[i]:t() * V[i], 1e-10,
+      'Output wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+
+    -- Test backward pass.
+    local gradOutput = torch.randn(bSize, outdim)
+    local gradInput = mv:backward({M, V}, gradOutput)
+    mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+    local gradM, gradV = table.unpack(gradInput)
+    mytester:assertTableEq(gradM:size():totable(), M:size():totable(),
+    'Gradient for input M has wrong size')
+    mytester:assertTableEq(gradV:size():totable(), V:size():totable(),
+    'Gradient for input V has wrong size')
+    for i = 1, bSize do
+      mytester:assertTensorEq(gradM[i], torch.ger(V[i], gradOutput[i]), 1e-10,
+      'Gradient for input M wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+      mytester:assertTensorEq(gradV[i], M[i] * gradOutput[i], 1e-10,
+      'Gradient for input V wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+  end
+end
+
+local function createMatrixInputSizes()
+  local M = torch.random(10, 20)
+  local N = torch.random(10, 20)
+  local P = torch.random(10, 20)
+  return M, N, P
+end
+
+function nntest.MM()
+  local mm = nn.MM(false, true)
+  local M, N, P = createMatrixInputSizes()
+  local A = torch.randn(M, N)
+  local B = torch.randn(P, N)
+
+  -- Test forward pass.
+  local output = mm:forward({A, B})
+  mytester:assertTableEq(output:size():totable(), {M, P},
+                         'Output has wrong dimensionality')
+  mytester:assertTensorEq(output, A * B:t(), 1e-10,
+                          'Wrong output')
+
+  -- Test backward pass.
+  local gradOutput = torch.randn(M, P)
+  local gradInput = mm:backward({A, B}, gradOutput)
+  mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+  local gradA, gradB = table.unpack(gradInput)
+  mytester:assertTableEq(gradA:size():totable(), A:size():totable(),
+                         'Gradient for input A has wrong size')
+  mytester:assertTableEq(gradB:size():totable(), B:size():totable(),
+                         'Gradient for input B has wrong size')
+  mytester:assertTensorEq(gradA, gradOutput * B, 1e-10,
+                          'Wrong gradient for input A')
+  mytester:assertTensorEq(gradB, gradOutput:t() * A, 1e-10,
+                          'Wrong gradient for input B')
+end
+
+function nntest.BatchMMNoTranspose()
+  local mm = nn.MM()
+  local M, N, P = createMatrixInputSizes()
+  for bSize = 1, 11, 5 do
+    local A = torch.randn(bSize, M, N)
+    local B = torch.randn(bSize, N, P)
+
+    -- Test forward pass.
+    local output = mm:forward({A, B})
+    mytester:assertTableEq(output:size():totable(), {bSize, M, P},
+                           'Output has wrong dimensionality')
+    for i = 1, bSize do
+      mytester:assertTensorEq(output[i], A[i] * B[i], 1e-10,
+                              'Output wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+
+    -- Test backward pass.
+    local gradOutput = torch.randn(bSize, M, P)
+    local gradInput = mm:backward({A, B}, gradOutput)
+    mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+    local gradA, gradB = table.unpack(gradInput)
+    mytester:assertTableEq(gradA:size():totable(), A:size():totable(),
+                           'Gradient for input A has wrong size')
+    mytester:assertTableEq(gradB:size():totable(), B:size():totable(),
+                           'Gradient for input B has wrong size')
+    for i = 1, bSize do
+      mytester:assertTensorEq(gradA[i], gradOutput[i] * B[i]:t(), 1e-10,
+                              'Gradient for input A wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+      mytester:assertTensorEq(gradB[i], A[i]:t() * gradOutput[i], 1e-10,
+                              'Gradient for input B wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+  end
+end
+
+function nntest.BatchMMTransposeA()
+  local mm = nn.MM(true, false)
+  local M, N, P = createMatrixInputSizes()
+  for bSize = 1, 11, 5 do
+    local A = torch.randn(bSize, N, M)
+    local B = torch.randn(bSize, N, P)
+
+    -- Test forward pass.
+    local output = mm:forward({A, B})
+    mytester:assertTableEq(output:size():totable(), {bSize, M, P},
+                           'Output has wrong dimensionality')
+    for i = 1, bSize do
+      mytester:assertTensorEq(output[i], A[i]:t() * B[i], 1e-10,
+                              'Output wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+
+    -- Test backward pass.
+    local gradOutput = torch.randn(bSize, M, P)
+    local gradInput = mm:backward({A, B}, gradOutput)
+    mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+    local gradA, gradB = table.unpack(gradInput)
+    mytester:assertTableEq(gradA:size():totable(), A:size():totable(),
+                           'Gradient for input A has wrong size')
+    mytester:assertTableEq(gradB:size():totable(), B:size():totable(),
+                           'Gradient for input B has wrong size')
+    for i = 1, bSize do
+      mytester:assertTensorEq(gradA[i], B[i] * gradOutput[i]:t(), 1e-10,
+                              'Gradient for input A wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+      mytester:assertTensorEq(gradB[i], A[i] * gradOutput[i], 1e-10,
+                              'Gradient for input B wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+  end
+end
+
+function nntest.BatchMMTransposeB()
+  local mm = nn.MM(false, true)
+  local M, N, P = createMatrixInputSizes()
+  for bSize = 1, 11, 5 do
+    local A = torch.randn(bSize, M, N)
+    local B = torch.randn(bSize, P, N)
+
+    -- Test forward pass.
+    local output = mm:forward({A, B})
+    mytester:assertTableEq(output:size():totable(), {bSize, M, P},
+                           'Output has wrong dimensionality')
+    for i = 1, bSize do
+      mytester:assertTensorEq(output[i], A[i] * B[i]:t(), 1e-10,
+                              'Output wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+
+    -- Test backward pass.
+    local gradOutput = torch.randn(bSize, M, P)
+    local gradInput = mm:backward({A, B}, gradOutput)
+    mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+    local gradA, gradB = table.unpack(gradInput)
+    mytester:assertTableEq(gradA:size():totable(), A:size():totable(),
+                           'Gradient for input A has wrong size')
+    mytester:assertTableEq(gradB:size():totable(), B:size():totable(),
+                           'Gradient for input B has wrong size')
+    for i = 1, bSize do
+      mytester:assertTensorEq(gradA[i], gradOutput[i] * B[i], 1e-10,
+                              'Gradient for input A wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+      mytester:assertTensorEq(gradB[i], gradOutput[i]:t() * A[i], 1e-10,
+                              'Gradient for input B wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+  end
+end
+
+function nntest.BatchMMTransposeBoth()
+  local mm = nn.MM(true, true)
+  local M, N, P = createMatrixInputSizes()
+  for bSize = 1, 11, 5 do
+    local A = torch.randn(bSize, N, M)
+    local B = torch.randn(bSize, P, N)
+
+    -- Test forward pass.
+    local output = mm:forward({A, B})
+    mytester:assertTableEq(output:size():totable(), {bSize, M, P},
+                           'Output has wrong dimensionality')
+    for i = 1, bSize do
+      mytester:assertTensorEq(output[i], A[i]:t() * B[i]:t(), 1e-10,
+                              'Output wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+
+    -- Test backward pass.
+    local gradOutput = torch.randn(bSize, M, P)
+    local gradInput = mm:backward({A, B}, gradOutput)
+    mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+    local gradA, gradB = table.unpack(gradInput)
+    mytester:assertTableEq(gradA:size():totable(), A:size():totable(),
+                           'Gradient for input A has wrong size')
+    mytester:assertTableEq(gradB:size():totable(), B:size():totable(),
+                           'Gradient for input B has wrong size')
+    for i = 1, bSize do
+      mytester:assertTensorEq(gradA[i], B[i]:t() * gradOutput[i]:t(), 1e-10,
+                              'Gradient for input A wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+      mytester:assertTensorEq(gradB[i], gradOutput[i]:t() * A[i]:t(), 1e-10,
+                              'Gradient for input B wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+  end
+end
+
+function nntest.DotProduct()
+  local indim = math.random(1,10)
+
+  -- test 1D forward
+  local input = {torch.rand(indim),torch.rand(indim)}
+  local module = nn.DotProduct()
+  local expected = input[1]:dot(input[2])
+  local output = module:forward(input)
+  mytester:assertlt(math.abs(expected-output[1]), precision, 'error on forward ')
+
+  -- check gradients
+  -- Note: testJacobian doesn't support table inputs, and rather than re-write
+  -- it so that it does, I'll just use a split table module on the input.
+  -- I assume both SplitTable and Sequential do not have bugs, otherwise this
+  -- test will break.
+  local input = torch.rand(2,indim)
+  local module = nn.Sequential()
+  module:add(nn.SplitTable(1))
+  module:add(nn.DotProduct())
+
+  local err = jac.testJacobian(module,input)
+  mytester:assertlt(err,precision, 'error on state ')
+
+  -- IO
+  local ferr,berr = jac.testIO(module,input)
+  mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+  mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+  -- batch
+  -- rebuild module to avoid correlated tests
+  local module = nn.Sequential()
+  module:add(nn.SplitTable(1))
+  module:add(nn.DotProduct())
+
+  local nframes = math.random(1,10)
+  local indim = math.random(1,10)
+  local input = torch.rand(2,nframes,indim)
+
+  local err = jac.testJacobian(module,input)
+  mytester:assertlt(err,precision, 'batch error on state ')
+end
+
+function nntest.CosineDistance()
+  local indim = math.random(1,10)
+  local input = {torch.rand(indim),torch.rand(indim)}
+
+  -- check forward against previous implementation
+  local module = nn.CosineDistance()
+
+  local w1 = input[1]:dot(input[2])
+  local w2 = math.sqrt(input[1]:dot(input[1]))
+  local w3 = math.sqrt(input[2]:dot(input[2]))
+  local output_old = w1/w2/w3
+
+  local output = module:forward(input)
+
+  mytester:assertlt(math.abs(output_old-output[1]),precision,'error on forward ')
+
+
+  -- check gradients
+  -- Note: testJacobian doesn't support table inputs, and rather than re-write
+  -- it so that it does, I'll just use a split table module on the input.
+  -- I assume both SplitTable and Sequential do not have bugs, otherwise this
+  -- test will break.
+  local input = torch.rand(2,indim)
+  local module = nn.Sequential()
+  module:add(nn.SplitTable(1))
+  module:add(nn.CosineDistance())
+
+  local err = jac.testJacobian(module,input)
+  mytester:assertlt(err,precision, 'error on state ')
+
+  -- IO
+  local ferr,berr = jac.testIO(module,input)
+  mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+  mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+  -- batch
+  -- rebuild module to avoid correlated tests
+  local module = nn.Sequential()
+  module:add(nn.SplitTable(1))
+  module:add(nn.CosineDistance())
+
+  local nframes = math.random(1,10)
+  local indim = math.random(1,10)
+  local input = torch.rand(2,nframes,indim)
+
+  local err = jac.testJacobian(module,input)
+  mytester:assertlt(err,precision, 'batch error on state ')
+
+end
+
+function nntest.CosineEmbeddingCriterion()
+  local v1 = torch.Tensor{1, 0}
+  local v2 = torch.Tensor{0.5, math.sqrt(3)*0.5}
+
+  local crit = nn.CosineEmbeddingCriterion(0.6)
+  local output = crit:forward({v1, v2}, -1) -- must be Called before backward
+  local grads = crit:backward({v1, v2}, -1)
+
+  local zero = torch.Tensor(2):zero()
+  equal(grads[1], zero, 'gradient should be zero')
+  equal(grads[2], zero, 'gradient should be zero')
+
+  -- check jacobians
+  local margin = math.random()*2-1
+  local dim = 5
+  local batch_size = 1
+  local crit = nn.CosineEmbeddingCriterion(margin)
+  local v = torch.rand(2,dim)
+  criterionJacobianTest1DTable(crit,v,1)
+  criterionJacobianTest1DTable(crit,v,-1)
+
+  -- batch with hand-computed values
+  local v1 = torch.Tensor{{1, 0}, {0.5, math.sqrt(3)*0.5}}
+  local v2 = torch.Tensor{{0.5, math.sqrt(3)*0.5}, {1, 0}}
+
+  local t = torch.Tensor{-1,-1}
+  local crit = nn.CosineEmbeddingCriterion(0.6)
+  local output = crit:forward({v1, v2}, t) -- must be Called before backward
+  local grads = crit:backward({v1, v2}, t)
+
+  local zero = torch.Tensor(2,2):zero()
+  equal(grads[1], zero, 'gradient should be zero')
+  equal(grads[2], zero, 'gradient should be zero')
+
+  -- batch, sizeAverage true, jacobian
+  local margin = math.random()*2-1
+  local dim = 5
+  local batch_size = 2
+  local crit = nn.CosineEmbeddingCriterion(margin)
+  crit.sizeAverage = true
+  local v = torch.rand(2,batch_size,dim)
+  local t = torch.Tensor(batch_size):random(0,1):mul(2):add(-1)
+  criterionJacobianTest1DTable(crit,v,t)
+
+  -- batch, sizeAverage false, jacobian
+  local margin = math.random()*2-1
+  local crit = nn.CosineEmbeddingCriterion(margin)
+  crit.sizeAverage = false
+  local v = torch.rand(2,batch_size,dim)
+  local t = torch.Tensor(batch_size):random(0,1):mul(2):add(-1)
+  criterionJacobianTest1DTable(crit,v,t)
+end
+
+function nntest.HingeEmbeddingCriterion()
+  local x = torch.Tensor{0.3,2.1,1.8,0}
+  local y = torch.Tensor{1,-1,-1,1}
+  local expgrads = torch.Tensor{1,0,-1,1} / 4
+
+  local crit = nn.HingeEmbeddingCriterion(2)
+  local output = crit:forward(x, y) -- must be called before backward
+  local grads = crit:backward(x, y)
+
+  mytester:assert(math.abs(output - (0.3 + 0.2) / 4) < 1e-10)
+  equal(grads, expgrads)
+end
+
+function nntest.Replicate()
+   local vector = torch.rand(3)
+
+   local r1 = nn.Replicate(2, 1)
+   local r2 = nn.Replicate(2, 2)
+
+   local vOutput1 = r1:forward(vector):clone()
+   local vOutput2 = r2:forward(vector):clone()
+
+   local expected1 = torch.zeros(2, 3)
+   local expected2 = torch.zeros(3, 2)
+   expected1:select(1, 1):copy(vector)
+   expected1:select(1, 2):copy(vector)
+   expected2:select(2, 1):copy(vector)
+   expected2:select(2, 2):copy(vector)
+
+   mytester:assertTensorEq(vOutput1, expected1, precision, 'Wrong tiling of data when replicating vector.')
+   mytester:assertTensorEq(vOutput2, expected2, precision, 'Wrong tiling of data when replicating vector.')
+
+   -- batch mode
+   local vector = torch.rand(4,3)
+
+   local r1 = nn.Replicate(2, 1, 1)
+   local r2 = nn.Replicate(2, 2, 1)
+
+   local vOutput1 = r1:forward(vector):clone()
+   local vOutput2 = r2:forward(vector):clone()
+
+   local expected1 = torch.zeros(4, 2, 3)
+   local expected2 = torch.zeros(4, 3, 2)
+   expected1:select(2, 1):copy(vector)
+   expected1:select(2, 2):copy(vector)
+   expected2:select(3, 1):copy(vector)
+   expected2:select(3, 2):copy(vector)
+
+   mytester:assertTensorEq(vOutput1, expected1, precision, 'Wrong tiling of data when replicating batch vector.')
+   mytester:assertTensorEq(vOutput2, expected2, precision, 'Wrong tiling of data when replicating batch vector.')
+end
+
+local function testBatchNormalization(moduleName, dim, k)
+   local planes = torch.random(1,k)
+   local size = { torch.random(2, k), planes }
+   for i=1,dim do
+      table.insert(size, torch.random(1,k))
+   end
+   local input = torch.zeros(table.unpack(size)):uniform()
+
+   local function jacTests(module, input, affine)
+      local err = jac.testJacobian(module,input)
+      mytester:assertlt(err,precision, 'error on state ')
+
+      if affine then
+         local err = jac.testJacobianParameters(module, input,
+                                            module.weight, module.gradWeight)
+         mytester:assertlt(err,precision, 'error on weight ')
+
+         local err = jac.testJacobianParameters(module, input,
+                                            module.bias, module.gradBias)
+         mytester:assertlt(err,precision, 'error on weight ')
+
+         local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+         mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+         local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+         mytester:assertlt(err,precision, 'error on bias [direct update] ')
+
+         for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+            mytester:assertlt(err, precision, string.format(
+               'error on weight [%s]', t))
+         end
+
+         for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+            mytester:assertlt(err, precision, string.format('error on bias [%s]', t))
+         end
+      end
+
+      -- IO
+      local ferr,berr = jac.testIO(module,input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   end
+
+   local module = nn[moduleName](planes)
+   module:training()
+   jacTests(module, input, true)
+   module:evaluate()
+   jacTests(module, input, true)
+
+   -- batch norm without affine transform
+   module = nn[moduleName](planes, 1e-5, 0.1, false)
+   module:training()
+   jacTests(module, input, false)
+   module:evaluate()
+   jacTests(module, input, false)
+end
+
+function nntest.BatchNormalization()
+   testBatchNormalization('BatchNormalization', 0, 20)
+end
+
+function nntest.SpatialBatchNormalization()
+   testBatchNormalization('SpatialBatchNormalization', 2, 6)
+end
+
+function nntest.VolumetricBatchNormalization()
+   testBatchNormalization('VolumetricBatchNormalization', 3, 4)
+end
+
+function nntest.GradientReversal()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   -- Two GradientReversal layers should cancel each other out
+   local module = nn.Sequential()
+   module:add(nn.GradientReversal())
+   module:add(nn.GradientReversal())
+
+   local err = jac.testJacobian(module,input, 0.1, 10)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input, 0.1, 10)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+function nntest.Padding()
+   local fanin = math.random(1,3)
+   local sizex = math.random(4,16)
+   local sizey = math.random(4,16)
+   local pad = math.random(-3,3)
+   local index = math.random(1, fanin)
+   local val = torch.randn(1):squeeze()
+   local module = nn.Padding(1, pad, 3, val, index)
+   local input = torch.rand(fanin,sizey,sizex)
+   local size = input:size():totable()
+   size[1] = size[1] + math.abs(pad)
+
+   local output = module:forward(input)
+   mytester:assertTableEq(size, output:size():totable(), 0.00001, "Padding size error")
+
+   local gradInput = module:backward(input, output)
+   mytester:assertTensorEq(gradInput, input, 0.00001, "Padding backward error")
+end
+
+function nntest.addSingletonDimension()
+   local dims = torch.random(5)
+   local size = torch.LongTensor(dims):random(10)
+   local perm = torch.randperm(dims):totable()
+   local tensor = torch.Tensor(table.unpack(size:totable())):uniform():permute(table.unpack(perm))
+   size = torch.gather(size, 1, torch.LongTensor(perm))
+
+   local firstDim = nn.utils.addSingletonDimension(tensor)
+   mytester:assertTableEq(firstDim:size():totable(), {1, table.unpack(size:totable())},
+                          "wrong size for singleton dimension 1")
+   mytester:assertTensorEq(firstDim[1], tensor, 0,
+                           "wrong content for singleton dimension 1")
+
+   local dim = torch.random(dims + 1)
+   local result = nn.utils.addSingletonDimension(tensor, dim)
+   local resultSize = size:totable()
+   table.insert(resultSize, dim, 1)
+   mytester:assertTableEq(result:size():totable(), resultSize,
+                          "wrong size for random singleton dimension")
+   mytester:assertTensorEq(result:select(dim, 1), tensor, 0,
+                           "wrong content for random singleton dimension")
+
+   mytester:assertError(function() nn.utils.addSingletonDimension(tensor, dims + 2) end,
+                        "invalid dimension not detected")
+
+   -- passing output tensor as argument
+   local resultArg = torch.Tensor()
+   local resultR = nn.utils.addSingletonDimension(resultArg, tensor, dim)
+   mytester:eq(resultArg:size():totable(), resultSize,
+               'wrong content for random singleton dimention '..
+               'when the result is passed as argument')
+   mytester:eq(resultArg, result, 'wrong content for random singleton dimention '..
+               'when the result is passed as argument')
+
+   mytester:eq(resultR == resultArg, true,
+               'new tensor is created when it should use the provided tensor')
+end
+
+function nntest.SpatialReflectionPadding()
+   local batch = math.random(1,3)
+   local plane = math.random(1,3)
+   local sizeY = math.random(7,16)
+   local sizeX = math.random(7,16)
+   local padL = math.random(-3,3)
+   local padR = math.random(-3,3)
+   local padT = math.random(-3,3)
+   local padB = math.random(-3,3)
+   local jac = nn.Jacobian
+   local layer = nn.SpatialReflectionPadding(padL, padR, padT, padB)
+   local input = torch.rand(batch, plane, sizeY, sizeX)
+   local err = jac.testJacobian(layer, input)
+   mytester:assertalmosteq(err, 0.0, 1e-7)
+end
+
+function nntest.SpatialReplicationPadding()
+   local batch = math.random(1,3)
+   local plane = math.random(1,3)
+   local sizeY = math.random(7,16)
+   local sizeX = math.random(7,16)
+   local padL = math.random(-3,3)
+   local padR = math.random(-3,3)
+   local padT = math.random(-3,3)
+   local padB = math.random(-3,3)
+   local jac = nn.Jacobian
+   local layer = nn.SpatialReplicationPadding(padL, padR, padT, padB)
+   local input = torch.rand(batch, plane, sizeY, sizeX)
+   local err = jac.testJacobian(layer, input)
+   mytester:assertalmosteq(err, 0.0, 1e-7)
+end
+
+function nntest.Typecast()
+  local function make_network()
+    local seq = nn.Sequential()
+    seq:add(nn.Linear(15, 10))
+    seq:add(nn.Linear(15, 10))
+    seq.modules[1].bias:fill(1)
+    seq.modules[2].bias:fill(2)
+    return seq
+  end
+
+  -- make sure that the typecasts aren't nops
+  assert(torch.getdefaulttensortype() == 'torch.DoubleTensor')
+
+  -- basic net
+  local net = make_network()
+  net.modules[1].empty_tensor = torch.Tensor()
+  net:float()
+  assert(net.modules[1].bias:type() == 'torch.FloatTensor',
+      net.modules[1].bias:type())
+  assert(net.modules[1].empty_tensor:type() == 'torch.FloatTensor')
+  assert(net.modules[1].bias ~= net.modules[2].bias)
+  net.modules[1].bias:fill(3)
+  assert(net.modules[1].bias[1] == 3)
+  assert(net.modules[2].bias[1] == 2)
+
+  -- shared tensors remain shared
+  local net = make_network()
+  net.modules[2].bias = net.modules[1].bias
+  net:float()
+  assert(net.modules[1].bias:type() == 'torch.FloatTensor')
+  assert(net.modules[1].bias == net.modules[2].bias)
+  assert(net.modules[1].bias[1] == 1)
+
+  -- shared storages remain shared
+  local net = make_network()
+  net.modules[2].bias:set(net.modules[1].bias)
+  local net = net:float()
+  assert(net.modules[1].bias:type() == 'torch.FloatTensor')
+  assert(net.modules[1].bias ~= net.modules[2].bias)
+  net.modules[1].bias:fill(3)
+  assert(net.modules[1].bias[1] == 3)
+  assert(net.modules[2].bias[1] == 3)
+
+  -- tricky: overlapping views on the same storage are preserved
+  local net = make_network()
+  local overlap_storage = torch.Tensor(15):fill(1)
+  net.modules[1].bias = overlap_storage:narrow(1, 1, 10)
+  net.modules[2].bias = overlap_storage:narrow(1, 6, 10)
+  net:float()
+  assert(net.modules[1].bias:type() == 'torch.FloatTensor')
+  assert(net.modules[1].bias ~= net.modules[2].bias)
+  net.modules[1].bias:fill(3)
+  assert(net.modules[1].bias[1] == 3)
+  assert(net.modules[2].bias[1] == 3)
+  assert(net.modules[2].bias[6] == 1) -- only the first 5 elements overlapped
+
+  -- check recursiveType on a table
+  local net1 = make_network()
+  local net2 = make_network()
+  net2.modules[1].bias:set(net1.modules[1].bias)
+  net1:float()
+  net2:float()
+  net1.modules[1].bias:fill(3)
+  assert(net2.modules[1].bias[1] == 1)
+
+  local net1 = make_network()
+  local net2 = make_network()
+  net2.modules[1].bias:set(net1.modules[1].bias)
+
+  local tensorCache = {}
+  net1:type('torch.FloatTensor', tensorCache)
+  net2:type('torch.FloatTensor', tensorCache)
+  net1.modules[1].bias:fill(3)
+  assert(net2.modules[1].bias[1] == 3)
+
+  local net1 = make_network()
+  local net2 = make_network()
+  net2.modules[1].bias:set(net1.modules[1].bias)
+
+  nn.utils.recursiveType({net1, net2}, 'torch.FloatTensor')
+  net1.modules[1].bias:fill(3)
+  assert(net2.modules[1].bias[1] == 3)
+
+  -- smoke test some modules with custom type methods
+  local custom_type_modules = {
+    nn.MixtureTable(3),
+    nn.ConcatTable(),
+    nn.Copy(),
+    nn.Copy(nil, nil, nil, true),
+    nn.SpatialContrastiveNormalization(),
+    nn.DotProduct(),
+    nn.PairwiseDistance(1),
+    nn.SpatialDivisiveNormalization(),
+    nn.SpatialSubtractiveNormalization()
+  }
+  for _, module in ipairs(custom_type_modules) do
+    module:float()
+  end
+end
+
+function nntest.Module_apply()
+  local s = nn.Sequential()
+  s:add(nn.Linear(10,10))
+  local s2 = nn.Sequential()
+  s2:add(nn.Linear(10,5))
+  s:add(s2)
+  s:add(nn.Tanh())
+
+  local seen = 0
+  s:apply(function(module)
+    if torch.type(module) == 'nn.Linear' then
+      module.bias:resize(20)
+      seen = seen + 1
+    end
+  end)
+  mytester:asserteq(seen, 2)
+  mytester:asserteq(s.modules[1].bias:size(1), 20)
+  mytester:asserteq(s2.modules[1].bias:size(1), 20)
+end
+
+function nntest.Module_replace()
+   -- test replace in container
+   local s = nn.Sequential()
+   s:add(nn.Linear(10,10))
+   s:add(nn.Sigmoid())
+   s:replace(function(module)
+      return torch.type(module) == 'nn.Sigmoid' and nn.Tanh() or module
+   end)
+   -- test replace of a single module
+   local single = nn.Tanh()
+   local replaced = single:replace(function(module)
+      return torch.type(module) == 'nn.Tanh' and nn.Sigmoid() or module
+   end)
+   mytester:asserteq(torch.type(s:get(2)), 'nn.Tanh', 'replace in container')
+   mytester:asserteq(torch.type(replaced), 'nn.Sigmoid', 'replace in single module')
+end
+
+function nntest.Cosine()
+   local inputSize = 4
+   local outputSize = 5
+
+   -- test 1D
+   local input = torch.randn(inputSize)
+   local gradOutput = torch.randn(outputSize)
+   local cosine = nn.Cosine(inputSize,outputSize)
+   local output = cosine:forward(input)
+   local inputNorm = input:norm()+1e-12
+   local weight2 = cosine.weight[2]
+   local output2 = torch.dot(weight2, input)/((weight2:norm()+1e-12)*inputNorm)
+   mytester:assert(math.abs(output2 - output[2]) < 0.000001,"Cosine output 1D err weight[2]")
+   local output2 = torch.mv(cosine.weight, input)
+   output2:cdiv(cosine.weight:norm(2,2)+1e-12):div(inputNorm)
+   mytester:assertTensorEq(output, output2, 0.000001, "Cosine output 1D err")
+   local gradInput = cosine:updateGradInput(input, gradOutput)
+   local gradInput2 = gradInput:clone():zero()
+   for j=1,outputSize do
+      local w_j = cosine.weight[j]
+      local nw_j = w_j:norm()+1e-12
+      for i=1,inputSize do
+         local w_ij = w_j[i]
+         local grad_i = (w_ij/(inputNorm*nw_j))
+         grad_i = grad_i - (output[j]*input[i]/(inputNorm*inputNorm))
+         grad_i = grad_i * gradOutput[j]
+         gradInput2[i] = gradInput2[i] + grad_i
+      end
+   end
+   mytester:assertTensorEq(gradInput2, gradInput, 0.000001, "Cosine gradInput 1D err")
+   cosine:zeroGradParameters()
+   cosine:accGradParameters(input, gradOutput, 1)
+   local gradWeight2 = cosine.weight:clone():zero()
+   for j=1,outputSize do
+      local w_j = cosine.weight[j]
+      local nw_j = w_j:norm()+1e-12
+      for i=1,inputSize do
+         local w_ij = w_j[i]
+         local gW_ij = (gradOutput[j]/nw_j)  * ( ( input[i] / inputNorm ) - (output[j] * w_ij / nw_j) )
+         gradWeight2[{j,i}] = gW_ij
+      end
+   end
+   mytester:assertTensorEq(cosine.gradWeight, gradWeight2, 0.000001, "Cosine gradWeight 2D err")
+
+   -- test 2D
+   local batchSize = 3
+   local input = torch.randn(batchSize, inputSize)
+   local gradOutput = torch.randn(batchSize, outputSize)
+   cosine:zeroGradParameters()
+   local cosine2 = cosine:clone()
+   local output = cosine:forward(input)
+   local output2 = cosine2:forward(input[2])
+   mytester:assertTensorEq(output[2], output2, 0.000001, "Cosine output 2D err")
+   local gradInput = cosine:backward(input, gradOutput)
+
+   local gradInput2 = gradInput:clone():zero()
+   for i=1,batchSize do
+      cosine2:forward(input[i], gradOutput[i])
+      gradInput2[i]:copy(cosine2:backward(input[i], gradOutput[i]))
+   end
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, "Cosine gradInput 2D err")
+   mytester:assertTensorEq(cosine.gradWeight, cosine2.gradWeight, 0.000001, "Cosine gradWeight 2D err")
+end
+
+function nntest.ErrorHandling()
+   local l = nn.Linear(1, 1)
+   local p = nn.Parallel(1, 1):add(l)
+   local c = nn.Concat(1):add(p)
+   local model = nn.Sequential():add(nn.Identity()):add(c):add(nn.Identity())
+   local function errmsg(module, i)
+       return 'In ' .. i .. ' module of ' .. torch.type(module) .. ':\n'
+   end
+   local expected_err = errmsg(model, 2) .. errmsg(c, 1) .. errmsg(p, 1)
+   mytester:assertErrorObj(
+       function()
+           model:forward(torch.randn(1,2,2))
+       end,
+       function(err)
+           return err:find(expected_err) and err:find('size mismatch')
+       end,
+       "Failure expected or bad error message (missing information or reason)"
+   )
+end
+
+mytester:add(nntest)
+
+jac = nn.Jacobian
+sjac = nn.SparseJacobian
+function nn.test(tests, seed)
+   -- Limit number of threads since everything is small
+   local nThreads = torch.getnumthreads()
+   torch.setnumthreads(1)
+   -- randomize stuff
+   local seed = seed or (1e5 * torch.tic())
+   print('Seed: ', seed)
+   math.randomseed(seed)
+   torch.manualSeed(seed)
+   mytester:run(tests)
+   torch.setnumthreads(nThreads)
+   return mytester
+end
diff --git a/utils.lua b/utils.lua
new file mode 100644
index 0000000..8f9c203
--- /dev/null
+++ b/utils.lua
@@ -0,0 +1,218 @@
+nn.utils = {}
+
+-- oops; someone forgot to add torch.Storage.type
+-- TODO replace with torch.Storage.type when implemented
+local function torch_Storage_type(self, type)
+   local current = torch.typename(self)
+   if not type then return current end
+   if type ~= current then
+      local new = torch.getmetatable(type).new()
+      if self:size() > 0 then
+         new:resize(self:size()):copy(self)
+      end
+      return new
+   else
+      return self
+   end
+end
+
+-- tensorCache maintains a list of all tensors and storages that have been
+-- converted (recursively) by calls to recursiveType() and type().
+-- It caches conversions in order to preserve sharing semantics
+-- i.e. if two tensors share a common storage, then type conversion
+-- should preserve that.
+--
+-- You can preserve sharing semantics across multiple networks by
+-- passing tensorCache between the calls to type, e.g.
+--
+-- > tensorCache = {}
+-- > net1:type('torch.CudaTensor', tensorCache)
+-- > net2:type('torch.CudaTensor', tensorCache)
+-- > nn.utils.recursiveType(anotherTensor, 'torch.CudaTensor', tensorCache)
+--
+-- Implementation note: to make Lua table lookup behave correctly,
+-- tensor keys are stored as actual tensor objects, while storage
+-- keys are stored as the pointers themselves (as numbers).
+function nn.utils.recursiveType(param, type, tensorCache)
+   tensorCache = tensorCache or {}
+
+   if torch.type(param) == 'table' then
+      for k, v in pairs(param) do
+         param[k] = nn.utils.recursiveType(v, type, tensorCache)
+      end
+   elseif torch.isTypeOf(param, 'nn.Module') or
+          torch.isTypeOf(param, 'nn.Criterion') then
+      param:type(type, tensorCache)
+   elseif torch.isTensor(param) then
+      if torch.typename(param) ~= type then
+         local newparam
+         if tensorCache[param] then
+            newparam = tensorCache[param]
+         else
+            newparam = torch.Tensor():type(type)
+            local storageType = type:gsub('Tensor','Storage')
+            if param:storage() then
+               local storage_key = torch.pointer(param:storage())
+               if not tensorCache[storage_key] then
+                  tensorCache[storage_key] = torch_Storage_type(
+                        param:storage(), storageType)
+               end
+               assert(torch.type(tensorCache[storage_key]) == storageType)
+               newparam:set(
+                  tensorCache[storage_key],
+                  param:storageOffset(),
+                  param:size(),
+                  param:stride()
+               )
+            end
+            tensorCache[param] = newparam
+         end
+         assert(torch.type(newparam) == type)
+         param = newparam
+      end
+   end
+   return param
+end
+
+function nn.utils.recursiveResizeAs(t1,t2)
+   if torch.type(t2) == 'table' then
+      t1 = (torch.type(t1) == 'table') and t1 or {t1}
+      for key,_ in pairs(t2) do
+         t1[key], t2[key] = nn.utils.recursiveResizeAs(t1[key], t2[key])
+      end
+      for key,_ in pairs(t1) do
+         if not t2[key] then
+            t1[key] = nil
+         end
+      end
+   elseif torch.isTensor(t2) then
+      t1 = torch.isTensor(t1) and t1 or t2.new()
+      t1:resizeAs(t2)
+   else
+      error("expecting nested tensors or tables. Got "..
+            torch.type(t1).." and "..torch.type(t2).." instead")
+   end
+   return t1, t2
+end
+
+function nn.utils.recursiveFill(t2, val)
+   if torch.type(t2) == 'table' then
+      for key,_ in pairs(t2) do
+         t2[key] = nn.utils.recursiveFill(t2[key], val)
+      end
+   elseif torch.isTensor(t2) then
+      t2:fill(val)
+   else
+      error("expecting tensor or table thereof. Got "
+           ..torch.type(t2).." instead")
+   end
+   return t2
+end
+
+function nn.utils.recursiveAdd(t1, val, t2)
+   if not t2 then
+      assert(val, "expecting at least two arguments")
+      t2 = val
+      val = 1
+   end
+   val = val or 1
+   if torch.type(t2) == 'table' then
+      t1 = (torch.type(t1) == 'table') and t1 or {t1}
+      for key,_ in pairs(t2) do
+         t1[key], t2[key] = nn.utils.recursiveAdd(t1[key], val, t2[key])
+      end
+   elseif torch.isTensor(t1) and torch.isTensor(t2) then
+      t1:add(val, t2)
+   else
+      error("expecting nested tensors or tables. Got "..
+            torch.type(t1).." and "..torch.type(t2).." instead")
+   end
+   return t1, t2
+end
+
+function nn.utils.recursiveCopy(t1,t2)
+   if torch.type(t2) == 'table' then
+      t1 = (torch.type(t1) == 'table') and t1 or {t1}
+      for key,_ in pairs(t2) do
+         t1[key], t2[key] = nn.utils.recursiveCopy(t1[key], t2[key])
+      end
+   elseif torch.isTensor(t2) then
+      t1 = torch.isTensor(t1) and t1 or t2.new()
+      t1:resizeAs(t2):copy(t2)
+   else
+      error("expecting nested tensors or tables. Got "..
+            torch.type(t1).." and "..torch.type(t2).." instead")
+   end
+   return t1, t2
+end
+
+function nn.utils.addSingletonDimension(...)
+  local view, t, dim
+  if select('#',...) < 3 then
+    t, dim = select(1,...)
+  else
+    view, t, dim = select(1,...)
+    assert(torch.isTensor(view),
+           "output tensor expected, got " .. type(view))
+  end
+
+  assert(torch.isTensor(t), "input tensor expected")
+  dim = dim or 1
+  assert(dim > 0 and dim <= (t:dim() + 1), "invalid dimension: " .. dim
+             .. '. Tensor is of ' .. t:dim() .. ' dimensions.')
+
+  view = view or t.new()
+  local size = torch.LongStorage(t:dim() + 1)
+  local stride = torch.LongStorage(t:dim() + 1)
+
+  for d = 1, dim - 1 do
+    size[d] = t:size(d)
+    stride[d] = t:stride(d)
+  end
+  size[dim] = 1
+  stride[dim] = 1
+  for d = dim + 1, t:dim() + 1 do
+    size[d] = t:size(d - 1)
+    stride[d] = t:stride(d - 1)
+  end
+
+  view:set(t:storage(), t:storageOffset(), size, stride)
+  return view
+end
+
+function nn.utils.contiguousView(output, input, ...)
+  output = output or input.new()
+  if input:isContiguous() then
+    output:view(input, ...)
+  else
+    output:resizeAs(input)
+    output:copy(input)
+    output:view(output, ...)
+  end
+  return output
+end
+
+-- go over specified fields and clear them. accepts
+-- nn.utils.clearState(self, {'_buffer', '_buffer2'}) and
+-- nn.utils.clearState(self, '_buffer', '_buffer2')
+function nn.utils.clear(self, ...)
+   local arg = {...}
+   if #arg > 0 and type(arg[1]) == 'table' then
+      arg = arg[1]
+   end
+   local function clear(f)
+      if self[f] then
+         if torch.isTensor(self[f]) then
+            self[f]:set()
+         elseif type(self[f]) == 'table' then
+            self[f] = {}
+         else
+            self[f] = nil
+         end
+      end
+   end
+   for i,v in ipairs(arg) do clear(v) end
+   return self
+end
+
+table.unpack = table.unpack or unpack

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-nn.git



More information about the debian-science-commits mailing list