[lua-torch-nn] 02/04: New upstream version 0~20170521-g84b4350+dfsg

Mon May 22 03:42:47 UTC 2017

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-nn.

commit 39b264b97438849c25b893bb36dc6ca397a46296
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Mon May 22 03:40:56 2017 +0000

    New upstream version 0~20170521-g84b4350+dfsg
---
 Bottle.lua                                      |  16 +-
 ClassNLLCriterion.lua                           |  15 +-
 Decorator.lua                                   |  47 ++
 DontCast.lua                                    | 124 ++++
 IndexLinear.lua                                 | 398 +++++++++++++
 Linear.lua                                      |   6 +-
 LinearWeightNorm.lua                            | 168 ++++++
 MapTable.lua                                    |  22 +-
 NaN.lua                                         |  72 +++
 Profile.lua                                     |  55 ++
 README.md                                       |   2 +-
 SelectTable.lua                                 |   4 +-
 SpatialDepthWiseConvolution.lua                 | 139 +++++
 Sum.lua                                         |  88 +--
 Transpose.lua                                   |  13 +-
 WeightNorm.lua                                  |  61 +-
 doc/containers.md                               | 150 ++++-
 doc/convolution.md                              |  46 ++
 doc/criterion.md                                |  26 +-
 doc/simple.md                                   |  92 ++-
 doc/table.md                                    |   2 +-
 init.lua                                        |   8 +
 lib/THNN/generic/ClassNLLCriterion.c            |  48 +-
 lib/THNN/generic/FusedRNNKernel.c               |  53 ++
 lib/THNN/generic/HardTanh.c                     |   4 +-
 lib/THNN/generic/IndexLinear.c                  | 742 ++++++++++++++++++++++++
 lib/THNN/generic/Linear.c                       |  23 +-
 lib/THNN/generic/PReLU.c                        |  94 +--
 lib/THNN/generic/Sigmoid.c                      |   8 +-
 lib/THNN/generic/SparseLinear.c                 |  10 +-
 lib/THNN/generic/SpatialConvolutionLocal.c      |  94 ++-
 lib/THNN/generic/SpatialConvolutionMM.c         |  75 +--
 lib/THNN/generic/SpatialConvolutionMap.c        |  30 +-
 lib/THNN/generic/SpatialDepthWiseConvolution.c  | 519 +++++++++++++++++
 lib/THNN/generic/SpatialDilatedConvolution.c    |   9 +
 lib/THNN/generic/SpatialDilatedMaxPooling.c     |   6 +-
 lib/THNN/generic/SpatialFullConvolution.c       |   9 +
 lib/THNN/generic/SpatialFullConvolutionMap.c    |   2 +
 lib/THNN/generic/SpatialSubSampling.c           |   2 +
 lib/THNN/generic/THNN.h                         | 131 ++++-
 lib/THNN/generic/Tanh.c                         |   5 +-
 lib/THNN/generic/TemporalConvolution.c          |  31 +-
 lib/THNN/generic/TemporalMaxPooling.c           |   6 +-
 lib/THNN/generic/TemporalRowConvolution.c       |  18 +-
 lib/THNN/generic/TemporalSubSampling.c          |   7 +-
 lib/THNN/generic/VolumetricConvolutionMM.c      |  17 +-
 lib/THNN/generic/VolumetricDilatedConvolution.c |   8 +
 lib/THNN/generic/VolumetricDilatedMaxPooling.c  |  17 +-
 lib/THNN/generic/VolumetricFullConvolution.c    |   9 +
 lib/THNN/generic/unfold.c                       | 112 ++--
 lib/THNN/init.c                                 |   9 +
 rocks/nn-scm-1.rockspec                         |   3 +-
 test.lua                                        | 533 ++++++++++++++++-
 test/benchmarks/IndexLinear.lua                 | 323 +++++++++++
 54 files changed, 4103 insertions(+), 408 deletions(-)

diff --git a/Bottle.lua b/Bottle.lua
index 683935c..6dee432 100644
--- a/Bottle.lua
+++ b/Bottle.lua
@@ -1,21 +1,19 @@
-local Bottle, parent = torch.class("nn.Bottle", "nn.Container")
+local Bottle, parent = torch.class("nn.Bottle", "nn.Decorator")
 local unpack = unpack or table.unpack
 
 function Bottle:__init(module, nInputDim, nOutputDim)
-   parent.__init(self)
+   parent.__init(self, module)
    self.nInputDim = nInputDim or 2
    self.nOutputDim = nOutputDim or self.nInputDim
    self.dimDelta = self.nInputDim - self.nOutputDim
    -- Used to reshape the gradients
    self.inShape = torch.Tensor(self.nInputDim)
    self.outShape = torch.Tensor(self.nOutputDim)
-   -- add module to modules
-   self.modules[1] = module
 end
 
 function Bottle:updateOutput(input)
    -- first batchDims dimensions will be fused
-   local batchDims = input:dim() - self.nInputDim + 1 
+   local batchDims = input:dim() - self.nInputDim + 1
    -- see if bottle is required
    if batchDims > 1 then
       -- bottle the first dims
@@ -28,7 +26,7 @@ function Bottle:updateOutput(input)
       local output = self.modules[1]:updateOutput(newInput)
       assert(output:dim() == self.nOutputDim,
 	     "Wrong number of output dims on module. Expected: " ..
-		self.nOutputDim .. ' but got ' .. 
+		self.nOutputDim .. ' but got ' ..
 		tostring(output and output:dim()))
       self.outShape:copy(torch.LongTensor(output:size()))
       if math.abs(self.dimDelta) > 0 then
@@ -56,7 +54,7 @@ function Bottle:updateGradInput(input, gradOutput)
       end
    else
       if self.modules[1].gradInput then
-         self.gradInput:set(self.modules[1]:updateGradInput(input))
+         self.gradInput:set(self.modules[1]:updateGradInput(input, gradOutput))
       else
          self.gradInput = nil
       end
@@ -67,7 +65,7 @@ end
 function Bottle:accGradParameters(input, gradOutput, scale)
    if input:dim() > self.nInputDim then
       input = input:view(unpack(self.inShape:totable()))
-      gradOutput = gradOutput:view(unpack(self.outShape:totable()))      
+      gradOutput = gradOutput:view(unpack(self.outShape:totable()))
    end
-   self.modules[1]:accGradParameters(input, gradOutput, scale)   
+   self.modules[1]:accGradParameters(input, gradOutput, scale)
 end
diff --git a/ClassNLLCriterion.lua b/ClassNLLCriterion.lua
index d89f439..dae0e66 100644
--- a/ClassNLLCriterion.lua
+++ b/ClassNLLCriterion.lua
@@ -1,13 +1,10 @@
 local THNN = require 'nn.THNN'
 local ClassNLLCriterion, parent = torch.class('nn.ClassNLLCriterion', 'nn.Criterion')
 
-function ClassNLLCriterion:__init(weights, sizeAverage)
+function ClassNLLCriterion:__init(weights, sizeAverage, ignoreIndex)
     parent.__init(self)
-    if sizeAverage ~= nil then
-       self.sizeAverage = sizeAverage
-    else
-       self.sizeAverage = true
-    end
+    self.sizeAverage = (sizeAverage == nil) and true or sizeAverage
+    self.ignoreIndex = ignoreIndex or -100 -- this target index will be ignored
     if weights then
        assert(weights:dim() == 1, "weights input should be 1-D Tensor")
        self.weights = weights
@@ -47,7 +44,8 @@ function ClassNLLCriterion:updateOutput(input, target)
       self.output_tensor:cdata(),
       self.sizeAverage,
       THNN.optionalTensor(self.weights),
-      self.total_weight_tensor:cdata()
+      self.total_weight_tensor:cdata(),
+      self.ignoreIndex
    )
    self.output = self.output_tensor[1]
    return self.output, self.total_weight_tensor[1]
@@ -76,7 +74,8 @@ function ClassNLLCriterion:updateGradInput(input, target)
       self.gradInput:cdata(),
       self.sizeAverage,
       THNN.optionalTensor(self.weights),
-      self.total_weight_tensor:cdata()
+      self.total_weight_tensor:cdata(),
+      self.ignoreIndex
    )
 
    return self.gradInput
diff --git a/Decorator.lua b/Decorator.lua
new file mode 100644
index 0000000..05fb4db
--- /dev/null
+++ b/Decorator.lua
@@ -0,0 +1,47 @@
+local Decorator, parent = torch.class("nn.Decorator", "nn.Container")
+
+function Decorator:__init(module)
+   parent.__init(self)
+   -- so that it can be handled like a Container
+   self.modules[1] = module
+end
+
+function Decorator:updateOutput(input)
+   self.output = self.modules[1]:updateOutput(input)
+   return self.output
+end
+
+function Decorator:updateGradInput(input, gradOutput)
+   self.gradInput = self.modules[1]:updateGradInput(input, gradOutput)
+   return self.gradInput
+end
+
+function Decorator:accGradParameters(input, gradOutput, scale)
+   self.modules[1]:accGradParameters(input, gradOutput, scale)
+end
+
+function Decorator:accUpdateGradParameters(input, gradOutput, lr)
+   self.modules[1]:accUpdateGradParameters(input, gradOutput, lr)
+end
+
+function Decorator:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   self.modules[1]:sharedAccUpdateGradParameters(input, gradOutput, lr)
+end
+
+function Decorator:__tostring__()
+   if self.modules[1].__tostring__ then
+      return torch.type(self) .. ' @ ' .. self.modules[1]:__tostring__()
+   else
+      return torch.type(self) .. ' @ ' .. torch.type(self.modules[1])
+   end
+end
+
+-- useful for multiple-inheritance
+function Decorator.decorate(class)
+   class.updateOutput = nn.Decorator.updateOutput
+   class.updateGradInput = nn.Decorator.updateGradInput
+   class.accGradParameters = nn.Decorator.accGradParameters
+   class.accUpdateGradParameters = nn.Decorator.accUpdateGradParameters
+   class.sharedAccUpdateGradParameters = nn.Decorator.sharedAccUpdateGradParameters
+   class.__tostring__ =  nn.Decorator.__tostring__
+end
diff --git a/DontCast.lua b/DontCast.lua
new file mode 100644
index 0000000..b89f543
--- /dev/null
+++ b/DontCast.lua
@@ -0,0 +1,124 @@
+local DontCast, parent = torch.class("nn.DontCast", "nn.Decorator")
+
+-- utility functions
+
+local function recursiveTypeCopy(dst, src, type_str)
+   if torch.type(src) == 'table' then
+      dst = (torch.type(dst) == 'table') and dst or {}
+      for k, v in pairs(src) do
+         dst[k] = recursiveTypeCopy(dst[k], v, type_str)
+      end
+   elseif torch.isTensor(src) then
+      dst = (torch.type(dst) == type_str) and dst or torch.getmetatable(type_str).new()
+      dst:resize(src:size())
+      if src:nElement() > 0 then
+         dst:copy(src)
+      end
+   end
+   return dst
+end
+
+local function tableTensorType(src)
+   if type(src) == 'table' then
+      local type_str, found
+      for k,v in pairs(src) do
+         type_str, found = tableTensorType(v)
+         if found then
+            return type_str, true
+         end
+      end
+      return type_str, found
+   else
+      return torch.type(src), torch.isTensor(src)
+   end
+end
+
+-- DontCast methods and constructor
+
+function DontCast:__init(module, castin, castout, moduleType)
+   parent.__init(self, module)
+   self.castin = castin
+   self.castout = (castout == nil) and castin or castout
+   self.moduleType = moduleType
+   if (self.castin or self.castout) and not self.moduleType then
+      local moduleType, found = tableTensorType(module.output)
+      if found then
+         self.moduleType = moduleType
+      else
+         moduleType, found = tableTensorType(module:parameters())
+         if found then
+            self.moduleType = moduleType
+         else
+            error"Cannot extrapolate moduleType. Provide constructor argument 4"
+         end
+      end
+   end
+end
+
+function DontCast:updateOutput(input)
+   if self.castin and tableTensorType(input) ~= self.moduleType then
+      self._input = recursiveTypeCopy(self._input, input, self.moduleType)
+      input = self._input
+   end
+
+   local output = self.modules[1]:updateOutput(input)
+
+   if self.castout then
+      self.output = recursiveTypeCopy(self.output, output, tableTensorType(self.output))
+   else
+      self.output = output
+   end
+   return self.output
+end
+
+function DontCast:updateGradInput(input, gradOutput)
+   if self.castin and tableTensorType(input) ~= self.moduleType then
+      input = self._input
+   end
+   if self.castout and tableTensorType(gradOutput) ~= self.moduleType then
+      self._gradOutput = recursiveTypeCopy(self._gradOutput, gradOutput, self.moduleType)
+      gradOutput = self._gradOutput
+   end
+
+   local gradInput = self.modules[1]:updateGradInput(input, gradOutput)
+
+   if self.castin then
+      self.gradInput = recursiveTypeCopy(self.gradInput, gradInput, tableTensorType(self.gradInput))
+   else
+      self.gradInput = gradInput
+   end
+   return self.gradInput
+end
+
+function DontCast:accGradParameters(input, gradOutput, scale)
+   if self.castin and tableTensorType(input) ~= self.moduleType then
+      input = self._input
+   end
+   if self.castout and tableTensorType(gradOutput) ~= self.moduleType then
+      gradOutput = self._gradOutput
+   end
+
+   self.modules[1]:accGradParameters(input, gradOutput, scale)
+end
+
+function DontCast:accUpdateGradParameters(input, gradOutput, lr)
+   if self.castin and tableTensorType(input) ~= self.moduleType then
+      input = self._input
+   end
+   if self.castout and tableTensorType(gradOutput) ~= self.moduleType then
+      gradOutput = self._gradOutput
+   end
+
+   self.modules[1]:accUpdateGradParameters(input, gradOutput, lr)
+end
+
+-- dont cast (the essence thereof)
+function DontCast:type(type)
+   if self.castout and tableTensorType(self.output) ~= type then
+      self.output = recursiveTypeCopy(nil, self.output, type)
+   end
+   if self.castin and tableTensorType(self.gradInput) ~= type then
+      self.gradInput = recursiveTypeCopy(nil, self.gradInput, type)
+   end
+   return self
+end
diff --git a/IndexLinear.lua b/IndexLinear.lua
new file mode 100644
index 0000000..2ddbcbd
--- /dev/null
+++ b/IndexLinear.lua
@@ -0,0 +1,398 @@
+local ffi  = require 'ffi'
+local IndexLinear, parent = torch.class('nn.IndexLinear', 'nn.Module')
+
+
+
+function IndexLinear:__init(inputSize, outputSize, doGradInput, keysOffset, weight, bias, normalize)
+   parent.__init(self)
+
+   -- We need for 3 extra parameters per feature
+   -- if we normalize:
+   -- * The max-abs value
+   -- * The inverse of the max-abs value
+   -- * The per-feature bias
+   -- We keep an extra placeholder for further per learning rate feature manipulation.
+   -- So it's 4 total.
+   self.normalize = normalize and 4 or 0
+
+   -- This is important to keep the possibility of sharing a weight
+   -- directly, without having to allocate it first.
+   -- The reason is these weights can be very large.
+   self.weight = weight or torch.Tensor(inputSize, outputSize + self.normalize):zero()
+   self.bias = bias or torch.Tensor(outputSize):zero()
+   self.inputSize = self.weight and self.weight:size(1) or inputSize
+   self.outputSize = self.weight and (self.weight:size(2)-self.normalize) or outputSize
+
+   -- gradWeight is not initialized as we're doing dense gradient accumulation
+   -- This is more efficient and avoids allocating a giant useless gradWeight
+   self.gradWeight = torch.Tensor()
+
+   -- gradBias still works the same as it's already dense
+   self.gradBias = torch.Tensor(self.outputSize):zero()
+
+   -- Buffers
+   self.gradWeightBuffer = torch.Tensor()
+   self.valuesBuffer = torch.Tensor()
+   self.normalizedValues = torch.Tensor()
+
+   -- That is used to accumulate keys and gradWeight
+   -- when doing gradients accumulations
+   self.running = {
+      cumSumSizes = {},
+      keys = {},
+      gradWeight = {},
+      counter = 1,
+   }
+
+   -- self.sizes, self.cumSumSizes are calculated on the CPU even when using CUDA.
+   -- These two tables make it easier to resize these buffers instead of re-allocating them.
+   -- self.*Cache[1] always contains values on CPU.
+   -- If CUDA is being used, self.*Cache[2] contains values on GPU.
+   self.sizesCache = {}
+   self.cumSumSizesCache = {}
+
+   -- A few options
+   self.weightDecay = 0
+   self.doGradInput = doGradInput or false
+   self.offset = keysOffset and keysOffset-1 or -1 -- if this adds self.offset to indices
+end
+
+-- Reset all the parameters needed
+-- for normalization to 0
+function IndexLinear:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(2))
+   end
+   self.weight:uniform(-stdv, stdv)
+   self.bias:uniform(-stdv, stdv):mul(0.000001)
+   if self.normalize and self.normalize > 0 then
+      self.weight[{{}, {1,self.normalize}}]:zero()
+   end
+end
+
+function IndexLinear:reshapeInput(input)
+   assert(type(input) == 'table')
+
+   local ninputs = 0
+   for _, v in ipairs(input) do
+      ninputs = ninputs + 1
+   end
+
+   assert(ninputs == 2 or ninputs == 3)
+
+   -- If format is:
+   -- {
+   --   torch.LongTensor(size1+size2+...+sizeN), -- concatenated batch of keys
+   --   torch.Tensor(size1+size2+...+sizeN), -- concatenated batch of values
+   --   torch.LongTensor(N), -- keys/values sizes (values are {size1, ..., sizeN})
+   -- }
+   if ninputs == 3 then
+      local fkeys = input[1]
+      local fvals = input[2]
+      local fsizes = torch.isTensor(input[3]) and input[3] or fkeys.new{input[3]}
+      assert(fkeys:nElement() == fvals:nElement(), 'Keys and values should be of same size')
+      assert(fkeys:dim() == 1, 'Keys and values should be 1D')
+      self.isFlat = true
+      self.noBatch = false
+      return fkeys, fvals, fsizes
+   end
+
+   local keys = input[1]
+   local values = input[2]
+   local lkeys, lvalues
+
+   -- If format is:
+   -- {
+   --   { torch.LongTensor(size1), torch.LongTensor(size2), ..., torch.LongTensor(sizeN) }, -- batch of keys
+   --   { torch.Tensor(size1), torch.Tensor(size2), ..., torch.Tensor(sizeN) }, -- batch of values,
+   -- }
+   if type(keys) == 'table' and type(values) == 'table' then
+      lkeys, lvalues = keys, values
+      self.isFlat = false
+      self.noBatch = false
+
+   -- If format is not a batch:
+   -- {
+   --   torch.LongTensor(size1), -- keys
+   --   torch.Tensor(size1), -- values,
+   -- }
+   elseif torch.isTensor(keys) and torch.isTensor(values) then
+      lkeys, lvalues = {keys}, {values}
+      self.isFlat = false
+      self.noBatch = true
+   else
+      error('Wrong input format.')
+   end
+
+   for i=1,#lkeys do
+      assert(lvalues[i]:dim() == 1 and lkeys[i]:dim() == 1, "keys and values should be 1D")
+   end
+
+   return lkeys, lvalues
+end
+
+function IndexLinear:longTensor(...)
+   if (self:type() == 'torch.CudaTensor') then
+      return torch.CudaLongTensor(...)
+   else
+      return torch.LongTensor(...)
+   end
+end
+
+function IndexLinear:flattenInputs(input)
+   local lkeys, lvalues, sizes = self:reshapeInput(input)
+
+   local counter = self.running.counter
+
+   -- Ensure everything is of the right type
+   local isCuda = (self:type() == 'torch.CudaTensor')
+   self.running.keys[counter] = self.running.keys[counter] or self:longTensor()
+   self.keys = self.running.keys[counter]
+
+   if self.isFlat then
+      self.values = self.values or lvalues.new()
+      self.sizes = self.sizes or self:longTensor()
+
+      self.keys:resize(lkeys:size()):copy(lkeys)
+      self.values:resize(lvalues:size()):copy(lvalues)
+      self.sizes = sizes
+      self.cumSumSizes = self.cumSumSizes or self.sizes.new()
+      self.cumSumSizes:cumsum(self.sizes)
+   else
+      self.values = self.values or lvalues[1].new()
+
+      self.lkeys = lkeys
+      self.lvalues = lvalues
+      local batchSize = #self.lkeys
+
+      self.sizesCache[1] = self.sizesCache[1] or torch.LongTensor(batchSize)
+      self.cumSumSizesCache[1] = self.cumSumSizesCache[1] or torch.LongTensor(batchSize)
+
+      self.sizes = self.sizesCache[1]
+      self.cumSumSizes = self.cumSumSizesCache[1]
+
+      self.sizes:resize(batchSize)
+      self.cumSumSizes:resize(batchSize)
+
+      for i = 1,batchSize do
+         self.sizes[i] = self.lkeys[i]:size(1)
+      end
+      self.cumSumSizes:cumsum(self.sizes)
+
+      self.keys:cat(self.lkeys, 1)
+      self.values:cat(self.lvalues, 1)
+
+      if isCuda then
+         -- Get the GPU cache
+         self.sizesCache[2] = self.sizesCache[2] or torch.CudaLongTensor()
+         self.cumSumSizesCache[2] = self.cumSumSizesCache[2] or torch.CudaLongTensor()
+
+         self.sizes = self.sizesCache[2]
+         self.cumSumSizes = self.cumSumSizesCache[2]
+
+         -- Resize and copy to GPU
+         self.sizes:resize(batchSize):copy(self.sizesCache[1])
+         self.cumSumSizes:resize(batchSize):copy(self.cumSumSizesCache[1])
+      end
+   end
+   self.running.cumSumSizes[counter] = self.cumSumSizes
+end
+
+function IndexLinear:updateOutput(input)
+
+   self:flattenInputs(input)
+
+   self.values.THNN.IndexLinear_updateOutput(
+      self.keys:cdata(),
+      self.offset,
+      self.values:cdata(),
+      self.sizes:cdata(),
+      self.cumSumSizes:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.normalizedValues:cdata(),
+      self.train and 1 or 0
+      )
+
+   if self.noBatch then
+      self.output:resize(self.output:size(2))
+   end
+   return self.output
+end
+
+function IndexLinear:accUpdateGradParameters(input, gradOutput, scale)
+   self.values.THNN.IndexLinear_accUpdateGradParameters(
+      self.keys:cdata(),
+      self.offset,
+      self.normalize > 0 and self.normalizedValues:cdata() or self.values:cdata(),
+      self.sizes:cdata(),
+      self.cumSumSizes:cdata(),
+      gradOutput:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.weightDecay or 0,
+      scale or 1
+   )
+end
+
+function IndexLinear:accGradParameters(input, gradOutput, scale)
+
+   local counter = self.running.counter
+
+   -- Same as the running.keys in the updateOutput function,
+   -- get a table of dense running.gradWeight
+   self.running.gradWeight[counter] = self.running.gradWeight[counter] or self.values.new()
+   self.values.THNN.IndexLinear_accGradParameters(
+      self.keys:cdata(),
+      self.offset,
+      self.normalize > 0 and self.normalizedValues:cdata() or self.values:cdata(),
+      self.sizes:cdata(),
+      self.cumSumSizes:cdata(),
+      gradOutput:cdata(),
+      self.running.gradWeight[counter]:cdata(),
+      self.gradBias:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.valuesBuffer:cdata(),
+      self.weightDecay or 0,
+      scale or 1
+   )
+
+   -- Increment the running counter to create a new buffer
+   -- if we don't flush them in zerogradParamters
+   self.running.counter = self.running.counter + 1
+end
+
+function IndexLinear:updateGradInput(input, gradOutput)
+   self.gradInput = {}
+   -- Revamped from nn.SparseLinear.updateGradInput
+   if self.doGradInput and self.normalize > 0 then
+      error('updateGradInput is not implemented in max-normalize mode')
+   end
+
+   local ini = self.weight:size(1)
+
+   if self.doGradInput then
+      local gi = gradOutput.new()
+      if gradOutput:dim() == 1 then
+         gi:resize(self.weight:size(1))
+         gi:mv(self.weight,gradOutput)
+         gi:resize(1, self.weight:size(1))
+      elseif gradOutput:dim() == 2 then
+         gi:resize(gradOutput:size(1), self.weight:size(1))
+         gi:mm(gradOutput, self.weight:t())
+      end
+
+      local indices = self.running.keys[1].new(ini):range(1, ini)
+
+      if self.isFlat then
+         self.gradInput[1] = torch.repeatTensor(indices, gi:size(1), 1)
+         self.gradInput[2] = gi
+      else
+         self.gradInput[1] = {}
+         self.gradInput[2] = {}
+         for i = 1,gi:size(1) do
+            self.gradInput[1][i] = self.running.keys[1].new(ini)
+            self.gradInput[1][i]:copy(indices)
+            self.gradInput[2][i] = gradOutput.new(ini)
+            self.gradInput[2][i]:copy(gi[i])
+         end
+      end
+   end
+
+   if self.noBatch then
+      if self.isFlat then
+         self.gradInput = {self.gradInput[1]:resize(ini), self.gradInput[2]:resize(ini)}
+      else
+         self.gradInput = {self.gradInput[1][1], self.gradInput[2][1]}
+      end
+   end
+   return self.gradInput
+end
+
+function IndexLinear:updateParameters(lr)
+   local counter = self.running.counter
+   if counter > 1 then
+      if counter == 2 then
+         self.updateKeys = self.running.keys[1]
+         self.gradWeight = self.running.gradWeight[1]
+      else
+         self.updateKeysBuffer = self.updateKeysBuffer or self:longTensor()
+         local lkeys = {}
+         local lgweights = {}
+         local totalSize = 0
+         local lCumSumSizes = {}
+         for i=1,counter-1 do
+            lkeys[i] = self.running.keys[i]
+            -- Change layout to take advantage of the 1-D contiguous torch.cat
+            lgweights[i] = self.running.gradWeight[i]:contiguous()
+            lgweights[i]:resize(lgweights[i]:nElement())
+            lCumSumSizes[i] = totalSize + self.running.cumSumSizes[i]
+            totalSize = totalSize + lkeys[i]:size(1)
+         end
+
+         self.updateKeysBuffer:cat(lkeys, 1)
+         self.gradWeightBuffer:cat(lgweights, 1)
+         self.cumSumSizes:cat(lCumSumSizes, 1)
+         self.gradWeightBuffer:resize(totalSize, self.outputSize)
+         self.gradWeight = self.gradWeightBuffer
+         self.updateKeys = self.updateKeysBuffer
+      end
+      self.values.THNN.IndexLinear_updateParameters(
+            self.gradWeight:cdata(),
+            self.gradBias:cdata(),
+            self.weight:cdata(),
+            self.bias:cdata(),
+            self.updateKeys:cdata(),
+            self.cumSumSizes:cdata(),
+            self.offset,
+            self.weightDecay or 0,
+            lr or error('You must specify a learning rate')
+         )
+   end
+end
+
+function IndexLinear:zeroGradParameters()
+   -- No need to do anything here as gradWeight is dense
+   self.gradBias:zero()
+
+   -- The below piece of code would reset
+   -- the smart scaling parameters for each features
+   -- each time we call zeroGradParameters
+   -- TODO: decide what to do with that piece of code.
+   -- NB: this should be commented along with the corresponding
+   -- piece of code in lib/THNN/generic/IndexLinear.c, in the accUpdateGradParameters function.
+
+   --[[
+   local w = self.weight:select(2, 3)
+   if self.updateKeys and self.updateKeys:nElement() > 0 then
+      self.updateKeysBuffer:resizeAs(self.updateKeys):copy(self.updateKeys):add(self.offset+1)
+      w:indexFill(1, self.updateKeysBuffer, 0)
+   end
+   ]]--
+   self.running.counter = 1
+end
+
+function IndexLinear:parameters()
+   return {self.weight, self.bias}, {self.running, self.gradBias}
+end
+
+function IndexLinear:clearState()
+   self.running.keys = {}
+   self.running.gradWeight = {}
+   self.keys = nil
+   self.zerokeys = nil
+   self.updateKeys = nil
+   self.values = nil
+   self.sizes = nil
+   self.lkeys = {}
+   self.lvalues = {}
+   self.gradWeightBuffer = self.gradWeightBuffer.new()
+   self.valuesBuffer = self.valuesBuffer.new()
+   self.updateKeysBuffer = nil
+   self.values = nil
+   return parent.clearState(self)
+end
diff --git a/Linear.lua b/Linear.lua
index 3221227..09b5979 100644
--- a/Linear.lua
+++ b/Linear.lua
@@ -42,7 +42,7 @@ function Linear:reset(stdv)
    return self
 end
 
-local function updateAddBuffer(self, input)
+function Linear:updateAddBuffer(input)
    local nframe = input:size(1)
    self.addBuffer = self.addBuffer or input.new()
    if self.addBuffer:nElement() ~= nframe then
@@ -62,7 +62,7 @@ function Linear:updateOutput(input)
       if self.output:nElement() ~= nElement then
          self.output:zero()
       end
-      updateAddBuffer(self, input)
+      self:updateAddBuffer(input)
       self.output:addmm(0, self.output, 1, input, self.weight:t())
       if self.bias then self.output:addr(1, self.addBuffer, self.bias) end
    else
@@ -99,7 +99,7 @@ function Linear:accGradParameters(input, gradOutput, scale)
       self.gradWeight:addmm(scale, gradOutput:t(), input)
       if self.bias then
          -- update the size of addBuffer if the input is not the same size as the one we had in last updateGradInput
-         updateAddBuffer(self, input)
+         self:updateAddBuffer(input)
          self.gradBias:addmv(scale, gradOutput:t(), self.addBuffer)
       end
    end
diff --git a/LinearWeightNorm.lua b/LinearWeightNorm.lua
new file mode 100755
index 0000000..a712f55
--- /dev/null
+++ b/LinearWeightNorm.lua
@@ -0,0 +1,168 @@
+local LinearWeightNorm, parent = torch.class('nn.LinearWeightNorm', 'nn.Linear')
+
+function LinearWeightNorm:__init(inputSize, outputSize, bias, eps)
+    nn.Module.__init(self) -- Skip nn.Linear constructor
+
+    local bias = ((bias == nil) and true) or bias
+
+    self.eps = eps or 1e-16
+
+    self.outputSize = outputSize
+    self.inputSize = inputSize
+
+    self.v = torch.Tensor(outputSize, inputSize)
+    self.gradV = torch.Tensor(outputSize, inputSize)
+
+    self.weight = torch.Tensor(outputSize, inputSize)
+
+    self.g = torch.Tensor(outputSize,1)
+    self.gradG = torch.Tensor(outputSize,1)
+
+    self.norm = torch.Tensor(outputSize,1)
+    self.scale = torch.Tensor(outputSize,1)
+
+    if bias then
+        self.bias = torch.Tensor(outputSize)
+        self.gradBias = torch.Tensor(outputSize)
+    end
+
+    self:reset()
+end
+
+function LinearWeightNorm:evaluate()
+    if self.train ~= false then
+        self:updateWeightMatrix()
+    end
+
+    parent.evaluate(self)
+end
+
+function LinearWeightNorm:initFromWeight(weight)
+    weight = weight or self.weight
+
+    self.g:norm(weight,2,2):clamp(self.eps,math.huge)
+    self.v:copy(weight)
+
+    return self
+end
+
+function LinearWeightNorm.fromLinear(linear)
+    local module = nn.LinearWeightNorm(linear.weight:size(2), linear.weight:size(1), torch.isTensor(linear.bias))
+    module.weight:copy(linear.weight)
+    module:initFromWeight()
+
+    if linear.bias then
+        module.bias:copy(linear.bias)
+    end
+
+    return module
+end
+
+function LinearWeightNorm:toLinear()
+    self:updateWeightMatrix()
+
+    local module = nn.Linear(self.inputSize, self.outputSize, torch.isTensor(self.bias))
+
+    module.weight:copy(self.weight)
+    if self.bias then
+        module.bias:copy(self.bias)
+    end
+
+    return module
+end
+
+function LinearWeightNorm:parameters()
+    if self.bias then
+        return {self.v, self.g, self.bias}, {self.gradV, self.gradG, self.gradBias}
+    else
+        return {self.v, self.g}, {self.gradV, self.gradG}
+    end
+end
+
+function LinearWeightNorm:reset(stdv)
+    if stdv then
+        stdv = stdv * math.sqrt(3)
+    else
+        stdv = 1 / math.sqrt(self.inputSize)
+    end
+
+    self.weight:uniform(-stdv,stdv)
+    self:initFromWeight()
+
+    if self.bias then
+        self.bias:uniform(-stdv,stdv)
+    end
+end
+
+function LinearWeightNorm:updateWeightMatrix()
+    if self.norm:dim() == 0 then self.norm:resizeAs(self.g) end
+    if self.scale:dim() == 0 then self.scale:resizeAs(self.g) end
+    if self.weight:dim() == 0 then self.weight:resizeAs(self.v) end
+
+    self.norm:norm(self.v,2,2):clamp(self.eps,math.huge)
+    self.scale:cdiv(self.g,self.norm)
+    self.weight:cmul(self.v,self.scale:expandAs(self.v))
+end
+
+function LinearWeightNorm:updateOutput(input)
+    if self.train ~= false then
+        self:updateWeightMatrix()
+    end
+
+    return parent.updateOutput(self, input)
+end
+
+function LinearWeightNorm:accGradParameters(input, gradOutput, scale)
+    scale = scale or 1
+    if input:dim() == 1 then
+        self.gradV:addr(scale, gradOutput, input)
+        if self.bias then self.gradBias:add(scale, gradOutput) end
+    elseif input:dim() == 2 then
+        self.gradV:addmm(scale, gradOutput:t(), input)
+        if self.bias then
+            -- update the size of addBuffer if the input is not the same size as the one we had in last updateGradInput
+            self:updateAddBuffer(input)
+            self.gradBias:addmv(scale, gradOutput:t(), self.addBuffer)
+        end
+    end
+
+    local scale = self.scale:expandAs(self.v)
+    local norm = self.norm:expandAs(self.v)
+
+    self.weight:cmul(self.gradV,self.v):cdiv(norm)
+    self.gradG:sum(self.weight,2)
+
+    self.gradV:cmul(scale)
+
+    self.weight:cmul(self.v,scale):cdiv(norm)
+    self.weight:cmul(self.gradG:expandAs(self.weight))
+
+    self.gradV:add(-1,self.weight)
+end
+
+function LinearWeightNorm:defaultAccUpdateGradParameters(input, gradOutput, lr)
+    local gradV = self.gradV
+    local gradG = self.gradG
+    local gradBias = self.gradBias
+
+    self.gradV = self.v
+    self.gradG = self.g
+    self.gradBias = self.bias
+
+    self:accGradParameters(input, gradOutput, -lr)
+
+    self.gradV = gradV
+    self.gradG = gradG
+    self.gradBias = gradBias
+end
+
+function LinearWeightNorm:clearState()
+    nn.utils.clear(self, 'weight', 'norm', 'scale')
+    return parent.clearState(self)
+end
+
+function LinearWeightNorm:__tostring__()
+    return torch.type(self) ..
+        string.format('(%d -> %d)', self.inputSize, self.outputSize) ..
+        (self.bias == nil and ' without bias' or '')
+end
\ No newline at end of file
diff --git a/MapTable.lua b/MapTable.lua
index 90b439c..c79f1ea 100644
--- a/MapTable.lua
+++ b/MapTable.lua
@@ -2,17 +2,23 @@ local MapTable, parent = torch.class('nn.MapTable', 'nn.Container')
 
 function MapTable:__init(module, shared)
    parent.__init(self)
-   self.shared = shared or {'weight', 'bias', 'gradWeight', 'gradBias'}
+   self.shared = (shared == nil) and true or shared
+   self.sharedparams = {'weight', 'bias', 'gradWeight', 'gradBias'}
    self.output = {}
    self.gradInput = {}
    self:add(module)
 end
 
 function MapTable:_extend(n)
+   self.sharedparams = self.sharedparams or {'weight', 'bias', 'gradWeight', 'gradBias'}
    self.modules[1] = self.module
    for i = 2, n do
       if not self.modules[i] then
-         self.modules[i] = self.module:clone(table.unpack(self.shared))
+         if self.shared then
+           self.modules[i] = self.module:clone(table.unpack(self.sharedparams))
+         else
+           self.modules[i] = self.module:clone()
+         end
       end
    end
 end
@@ -70,13 +76,21 @@ end
 
 function MapTable:zeroGradParameters()
     if self.module then
-        self.module:zeroGradParameters()
+        if self.shared then
+          self.module:zeroGradParameters()
+        else
+          parent.zeroGradParameters(self)
+        end
     end
 end
 
 function MapTable:updateParameters(learningRate)
     if self.module then
-        self.module:updateParameters(learningRate)
+        if self.shared then
+          self.module:updateParameters(learningRate)
+        else
+          parent.updateParameters(self, learningRate)
+        end
     end
 end
 
diff --git a/NaN.lua b/NaN.lua
new file mode 100644
index 0000000..b80f6a0
--- /dev/null
+++ b/NaN.lua
@@ -0,0 +1,72 @@
+------------------------------------------------------------------------
+--[[ NaN ]]--
+-- Asserts that outputs and gradInputs do not contain NaNs.
+-- Useful for locating the source of NaN errors.
+------------------------------------------------------------------------
+local NaN, parent = torch.class("nn.NaN", "nn.Decorator")
+
+local idseq = 0
+function NaN.newId()
+   idseq = idseq + 1
+   return idseq
+end
+
+function NaN:__init(module, id)
+   parent.__init(self, module)
+   self.id = id or NaN.newId()
+end
+
+function NaN:recursiveIsNaN(tensor)
+   local isNaN = false
+   if torch.type(tensor) == 'table' then
+      for k,v in pairs(tensor) do
+         isNaN = self:recursiveIsNaN(v)
+         if isNaN then break end
+      end
+   else
+      local _ = require 'moses'
+      isNaN = _.isNaN(tensor:sum())
+   end
+   return isNaN
+end
+
+function NaN:updateOutput(input)
+   self.output = self.modules[1]:updateOutput(input)
+   if self:recursiveIsNaN(self.output) then
+      if self:recursiveIsNaN(input) then
+         error(string.format("NaN found in input of module :\n%s", self:__tostring__()))
+      elseif self:recursiveIsNaN(self:parameters()) then
+         error(string.format("NaN found in parameters of module :\n%s", self:__tostring__()))
+      end
+      error(string.format("NaN found in output of module :\n%s", self:__tostring__()))
+   end
+   return self.output
+end
+
+function NaN:updateGradInput(input, gradOutput)
+   self.gradInput = self.modules[1]:updateGradInput(input, gradOutput)
+   if self:recursiveIsNaN(self.gradInput) then
+      if self:recursiveIsNaN(gradOutput) then
+         error(string.format("NaN found in gradOutput of module :\n%s", self:__tostring__()))
+      end
+      error(string.format("NaN found in gradInput of module :\n%s", self:__tostring__()))
+   end
+   return self.gradInput
+end
+
+function NaN:accGradParameters(input, gradOutput, scale)
+   self.modules[1]:accGradParameters(input, gradOutput, scale)
+   local params, gradParams = self:parameters()
+   if self:recursiveIsNaN(gradParams) then
+      error(string.format("NaN found in gradParameters of module :\n%s", self:__tostring__()))
+   end
+end
+
+function NaN:__tostring__()
+   local selfstring = torch.type(self) .. '(' .. self.id .. ')'
+   if self.modules[1].__tostring__ then
+      return selfstring .. ' @ ' .. self.modules[1]:__tostring__()
+   else
+      return selfstring .. ' @ ' .. torch.type(self.modules[1])
+   end
+end
diff --git a/Profile.lua b/Profile.lua
new file mode 100644
index 0000000..36cd909
--- /dev/null
+++ b/Profile.lua
@@ -0,0 +1,55 @@
+local ProfileModule, parent = torch.class("nn.Profile", "nn.Decorator")
+
+function ProfileModule:__init(module, print_interval, name)
+   parent.__init(self, module)
+   self.print_interval = print_interval or 100
+   self.name = name or torch.type(module)
+   self.module = module
+   self.numFwds = 0
+   self.numBwds = 0
+   self.summedFwdTime = 0
+   self.summedBwdTime = 0
+   self.timer = torch.Timer()
+end
+
+function ProfileModule:updateOutput(input)
+   self.timer:reset()
+   self.output = self.module:updateOutput(input)
+   self.summedFwdTime = self.summedFwdTime + self.timer:time().real
+   self.numFwds = self.numFwds + 1
+   if self.numFwds % self.print_interval == 0 then
+      print (string.format('%s took %.3f seconds for %d forward passes',
+         self.name, self.summedFwdTime, self.print_interval))
+      self.numFwds = 0
+      self.summedFwdTime = 0
+   end
+   return self.output
+end
+
+function ProfileModule:updateGradInput(input, gradOutput)
+   self.timer:reset()
+   self.gradInput = self.module:updateGradInput(input, gradOutput)
+   self.summedBwdTime = self.summedBwdTime + self.timer:time().real
+   self.numBwds = self.numBwds + 1
+   if self.numBwds % self.print_interval == 0 then
+      print (string.format('%s took %.3f seconds for %d backward passes',
+         self.name, self.summedBwdTime, self.print_interval))
+      self.numBwds = 0
+      self.summedBwdTime = 0
+   end
+   return self.gradInput
+end
+
+local function makeTorchTimerSerializable()
+   -- The Timer object part of this class needs to be serializable
+   -- so that the layer can be saved, cloned, etc. We add a dummy
+   -- serialization of torch.Timer that just creates a new instance at read
+   local timerMetatable = getmetatable(torch.Timer())
+   timerMetatable['__factory'] = torch.Timer
+   timerMetatable['write'] = function(object, file) end
+   timerMetatable['read'] = function(object, file, versionNumber)
+      return object
+   end
+end
+
+makeTorchTimerSerializable()
diff --git a/README.md b/README.md
index 378a440..6efd609 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 This package provides an easy and modular way to build and train simple or complex neural networks using [Torch](https://github.com/torch/torch7/blob/master/README.md):
  * Modules are the bricks used to build neural networks. Each are themselves neural networks, but can be combined with other networks using containers to create complex neural networks:
    * [Module](doc/module.md#nn.Module): abstract class inherited by all modules;
-   * [Containers](doc/containers.md#nn.Containers): container classes like [`Sequential`](doc/containers.md#nn.Sequential), [`Parallel`](doc/containers.md#nn.Parallel) and [`Concat`](doc/containers.md#nn.Concat);
+   * [Containers](doc/containers.md#nn.Containers): composite and decorator classes like [`Sequential`](doc/containers.md#nn.Sequential), [`Parallel`](doc/containers.md#nn.Parallel), [`Concat`](doc/containers.md#nn.Concat) and [`NaN`](doc/containers.md#nn.NaN);
    * [Transfer functions](doc/transfer.md#nn.transfer.dok): non-linear functions like [`Tanh`](doc/transfer.md#nn.Tanh) and [`Sigmoid`](doc/transfer.md#nn.Sigmoid);
    * [Simple layers](doc/simple.md#nn.simplelayers.dok): like [`Linear`](doc/simple.md#nn.Linear), [`Mean`](doc/simple.md#nn.Mean), [`Max`](doc/simple.md#nn.Max) and [`Reshape`](doc/simple.md#nn.Reshape);
    * [Table layers](doc/table.md#nn.TableLayers): layers for manipulating `table`s like [`SplitTable`](doc/table.md#nn.SplitTable), [`ConcatTable`](doc/table.md#nn.ConcatTable) and [`JoinTable`](doc/table.md#nn.JoinTable);
diff --git a/SelectTable.lua b/SelectTable.lua
index f383a10..ef26f35 100644
--- a/SelectTable.lua
+++ b/SelectTable.lua
@@ -24,13 +24,15 @@ local function zeroTableCopy(t1, t2)
    for k, v in pairs(t2) do
       if (torch.type(v) == "table") then
          t1[k] = zeroTableCopy(t1[k] or {}, t2[k])
-      else
+      elseif torch.isTensor(v) then
          if not t1[k] then
             t1[k] = v:clone():zero()
          else
             t1[k]:resizeAs(v)
             t1[k]:zero()
          end
+      else
+        t1[k] = nil
       end
    end
    for k, v in pairs(t1) do
diff --git a/SpatialDepthWiseConvolution.lua b/SpatialDepthWiseConvolution.lua
new file mode 100644
index 0000000..1132f04
--- /dev/null
+++ b/SpatialDepthWiseConvolution.lua
@@ -0,0 +1,139 @@
+local THNN = require 'nn.THNN'
+local SpatialDepthWiseConvolution, parent = torch.class('nn.SpatialDepthWiseConvolution', 'nn.Module')
+
+function SpatialDepthWiseConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kW = kW
+   self.kH = kH
+
+   self.dW = dW
+   self.dH = dH
+   self.padW = padW or 0
+   self.padH = padH or self.padW
+
+   self.weight = torch.Tensor(nOutputPlane, nInputPlane*kH*kW)
+   self.bias = torch.Tensor(nOutputPlane, nInputPlane)
+   self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane*kH*kW)
+   self.gradBias = torch.Tensor(nOutputPlane, nInputPlane)
+
+   self:reset()
+end
+
+function SpatialDepthWiseConvolution:noBias()
+   self.bias = nil
+   self.gradBias = nil
+   return self
+end
+
+function SpatialDepthWiseConvolution:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+   else
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+function SpatialDepthWiseConvolution:updateOutput(input)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   self.finput = self.finput or input.new()
+   self.fgradInput = self.fgradInput or input.new()
+   -- backward compatibility
+   if self.padding then
+      self.padW = self.padding
+      self.padH = self.padding
+      self.padding = nil
+   end
+   input.THNN.SpatialDepthWiseConvolution_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      THNN.optionalTensor(self.bias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH
+   )
+   return self.output
+end
+
+function SpatialDepthWiseConvolution:updateGradInput(input, gradOutput)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   if self.gradInput then
+      input.THNN.SpatialDepthWiseConvolution_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.finput:cdata(),
+         self.fgradInput:cdata(),
+         self.kW, self.kH,
+         self.dW, self.dH,
+         self.padW, self.padH
+      )
+      return self.gradInput
+   end
+end
+
+function SpatialDepthWiseConvolution:accGradParameters(input, gradOutput, scale)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   scale = scale or 1
+   assert((self.bias and self.gradBias) or (self.bias == nil and self.gradBias == nil))
+   input.THNN.SpatialDepthWiseConvolution_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      THNN.optionalTensor(self.gradBias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      scale
+   )
+end
+
+function SpatialDepthWiseConvolution:type(type,tensorCache)
+   self.finput = self.finput and torch.Tensor()
+   self.fgradInput = self.fgradInput and torch.Tensor()
+   return parent.type(self,type,tensorCache)
+end
+
+function SpatialDepthWiseConvolution:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%d', torch.type(self),
+         self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
+   if self.dW ~= 1 or self.dH ~= 1 or self.padW ~= 0 or self.padH ~= 0 then
+     s = s .. string.format(', %d,%d', self.dW, self.dH)
+   end
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+     s = s .. ', ' .. self.padW .. ',' .. self.padH
+   end
+   if self.bias then
+      return s .. ')'
+   else
+      return s .. ') without bias'
+   end
+end
+
+function SpatialDepthWiseConvolution:clearState()
+   nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+   return parent.clearState(self)
+end
+
diff --git a/Sum.lua b/Sum.lua
index 9ff73f8..8dc8305 100644
--- a/Sum.lua
+++ b/Sum.lua
@@ -1,61 +1,67 @@
 local Sum, parent = torch.class('nn.Sum', 'nn.Module')
 
-function Sum:__init(dimension, nInputDims, sizeAverage)
+function Sum:__init(dimension, nInputDims, sizeAverage, squeeze)
    parent.__init(self)
    self.dimension   = dimension or 1
    -- do not assign default value to nInputDims or it will break backward compatibility
    self.nInputDims  = nInputDims
    self.sizeAverage = sizeAverage or false
+   if squeeze ~= nil then
+      assert(type(squeeze) == 'boolean', 'squeeze has to be true/false')
+      self.squeeze = squeeze
+   else
+      self.squeeze = true
+   end
 end
 
 function Sum:_getPositiveDimension(input)
-    local dimension = self.dimension
-    if dimension < 0 then
-        dimension = input:dim() + dimension + 1
-    elseif self.nInputDims and input:dim()==(self.nInputDims+1) then
-        dimension = dimension + 1
-    end
-    assert(input:dim() >= dimension, "dimension exceeds input dimensions")
-    return dimension
+   local dimension = self.dimension
+   if dimension < 0 then
+      dimension = input:dim() + dimension + 1
+   elseif self.nInputDims and input:dim()==(self.nInputDims+1) then
+      dimension = dimension + 1
+   end
+   assert(input:dim() >= dimension, "dimension exceeds input dimensions")
+   return dimension
 end
 
 function Sum:updateOutput(input)
-    local dimension = self:_getPositiveDimension(input)
-    if type(self.output) == 'number' then
-        self.output = input.new()
-    end
-    self.output:sum(input, dimension)
-    if self.sizeAverage then
-        self.output:div(input:size(dimension))
-    end
-    if self.output:nDimension() > 1 then
-        self.output:set(self.output:select(dimension, 1))
-    end
-    return self.output
+   local dimension = self:_getPositiveDimension(input)
+   if type(self.output) == 'number' then
+      self.output = input.new()
+   end
+   self.output:sum(input, dimension)
+   if self.sizeAverage then
+      self.output:div(input:size(dimension))
+   end
+   if self.squeeze and self.output:nDimension() > 1 then
+      self.output:set(self.output:select(dimension, 1))
+   end
+   return self.output
 end
 
 function Sum:updateGradInput(input, gradOutput)
-    local dimension = self:_getPositiveDimension(input)
-    -- zero-strides don't work with MKL/BLAS, so
-    -- don't set self.gradInput to zero-stride tensor.
-    -- Instead, do a deepcopy
-    local size      = input:size()
-    size[dimension] = 1
-    if not gradOutput:isContiguous() then
-        self._gradOutput = self._gradOutput or gradOutput.new()
-        self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
-        gradOutput = self._gradOutput
-    end
-    gradOutput      = gradOutput:view(size)
-    self.gradInput:resizeAs(input)
-    self.gradInput:copy(gradOutput:expandAs(input))
-    if self.sizeAverage then
-        self.gradInput:div(input:size(dimension))
-    end
-    return self.gradInput
+   local dimension = self:_getPositiveDimension(input)
+   -- zero-strides don't work with MKL/BLAS, so
+   -- don't set self.gradInput to zero-stride tensor.
+   -- Instead, do a deepcopy
+   local size      = input:size()
+   size[dimension] = 1
+   if not gradOutput:isContiguous() then
+      self._gradOutput = self._gradOutput or gradOutput.new()
+      self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+      gradOutput = self._gradOutput
+   end
+   gradOutput      = gradOutput:view(size)
+   self.gradInput:resizeAs(input)
+   self.gradInput:copy(gradOutput:expandAs(input))
+   if self.sizeAverage then
+      self.gradInput:div(input:size(dimension))
+   end
+   return self.gradInput
 end
 
 function Sum:clearState()
-    nn.utils.clear(self, '_gradOutput')
-    return parent.clearState(self)
+   nn.utils.clear(self, '_gradOutput')
+   return parent.clearState(self)
 end
diff --git a/Transpose.lua b/Transpose.lua
index 263db60..cceb2b6 100644
--- a/Transpose.lua
+++ b/Transpose.lua
@@ -7,11 +7,18 @@ local Transpose, parent = torch.class('nn.Transpose', 'nn.Module')
 function Transpose:__init(...)
    parent.__init(self)
    self.permutations = {...}
+   self.numInputDims = nil
+end
+
+function Transpose:setNumInputDims(numInputDims)
+   self.numInputDims = numInputDims
+   return self
 end
 
 function Transpose:updateOutput(input)
+   local offset = self.numInputDims and input:nDimension()-self.numInputDims or 0
    for _,perm in ipairs(self.permutations) do
-      input = input:transpose(perm[1],perm[2])
+      input = input:transpose(perm[1]+offset,perm[2]+offset)
    end
    self.output:resizeAs(input):copy(input)
    return self.output
@@ -20,9 +27,9 @@ end
 function Transpose:updateGradInput(input, gradOutput)
    for i = #self.permutations,1,-1 do
       local perm = self.permutations[i]
-      gradOutput = gradOutput:transpose(perm[1],perm[2])
+      local offset = self.numInputDims and input:nDimension()-self.numInputDims or 0
+      gradOutput = gradOutput:transpose(perm[1]+offset,perm[2]+offset)
    end
    self.gradInput:resizeAs(gradOutput):copy(gradOutput)
    return self.gradInput
 end
-
diff --git a/WeightNorm.lua b/WeightNorm.lua
index ad832b7..3ffcd90 100644
--- a/WeightNorm.lua
+++ b/WeightNorm.lua
@@ -1,6 +1,6 @@
 -- Weight Normalization
 -- https://arxiv.org/pdf/1602.07868v3.pdf
-local WeightNorm, parent = torch.class("nn.WeightNorm", "nn.Container")
+local WeightNorm, parent = torch.class("nn.WeightNorm", "nn.Decorator")
 
 function WeightNorm:__init(module, outputDim)
     -- this container will apply Weight Normalization to any module it wraps
@@ -9,7 +9,7 @@ function WeightNorm:__init(module, outputDim)
     -- if the weight is not 2D, the container will view the weight into a 2D shape
     -- that is nOut x (nIn x kw x dw x ...)
 
-    parent.__init(self)
+    parent.__init(self, module)
     assert(module.weight)
 
     if module.bias then
@@ -34,6 +34,7 @@ function WeightNorm:__init(module, outputDim)
 
     -- view size back to original weight
     self.viewOut = self.weight:size()
+    self.weightSize = self.weight:size()
 
     -- bubble outputDim size up to the front
     for i = self.outputDim - 1, 1, -1 do
@@ -53,7 +54,6 @@ function WeightNorm:__init(module, outputDim)
     -- gradient of v
     self.gradV = torch.Tensor(self.viewIn)
 
-    self.modules[1] = module
     self:resetInit()
 end
 
@@ -81,7 +81,14 @@ function WeightNorm:resetInit(inputSize, outputSize)
     end
 end
 
-function WeightNorm:updateOutput(input)
+function WeightNorm:evaluate()
+    if not(self.train == false) then
+        self:updateWeight()
+        parent.evaluate(self)
+    end
+end
+
+function WeightNorm:updateWeight()
     -- view to 2D when weight norm container operates
     self.gradV:copy(self:permuteIn(self.weight))
     self.gradV = self.gradV:view(self.viewIn)
@@ -92,12 +99,18 @@ function WeightNorm:updateOutput(input)
     self.gradV:copy(self.v)
     self._scale:copy(self.g):cdiv(self._norm)
     self.gradV:cmul(self._scale:view(self.viewIn[1], 1)
-                                :expand(self.viewIn[1], self.viewIn[2]))
+                               :expand(self.viewIn[1], self.viewIn[2]))
 
     -- otherwise maintain size of original module weight
     self.gradV = self.gradV:view(self.viewOut)
 
     self.weight:copy(self:permuteOut(self.gradV))
+end
+
+function WeightNorm:updateOutput(input)
+    if not(self.train == false) then
+        self:updateWeight()
+    end
     self.output:set(self.modules[1]:updateOutput(input))
     return self.output
 end
@@ -124,7 +137,7 @@ function WeightNorm:accGradParameters(input, gradOutput, scale)
     -- dL / dg * (w * g / ||w||^2)
     self.weight:copy(self.v):cmul(scale):cdiv(norm)
     self.weight:cmul(self.gradG:view(self.viewIn[1], 1)
-                            :expand(self.viewIn[1], self.viewIn[2]))
+                               :expand(self.viewIn[1], self.viewIn[2]))
 
     -- dL / dv update
     self.gradV:add(-1, self.weight)
@@ -159,7 +172,37 @@ function WeightNorm:parameters()
     end
 end
 
-function WeightNorm:__tostring__()
-    local str = 'nn.WeightNorm [' .. tostring(self.modules[1]) .. ']'
-    return str
+function WeightNorm:write(file)
+    -- Don't save weight and gradWeight since we can easily re-compute it from v
+    -- and g.
+    local weight = self.modules[1].weight
+    local gradWeight = self.modules[1].gradWeight
+    self.weight = nil
+    self.gradWeight = nil
+    self.modules[1].weight = nil
+    self.modules[1].gradWeight = nil
+    if not self.weightSize then
+        self.weightSize = weight:size()
+    end
+
+    parent.write(self, file)
+
+    self.modules[1].weight = weight
+    self.modules[1].gradWeight = gradWeight
+    self.weight = weight
+    self.gradWeight = gradWeight
+end
+
+function WeightNorm:read(file)
+    parent.read(self, file)
+
+    -- Re-compute weight and gradWeight
+    if not self.weight then
+        self.modules[1].weight = self.v.new(self.weightSize)
+        self.modules[1].gradWeight = self.v.new(self.weightSize)
+        self.weight = self.modules[1].weight
+        self.gradWeight = self.modules[1].gradWeight
+        self:updateWeight()
+        self.gradWeight:copy(self:permuteOut(self.gradV))
+    end
 end
diff --git a/doc/containers.md b/doc/containers.md
old mode 100644
new mode 100755
index bff4fab..cecf782
--- a/doc/containers.md
+++ b/doc/containers.md
@@ -7,7 +7,12 @@ Complex neural networks are easily built using container classes:
     * [Parallel](#nn.Parallel) : applies its `ith` child module to the  `ith` slice of the input Tensor ;
     * [Concat](#nn.Concat) : concatenates in one layer several modules along dimension `dim` ;
       * [DepthConcat](#nn.DepthConcat) : like Concat, but adds zero-padding when non-`dim` sizes don't match;
-    * [Bottle](#nn.Bottle) : allows any dimensionality input be forwarded through a module ;
+    * [Decorator](#nn.Decorator) : abstract class to change the behaviour of an encapsulated module ;
+      * [Bottle](#nn.Bottle) : allows any dimensionality input be forwarded through a module ;
+      * [WeightNorm](#nn.WeightNorm) :  implements the reparametrization presented in [Weight Normalization](https://arxiv.org/pdf/1602.07868v3.pdf) ;
+      * [DontCast](#nn.DontCast) : prevent encapsulated module from being casted by `Module:type()` ;
+      * [NaN](#nn.NaN) : decorate a module to detect the source of NaN errors ;
+      * [Profile](#nn.Profile) : decorate a module to time its forwards and backwards passes ;
 
 See also the [Table Containers](#nn.TableContainers) for manipulating tables of [Tensors](https://github.com/torch/torch7/blob/master/doc/tensor.md).
 
@@ -293,6 +298,17 @@ module output tensors non-`dim` sizes aren't all odd or even.
 Such that in order to keep the mappings aligned, one need
 only ensure that these be all odd (or even).
 
+<a name='nn.Decorator'></a>
+## Decorator ##
+
+```lua
+dmodule = nn.Decorator(module)
+```
+
+This module is an abstract class used to decorate a `module`. This means
+that method calls to `dmodule` will call the same method on the encapsulated
+`module`, and return its results.
+
 <a name="nn.Bottle"></a>
 ## Bottle
 
@@ -330,9 +346,135 @@ mlp = nn.Bottle(nn.Linear(10, 2))
 module = nn.WeightNorm(module)
 ```
 
-WeightNorm implements the reparametrization presented in [Weight Normalization](https://arxiv.org/pdf/1602.07868v3.pdf), which decouples the length of neural network weight vectors from their direction. The weight vectors `w` is determined instead by parameters `g` and `v` such that `w = g * v / ||v||`, where `||v||` is the euclidean norm of vector v. This container can wrap nn layers with weights.
+WeightNorm implements the reparametrization presented in [Weight Normalization](https://arxiv.org/pdf/1602.07868v3.pdf), which decouples the length of neural network weight vectors from their direction. The weight vector `w` is determined instead by parameters `g` and `v` such that `w = g * v / ||v||`, where `||v||` is the euclidean norm of vector `v`. This container can wrap nn layers with weights.
+
+It accepts a parameter ``outputDim`` that represents the output dimension of the module weight it wraps, which defaults to 1. If the outputDim is not 1 the container will transpose the weight appropriately. If the module weight is not 2D, e.g. in the case of convolutional layers, the container will view the weight into an appropriate 2D shape based on the `outputDim` specified by the user.
+
+An optimised version of `nn.WeightNorm(nn.Linear(inputDimension, outputDimension))` is available as `nn.LinearWeightNorm(inputDimension, outputDimension, [bias = true])`. This layer occupies less memory and is faster through the use of fewer tensor copy operations, it also stores and updates a dirty flag to avoid unnecessary computation of the weight matrix.
+
+
+<a name='nn.DontCast'></a>
+## DontCast ##
+
+```lua
+dmodule = nn.DontCast(module)
+```
+
+This module is a decorator. Use it to decorate a module that you don't
+want to be cast when the `type()` method is called.
+
+```lua
+module = nn.DontCast(nn.Linear(3,4):float())
+module:double()
+th> print(module:forward(torch.FloatTensor{1,2,3}))
+ 1.0927
+-1.9380
+-1.8158
+-0.0805
+[torch.FloatTensor of size 4]
+```
+
+
+<a name='nn.NaN'></a>
+## NaN ##
+
+```lua
+dmodule = nn.NaN(module, [id])
+```
+
+The `NaN` module asserts that the `output` and `gradInput` of the decorated `module` do not contain NaNs.
+This is useful for locating the source of those pesky NaN errors.
+The `id` defaults to automatically incremented values of `1,2,3,...`.
+
+For example :
+
+```lua
+linear = nn.Linear(3,4)
+mlp = nn.Sequential()
+mlp:add(nn.NaN(nn.Identity()))
+mlp:add(nn.NaN(linear))
+mlp:add(nn.NaN(nn.Linear(4,2)))
+print(mlp)
+```
+
+As you can see the `NaN` layers are have unique ids :
+
+```lua
+nn.Sequential {
+  [input -> (1) -> (2) -> (3) -> output]
+  (1): nn.NaN(1) @ nn.Identity
+  (2): nn.NaN(2) @ nn.Linear(3 -> 4)
+  (3): nn.NaN(3) @ nn.Linear(4 -> 2)
+}
+```
 
-It accepts a parameter ``outputDim`` that represents the output dimension of the module weight it wraps, which defaults to 1. If the outputDim is not 1, the container will transpose the weight appropriately. If the module weight is not 2D, the container will view the weight into an appropriate 2D shape based on the outputDim specified by the user.
+And if we fill the `bias` of the linear module with NaNs and call `forward`:
+
+```lua
+nan = math.log(math.log(0)) -- this is a nan value
+linear.bias:fill(nan)
+mlp:forward(torch.randn(2,3))
+```
+
+We get a nice error message:
+```lua
+/usr/local/share/lua/5.1/nn/NaN.lua:39: NaN found in parameters of module :
+nn.NaN(2) @ nn.Linear(3 -> 4)
+```
+
+For a quick one-liner to catch NaNs anywhere inside a model (for example, a `nn.Sequential` or any other `nn.Container`), we can use this with the `nn.Module.replace` function:
+```lua
+model:replace(function(module) return nn.NaN(module) end)
+```
+
+<a name='nn.Profile'></a>
+## Profile ##
+
+```lua
+dmodule = nn.Profile(module, [print_interval, [name] ])
+```
+
+The `Profile` module times each forward and backward pass of the decorated `module`. It prints this information after `print_interval` passes, which is `100` by default. For timing multiple modules, the `name` argument allows this information to be printed accompanied by a name, which by default is the type of the decorated `module`.
+
+This is useful for profiling new modules you develop, and tracking down bottlenecks in the speed of a network.
+
+The timer and print statement can add a small amount of overhead to the overall speed.
+
+As an example:
+
+```lua
+mlp = nn.Sequential()
+mlp:add(nn.Identity())
+mlp:add(nn.Linear(1000,1000))
+mlp:add(nn.Tanh())
+mlp:replace(function(module) return nn.Profile(module, 1000) end)
+inp = torch.randn(1000)
+gradOutput = torch.randn(1000)
+for i=1,1000 do
+   mlp:forward(inp)
+   mlp:backward(inp, gradOutput)
+end
+```
+
+results in the following profile information:
+
+```
+nn.Identity took 0.026 seconds for 1000 forward passes
+nn.Linear took 0.119 seconds for 1000 forward passes
+nn.Tanh took 0.061 seconds for 1000 forward passes
+nn.Tanh took 0.032 seconds for 1000 backward passes
+nn.Linear took 0.161 seconds for 1000 backward passes
+nn.Identity took 0.026 seconds for 1000 backward passes
+```
+
+It's good practice to profile modules after a single forwards and backwards pass, since the initial pass often has to allocate memory. Thus, in the example above, you would run another 1000 forwards and backwards passes to time the modules in their normal mode of operation:
+
+```
+for i=1,1000 do
+   mlp:forward(inp)
+   mlp:backward(inp, gradOutput)
+end
+```
 
 <a name="nn.TableContainers"></a>
 ## Table Containers ##
@@ -341,3 +483,5 @@ While the above containers are used for manipulating input [Tensors](https://git
  * [ParallelTable](table.md#nn.ParallelTable)
 
 These, along with all other modules for manipulating tables can be found [here](table.md).
+
+
diff --git a/doc/convolution.md b/doc/convolution.md
index 3dde128..82d890e 100644
--- a/doc/convolution.md
+++ b/doc/convolution.md
@@ -14,6 +14,7 @@ A convolution is an integral that expresses the amount of overlap of one functio
     * [SpatialConvolution](#nn.SpatialConvolution) : a 2D convolution over an input image ;
     * [SpatialFullConvolution](#nn.SpatialFullConvolution) : a 2D full convolution over an input image ;
     * [SpatialDilatedConvolution](#nn.SpatialDilatedConvolution) : a 2D dilated convolution over an input image ;
+    * [SpatialDepthWiseConvolution](#nn.SpatialDepthWiseConvolution) : a 2D depth-wise convolution over an input image ;
     * [SpatialConvolutionLocal](#nn.SpatialConvolutionLocal) : a 2D locally-connected layer over an input image ;
     * [SpatialSubSampling](#nn.SpatialSubSampling) : a 2D sub-sampling over an input image ;
     * [SpatialMaxPooling](#nn.SpatialMaxPooling) : a 2D max-pooling operation over an input image ;
@@ -546,6 +547,51 @@ oheight = floor((height + 2 * padH - dilationH * (kH-1) - 1) / dH) + 1
 
 Further information about the dilated convolution can be found in the following paper: [Multi-Scale Context Aggregation by Dilated Convolutions](http://arxiv.org/abs/1511.07122).
 
+<a name="nn.SpatialDepthWiseConvolution"></a>
+### SpatialDepthWiseConvolution ###
+
+```lua
+module = nn.SpatialDepthWiseConvolution(nInputPlane, nOutputPlane, kW, kH, [dW], [dH], [padW], [padH])
+```
+
+Applies a 2D depth-wise convolution over an input image composed of several input planes. The `input` tensor in
+`forward(input)` is expected to be a 3D tensor (`nInputPlane x height x width`).
+
+It is similar to `SpatialConvolution`, but here a spatial convolution is performed independently over each channel of an input. The most noticiable difference is the output dimension of `SpatialConvolution` is `nOutputPlane x oheight x owidth`, while for `SpatialDepthWiseConvolution` it is  `(nOutputPlane x nInputPlane) x oheight x owidth`.
+
+The parameters are the following:
+  * `nInputPlane`: The number of expected input planes in the image given into `forward()`.
+  * `nOutputPlane`: The number of output planes the convolution layer will produce.
+  * `kW`: The kernel width of the convolution
+  * `kH`: The kernel height of the convolution
+  * `dW`: The step of the convolution in the width dimension. Default is `1`.
+  * `dH`: The step of the convolution in the height dimension. Default is `1`.
+  * `padW`: Additional zeros added to the input plane data on both sides of width axis. Default is `0`. `(kW-1)/2` is often used here.
+  * `padH`: Additional zeros added to the input plane data on both sides of height axis. Default is `0`. `(kH-1)/2` is often used here.
+
+Note that depending of the size of your kernel, several (of the last)
+columns or rows of the input image might be lost. It is up to the user to
+add proper padding in images.
+
+If the input image is a 3D tensor `nInputPlane x height x width`, the output image size
+will be a 3D tensor `(nOutputPlane x nInputPlane) x oheight x owidth` where
+```lua
+owidth  = floor((width  + 2*padW - kW) / dW + 1)
+oheight = floor((height + 2*padH - kH) / dH + 1)
+```
+
+The parameters of the convolution can be found in `self.weight` (Tensor of
+size `nOutputPlane x nInputPlane x kH x kW`) and `self.bias` (Tensor of
+size `nOutputPlane x nInputPlane`). The corresponding gradients can be found in
+`self.gradWeight` and `self.gradBias`.
+
+The output value of the layer can be described as:
+```
+output[i][j] = input[j] * weight[i][j] + b[i][j], i = 1, ..., nOutputPlane, j = 1, ..., nInputPlane
+```
+
+Further information about the dilated convolution can be found in the following paper: [Xception: Deep Learning with Depthwise Separable Convolutions](https://arxiv.org/abs/1610.02357).
+
 <a name="nn.SpatialConvolutionLocal"></a>
 ### SpatialConvolutionLocal ###
 
diff --git a/doc/criterion.md b/doc/criterion.md
index cb2bbd0..a3e1b2e 100644
--- a/doc/criterion.md
+++ b/doc/criterion.md
@@ -95,10 +95,10 @@ criterion.sizeAverage = false
 ## ClassNLLCriterion ##
 
 ```lua
-criterion = nn.ClassNLLCriterion([weights])
+criterion = nn.ClassNLLCriterion([weights, sizeAverage, ignoreIndex])
 ```
 
-The negative log likelihood criterion. It is useful to train a classification problem with `n` classes.
+The negative log likelihood (NLL) criterion. It is useful to train a classification problem with `n` classes.
 If provided, the optional argument `weights` should be a 1D `Tensor` assigning weight to each of the classes.
 This is particularly useful when you have an unbalanced training set.
 
@@ -113,11 +113,21 @@ The loss can be described as:
 loss(x, class) = -x[class]
 ```
 
-or in the case of the `weights` argument it is specified as follows:
+or in the case of the `weights` argument, it is specified as follows:
 ```lua
 loss(x, class) = -weights[class] * x[class]
 ```
-Due to the behaviour of the backend code, it is necessary to set sizeAverage to false when calculating losses *in non-batch mode*.
+
+or in the case of the `ignoreIndex` argument:
+```
+loss(x, class) = class != ignoreIndex ? -weights[class] * x[class] : 0
+```
+
+Indeed, the `ignoreIndex` (defaults to -100) specifies a value for targets to be ignored.
+The commensurate `gradInput` for that target will be zero.
+When `sizeAverage=true` (the default), the `gradInput` and `output` are averaged over non-ignored targets.
+
+Due to the behaviour of the backend code, it is necessary to set `sizeAverage` to false when calculating losses *in non-batch mode*.
 
 The following is a code fragment showing how to make a gradient step given an input `x`, a desired output `y` (an integer `1` to `n`, in this case `n = 2` classes), a network `mlp` and a learning rate `learningRate`:
 
@@ -133,7 +143,7 @@ function gradUpdate(mlp, x, y, learningRate)
 end
 ```
 
-By default, the losses are averaged over observations for each minibatch. However, if the field `sizeAverage` is set to `false`, the losses are instead summed for each minibatch.
+By default, the losses are averaged over observations for each minibatch. However, if the argument `sizeAverage` is set to `false`, the losses are instead summed for each minibatch.
 
 
 <a name="nn.CrossEntropyCriterion"></a>
@@ -419,7 +429,7 @@ Optionally, you can give non-equal weighting on the classes by passing a 1D `wei
 The loss function then becomes:
 
 ```lua
-loss(x, y) = sum_i(max(0, w[y] * (margin - x[y] - x[i]))^p) / x:size(1)
+loss(x, y) = sum_i(max(0, w[y] * (margin - x[y] + x[i]))^p) / x:size(1)
 ```
 
 This criterion is especially useful for classification when used in conjunction with a module ending in the following output layer:
@@ -460,7 +470,7 @@ target = torch.Tensor{{1, 3, 0, 0}, {4, 0, 0, 0}} -- zero-values are ignored
 criterion:forward(input, target)
 ```
 
-<a name="nn.MultiLabelSoftMarginCriterion"/>
+<a name="nn.MultiLabelSoftMarginCriterion"></a>
 ## MultiLabelSoftMarginCriterion ##
 
 ```lua
@@ -758,7 +768,7 @@ Sample example
 
    tripleModel = nn.ParallelTable()
    tripleModel:add(embeddingModel)
-   tripleModel:add(embeddingModel:clone('weight', 'bias', 
+   tripleModel:add(embeddingModel:clone('weight', 'bias',
                                         'gradWeight', 'gradBias'))
    tripleModel:add(embeddingModel:clone('weight', 'bias',
                                         'gradWeight', 'gradBias'))
diff --git a/doc/simple.md b/doc/simple.md
old mode 100644
new mode 100755
index 09c60ca..7d19fd4
--- a/doc/simple.md
+++ b/doc/simple.md
@@ -4,7 +4,9 @@ Simple Modules are used for various tasks like adapting Tensor methods and provi
 
   * Parameterized Modules :
     * [Linear](#nn.Linear) : a linear transformation ;
+    * [LinearWeightNorm](#nn.LinearWeightNorm) : a weight normalized linear transformation ;
     * [SparseLinear](#nn.SparseLinear) : a linear transformation with sparse inputs ;
+    * [IndexLinear](#nn.IndexLinear) : an alternative linear transformation with for sparse inputs and max normalization ;
     * [Bilinear](#nn.Bilinear) : a bilinear transformation with sparse inputs ;
     * [PartialLinear](#nn.PartialLinear) : a linear transformation with sparse inputs with the option of only computing a subset ;
     * [Add](#nn.Add) : adds a bias term to the incoming data ;
@@ -99,6 +101,19 @@ x = torch.Tensor(10) -- 10 inputs
 y = module:forward(x)
 ```
 
+<a name="nn.LinearWeightNorm"></a>
+## LinearWeightNorm ##
+
+```lua
+module = nn.LinearWeightNorm(inputDimension, outputDimension, [bias = true])
+```
+
+LinearWeightNorm implements the reparametrization presented in [Weight Normalization](https://arxiv.org/pdf/1602.07868v3.pdf), which decouples the length of neural network weight vectors from their direction. The weight vector `w` is determined instead by parameters `g` and `v` such that `w = g * v / ||v||`, where `||v||` is the euclidean norm of vector `v`. In all other respects this layer behaves like `nn.Linear`.
+
+To convert between `nn.Linear` and `nn.LinearWeightNorm` you can use the `nn.LinearWeightNorm.fromLinear(linearModule)` and `weightNormModule:toLinear()` functions.
+
+Other layer types can make use of weight normalization through the [nn.WeightNorm](https://github.com/torch/nn/blob/master/doc/containers.md#nn.WeightNorm) container.
+
 <a name="nn.SparseLinear"></a>
 ## SparseLinear ##
 
@@ -133,6 +148,66 @@ x = torch.Tensor({ {1, 0.1}, {2, 0.3}, {10, 0.3}, {31, 0.2} })
 
 The first column contains indices, the second column contains values in a a vector where all other elements are zeros. The indices should not exceed the stated dimensions of the input to the layer (10000 in the example).
 
+<a name="nn.IndexLinear"></a>
+## IndexLinear ##
+
+```lua
+module = nn.IndexLinear(inputSize, outputSize, doGradInput, keysOffset, weight, bias, normalize)
+```
+
+Applies the following transformation to the incoming (optionally) normalized sparse input data:
+`z = Weight * y + bias`, where
+- `y_i = normalize and (x_i *  (1 / x_i_max) + b_i) or x_i`
+- `x_i` is the `i'th` feature of the input,
+- `b_i` is a per-feature bias,
+- `x_i_max` is the maximum absolute value seen so far during training for feature `i`.
+
+The normalization of input features is very useful to avoid explosions during training if sparse input values are really high. It also helps ditinguish between the presence and the absence of a given feature.
+
+#### Parameters ####
+- `inputSize` is the maximum number of features.
+- `outputSize` is the number of output neurons.
+- `doGradInput`, if  `false` (the default), the gradInput will not be computed.
+- `keysOffset` lets you specify input keys are in the `[1+keysOffset, N+keysOffset]` range. (defaults to `0`)
+- `weight` and `bias` allow you to create the module with existing weights without using additional memory.
+  When passing `weight` and `bias`, `inputSize` and `outputSize` are inferred from the weights.
+- `normalize` will activate the normalization of the input feature values. (`false` by default)
+
+You can create an `IndexLinear` layer the following way:
+
+```lua
+-- 10000 inputs, 2 outputs, no grad input, no offset, no input weight/bias, max-norm on
+module = nn.IndexLinear(10000, 2, nil, 0, nil, nil, true)
+```
+
+#### Differences from SparseLinear ####
+- The layout of `weight` is transposed compared to `SparseLinear`. This was done for performance considerations.
+- The `gradWeight` that is computed for in-place updates is a sparse representation of the whole gradWeight matrix. Its size changes from one
+backward pass to another. This was done for performance considerations.
+- The input format differs from the [SparseLinear](#nn.SparseLinear) input format by accepting keys and values as a table of tensors. This enables `IndexLinear` to have a larger range for keys than `SparseLinear`.
+
+The `input` tensors must be in one of the following formats.
+
+- An array of size 2 containing a batch of `keys` followed by a batch of `values`.
+```lua
+x = {
+      { torch.LongTensor({ 1, 200 }), torch.LongTensor({ 100, 200, 1000 }) },
+      { torch.Tensor({ 1, 0.1 }), torch.Tensor({ 10, 0.5, -0.5 }) }
+}
+```
+
+- an array of size 3 containing a flattened (pre-concatenated) batch of `keys`, followed by `values`, and `sizes`.
+```lua
+-- Equivalent to the input shown above
+x = {
+      torch.LongTensor({ 1, 200, 100, 200, 1000 }),
+      torch.Tensor({ 1, 0.1, 10, .5, -0.5 }),
+      torch.LongTensor({ 2, 3 })
+}
+```
+
+Note: The tensors representing `keys` and `sizes` must always be of type `LongTensor` / `CudaLongTensor`. The values can be either `FloatTensor`or `DoubleTensor` or their cutorch equivalents.
+
 <a name="nn.Bilinear"></a>
 ## Bilinear ##
 
@@ -556,11 +631,11 @@ This module is based on [nn.Sum](#nn.Sum).
 ## Sum ##
 
 ```lua
-module = nn.Sum(dimension, nInputDim, sizeAverage)
+module = nn.Sum(dimension, nInputDim, sizeAverage, squeeze)
 ```
 
 Applies a sum operation over dimension `dimension`.
-Hence, if an `nxpxq` Tensor was given as input, and `dimension` = `2` then an `nxq` matrix would be output.
+Hence, if an `nxpxq` Tensor was given as input, and `dimension` = `2` then an `nxq` matrix would be output. If argument `squeeze` is set to `false` then the output would be of size `nx1xq`.
 When `nInputDim` is provided , inputs larger than that value will be considered batches where the actual `dimension` to apply the sum operation will be dimension `dimension + 1`.
 Negative indexing is allowed by providing a negative value to `nInputDim`.
 When `sizeAverage` is provided, the sum is divided by the size of the input in this `dimension`. This is equivalent to the mean operation performed by the [nn.Mean](#nn.Mean) module.
@@ -1183,6 +1258,19 @@ t:transpose(dim1, dim2)
 t:transpose(dim3, dim4)
 ```
 
+The method `setNumInputDims()` allows to specify the expected number of dimensions of the inputs of the modules. This makes it possible to use minibatch inputs. Example:
+```lua
+b = 5 -- batch size 5
+input = torch.Tensor(b, 2, 4, 3) -- input: b x 2 x 4 x 3
+
+m = nn.Transpose({1,3})
+m:forward(input) -- output: 4 x 2 x b x 3 x 1
+
+numInputDims = 3 -- input feature map should be the last 3 dims
+m = nn.Transpose({1,3}):setNumInputDims(numInputDims)
+m:forward(input) -- output: b x 3 x 4 x 2
+```
+
 <a name="nn.Exp"></a>
 ## Exp ##
 
diff --git a/doc/table.md b/doc/table.md
index d5174a7..b3e2e5f 100644
--- a/doc/table.md
+++ b/doc/table.md
@@ -177,7 +177,7 @@ module = nn.MapTable(m, share)
 
 `MapTable` is a container for a single module which will be applied to all input elements. The member module is cloned as necessary to process all input elements. Call `resize(n)` to set the number of clones manually or call `clearState()` to discard all clones.
 
-Optionally, the module can be initialized with the contained module and with a list of parameters that are shared across all clones. By default, these parameters are `weight`, `bias`, `gradWeight` and `gradBias`.
+Optionally, the module can be initialized with the contained module and a boolean `share`. It indicates whether parameters are shared across all clones or not. By default it is set to true. The shared parameters are `weight`, `bias`, `gradWeight` and `gradBias`.
 
 ```
 +----------+         +-----------+
diff --git a/init.lua b/init.lua
old mode 100644
new mode 100755
index 97ec910..18c3c8c
--- a/init.lua
+++ b/init.lua
@@ -15,13 +15,20 @@ require('nn.Concat')
 require('nn.Parallel')
 require('nn.Sequential')
 require('nn.DepthConcat')
+
+require('nn.Decorator')
 require('nn.Bottle')
 require('nn.WeightNorm')
+require('nn.DontCast')
+require('nn.NaN')
+require('nn.Profile')
 
 require('nn.Linear')
+require('nn.LinearWeightNorm')
 require('nn.Bilinear')
 require('nn.PartialLinear')
 require('nn.SparseLinear')
+require('nn.IndexLinear')
 require('nn.Reshape')
 require('nn.View')
 require('nn.Contiguous')
@@ -106,6 +113,7 @@ require('nn.SpatialConvolutionLocal')
 require('nn.SpatialFullConvolution')
 require('nn.SpatialFullConvolutionMap')
 require('nn.SpatialConvolutionMM')
+require('nn.SpatialDepthWiseConvolution')
 require('nn.SpatialConvolutionMap')
 require('nn.SpatialDilatedConvolution')
 require('nn.SpatialSubSampling')
diff --git a/lib/THNN/generic/ClassNLLCriterion.c b/lib/THNN/generic/ClassNLLCriterion.c
index 0db3a8a..4cf37ae 100644
--- a/lib/THNN/generic/ClassNLLCriterion.c
+++ b/lib/THNN/generic/ClassNLLCriterion.c
@@ -9,12 +9,14 @@ void THNN_(ClassNLLCriterion_updateOutput)(
           THTensor *output,
           bool sizeAverage,
           THTensor *weights,
-          THTensor *total_weight)
+          THTensor *total_weight,
+          long ignore_index)
 {
   THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
   THNN_CHECK_DIM_SIZE(total_weight, 1, 0, 1);
   int n_dims = THTensor_(nDimension)(input);
   int n_classes = THTensor_(size)(input, n_dims - 1);
+  ignore_index -= TH_INDEX_BASE;
 
   if (THIndexTensor_(nDimension)(target) > 1) {
     THError("multi-target not supported");
@@ -42,9 +44,11 @@ void THNN_(ClassNLLCriterion_updateOutput)(
 
   if (THTensor_(nDimension)(input) == 1) {
     int cur_target = target_data[0] - TH_INDEX_BASE;
-    THAssert(cur_target >= 0 && cur_target < n_classes);
-    total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f;
-    output_data[0] = -input_data[cur_target] * total_weight_data[0];
+    if (cur_target != ignore_index) {
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+      total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f;
+      output_data[0] = -input_data[cur_target] * total_weight_data[0];
+    }
   } else if (THTensor_(nDimension)(input) == 2) {
     int batch_size = THTensor_(size)(input, 0);
     THAssert(THIndexTensor_(size)(target, 0) == batch_size);
@@ -54,11 +58,13 @@ void THNN_(ClassNLLCriterion_updateOutput)(
     int i;
     for (i = 0; i < batch_size; i++) {
       int cur_target = target_data[i] - TH_INDEX_BASE;
-      THAssert(cur_target >= 0 && cur_target < n_classes);
+      if (cur_target != ignore_index) {
+        THAssert(cur_target >= 0 && cur_target < n_classes);
 
-      real cur_weight = weights ? weights_data[cur_target] : 1.0f;
-      total_weight_data[0] += cur_weight;
-      output_data[0] -= input_data[i * n_target + cur_target] * cur_weight;
+        real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+        total_weight_data[0] += cur_weight;
+        output_data[0] -= input_data[i * n_target + cur_target] * cur_weight;
+      }
     }
   }
 
@@ -80,10 +86,12 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
           THTensor *gradInput,
           bool sizeAverage,
           THTensor *weights,
-          THTensor *total_weight)
+          THTensor *total_weight,
+          long ignore_index)
 {
   int n_dims = THTensor_(nDimension)(input);
   int n_classes = THTensor_(size)(input, n_dims - 1);
+  ignore_index -= TH_INDEX_BASE;
 
   if (!THTensor_(isContiguous)(gradInput)) {
     THError("gradInput must be contiguous");
@@ -102,7 +110,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
   if (THTensor_(nDimension)(input) > 2) {
     THError("input tensor should be 1D or 2D");
   }
-  
+
   if (weights && THTensor_(nElement)(weights) != n_classes) {
     THError("weight tensor should be defined either for all or no classes");
   }
@@ -116,10 +124,12 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
 
   if (THTensor_(nDimension)(input) == 1) {
     int cur_target = target_data[0] - TH_INDEX_BASE;
-    THAssert(cur_target >= 0 && cur_target < n_classes);
+    if (cur_target != ignore_index) {
+      THAssert(cur_target >= 0 && cur_target < n_classes);
 
-    gradInput_data[cur_target] =
-      (!sizeAverage && weights) ? -weights_data[cur_target] : -1;
+      gradInput_data[cur_target] =
+        (!sizeAverage && weights) ? -weights_data[cur_target] : -1;
+    }
 
   } else if (THTensor_(nDimension)(input) == 2) {
     int batch_size = THTensor_(size)(input, 0);
@@ -131,13 +141,15 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
     for (i = 0; i < batch_size; i++){
       int cur_target = target_data[i] - TH_INDEX_BASE;
 
-      THAssert(cur_target >= 0 && cur_target < n_classes);
+      if (cur_target != ignore_index) {
+        THAssert(cur_target >= 0 && cur_target < n_classes);
 
-      gradInput_data[i * n_target + cur_target] =
-        -(weights ? weights_data[cur_target] : 1.0f);
+        gradInput_data[i * n_target + cur_target] =
+          -(weights ? weights_data[cur_target] : 1.0f);
 
-      if (sizeAverage && *total_weight_data) {
-        gradInput_data[i * n_target + cur_target] /= *total_weight_data;
+        if (sizeAverage && *total_weight_data) {
+          gradInput_data[i * n_target + cur_target] /= *total_weight_data;
+        }
       }
     }
   }
diff --git a/lib/THNN/generic/FusedRNNKernel.c b/lib/THNN/generic/FusedRNNKernel.c
new file mode 100644
index 0000000..6126e86
--- /dev/null
+++ b/lib/THNN/generic/FusedRNNKernel.c
@@ -0,0 +1,53 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/FusedRNNKernel.c"
+#else
+
+void THNN_(GRUFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1,
+          THTensor *bias2,
+          THTensor *hx,
+          THTensor *hy)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(GRUFused_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(LSTMFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1,
+          THTensor *bias2,
+          THTensor *cx,
+          THTensor *hy,
+          THTensor *cy)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(LSTMFused_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *prevC,
+          THTensor *cy,
+          THTensor *gradOutput,
+          THTensor *gradOutputCell,
+          THTensor *gradInput)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+#endif
diff --git a/lib/THNN/generic/HardTanh.c b/lib/THNN/generic/HardTanh.c
index b38a946..589a66e 100644
--- a/lib/THNN/generic/HardTanh.c
+++ b/lib/THNN/generic/HardTanh.c
@@ -97,7 +97,7 @@ void THNN_(HardTanh_updateGradInput)(
     }
     else
       TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
-        if (*input_data < min_val || *input_data > max_val)
+        if (*input_data <= min_val || *input_data >= max_val)
           *gradInput_data = 0;
         else
           *gradInput_data = *gradOutput_data;
@@ -122,7 +122,7 @@ void THNN_(HardTanh_updateGradInput)(
 #pragma omp parallel for private(i)
       for (i = 0; i < n; i++)
       {
-        if (ptr_input[i] < min_val || ptr_input[i] > max_val)
+        if (ptr_input[i] <= min_val || ptr_input[i] >= max_val)
           ptr_gradInput[i] = 0;
         else
           ptr_gradInput[i] = ptr_gradOutput[i];
diff --git a/lib/THNN/generic/IndexLinear.c b/lib/THNN/generic/IndexLinear.c
new file mode 100644
index 0000000..42d8368
--- /dev/null
+++ b/lib/THNN/generic/IndexLinear.c
@@ -0,0 +1,742 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/IndexLinear.c"
+#else
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+/* Threshold used to trigger multithreading */
+#ifndef THNN_SPARSE_OMP_THRESHOLD
+#define THNN_SPARSE_OMP_THRESHOLD 100000
+#endif
+
+/* Threshold used to trigger BLAS axpy call */
+#ifndef THNN_SPARSE_OUTDIM_THRESHOLD
+#define THNN_SPARSE_OUTDIM_THRESHOLD 49
+#endif
+
+/* sign MACRO */
+#ifndef THNN_INDEXLINEAR_SIGN
+#define THNN_INDEXLINEAR_SIGN(a) ( ( (a) < 0 )  ?  -1   : ( (a) > 0 ) )
+#endif
+
+static bool THNN_(checkKeysValues)(THLongTensor* keys, THTensor* values)
+{
+  return THLongTensor_size(keys, 0) == THTensor_(nElement)(values)
+                && THTensor_(nDimension)(values) == 1
+                && THLongTensor_nDimension(keys) == 1;
+}
+
+void THNN_(IndexLinear_updateOutput)(
+          THNNState *state,
+          THLongTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THLongTensor *sizes,
+          THLongTensor *cumSumSizes,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *normalizedValues,
+          int  train)
+{
+  /* Retrieve all the dimensions of the problem */
+  long batchSize = THLongTensor_size(sizes, 0);
+  long keysSize = THLongTensor_size(keys, 0);
+  long outDim = THTensor_(size)(bias, 0);
+  long woutDim = THTensor_(size)(weight, 1);
+  int maxNormalize = woutDim - outDim;
+  long* sizesData = THLongTensor_data(sizes);
+  long* cumSumSizesData = THLongTensor_data(cumSumSizes);
+
+  /* Define/resize the normalized values tensor if maxNormalize is  > 0 */
+  real* normalizedValuesData = NULL;
+  if (maxNormalize)
+  {
+    THTensor_(resize1d)(normalizedValues, keysSize);
+    normalizedValuesData = THTensor_(data)(normalizedValues);
+  }
+
+  /* Resize the output */
+  THTensor_(resize2d)(output, batchSize, outDim);
+
+  /* Access the storage data/strides */
+  real* outputData = THTensor_(data)(output);
+  real* valuesData = THTensor_(data)(values);
+  real* weightData = THTensor_(data)(weight);
+  long weightStride0 = weight->stride[0];
+  real* biasData = THTensor_(data)(bias);
+  long* keysData = THLongTensor_data(keys);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(output), 6, "output vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias vector must be contiguous");
+  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+  THArgCheck(THTensor_(isContiguous)(normalizedValues), 9, "normalizedValues vector must be contiguous");
+  long i,j,k;
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations. */
+  if (outDim == 1)
+  {
+    THVector_(fill)(outputData, *biasData, batchSize);
+    if (maxNormalize)
+    {
+      /* Parallelize on the batch itself */
+#pragma omp parallel                                                    \
+    for private(i,j)                                                    \
+    firstprivate(outDim, keysOffset,                                    \
+                 weightData, keysData,                                  \
+                 valuesData, outputData,                                \
+                 cumSumSizesData, sizesData)                            \
+    schedule(static)                                                    \
+    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+      for (j = 0; j < batchSize; j++)
+      {
+        real* loutputData = outputData + j;
+        real val = 0;
+        real absVal = 0;
+        long offset = j == 0 ? 0 : cumSumSizesData[j - 1];
+
+        for (i = 0; i < sizesData[j]; i++)
+        {
+          long woffset = weightStride0*(keysData[offset] + keysOffset);
+          absVal = fabs(valuesData[offset]);
+          if (train)
+          {
+            if (absVal > weightData[woffset])
+            {
+              weightData[woffset] = absVal;
+              weightData[woffset+1] = 1/absVal;
+            }
+
+            /*
+             * The following can be used to scale the size of the updates
+             * depending on some rule, e.g. the frequency of a feature, ...
+             * This is used at update time.
+             * TODO: implement a smarter update scale.
+             */
+            weightData[woffset+2] = 1;
+          }
+          normalizedValuesData[offset] = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(valuesData[offset]):valuesData[offset]*weightData[woffset+1]) + weightData[woffset+3];
+          val += normalizedValuesData[offset] * weightData[woffset+maxNormalize];
+          offset++;
+        }
+        *loutputData += val;
+      }
+    }
+    else
+    {
+      /* Parallelize on the batch itself */
+#pragma omp parallel                                                    \
+    for private(i,j)                                                    \
+    firstprivate(outDim, weightData,                                    \
+                 keysData, valuesData,                                  \
+                 outputData, cumSumSizesData,                           \
+                 sizesData)                                             \
+    schedule(static)                                                    \
+    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+      for (j = 0; j < batchSize; j++)
+      {
+        long offset = j == 0 ? 0 : cumSumSizesData[j - 1];
+        real* loutputData = outputData + j;
+        real val = 0;
+
+        for (i = 0; i < sizesData[j]; i++)
+        {
+          val += weightData[weightStride0*(keysData[offset] + keysOffset)] * valuesData[offset];
+          offset++;
+        }
+        *loutputData += val;
+      }
+    }
+  }
+  else {
+#pragma omp parallel                                                    \
+    for private(i,j,k)                                                  \
+    firstprivate(outDim, weightData,                                    \
+                 keysData, valuesData,                                  \
+                 biasData, outputData,                                  \
+                 cumSumSizesData, sizesData)                            \
+    schedule(static)                                                    \
+    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+    for (j = 0; j < batchSize; j++)
+    {
+      long offset = j == 0 ? 0 : cumSumSizesData[j -  1];
+      real val = 0;
+      real* loutputData = outputData + j*outDim;
+      real* lweightData = weightData;
+      memcpy(loutputData, biasData, outDim*sizeof(real));
+      for (i = 0; i < sizesData[j]; i++)
+      {
+        real val;
+        long woffset = weightStride0*(keysData[offset] + keysOffset);
+        if (maxNormalize)
+        {
+          val = valuesData[offset];
+          real absVal = fabs(val);
+          if (train)
+          {
+            if (absVal > weightData[woffset])
+            {
+              weightData[woffset] = absVal;
+              weightData[woffset+1] = 1/absVal;
+            }
+
+            /*
+             * The following can be used to scale the size of the updates
+             * depending on some rule, e.g. the frequency of a feature, ...
+             * The commented section thereafter is just an example of what can be done:
+             *
+             *```
+             * weightData[woffset+2] = weightData[woffset+2]==0?1:(weightData[woffset+2] / (weightData[woffset+2] + 1));
+             * real alpha = 1;
+             * real beta = 0.01;
+             * real gamma = 1 - 0.000001;
+             * real l = weightData[woffset+2]==0?1/gamma:(weightData[woffset+2] - beta) / (alpha - beta);
+             * l = gamma*l;
+             * weightData[woffset+2] = (alpha-beta)*l + beta;
+             * ```
+             *
+             * TODO: implement a smarter update scale.
+             */
+            weightData[woffset+2] = 1;
+          }
+
+          /* Normalize + Clamp */
+          val = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(val):val*weightData[woffset+1]) + weightData[woffset+3];
+          normalizedValuesData[offset] = val;
+
+          lweightData = weightData + woffset + maxNormalize;
+        }
+        else
+        {
+          val = valuesData[offset];
+          lweightData = weightData + woffset;
+        }
+        if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+        {
+          THBlas_(axpy)(outDim, val, lweightData, 1, loutputData, 1);
+        }
+        else
+        {
+          for (k=0; k < outDim; k++)
+          {
+            loutputData[k] += lweightData[k] * val;
+          }
+        }
+        offset++;
+      }
+    }
+  }
+  return;
+}
+
+void THNN_(IndexLinear_updateParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THLongTensor *runningKeys,
+          THLongTensor *cumSumSizes,
+          long keysOffset,
+          accreal weightDecay_,
+          accreal learningRate_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
+  /* Retrieve all the dimensions of the problem */
+  long outDim = THTensor_(size)(bias, 0);
+  long woutDim = THTensor_(size)(weight, 1);
+  int maxNormalize = woutDim - outDim;
+  long keysSize = THLongTensor_size(runningKeys, 0);
+
+  /* Access the storage data/strides */
+  real* gradWeightData = THTensor_(data)(gradWeight);
+  real* weightData = THTensor_(data)(weight);
+  long weightStride0 = weight->stride[0];
+  real* gradBiasData = THTensor_(data)(gradBias);
+  real* biasData = THTensor_(data)(bias);
+  long* keysData = THLongTensor_data(runningKeys);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous");
+  THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous");
+
+  int j,k;
+  long offset = 0;
+
+  /* Update the bias first */
+  THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim);
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations.
+   * No multithreading here as this could
+   * corrupt the results (hogwild style) */
+  if (outDim == 1)
+  {
+    if (maxNormalize)
+    {
+      if (weightDecay)
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          long woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
+          real lr = learningRate*weightData[woffset-2];
+          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
+          weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset];
+        }
+      }
+      else
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          long woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
+          real lr = learningRate*weightData[woffset-2];
+          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
+          weightData[woffset] -= gradWeightData[2*j+1]*lr;
+        }
+      }
+    }
+    else
+    {
+      if (weightDecay)
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          long woffset = weightStride0*(keysData[j] + keysOffset);
+          weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset];
+        }
+      }
+      else
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate;
+        }
+      }
+    }
+  }
+  else
+  {
+    for (j = 0; j < keysSize; j++)
+    {
+      real lr = learningRate;
+      real wd = weightDecay;
+      real* lweightData;
+      long woffset = weightStride0*(keysData[j] + keysOffset);
+      real* lgradWeightData = gradWeightData + j*outDim;
+      if (maxNormalize)
+      {
+        lgradWeightData += j*outDim;
+        /* weightData[woffset + 2] */
+        lweightData = weightData + woffset + maxNormalize - 2;
+        lr = lr*lweightData[0];
+        wd = weightDecay*lweightData[0];
+        /* weightData[woffset + 3] */
+        lweightData++;
+        for (k=0; k < outDim; k++)
+        {
+            lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr;
+        }
+        lweightData++;
+        lgradWeightData += outDim;
+      }
+      else
+      {
+        lweightData = weightData + woffset;
+      }
+
+      /* We do sparse weight decay.
+       * We think it makes more sense. */
+      if (weightDecay)
+      {
+        for (k=0; k < outDim; k++)
+        {
+            lweightData[k] -= lweightData[k]*wd;
+        }
+      }
+
+      if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+      {
+        THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1);
+      }
+      else
+      {
+        for (k=0; k < outDim; k++)
+        {
+          lweightData[k] -= lgradWeightData[k]*lr;
+        }
+      }
+    }
+  }
+}
+
+
+void THNN_(IndexLinear_accUpdateGradParameters)(
+          THNNState *state,
+          THLongTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THLongTensor *sizes,
+          THLongTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay_,
+          accreal scale_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  /* Retrieve all the dimensions of the problem */
+  long batchSize = THLongTensor_size(sizes, 0);
+  long keysSize = THLongTensor_size(keys, 0);
+  long outDim = THTensor_(size)(bias, 0);
+  long woutDim = THTensor_(size)(weight, 1);
+  int maxNormalize = woutDim - outDim;
+  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+
+  /* Access the storage data/strides */
+  real* gradOutputData = THTensor_(data)(gradOutput);
+  real* valuesData =THTensor_(data)(values);
+  real* weightData = THTensor_(data)(weight);
+  real* biasData = THTensor_(data)(bias);
+  long weightStride0 = weight->stride[0];
+  long biasStride = bias->stride[0];
+  long* keysData = THLongTensor_data(keys);
+  long* sizesData = THLongTensor_data(sizes);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous");
+
+  int i,j,k;
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations.
+   * No multithreading here as this could
+   * corrupt the results (hogwild style) */
+  if (outDim == 1)
+  {
+    if (maxNormalize)
+    {
+        long offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real* lgradOutputData = gradOutputData + j;
+          *biasData -= *lgradOutputData * scale;
+          real val = *lgradOutputData * scale;
+          real* lweightData = weightData;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            long idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
+            weightData[idx-1] -= weightData[idx]*val*weightData[idx-2];
+            weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2];
+            offset++;
+          }
+        }
+
+        offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real* lweightData = weightData;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            long idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
+            weightData[idx-2] = 0;
+            offset++;
+          }
+        }
+    }
+    else
+    {
+      if (weightDecay)
+      {
+        long offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real* lgradOutputData = gradOutputData + j;
+          *biasData -= *lgradOutputData * scale;
+          real val = *lgradOutputData * scale;
+          real* lweightData = weightData;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            long idx = weightStride0*(keysData[offset] + keysOffset);
+            weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay;
+            offset++;
+          }
+        }
+      }
+      else
+      {
+        long offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real val = gradOutputData[j] * scale;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset];
+            offset++;
+          }
+          *biasData -= val;
+        }
+      }
+    }
+  }
+  else {
+    long offset = 0;
+    for (j = 0; j < batchSize; j++)
+    {
+      real val = 0;
+      real* lgradOutputData = gradOutputData + j*outDim;
+      real* lweightData = weightData;
+      THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim);
+      for (i = 0; i < sizesData[j]; i++)
+      {
+        real val = valuesData[offset] * scale;
+        real wd = weightDecay;
+
+        // Max normalize case
+        if (maxNormalize)
+        {
+          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
+          val *= lweightData[0];
+          wd *= lweightData[0];
+          for (k=0; k < outDim; k++)
+          {
+            lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0];
+          }
+          lweightData += 2;
+        }
+        else
+        {
+          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset);
+        }
+
+        /* We do sparse weight decay.
+         * We think it makes more sense. */
+        if (weightDecay)
+        {
+          if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+          {
+            THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1);
+          }
+          else
+          {
+            for (k=0; k < outDim; k++)
+            {
+              lweightData[k] -= wd * lweightData[k];
+            }
+          }
+        }
+
+        if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+        {
+          THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1);
+        }
+        else
+        {
+          for (k=0; k < outDim; k++)
+          {
+            lweightData[k] -= val * lgradOutputData[k];
+          }
+        }
+        offset++;
+      }
+    }
+
+    /* Max Normalize case:
+     * Reset the smart update scaling if
+     * one does it batch-wise.
+     * TODO: Decide what to do with that piece of code.
+     * NB: If the code belowe is uncommented, so should the commented
+     * code in IndexLinear:zeroGradParameters() */
+
+    /*
+    if (maxNormalize)
+    {
+      offset = 0;
+      for (j = 0; j < batchSize; j++)
+      {
+        real* lweightData = weightData;
+        for (i = 0; i < sizesData[j]; i++)
+        {
+          real val = valuesData[offset] * scale;
+          real wd = weightDecay;
+
+          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
+          lweightData[0] = 0;
+          offset++;
+        }
+      }
+    }
+    */
+  }
+  return;
+}
+
+void THNN_(IndexLinear_accGradParameters)(
+          THNNState *state,
+          THLongTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THLongTensor *sizes,
+          THLongTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *valuesBuffer,
+          accreal weightDecay_,
+          accreal scale_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  /* Retrieve all the dimensions of the problem */
+  long batchSize = THLongTensor_size(sizes, 0);
+  long keysSize = THLongTensor_size(keys, 0);
+  long outDim = THTensor_(size)(bias, 0);
+  long woutDim = THTensor_(size)(weight, 1);
+  long maxNormalize = (woutDim - outDim) > 0 ?1:0;
+  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+  long* sizesData = THLongTensor_data(sizes);
+
+  /* COmpute the cumulative sizes */
+  THLongTensor* cumSizes = THLongTensor_new();
+  THLongTensor_cumsum(cumSizes, sizes, 0);
+  long* cumSizesData = THLongTensor_data(cumSizes);
+
+  /* Resize the gradWeight buffer to keep it dense.
+   * That speeds up updates A LOT assuming random mem access. */
+  THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1));
+
+  /* Access the storage data/strides */
+  real* gradOutputData = THTensor_(data)(gradOutput);
+  real* valuesData =THTensor_(data)(values);
+  real* gradWeightData = THTensor_(data)(gradWeight);
+  real* weightData = THTensor_(data)(weight);
+  real* gradBiasData = THTensor_(data)(gradBias);
+  long gradWeightStride0 = gradWeight->stride[0];
+  long weightStride0 = weight->stride[0];
+  long* keysData = THLongTensor_data(keys);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous");
+
+  int i,j,k;
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations.
+   * No multithreading here as this could
+   * corrupt the results (hogwild style) */
+  if (outDim == 1)
+  {
+    for (j = 0; j < batchSize; j++)
+    {
+      long offset = j==0?0:cumSizesData[j-1];
+      real val = gradOutputData[j] * scale;
+      real* lgradWeightData = gradWeightData + offset;
+      real* lvaluesData = valuesData + offset;
+      long end = sizesData[j];
+
+      if (maxNormalize)
+      {
+        lgradWeightData += offset;
+        i = 0;
+        for(;i < end; i++)
+        {
+          lgradWeightData[2*i] = val;
+          lgradWeightData[2*i+1] = val * lvaluesData[i];
+        }
+      }
+      else
+      {
+        i = 0;
+        for(;i < end-4; i += 4)
+        {
+          lgradWeightData[i] = val * lvaluesData[i];
+          lgradWeightData[i+1] = val * lvaluesData[i+1];
+          lgradWeightData[i+2] = val * lvaluesData[i+2];
+          lgradWeightData[i+3] = val * lvaluesData[i+3];
+        }
+
+        for(; i < end; i++)
+        {
+          lgradWeightData[i] = val * lvaluesData[i];
+        }
+      }
+      *gradBiasData += val;
+      offset += end;
+    }
+  }
+  else {
+    for (j = 0; j < batchSize; j++)
+    {
+      long offset = j==0?0:cumSizesData[j-1];
+      real val = 0;
+      real* lgradOutputData = gradOutputData + j*outDim;
+      real* lgradWeightData = gradWeightData;
+      real* lweightData = weightData;
+      THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim);
+      for (i = 0; i < sizesData[j]; i++)
+      {
+        real val = valuesData[offset] * scale;
+        lgradWeightData = gradWeightData + offset*outDim;
+        if (maxNormalize)
+        {
+          lgradWeightData += offset*outDim;
+          k = 0;
+          for(;k < outDim-4; k += 4)
+          {
+            lgradWeightData[k] = lgradOutputData[k]*scale;
+            lgradWeightData[k+1] = lgradOutputData[k+1]*scale;
+            lgradWeightData[k+2] = lgradOutputData[k+2]*scale;
+            lgradWeightData[k+3] = lgradOutputData[k+3]*scale;
+          }
+
+          for(; k < outDim; k++)
+          {
+            lgradWeightData[k] = lgradOutputData[k]*scale;
+          }
+          lgradWeightData += outDim;
+        }
+        k = 0;
+        for(;k < outDim-4; k += 4)
+        {
+          lgradWeightData[k] = val * lgradOutputData[k];
+          lgradWeightData[k+1] = val * lgradOutputData[k+1];
+          lgradWeightData[k+2] = val * lgradOutputData[k+2];
+          lgradWeightData[k+3] = val * lgradOutputData[k+3];
+        }
+
+        for(; k < outDim; k++)
+        {
+          lgradWeightData[k] = val * lgradOutputData[k];
+        }
+        offset++;
+      }
+    }
+  }
+  THLongTensor_free(cumSizes);
+  return;
+}
+#endif
diff --git a/lib/THNN/generic/Linear.c b/lib/THNN/generic/Linear.c
index faef421..8c5cd11 100644
--- a/lib/THNN/generic/Linear.c
+++ b/lib/THNN/generic/Linear.c
@@ -42,9 +42,10 @@ void THNN_(Linear_updateOutput)(
       THTensor_(zero)(output);
     }
     THNN_(Linear_updateAddBuffer)(state,input,addBuffer);
-    THTensor_(transpose)(weight,weight,0,1);
-    THTensor_(addmm)(output,0,output,1,input,weight);
-    THTensor_(transpose)(weight,weight,0,1);
+    THTensor *tweight = THTensor_(new)();
+    THTensor_(transpose)(tweight,weight,0,1);
+    THTensor_(addmm)(output,0,output,1,input,tweight);
+    THTensor_(free)(tweight);
     if (bias) {
       THTensor_(addr)(output,1,output,1,addBuffer,bias);
     }
@@ -67,9 +68,10 @@ void THNN_(Linear_updateGradInput)(
 
     long dim = THTensor_(nDimension)(input);
     if (dim == 1) {
-      THTensor_(transpose)(weight,weight,0,1);
-      THTensor_(addmv)(gradInput,0,gradInput,1,weight,gradOutput);
-      THTensor_(transpose)(weight,weight,0,1);
+      THTensor *tweight = THTensor_(new)();
+      THTensor_(transpose)(tweight,weight,0,1);
+      THTensor_(addmv)(gradInput,0,gradInput,1,tweight,gradOutput);
+      THTensor_(free)(tweight);
     }
     else if (dim == 2) {
       THTensor_(addmm)(gradInput,0,gradInput,1,gradOutput,weight);
@@ -98,13 +100,14 @@ void THNN_(Linear_accGradParameters)(
     }
   }
   else if (dim == 2) {
-    THTensor_(transpose)(gradOutput,gradOutput,0,1);
-    THTensor_(addmm)(gradWeight,1,gradWeight,scale,gradOutput,input);
+    THTensor *tgradOutput = THTensor_(new)();
+    THTensor_(transpose)(tgradOutput,gradOutput,0,1);
+    THTensor_(addmm)(gradWeight,1,gradWeight,scale,tgradOutput,input);
     if (bias) {
       THNN_(Linear_updateAddBuffer)(state,input,addBuffer);
-      THTensor_(addmv)(gradBias,1,gradBias,scale,gradOutput,addBuffer);
+      THTensor_(addmv)(gradBias,1,gradBias,scale,tgradOutput,addBuffer);
     }
-    THTensor_(transpose)(gradOutput,gradOutput,0,1);
+    THTensor_(free)(tgradOutput);
   }
 }
 
diff --git a/lib/THNN/generic/PReLU.c b/lib/THNN/generic/PReLU.c
index 174f514..488322f 100644
--- a/lib/THNN/generic/PReLU.c
+++ b/lib/THNN/generic/PReLU.c
@@ -22,31 +22,18 @@ void THNN_(PReLU_updateOutput)(
   else
   {
     input = THTensor_(newContiguous)(input);
-    long bs, ks;
+    long bs = 1, ks = 1;
     {
       long input_ndim = THTensor_(nDimension)(input);
-      switch (input_ndim)
-      {
-        case 1:
-          bs = 1;
-          ks = 1;
-          break;
-        case 2:
-          bs = input->size[0];
-          ks = 1;
-          break;
-        case 3:
-          bs = 1;
-          ks = input->size[1] * input->size[2];
-          break;
-        case 4:
+      if (input->size[input_ndim > 1] != nOutputPlane)
+        THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+      if (input_ndim > 1) {
           bs = input->size[0];
-          ks = input->size[2] * input->size[3];
-          break;
+          for (int d = 2; d < input_ndim; d++) {
+              ks *= input->size[d];
+          }
       }
-
-      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
-        THError("wrong number of input planes");
     }
 
     real *output_data = THTensor_(data)(output);
@@ -95,36 +82,24 @@ void THNN_(PReLU_updateGradInput)(
   {
     input = THTensor_(newContiguous)(input);
     gradOutput = THTensor_(newContiguous)(gradOutput);
+    weight = THTensor_(newContiguous)(weight);
     const real *input_data = THTensor_(data)(input);
     const real *gradOutput_data = THTensor_(data)(gradOutput);
     const real *weight_data = THTensor_(data)(weight);
     real *gradInput_data = THTensor_(data)(gradInput);
 
-    long bs, ks;
+    long bs = 1, ks = 1;
     {
       long input_ndim = THTensor_(nDimension)(input);
-      switch (input_ndim)
-      {
-        case 1:
-          bs = 1;
-          ks = 1;
-          break;
-        case 2:
-          bs = input->size[0];
-          ks = 1;
-          break;
-        case 3:
-          bs = 1;
-          ks = input->size[1] * input->size[2];
-          break;
-        case 4:
+      if (input->size[input_ndim > 1] != nOutputPlane)
+        THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+      if (input_ndim > 1) {
           bs = input->size[0];
-          ks = input->size[2] * input->size[3];
-          break;
+          for (int d = 2; d < input_ndim; d++) {
+              ks *= input->size[d];
+          }
       }
-
-      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
-        THError("wrong number of input planes");
     }
 
     THIndex_t i, j, k;
@@ -152,6 +127,7 @@ void THNN_(PReLU_updateGradInput)(
     }
     THTensor_(free)(input);
     THTensor_(free)(gradOutput);
+    THTensor_(free)(weight);
   }
 }
 
@@ -169,10 +145,10 @@ void THNN_(PReLU_accGradParameters)(
 {
   real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   THNN_CHECK_NELEMENT(input, gradOutput);
-  real *gradWeight_data = THTensor_(data)(gradWeight);
 
   if (nOutputPlane == 0)
   {
+    real *gradWeight_data = THTensor_(data)(gradWeight);
     real sum = 0;
     TH_TENSOR_APPLY2(real, input, real, gradOutput,
       if ((*input_data) <= 0)
@@ -182,33 +158,22 @@ void THNN_(PReLU_accGradParameters)(
   }
   else
   {
+    THArgCheck(THTensor_(isContiguous)(gradWeight), 6, "gradWeight needs to be contiguous");
     input = THTensor_(newContiguous)(input);
     gradOutput = THTensor_(newContiguous)(gradOutput);
-    long bs, ks;
+    weight = THTensor_(newContiguous)(weight);
+    long bs = 1, ks = 1;
     {
       long input_ndim = THTensor_(nDimension)(input);
-      switch (input_ndim)
-      {
-        case 1:
-          bs = 1;
-          ks = 1;
-          break;
-        case 2:
-          bs = input->size[0];
-          ks = 1;
-          break;
-        case 3:
-          bs = 1;
-          ks = input->size[1] * input->size[2];
-          break;
-        case 4:
+      if (input->size[input_ndim > 1] != nOutputPlane)
+        THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+      if (input_ndim > 1) {
           bs = input->size[0];
-          ks = input->size[2] * input->size[3];
-          break;
+          for (int d = 2; d < input_ndim; d++) {
+            ks *= input->size[d];
+          }
       }
-
-      if (input->size[(input_ndim + 1) % 2] != nOutputPlane)
-        THError("wrong number of input planes");
     }
 
     const real *input_data = THTensor_(data)(input);
@@ -235,6 +200,7 @@ void THNN_(PReLU_accGradParameters)(
     }
     THTensor_(free)(input);
     THTensor_(free)(gradOutput);
+    THTensor_(free)(weight);
   }
 }
 
diff --git a/lib/THNN/generic/Sigmoid.c b/lib/THNN/generic/Sigmoid.c
index f48cb0f..17fb2cb 100644
--- a/lib/THNN/generic/Sigmoid.c
+++ b/lib/THNN/generic/Sigmoid.c
@@ -7,11 +7,7 @@ void THNN_(Sigmoid_updateOutput)(
           THTensor *input,
           THTensor *output)
 {
-  THTensor_(resizeAs)(output, input);
-
-  TH_TENSOR_APPLY2(real, output, real, input,
-    *output_data = 1./(1.+ exp(- *input_data));
-  );
+  THTensor_(sigmoid)(output, input);
 }
 
 void THNN_(Sigmoid_updateGradInput)(
@@ -21,7 +17,7 @@ void THNN_(Sigmoid_updateGradInput)(
           THTensor *gradInput,
           THTensor *output)
 {
-  THNN_CHECK_NELEMENT(input, gradOutput);
+  THNN_CHECK_NELEMENT(output, gradOutput);
   THTensor_(resizeAs)(gradInput, output);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
     real z = *output_data;
diff --git a/lib/THNN/generic/SparseLinear.c b/lib/THNN/generic/SparseLinear.c
index 0c52541..1cf7122 100644
--- a/lib/THNN/generic/SparseLinear.c
+++ b/lib/THNN/generic/SparseLinear.c
@@ -62,6 +62,8 @@ void THNN_(SparseLinear_updateOutput)(
   THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1);
   THLongTensor_zero(csr);
 
+  weight = THTensor_(newContiguous)(weight);
+
 //#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
   for (i=0; i<nnz; i++) {
     hp0 = (long)(THNN_(get2d)(input, i, 0)) - 1;
@@ -106,6 +108,7 @@ void THNN_(SparseLinear_updateOutput)(
   }
   THTensor_(free)(output_row);
   THLongTensor_free(csr);
+  THTensor_(free)(weight);
 }
 
 void THNN_(SparseLinear_legacyUpdateOutput)(
@@ -123,6 +126,8 @@ void THNN_(SparseLinear_legacyUpdateOutput)(
   THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
   THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
 
+  weight = THTensor_(newContiguous)(weight);
+
   long batchSize = THTensor_(size)(input, 0);
   long nnz = THTensor_(size)(input, 1);
   THTensor_(resize2d)(output, batchSize, outDim);
@@ -157,6 +162,7 @@ void THNN_(SparseLinear_legacyUpdateOutput)(
     THTensor_(cadd)(output_row, bias, 1.0, output_row);
   }
   THTensor_(free)(output_row);
+  THTensor_(free)(weight);
 }
 
 void THNN_(SparseLinear_accGradParameters)(
@@ -189,6 +195,7 @@ void THNN_(SparseLinear_accGradParameters)(
 
   THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
   THLongTensor_zero(csc);
+  weight = THTensor_(newContiguous)(weight);
 
 #pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
   for (i = 0; i < nnz; i++) {
@@ -227,7 +234,7 @@ void THNN_(SparseLinear_accGradParameters)(
 
   // gradBias += gradOutput
   THTensor* buf = THTensor_(new)();
-  THTensor_(sum)(buf, gradOutput, 0);
+  THTensor_(sum)(buf, gradOutput, 0, 1);
   THTensor_(cadd)(gradBias, gradBias, scale, buf);
   THTensor_(free)(buf);
   THLongTensor_free(csc);
@@ -235,6 +242,7 @@ void THNN_(SparseLinear_accGradParameters)(
   if (weightDecay != 0) {
     THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
   }
+  THTensor_(free)(weight);
 }
 
 void THNN_(SparseLinear_legacyAccGradParameters)(
diff --git a/lib/THNN/generic/SpatialConvolutionLocal.c b/lib/THNN/generic/SpatialConvolutionLocal.c
index 06b57f3..6db5a5d 100644
--- a/lib/THNN/generic/SpatialConvolutionLocal.c
+++ b/lib/THNN/generic/SpatialConvolutionLocal.c
@@ -3,17 +3,17 @@
 #else
 
 static inline void THNN_(SpatialConvolutionLocal_shapeCheck)(
-	THTensor *input, THTensor *gradOutput,
-	THTensor *weight, THTensor *bias,
-	int kH, int kW, int dH,
-	int dW, int padH, int padW,
-	long inputHeight, long inputWidth,
-	long outputHeight, long outputWidth) {
+    THTensor *input, THTensor *gradOutput,
+    THTensor *weight, THTensor *bias,
+    int kH, int kW, int dH,
+    int dW, int padH, int padW,
+    long inputHeight, long inputWidth,
+    long outputHeight, long outputWidth) {
 
   THArgCheck(kW > 0 && kH > 0, 9,
-	       "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+           "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
   THArgCheck(dW > 0 && dH > 0, 11,
-	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+         "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
 
   int ndim = input->nDimension;
   int dimf = 0;
@@ -27,7 +27,7 @@ static inline void THNN_(SpatialConvolutionLocal_shapeCheck)(
   }
 
   THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
-		"3D or 4D input tensor expected but got: %s");
+        "3D or 4D input tensor expected but got: %s");
 
   long nInputPlane = weight->size[2] / (kH * kW);
   long nOutputPlane = weight->size[1];
@@ -47,21 +47,22 @@ static inline void THNN_(SpatialConvolutionLocal_shapeCheck)(
   }
 }
 
-static int THNN_(view_weight_local)(THTensor **_weight)
+static THTensor* THNN_(view_weight_local)(THTensor *_weight)
 {
-  THTensor *weight = *_weight;
+  THTensor *weight = THTensor_(newContiguous)(_weight);
   THArgCheck(weight->nDimension == 3 || weight->nDimension == 6, 4,
           "weight tensor should be 3D or 6D - got %dD", weight->nDimension);
   if (weight->nDimension == 6) {
     long s1 = weight->size[0] * weight->size[1];
     long s2 = weight->size[2];
     long s3 = weight->size[3] * weight->size[4] * weight->size[5];
-    *_weight = THTensor_(newWithStorage3d)(weight->storage,
-					   weight->storageOffset,
-					   s1, -1, s2, -1, s3, -1);
-    return 1;
+    THTensor *old_weight = weight;
+    weight = THTensor_(newWithStorage3d)(weight->storage,
+                       weight->storageOffset,
+                       s1, -1, s2, -1, s3, -1);
+    THTensor_(free)(old_weight);
   }
-  return 0;
+  return weight;
 }
 
 static void THNN_(SpatialConvolutionLocal_updateOutput_frame)
@@ -76,8 +77,8 @@ static void THNN_(SpatialConvolutionLocal_updateOutput_frame)
   THTensor *output3d, *finput3d;
 
   THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
-		       nInputPlane, inputWidth, inputHeight,
-		       outputWidth, outputHeight);
+               nInputPlane, inputWidth, inputHeight,
+               outputWidth, outputHeight);
 
   THTensor_(copy)(output, bias);
 
@@ -116,7 +117,7 @@ void THNN_(SpatialConvolutionLocal_updateOutput)(
     long inputWidth, long inputHeight,
     long outputWidth, long outputHeight)
 {
-  int freeWeight = THNN_(view_weight_local)(&weight);
+  weight = THNN_(view_weight_local)(weight);
 
   THNN_(SpatialConvolutionLocal_shapeCheck)
     (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
@@ -154,10 +155,10 @@ void THNN_(SpatialConvolutionLocal_updateOutput)(
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
 
       THNN_(SpatialConvolutionLocal_updateOutput_frame)
-	(input_t, output_t, weight, bias, finput_t,
-	 kW, kH, dW, dH, padW, padH,
-	 nInputPlane, inputWidth, inputHeight,
-	 nOutputPlane, outputWidth, outputHeight);
+    (input_t, output_t, weight, bias, finput_t,
+     kW, kH, dW, dH, padW, padH,
+     nInputPlane, inputWidth, inputHeight,
+     nOutputPlane, outputWidth, outputHeight);
 
       THTensor_(free)(input_t);
       THTensor_(free)(output_t);
@@ -166,8 +167,7 @@ void THNN_(SpatialConvolutionLocal_updateOutput)(
   }
 
   THTensor_(free)(input);
-  if (freeWeight)
-    THTensor_(free)(weight);
+  THTensor_(free)(weight);
 }
 
 
@@ -198,8 +198,8 @@ static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)
   THTensor_(zero)(gradInput);
 
   THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH,
-		      nInputPlane, inputWidth, inputHeight,
-		      outputWidth, outputHeight);
+              nInputPlane, inputWidth, inputHeight,
+              outputWidth, outputHeight);
 
 }
 
@@ -217,7 +217,7 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
     long inputWidth, long inputHeight,
     long outputWidth, long outputHeight)
 {
-  int freeWeight = THNN_(view_weight_local)(&weight);
+  weight = THNN_(view_weight_local)(weight);
 
   THNN_(SpatialConvolutionLocal_shapeCheck)
     (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
@@ -230,12 +230,14 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
 
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(resizeAs)(fgradInput, finput);
-  THTensor_(transpose)(weight, weight, 1, 2);
+
+  THTensor *tweight = THTensor_(new)();
+  THTensor_(transpose)(tweight, weight, 1, 2);
 
   if(input->nDimension == 3)
   {
     THNN_(SpatialConvolutionLocal_updateGradInput_frame)
-      (gradInput, gradOutput, weight,
+      (gradInput, gradOutput, tweight,
        fgradInput, kW, kH, dW, dH, padW, padH,
        nInputPlane, inputWidth, inputHeight,
        nOutputPlane, outputWidth, outputHeight);
@@ -253,10 +255,10 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
       THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
 
       THNN_(SpatialConvolutionLocal_updateGradInput_frame)
-	(gradInput_t, gradOutput_t, weight, fgradInput_t,
-	 kW, kH, dW, dH, padW, padH,
-	 nInputPlane, inputWidth, inputHeight,
-	 nOutputPlane, outputWidth, outputHeight);
+    (gradInput_t, gradOutput_t, tweight, fgradInput_t,
+     kW, kH, dW, dH, padW, padH,
+     nInputPlane, inputWidth, inputHeight,
+     nOutputPlane, outputWidth, outputHeight);
 
       THTensor_(free)(gradInput_t);
       THTensor_(free)(gradOutput_t);
@@ -264,13 +266,10 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
     }
   }
 
-  THTensor_(transpose)(weight, weight, 1, 2);
-
+  THTensor_(free)(tweight);
   THTensor_(free)(input);
   THTensor_(free)(gradOutput);
-  if (freeWeight)
-    THTensor_(free)(weight);
-
+  THTensor_(free)(weight);
 }
 
 static void THNN_(SpatialConvolutionLocal_accGradParameters_frame)
@@ -316,8 +315,10 @@ void THNN_(SpatialConvolutionLocal_accGradParameters)(
     long outputWidth, long outputHeight,
     accreal scale_)
 {
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
   real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
-  int freeWeight = THNN_(view_weight_local)(&gradWeight);
+  gradWeight = THNN_(view_weight_local)(gradWeight);
 
   THNN_(SpatialConvolutionLocal_shapeCheck)
     (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
@@ -348,10 +349,10 @@ void THNN_(SpatialConvolutionLocal_accGradParameters)(
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
 
       THNN_(SpatialConvolutionLocal_accGradParameters_frame)
-	(gradOutput_t, gradWeight, gradBias, finput_t, scale,
-	 kW, kH, dW, dH, padW, padH,
-	 nInputPlane, inputWidth, inputHeight,
-	 nOutputPlane, outputWidth, outputHeight);
+    (gradOutput_t, gradWeight, gradBias, finput_t, scale,
+     kW, kH, dW, dH, padW, padH,
+     nInputPlane, inputWidth, inputHeight,
+     nOutputPlane, outputWidth, outputHeight);
 
       THTensor_(free)(gradOutput_t);
       THTensor_(free)(finput_t);
@@ -360,10 +361,7 @@ void THNN_(SpatialConvolutionLocal_accGradParameters)(
 
   THTensor_(free)(input);
   THTensor_(free)(gradOutput);
-
-  if (freeWeight)
-    THTensor_(free)(gradWeight);
-
+  THTensor_(free)(gradWeight);
 }
 
 #endif
diff --git a/lib/THNN/generic/SpatialConvolutionMM.c b/lib/THNN/generic/SpatialConvolutionMM.c
index c9c22bc..28fea51 100644
--- a/lib/THNN/generic/SpatialConvolutionMM.c
+++ b/lib/THNN/generic/SpatialConvolutionMM.c
@@ -53,6 +53,19 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
   }
 }
 
+static THTensor* THNN_(view_weight_MM2d)(THTensor *weight) {
+  weight = THTensor_(newContiguous)(weight);
+  if (weight->nDimension == 4) {
+    long s1 = weight->size[0];
+    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    THTensor *old_weight = weight;
+    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset,
+					 s1, -1, s2, -1);
+	THTensor_(free)(old_weight);
+  }
+  return weight;
+}
+
 static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
           THTensor *input,
           THTensor *output,
@@ -111,15 +124,7 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
           int padW,
           int padH)
 {
-  int freeWeight = 0;
-
-  if (weight->nDimension == 4) {
-    long s1 = weight->size[0];
-    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
-    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset,
-					 s1, -1, s2, -1);
-    freeWeight = 1;
-  }
+  weight = THNN_(view_weight_MM2d)(weight);
 
   THNN_(SpatialConvolutionMM_shapeCheck)
     (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW);
@@ -182,8 +187,7 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
   }
 
   THTensor_(free)(input);
-  if (freeWeight)
-    THTensor_(free)(weight);
+  THTensor_(free)(weight);
 }
 
 static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
@@ -228,15 +232,7 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
           int padW,
           int padH)
 {
-  int freeWeight = 0;
-
-  if (weight->nDimension == 4) {
-    long s1 = weight->size[0];
-    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
-    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset,
-					 s1, -1, s2, -1);
-    freeWeight = 1;
-  }
+  weight = THNN_(view_weight_MM2d)(weight);
 
   THNN_(SpatialConvolutionMM_shapeCheck)
     (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW);
@@ -251,13 +247,13 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
   // be left uninitialized on zero alpha, which might lead to weird behavior
   // hence, to be safe, zero it
   THTensor_(zero)(fgradInput);
-
-  THTensor_(transpose)(weight, weight, 0, 1);
+  THTensor *tweight = THTensor_(new)();
+  THTensor_(transpose)(tweight, weight, 0, 1);
 
   if(input->nDimension == 3)
   {
     THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput,
-						      weight, fgradInput,
+						      tweight, fgradInput,
 						      kW, kH, dW, dH, padW, padH);
   }
   else
@@ -273,7 +269,7 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
       THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
 
       THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t,
-							weight, fgradInput_t,
+							tweight, fgradInput_t,
 							kW, kH, dW, dH, padW, padH);
 
       THTensor_(free)(gradInput_t);
@@ -282,12 +278,10 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
     }
   }
 
-  THTensor_(transpose)(weight, weight, 0, 1);
-
+  THTensor_(free)(tweight);
   THTensor_(free)(input);
   THTensor_(free)(gradOutput);
-  if (freeWeight)
-    THTensor_(free)(weight);
+  THTensor_(free)(weight);
 }
 
 static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
@@ -303,9 +297,10 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
      gradOutput->size[0], -1,
      gradOutput->size[1]*gradOutput->size[2], -1);
 
-  THTensor_(transpose)(finput, finput, 0, 1);
-  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
-  THTensor_(transpose)(finput, finput, 0, 1);
+  THTensor *tfinput = THTensor_(new)();
+  THTensor_(transpose)(tfinput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, tfinput);
+  THTensor_(free)(tfinput);
 
   if (gradBias) {
     for(i = 0; i < gradBias->size[0]; i++)
@@ -338,17 +333,12 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
           int padH,
           accreal scale_)
 {
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  if (gradBias)
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+
   real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
-  int freeWeight = 0;
-
-  if (gradWeight->nDimension == 4) {
-    long s1 = gradWeight->size[0];
-    long s2 = gradWeight->size[1] * gradWeight->size[2] * gradWeight->size[3];
-    gradWeight = THTensor_(newWithStorage2d)(gradWeight->storage,
-					     gradWeight->storageOffset,
-					     s1, -1, s2, -1);
-    freeWeight = 1;
-  }
+  gradWeight = THNN_(view_weight_MM2d)(gradWeight);
 
   THNN_(SpatialConvolutionMM_shapeCheck)
     (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW);
@@ -381,8 +371,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
 
   THTensor_(free)(input);
   THTensor_(free)(gradOutput);
-  if (freeWeight)
-    THTensor_(free)(gradWeight);
+  THTensor_(free)(gradWeight);
 }
 
 #endif
diff --git a/lib/THNN/generic/SpatialConvolutionMap.c b/lib/THNN/generic/SpatialConvolutionMap.c
index 750b212..142a035 100644
--- a/lib/THNN/generic/SpatialConvolutionMap.c
+++ b/lib/THNN/generic/SpatialConvolutionMap.c
@@ -13,10 +13,6 @@ void THNN_(SpatialConvolutionMap_updateOutput)(
     "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
-  real *weight_data = THTensor_(data)(weight);
-  real *bias_data = THTensor_(data)(bias);
-  real *connTable_data = THTensor_(data)(connTable);
-
   int dimw = 2;
   int dimh = 1;
   int dimc = 0;
@@ -51,10 +47,16 @@ void THNN_(SpatialConvolutionMap_updateOutput)(
   /* contiguous */
   input = THTensor_(newContiguous)(input);
   output = THTensor_(newContiguous)(output);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
+  connTable = THTensor_(newContiguous)(connTable);
 
   /* get raw pointers */
   real *input_data = THTensor_(data)(input);
   real *output_data = THTensor_(data)(output);
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *connTable_data = THTensor_(data)(connTable);
 
   long p;
 #pragma omp parallel for private(p)
@@ -96,6 +98,9 @@ void THNN_(SpatialConvolutionMap_updateOutput)(
   /* clean up */
   THTensor_(free)(input);
   THTensor_(free)(output);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+  THTensor_(free)(connTable);
 }
 
 void THNN_(SpatialConvolutionMap_updateGradInput)(
@@ -109,9 +114,6 @@ void THNN_(SpatialConvolutionMap_updateGradInput)(
     "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
-  real *weight_data = THTensor_(data)(weight);
-  real *connTable_data = THTensor_(data)(connTable);
-
   /* and dims */
   int dimw = 2;
   int dimh = 1;
@@ -133,6 +135,8 @@ void THNN_(SpatialConvolutionMap_updateGradInput)(
   /* contiguous */
   gradInput = THTensor_(newContiguous)(gradInput);
   gradOutput = THTensor_(newContiguous)(gradOutput);
+  weight = THTensor_(newContiguous)(weight);
+  connTable = THTensor_(newContiguous)(connTable);
 
   /* Resize/Zero */
   THTensor_(resizeAs)(gradInput, input);
@@ -141,6 +145,8 @@ void THNN_(SpatialConvolutionMap_updateGradInput)(
   /* get raw pointers */
   real *gradInput_data = THTensor_(data)(gradInput);
   real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *weight_data = THTensor_(data)(weight);
+  real *connTable_data = THTensor_(data)(connTable);
 
   long p;
 #pragma omp parallel for private(p)
@@ -172,6 +178,8 @@ void THNN_(SpatialConvolutionMap_updateGradInput)(
   /* clean up */
   THTensor_(free)(gradInput);
   THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+  THTensor_(free)(connTable);
 }
 
 void THNN_(SpatialConvolutionMap_accGradParameters)(
@@ -193,9 +201,6 @@ void THNN_(SpatialConvolutionMap_accGradParameters)(
     "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
-  real *gradWeight_data = THTensor_(data)(gradWeight);
-  real *gradBias_data = THTensor_(data)(gradBias);
-
   /* and dims */
   int dimw = 2;
   int dimh = 1;
@@ -217,10 +222,15 @@ void THNN_(SpatialConvolutionMap_accGradParameters)(
   /* contiguous */
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
 
   /* get raw pointers */
   real *input_data = THTensor_(data)(input);
   real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+  real *gradBias_data = THTensor_(data)(gradBias);
+
 
   long k;
   /* gradients wrt bias */
diff --git a/lib/THNN/generic/SpatialDepthWiseConvolution.c b/lib/THNN/generic/SpatialDepthWiseConvolution.c
new file mode 100644
index 0000000..750bae0
--- /dev/null
+++ b/lib/THNN/generic/SpatialDepthWiseConvolution.c
@@ -0,0 +1,519 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDepthWiseConvolution.c"
+#else
+
+static inline void THNN_(SpatialDepthWiseConvolution_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias,
+	int kH, int kW, int dH, int dW, int padH, int padW) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+	       "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THNN_ARGCHECK(weight->nDimension == 4, 5, weight,
+		"2D or 4D weight tensor expected, but got: %s");
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 2, 0, weight->size[0]);
+    THNN_CHECK_DIM_SIZE(bias, 2, 1, weight->size[1]);
+  }
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane  = weight->size[1];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[0];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%d x %d x %d). "
+	    "Calculated output size: (%d x %d x %d). Output size is too small",
+	    nInputPlane,inputHeight,inputWidth,nOutputPlane*nInputPlane,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimf, nInputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimh, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimw, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimw + 1, outputWidth);
+  }
+}
+
+static void THNN_(SpatialDepthWiseConvolution_updateOutput_frame)(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          long nInputPlane,
+          long inputWidth,
+          long inputHeight,
+          long nOutputPlane,
+          long outputWidth,
+          long outputHeight)
+{
+  long i;
+  THTensor *output2d;
+
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
+		       nInputPlane, inputWidth, inputHeight,
+		       outputWidth, outputHeight);
+
+  output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
+                                         nOutputPlane, -1,
+                                         outputHeight*outputWidth, -1);
+  if (bias) {
+    for(i = 0; i < nOutputPlane; i++)
+        THVector_(fill)
+	  (output->storage->data + output->storageOffset + output->stride[0] * i,
+	   THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+  } else {
+    THTensor_(zero)(output);
+  }
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+void THNN_(SpatialDepthWiseConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  long nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1];
+  long nOutputPlane = weight->size[0];
+  if (weight->nDimension == 2) {
+    THTensor_(resize4d)(weight, nOutputPlane, nInputPlane, kH, kW);
+  }
+
+  THNN_(SpatialDepthWiseConvolution_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW);
+
+  THTensor *_weight = THTensor_(newTranspose)(weight, 0, 1);
+  weight = THTensor_(newContiguous)(_weight);
+  THTensor *_bias = THTensor_(newTranspose)(bias, 0, 1);
+  bias = THTensor_(newContiguous)(_bias);
+
+
+  // resize weight
+  long s1 = weight->size[0];
+  long s2 = weight->size[1];
+  long s3 = weight->size[2] * weight->size[3];
+  weight = THTensor_(newWithStorage3d)(weight->storage, weight->storageOffset,
+          s1, -1, s2, -1, s3, -1);
+
+  input = THTensor_(newContiguous)(input);
+
+  int ndim = input->nDimension;
+
+  int batch = 1;
+  if (ndim == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  }
+
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[2];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  long T = input->size[0];
+  long t;
+
+  THTensor_(resize5d)(output, T, nInputPlane, nOutputPlane, outputHeight, outputWidth);
+  THTensor_(resize4d)(finput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth);
+
+#pragma omp parallel for private(t)
+  for(t = 0; t < T; t++)
+  {
+    THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+    THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+    THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < nInputPlane; i++)
+    {
+      THTensor *weight_i = THTensor_(newSelect)(weight, 0, i);
+      THTensor *bias_i = THTensor_(newSelect)(bias, 0, i);
+      THTensor *input_i = THTensor_(newNarrow)(input_t, 0, i, 1);
+      THTensor *output_i = THTensor_(newSelect)(output_t, 0, i);
+      THTensor *finput_i = THTensor_(newSelect)(finput_t, 0, i);
+
+      THNN_(SpatialDepthWiseConvolution_updateOutput_frame)
+	(input_i, output_i, weight_i, bias_i, finput_i,
+	 kW, kH, dW, dH, padW, padH,
+	 1, inputWidth, inputHeight,
+	 nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_i);
+      THTensor_(free)(weight_i);
+      THTensor_(free)(bias_i);
+      THTensor_(free)(output_i);
+      THTensor_(free)(finput_i);
+    }
+    THTensor_(free)(input_t);
+    THTensor_(free)(output_t);
+    THTensor_(free)(finput_t);
+  }
+
+  THTensor_(free)(weight);
+  THTensor_(free)(_weight);
+  THTensor_(free)(bias);
+  THTensor_(free)(_bias);
+
+  THTensor_(resize4d)(output, T, nInputPlane * nOutputPlane, outputWidth, outputHeight);
+
+  if (batch == 0) {
+    THTensor_(select)(output, NULL, 0, 0);
+    THTensor_(select)(input, NULL, 0, 0);
+    THTensor_(select)(finput, NULL, 0, 0);
+  }
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialDepthWiseConvolution_updateGradInput_frame)(
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+    (gradOutput->storage, gradOutput->storageOffset,
+     gradOutput->size[0], -1,
+     gradOutput->size[1]*gradOutput->size[2], -1);
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
+		      padW, padH,
+		      gradInput->size[0], gradInput->size[2], gradInput->size[1],
+		      gradOutput->size[2], gradOutput->size[1]);
+}
+
+void THNN_(SpatialDepthWiseConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  long nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1];
+  long nOutputPlane = weight->size[0];
+  if (weight->nDimension == 2) {
+    THTensor_(resize4d)(weight, nOutputPlane, nInputPlane, kH, kW);
+  }
+  gradOutput = THTensor_(newWithTensor)(gradOutput);
+
+  if (input->nDimension == 3) {
+    if (gradOutput->nDimension == 3) {
+      THTensor_(resize4d)(gradOutput, nInputPlane, nOutputPlane, gradOutput->size[1], gradOutput->size[2]);
+    }
+  }
+  else
+  {
+    if (gradOutput->nDimension == 4) {
+      THTensor_(resize5d)(gradOutput, gradOutput->size[0], nInputPlane, nOutputPlane, gradOutput->size[2], gradOutput->size[3]);
+    }
+  }
+
+
+  THNN_(SpatialDepthWiseConvolution_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW);
+
+  THTensor *_weight = THTensor_(newTranspose)(weight, 0, 1);
+  weight = THTensor_(newContiguous)(_weight);
+
+
+  // resize weight
+  long s1 = weight->size[0];
+  long s2 = weight->size[1];
+  long s3 = weight->size[2] * weight->size[3];
+  weight = THTensor_(newWithStorage3d)(weight->storage, weight->storageOffset,
+          s1, -1, s2, -1, s3, -1);
+
+  input = THTensor_(newContiguous)(input);
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[2];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  long T = input->size[0];
+  long t;
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resize4d)(fgradInput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth);
+
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput);
+
+
+
+#pragma omp parallel for private(t)
+  for(t = 0; t < T; t++)
+  {
+    THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+    THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+    THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < nInputPlane; i++)
+    {
+      THTensor *weight_i = THTensor_(newSelect)(weight, 0, i);
+      THTensor *gradInput_i = THTensor_(newNarrow)(gradInput_t, 0, i, 1);
+      THTensor *gradOutput_i = THTensor_(newSelect)(gradOutput_t, 0, i);
+      THTensor *fgradInput_i = THTensor_(newSelect)(fgradInput_t, 0, i);
+
+      THTensor_(transpose)(weight_i, weight_i, 0, 1);
+
+      THNN_(SpatialDepthWiseConvolution_updateGradInput_frame)(gradInput_i, gradOutput_i,
+              weight_i, fgradInput_i,
+              kW, kH, dW, dH, padW, padH);
+
+      THTensor_(free)(gradInput_i);
+      THTensor_(free)(weight_i);
+      THTensor_(free)(gradOutput_i);
+      THTensor_(free)(fgradInput_i);
+    }
+
+    THTensor_(free)(gradInput_t);
+    THTensor_(free)(gradOutput_t);
+    THTensor_(free)(fgradInput_t);
+  }
+
+  if (batch == 0) {
+    THTensor_(select)(gradOutput, NULL, 0, 0);
+    THTensor_(select)(input, NULL, 0, 0);
+    THTensor_(select)(gradInput, NULL, 0, 0);
+    THTensor_(select)(fgradInput, NULL, 0, 0);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+  THTensor_(free)(_weight);
+}
+
+static void THNN_(SpatialDepthWiseConvolution_accGradParameters_frame)(
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          accreal scale)
+{
+  long i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+    (gradOutput->storage, gradOutput->storageOffset,
+     gradOutput->size[0], -1,
+     gradOutput->size[1]*gradOutput->size[2], -1);
+
+  THTensor_(transpose)(finput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
+  THTensor_(transpose)(finput, finput, 0, 1);
+
+  if (gradBias) {
+    for(i = 0; i < gradBias->size[0]; i++)
+    {
+      long k;
+      real sum = 0;
+      real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+      for(k = 0; k < gradOutput2d->size[1]; k++)
+        sum += data[k];
+      (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
+    }
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          accreal scale)
+{
+  long nInputPlane = gradWeight->nDimension == 2 ? gradWeight->size[1]/(kH*kW) : gradWeight->size[1];
+  long nOutputPlane = gradWeight->size[0];
+  if (gradWeight->nDimension == 2) {
+    THTensor_(resize4d)(gradWeight, nOutputPlane, nInputPlane, kH, kW);
+  }
+
+  gradOutput = THTensor_(newWithTensor)(gradOutput);
+  if (input->nDimension == 3) {
+    if (gradOutput->nDimension == 3) {
+      THTensor_(resize4d)(gradOutput, nInputPlane, nOutputPlane, gradOutput->size[1], gradOutput->size[2]);
+    }
+  }
+  else
+  {
+    if (gradOutput->nDimension == 4) {
+      THTensor_(resize5d)(gradOutput, gradOutput->size[0], nInputPlane, nOutputPlane, gradOutput->size[2], gradOutput->size[3]);
+    }
+  }
+
+
+  THNN_(SpatialDepthWiseConvolution_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW);
+
+  // Transpose gradWeight & gradBias
+  THTensor_(transpose)(gradWeight, NULL, 0, 1);
+  THTensor_(transpose)(gradBias, NULL, 0, 1);
+
+  THTensor *_gradWeight;
+  THTensor *_gradBias;
+  _gradBias = gradBias;
+  _gradWeight = gradWeight;
+
+  gradWeight = THTensor_(newContiguous)(gradWeight);
+  gradBias = THTensor_(newContiguous)(gradBias);
+
+  // resize gradWeight
+  long s1 = gradWeight->size[0];
+  long s2 = gradWeight->size[1];
+  long s3 = gradWeight->size[2] * gradWeight->size[3];
+  gradWeight = THTensor_(newWithStorage3d)(gradWeight->storage, gradWeight->storageOffset,
+          s1, -1, s2, -1, s3, -1);
+
+  input = THTensor_(newContiguous)(input);
+
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[2];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  long T = input->size[0];
+  long t;
+  THTensor_(resize4d)(finput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth);
+
+  for(t = 0; t < T; t++)
+  {
+    THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+    THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < nInputPlane; i++)
+    {
+      THTensor *finput_i = THTensor_(newSelect)(finput_t, 0, i);
+      THTensor *gradOutput_i = THTensor_(newSelect)(gradOutput_t, 0, i);
+      THTensor *gradWeight_i = THTensor_(newSelect)(gradWeight, 0, i);
+      THTensor *gradBias_i = THTensor_(newSelect)(gradBias, 0, i);
+
+      THNN_(SpatialDepthWiseConvolution_accGradParameters_frame)(gradOutput_i, gradWeight_i,
+                gradBias_i, finput_i, scale);
+
+      THTensor_(free)(finput_i);
+      THTensor_(free)(gradOutput_i);
+      THTensor_(free)(gradWeight_i);
+      THTensor_(free)(gradBias_i);
+    }
+
+    THTensor_(free)(gradOutput_t);
+    THTensor_(free)(finput_t);
+  }
+
+  // Copy back and transpose back
+  THTensor_(transpose)(_gradWeight, NULL, 0, 1);
+  THTensor_(transpose)(_gradBias, NULL, 0, 1);
+  THTensor_(resize4d)(_gradWeight, nInputPlane, nOutputPlane, kH, kW);
+  THTensor_(resize2d)(_gradBias, nInputPlane, nOutputPlane);
+
+  THTensor_(copy)(_gradWeight, gradWeight);
+  THTensor_(copy)(_gradBias, gradBias);
+  THTensor_(transpose)(_gradWeight, NULL, 0, 1);
+  THTensor_(transpose)(_gradBias, NULL, 0, 1);
+
+  if (batch == 0) {
+    THTensor_(select)(gradOutput, NULL, 0, 0);
+    THTensor_(select)(input, NULL, 0, 0);
+    THTensor_(select)(finput, NULL, 0, 0);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(gradWeight);
+  THTensor_(free)(gradBias);
+}
+
+#endif
diff --git a/lib/THNN/generic/SpatialDilatedConvolution.c b/lib/THNN/generic/SpatialDilatedConvolution.c
index d345f7a..897cc0d 100644
--- a/lib/THNN/generic/SpatialDilatedConvolution.c
+++ b/lib/THNN/generic/SpatialDilatedConvolution.c
@@ -81,6 +81,8 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
   int nOutputPlane = weight->size[0];
 
   input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
@@ -178,6 +180,8 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
   }
 
   THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
 }
 
 void THNN_(SpatialDilatedConvolution_updateGradInput)(
@@ -201,6 +205,7 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
   int nOutputPlane = weight->size[0];
 
   input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
   gradOutput = THTensor_(newContiguous)(gradOutput);
   int batch = 1;
   if (input->nDimension == 3) {
@@ -274,6 +279,7 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
 
   THTensor_(free)(input);
   THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
 }
 
 
@@ -302,6 +308,9 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
 
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  if (gradBias)
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
diff --git a/lib/THNN/generic/SpatialDilatedMaxPooling.c b/lib/THNN/generic/SpatialDilatedMaxPooling.c
index 5a2b764..8f4ad13 100644
--- a/lib/THNN/generic/SpatialDilatedMaxPooling.c
+++ b/lib/THNN/generic/SpatialDilatedMaxPooling.c
@@ -300,8 +300,10 @@ static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(
       {
         /* retrieve position of max */
         long maxp = ind_p_k[i*outputWidth + j] - TH_INDEX_BASE;
-        /* update gradient */
-        gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
+	if (maxp != -1) {
+	  /* update gradient */
+	  gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
+	}
       }
     }
   }
diff --git a/lib/THNN/generic/SpatialFullConvolution.c b/lib/THNN/generic/SpatialFullConvolution.c
index e2a835d..2edc53b 100644
--- a/lib/THNN/generic/SpatialFullConvolution.c
+++ b/lib/THNN/generic/SpatialFullConvolution.c
@@ -131,6 +131,8 @@ void THNN_(SpatialFullConvolution_updateOutput)(
   int nOutputPlane = THTensor_(size)(weight,1);
 
   input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
@@ -230,6 +232,8 @@ void THNN_(SpatialFullConvolution_updateOutput)(
   }
 
   THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
 }
 
 void THNN_(SpatialFullConvolution_updateGradInput)(
@@ -252,6 +256,7 @@ void THNN_(SpatialFullConvolution_updateGradInput)(
 
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
+  weight = THTensor_(newContiguous)(weight);
   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
@@ -327,6 +332,7 @@ void THNN_(SpatialFullConvolution_updateGradInput)(
 
   THTensor_(free)(input);
   THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
 }
 
 
@@ -353,6 +359,9 @@ void THNN_(SpatialFullConvolution_accGradParameters)(
 
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  if (gradBias)
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
diff --git a/lib/THNN/generic/SpatialFullConvolutionMap.c b/lib/THNN/generic/SpatialFullConvolutionMap.c
index e98dea0..6952fbe 100644
--- a/lib/THNN/generic/SpatialFullConvolutionMap.c
+++ b/lib/THNN/generic/SpatialFullConvolutionMap.c
@@ -7,6 +7,8 @@ void THNN_(SpatialFullConvolutionMap_updateOutput)(
   THTensor *connTable, int nInputPlane, int nOutputPlane,
   int dW, int dH)
 {
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
   THArgCheck(
     weight != NULL && weight->nDimension == 3
     && connTable != NULL && connTable->size[0] == weight->size[0], 4,
diff --git a/lib/THNN/generic/SpatialSubSampling.c b/lib/THNN/generic/SpatialSubSampling.c
index 3f01540..4c077bc 100644
--- a/lib/THNN/generic/SpatialSubSampling.c
+++ b/lib/THNN/generic/SpatialSubSampling.c
@@ -10,6 +10,7 @@ static inline void THNN_(SpatialSubSampling_shapeCheck)(
   int ndims = input->nDimension;
   THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
                   "3D or 4D input tensor expected but got: %s");
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
 
   int nInputPlane = THTensor_(size)(weight, 0);
 
@@ -40,6 +41,7 @@ void THNN_(SpatialSubSampling_updateOutput)(
     int kW, int kH,
     int dW, int dH)
 {
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
 
   real *weight_data = THTensor_(data)(weight);
   real *bias_data = THTensor_(data)(bias);
diff --git a/lib/THNN/generic/THNN.h b/lib/THNN/generic/THNN.h
index 9515abb..b9fd709 100644
--- a/lib/THNN/generic/THNN.h
+++ b/lib/THNN/generic/THNN.h
@@ -47,7 +47,8 @@ TH_API void THNN_(ClassNLLCriterion_updateOutput)(
           THTensor *output,            // [OUT] a one-element tensor with loss
           bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
           THTensor *weights,           // [OPTIONAL] class weights
-          THTensor *total_weight);     // [BUFFER]
+          THTensor *total_weight,      // [BUFFER]
+          long ignore_index);          // target index to ignore (loss = 0, gradInput = 0)
 TH_API void THNN_(ClassNLLCriterion_updateGradInput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor (1D/2D)
@@ -55,7 +56,8 @@ TH_API void THNN_(ClassNLLCriterion_updateGradInput)(
           THTensor *gradInput,         // [OUT] gradient w.r.t. input
           bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
           THTensor *weights,           // [OPTIONAL] class weights
-          THTensor *total_weight);     // [BUFFER]
+          THTensor *total_weight,      // [BUFFER]
+          long ignore_index);          // target index to ignore (loss = 0, gradInput = 0)
 
 TH_API void THNN_(SpatialClassNLLCriterion_updateOutput)(
           THNNState *state,            // library's state
@@ -168,6 +170,40 @@ TH_API void THNN_(LeakyReLU_updateGradInput)(
           accreal negval,              // negative part slope
           bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
 
+TH_API void THNN_(GRUFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1, // [OPTIONAL]
+          THTensor *bias2, // [OPTIONAL]
+          THTensor *hx,
+          THTensor *output);
+TH_API void THNN_(GRUFused_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+
+TH_API void THNN_(LSTMFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1, // [OPTIONAL]
+          THTensor *bias2, // [OPTIONAL]
+          THTensor *cell,
+          THTensor *output,
+          THTensor *outputCell);
+TH_API void THNN_(LSTMFused_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *cx,
+          THTensor *cy,
+          THTensor *gradOutput,
+          THTensor *gradOutputCell,
+          THTensor *gradInput);
+
 TH_API void THNN_(LogSigmoid_updateOutput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
@@ -364,7 +400,7 @@ TH_API void THNN_(Sigmoid_updateOutput)(
           THTensor *output);
 TH_API void THNN_(Sigmoid_updateGradInput)(
           THNNState *state,
-          THTensor *input,
+          THTensor *input,             // [OPTIONAL]
           THTensor *gradOutput,
           THTensor *gradInput,
           THTensor *output);
@@ -420,6 +456,58 @@ TH_API void THNN_(SoftShrink_updateGradInput)(
           THTensor *gradInput,
           accreal lambda);
 
+
+TH_API void THNN_(IndexLinear_updateOutput)(
+          THNNState *state,
+          THIndexTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THIndexTensor *sizes,
+          THIndexTensor *cumSumSizes,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *normalizedValues,
+          int   train);
+TH_API void THNN_(IndexLinear_accGradParameters)(
+          THNNState *state,
+          THIndexTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THIndexTensor *sizes,
+          THIndexTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor* valuesBuffer,
+          accreal weightDecay,
+          accreal scale);
+TH_API void THNN_(IndexLinear_accUpdateGradParameters)(
+          THNNState *state,
+          THIndexTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THIndexTensor *sizes,
+          THIndexTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay,
+          accreal scale);
+TH_API void THNN_(IndexLinear_updateParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THIndexTensor *runningKeys,
+          THIndexTensor *cumSumSizes,
+          long keysOffset,
+          accreal weightDecay,
+          accreal learningRate);
+
 TH_API void THNN_(SparseLinear_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -507,7 +595,7 @@ TH_API void THNN_(Tanh_updateOutput)(
           THTensor *output);
 TH_API void THNN_(Tanh_updateGradInput)(
           THNNState *state,
-          THTensor *input,
+          THTensor *input,             // [OPTIONAL]
           THTensor *gradOutput,
           THTensor *gradInput,
           THTensor *output);
@@ -724,6 +812,41 @@ TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
           int padW, int padH,
           accreal scale);
 
+TH_API void THNN_(SpatialDepthWiseConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialDepthWiseConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          accreal scale);
+
 TH_API void THNN_(SpatialConvolutionLocal_updateOutput)(
           THNNState *state,
           THTensor *input,
diff --git a/lib/THNN/generic/Tanh.c b/lib/THNN/generic/Tanh.c
index 69a24b8..ecf0708 100644
--- a/lib/THNN/generic/Tanh.c
+++ b/lib/THNN/generic/Tanh.c
@@ -7,7 +7,6 @@ void THNN_(Tanh_updateOutput)(
           THTensor *input,
           THTensor *output)
 {
-  THTensor_(resizeAs)(output, input);
   THTensor_(tanh)(output, input);
 }
 
@@ -21,8 +20,8 @@ void THNN_(Tanh_updateGradInput)(
   THNN_CHECK_SHAPE(output, gradOutput);
   THTensor_(resizeAs)(gradInput, output);
 
-  if (output->nDimension == 1 || 
-      !THTensor_(isContiguous)(output) || 
+  if (output->nDimension == 1 ||
+      !THTensor_(isContiguous)(output) ||
       !THTensor_(isContiguous)(gradOutput) ||
       !THTensor_(isContiguous)(gradInput))
   {
diff --git a/lib/THNN/generic/TemporalConvolution.c b/lib/THNN/generic/TemporalConvolution.c
index a107da2..8cfd97d 100644
--- a/lib/THNN/generic/TemporalConvolution.c
+++ b/lib/THNN/generic/TemporalConvolution.c
@@ -58,6 +58,8 @@ void THNN_(TemporalConvolution_updateOutput)(
     dimF = 2;
   }
 
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
   THNN_(TemporalConvolution_shapeCheck)
        (state, input, kW, dW, &inputFrameSize);
   input = THTensor_(newContiguous)(input);
@@ -98,9 +100,10 @@ void THNN_(TemporalConvolution_updateOutput)(
                               nFrame, outputFrameStride*output->size[1],
                               output->size[1], 1);
 
-      THTensor_(transpose)(weight, NULL, 0, 1);
-      THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, weight);
-      THTensor_(transpose)(weight, NULL, 0, 1);
+      THTensor *tweight = THTensor_(new)();
+      THTensor_(transpose)(tweight, weight, 0, 1);
+      THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, tweight);
+      THTensor_(free)(tweight);
     }
   }
   else
@@ -145,9 +148,10 @@ void THNN_(TemporalConvolution_updateOutput)(
                                 nFrame, outputFrameStride*outputSample->size[1],
                                 outputSample->size[1], 1);
 
-        THTensor_(transpose)(weight, NULL, 0, 1);
-        THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, weight);
-        THTensor_(transpose)(weight, NULL, 0, 1);
+        THTensor *tweight = THTensor_(new)();
+        THTensor_(transpose)(tweight, weight, 0, 1);
+        THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, tweight);
+        THTensor_(free)(tweight);
       }
     }
     THTensor_(free)(outputSample);
@@ -185,6 +189,7 @@ void THNN_(TemporalConvolution_updateGradInput)(
     dimF = 2;
   }
 
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
   THNN_(TemporalConvolution_shapeCheck)(
         state, input, kW, dW, NULL);
   nInputFrame = input->size[dimS];
@@ -330,9 +335,10 @@ void THNN_(TemporalConvolution_accGradParameters)(
                               nFrame, outputFrameStride*gradOutput->size[1],
                               gradOutput->size[1], 1);
 
-      THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
-      THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow);
-      THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
+      THTensor *tgradOutputWindow = THTensor_(new)();
+      THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1);
+      THTensor_(addmm)(gradWeight, 1, gradWeight, scale, tgradOutputWindow, inputWindow);
+      THTensor_(free)(tgradOutputWindow);
     }
   }
   else
@@ -372,9 +378,10 @@ void THNN_(TemporalConvolution_accGradParameters)(
                                 nFrame, outputFrameStride*gradOutputSample->size[1],
                                 gradOutputSample->size[1], 1);
 
-        THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
-        THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow);
-        THTensor_(transpose)(gradOutputWindow, NULL, 0, 1);
+        THTensor *tgradOutputWindow = THTensor_(new)();
+        THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1);
+        THTensor_(addmm)(gradWeight, 1, gradWeight, scale, tgradOutputWindow, inputWindow);
+        THTensor_(free)(tgradOutputWindow);
       }
     }
     THTensor_(free)(gradOutputSample);
diff --git a/lib/THNN/generic/TemporalMaxPooling.c b/lib/THNN/generic/TemporalMaxPooling.c
index e2976ab..344c1b3 100644
--- a/lib/THNN/generic/TemporalMaxPooling.c
+++ b/lib/THNN/generic/TemporalMaxPooling.c
@@ -242,7 +242,8 @@ void THNN_(TemporalMaxPooling_updateGradInput)(
       {
         /* compute local max: */
         long maxindex = (long)xp[y];
-        gip[maxindex*framesize+y] += gop[y];
+	if (maxindex != -1)
+	  gip[maxindex*framesize+y] += gop[y];
       }
     }
   }
@@ -268,7 +269,8 @@ void THNN_(TemporalMaxPooling_updateGradInput)(
         {
           /* compute local max: */
           long maxindex = (long)xp[y];
-          gip[maxindex*framesize+y] += gop[y];
+	  if (maxindex != -1)
+	    gip[maxindex*framesize+y] += gop[y];
         }
       }
     }
diff --git a/lib/THNN/generic/TemporalRowConvolution.c b/lib/THNN/generic/TemporalRowConvolution.c
index b1cd173..e3ae41e 100644
--- a/lib/THNN/generic/TemporalRowConvolution.c
+++ b/lib/THNN/generic/TemporalRowConvolution.c
@@ -18,6 +18,8 @@ static inline void THNN_(TemporalRowConvolution_shapeCheck)(
 	           "stride should be greater than zero, but got dW: %d", dW);
 	THNN_ARGCHECK(weight->nDimension == 3, 3, weight,
 	              "3D weight tensor expected, but got: %s");
+    THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+    THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
 
 	if (bias != NULL) {
 		THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
@@ -319,11 +321,12 @@ void THNN_(TemporalRowConvolution_updateGradInput)(
 	THTensor_(zero)(fgradInput);
 	THTensor_(zero)(gradInput);
 
-	THTensor_(transpose)(weight, weight, 1, 2);
+    THTensor *tweight = THTensor_(new)();
+    THTensor_(transpose)(tweight, weight, 1, 2);
 
 	if (ndim == 2) {
 		THNN_(TemporalRowConvolution_updateGradInput_frame)
-		        (gradInput, gradOutput, weight, fgradInput,
+		        (gradInput, gradOutput, tweight, fgradInput,
 		        kW, dW, padW,
 		        inputFrameSize, nInputFrame, nOutputFrame);
 	} else {
@@ -338,7 +341,7 @@ void THNN_(TemporalRowConvolution_updateGradInput)(
 			THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
 
 			THNN_(TemporalRowConvolution_updateGradInput_frame)
-			        (gradInput_t, gradOutput_t, weight, fgradInput_t,
+			        (gradInput_t, gradOutput_t, tweight, fgradInput_t,
 			        kW, dW, padW,
 			        inputFrameSize, nInputFrame, nOutputFrame);
 
@@ -348,7 +351,7 @@ void THNN_(TemporalRowConvolution_updateGradInput)(
 		}
 	}
 
-	THTensor_(transpose)(weight, weight, 1, 2);
+    THTensor_(free)(tweight);
 
 	if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case
 
@@ -374,12 +377,13 @@ static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
 		1, -1,
 		gradOutput->size[1], -1);
 
-	THTensor_(transpose)(finput, finput, 1, 2);
+    THTensor *tfinput = THTensor_(new)();
+	THTensor_(transpose)(tfinput, finput, 1, 2);
 	// gradOutput3d:	inputFrameSize x 1 x nOutputFrame
 	// finput:			inputFrameSize x nOutputFrame x kW
-	THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, finput);
+	THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput);
 	// gradWeight:		inputFrameSize x 1 x kW
-	THTensor_(transpose)(finput, finput, 1, 2);
+    THTensor_(free)(tfinput);
 
 	if (gradBias != NULL) {
 		for (i = 0; i < gradBias->size[0]; i++) {
diff --git a/lib/THNN/generic/TemporalSubSampling.c b/lib/THNN/generic/TemporalSubSampling.c
index 8728d14..68f35e2 100644
--- a/lib/THNN/generic/TemporalSubSampling.c
+++ b/lib/THNN/generic/TemporalSubSampling.c
@@ -52,6 +52,8 @@ void THNN_(TemporalSubSampling_updateOutput)(
   int nInputFrame, nOutputFrame;
   long k;
 
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 4, "bias must be contiguous");
   THNN_(TemporalSubSampling_shapeCheck)(state, input, NULL, kW, dW, &inputFrameSize);
 
   outputFrame = THTensor_(new)();
@@ -68,7 +70,7 @@ void THNN_(TemporalSubSampling_updateOutput)(
   {
     THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
     THTensor_(select)(outputFrame, output, 0, k);
-    THTensor_(sum)(outputFrame, inputWindow, 0);
+    THTensor_(sum)(outputFrame, inputWindow, 0, 1);
     THTensor_(cmul)(outputFrame, outputFrame, weight);
     THTensor_(cadd)(outputFrame, outputFrame, 1, bias);
   }
@@ -91,6 +93,7 @@ void THNN_(TemporalSubSampling_updateGradInput)(
   THTensor *gradInputWindow, *buffer, *kwunit;
   long k;
 
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
   THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL);
 
   gradOutputFrame = THTensor_(new)();
@@ -140,7 +143,7 @@ void THNN_(TemporalSubSampling_accGradParameters)(
   {
     THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
     THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
-    THTensor_(sum)(buffer, inputWindow, 0);
+    THTensor_(sum)(buffer, inputWindow, 0, 1);
     THTensor_(addcmul)(gradWeight, gradWeight, scale, buffer, gradOutputFrame);
     THTensor_(cadd)(gradBias, gradBias, scale, gradOutputFrame);
   }
diff --git a/lib/THNN/generic/VolumetricConvolutionMM.c b/lib/THNN/generic/VolumetricConvolutionMM.c
index 4aaaa95..00a121d 100644
--- a/lib/THNN/generic/VolumetricConvolutionMM.c
+++ b/lib/THNN/generic/VolumetricConvolutionMM.c
@@ -495,12 +495,13 @@ void THNN_(VolumetricConvolutionMM_updateGradInput)(
   // be left uninitialized on zero alpha, which might lead to weird behavior
   // hence, to be safe, zero it
   THTensor_(zero)(fgradInput);
-  THTensor_(transpose)(weight, weight, 0, 1);
+  THTensor *tweight = THTensor_(new)();
+  THTensor_(transpose)(tweight, weight, 0, 1);
 
   if (input->nDimension == 4)
   {
     THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
-      gradInput, gradOutput, weight, fgradInput,
+      gradInput, gradOutput, tweight, fgradInput,
       kT, kW, kH,
       dT, dW, dH,
       pT, pW, pH
@@ -519,7 +520,7 @@ void THNN_(VolumetricConvolutionMM_updateGradInput)(
       THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
 
       THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
-        gradInput_t, gradOutput_t, weight, fgradInput_t,
+        gradInput_t, gradOutput_t, tweight, fgradInput_t,
         kT, kW, kH,
         dT, dW, dH,
         pT, pW, pH
@@ -531,8 +532,7 @@ void THNN_(VolumetricConvolutionMM_updateGradInput)(
     }
   }
 
-  THTensor_(transpose)(weight, weight, 0, 1);
-
+  THTensor_(free)(tweight);
   THTensor_(free)(input);
   THTensor_(free)(gradOutput);
   if (freeWeight)
@@ -553,9 +553,10 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
     gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
   );
 
-  THTensor_(transpose)(finput, finput, 0, 1);
-  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
-  THTensor_(transpose)(finput, finput, 0, 1);
+  THTensor *tfinput = THTensor_(new)();
+  THTensor_(transpose)(tfinput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, tfinput);
+  THTensor_(free)(tfinput);
 
   if (gradBias) {
     for (i = 0; i < gradBias->size[0]; i++)
diff --git a/lib/THNN/generic/VolumetricDilatedConvolution.c b/lib/THNN/generic/VolumetricDilatedConvolution.c
index e31ff2b..5627e6e 100644
--- a/lib/THNN/generic/VolumetricDilatedConvolution.c
+++ b/lib/THNN/generic/VolumetricDilatedConvolution.c
@@ -84,6 +84,8 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
   int nOutputPlane = weight->size[0];
 
   input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
   int batch = 1;
   if (input->nDimension == 4) {
     // Force batch
@@ -186,6 +188,8 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
   }
 
   THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);  
 }
 
 void THNN_(VolumetricDilatedConvolution_updateGradInput)(
@@ -211,6 +215,8 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)(
 
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
+  weight = THTensor_(newContiguous)(weight);
+  
   int batch = 1;
   if (input->nDimension == 4) {
     // Force batch
@@ -285,6 +291,7 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)(
 
   THTensor_(free)(input);
   THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
 }
 
 void THNN_(VolumetricDilatedConvolution_accGradParameters)(
@@ -313,6 +320,7 @@ void THNN_(VolumetricDilatedConvolution_accGradParameters)(
 
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
+  
   int batch = 1;
   if (input->nDimension == 4) {
     // Force batch
diff --git a/lib/THNN/generic/VolumetricDilatedMaxPooling.c b/lib/THNN/generic/VolumetricDilatedMaxPooling.c
index 14e8177..66c0f95 100644
--- a/lib/THNN/generic/VolumetricDilatedMaxPooling.c
+++ b/lib/THNN/generic/VolumetricDilatedMaxPooling.c
@@ -157,10 +157,11 @@ static void THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
           THIndex_t *indzp = indz_p + k * otime * owidth * oheight
             + ti * owidth * oheight + i * owidth + j;
 
-          /* compute local max: */
-          real maxval = -THInf;
-          int x,y,z;
-          int mx, my, mz;
+	  /* compute local max: */
+	  real maxval = -THInf;
+	  int x,y,z;
+	  int mx, my, mz;
+	  mx = my = mz = -1;
 
           for (z = 0; z < kernel_t; z++)
           {
@@ -385,9 +386,11 @@ static void THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
           long maxi  = ((unsigned char*)(indzp))[1] * dilationH + i * dH - pH;
           long maxj  = ((unsigned char*)(indzp))[2] * dilationW + j * dW - pW;
 
-          /* update gradient */
-          gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] +=
-            gradOutput_p_k[ti * oheight * owidth + i * owidth + j];
+	  if (maxti != -1) {
+	    /* update gradient */
+	    gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] +=
+	      gradOutput_p_k[ti * oheight * owidth + i * owidth + j];
+	  }
         }
       }
     }
diff --git a/lib/THNN/generic/VolumetricFullConvolution.c b/lib/THNN/generic/VolumetricFullConvolution.c
index 62d0d74..c974fab 100644
--- a/lib/THNN/generic/VolumetricFullConvolution.c
+++ b/lib/THNN/generic/VolumetricFullConvolution.c
@@ -172,6 +172,8 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
   const int kW           = (int)weight->size[4];
 
   input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
   int batch = 1;
   if (input->nDimension == 4)
   {
@@ -280,6 +282,8 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
   }
 
   THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
 }
 
 void THNN_(VolumetricFullConvolution_updateGradInput)(
@@ -308,6 +312,7 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(
   const int kW           = (int)weight->size[4];
 
   input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
   gradOutput = THTensor_(newContiguous)(gradOutput);
 
   int batch = 1;
@@ -391,6 +396,7 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(
 
   THTensor_(free)(input);
   THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
 }
 
 void THNN_(VolumetricFullConvolution_accGradParameters)(
@@ -423,6 +429,9 @@ void THNN_(VolumetricFullConvolution_accGradParameters)(
 
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  if (gradBias)
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
 
   int batch = 1;
   if (input->nDimension == 4)
diff --git a/lib/THNN/generic/unfold.c b/lib/THNN/generic/unfold.c
index e718320..14a73b5 100644
--- a/lib/THNN/generic/unfold.c
+++ b/lib/THNN/generic/unfold.c
@@ -2,10 +2,6 @@
 #define TH_GENERIC_FILE "generic/unfold.c"
 #else
 
-#ifdef _WIN32
-# include <windows.h>
-#endif
-
 /* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
 void THNN_(unfolded_acc)(
           THTensor *finput,
@@ -22,11 +18,11 @@ void THNN_(unfolded_acc)(
           int outputWidth,
           int outputHeight)
 {
-#ifdef _WIN32
-  LONG_PTR nip;
-#else
-  size_t nip;
-#endif
+  // This function assumes that
+  // outputHeight*dH does not overflow a long
+  // outputWidth*dW does not overflow a long
+
+  int nip;
 
   real *input_data = THTensor_(data)(input);
   real *finput_data = THTensor_(data)(finput);
@@ -34,34 +30,34 @@ void THNN_(unfolded_acc)(
 #pragma omp parallel for private(nip)
   for(nip = 0; nip < nInputPlane; nip++)
   {
-    size_t kw, kh, y, x;
-    long long ix = 0, iy = 0;
+    int kw, kh, y, x;
+    long ix, iy;
     for(kh = 0; kh < kH; kh++)
     {
       for(kw = 0; kw < kW; kw++)
       {
-        real *src = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
-        real *dst = input_data + nip*(inputHeight*inputWidth);
+        real *src = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth);
+        real *dst = input_data + nip*((size_t)inputHeight*inputWidth);
         if (padW > 0 || padH > 0) {
-          size_t lpad,rpad;
+          int lpad,rpad;
           for(y = 0; y < outputHeight; y++) {
-            iy = (long long)(y*dH - padH + kh);
+            iy = (long)y*dH - padH + kh;
             if (iy < 0 || iy >= inputHeight) {
             } else {
               if (dW==1){
-                 ix = (long long)(0 - padW + kw);
-                 lpad = fmaxf(0,(int)(padW-kw));
-                 rpad = fmaxf(0,(int)(padW-(kW-kw-1)));
-                 real *dst_slice = dst+(size_t)(iy*inputWidth+ix+lpad);
-                 THVector_(cadd)(dst_slice, dst_slice, src+(size_t)(y*outputWidth+lpad), 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
+                 ix = 0 - padW + kw;
+                 lpad = fmaxf(0,padW-kw);
+                 rpad = fmaxf(0,padW-(kW-kw-1));
+                 real *dst_slice = dst+(size_t)iy*inputWidth+ix+lpad;
+                 THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+lpad, 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
               }
               else{
                 for (x=0; x<outputWidth; x++){
-                   ix = (long long)(x*dW - padW + kw);
+                   ix = (long)x*dW - padW + kw;
                    if (ix < 0 || ix >= inputWidth){
                    }else{
-                     real *dst_slice = dst+(size_t)(iy*inputWidth+ix);
-                     THVector_(cadd)(dst_slice, dst_slice, src+(size_t)(y*outputWidth+x), 1, 1);
+                     real *dst_slice = dst+(size_t)iy*inputWidth+ix;
+                     THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1);
                    }
                 }
               }
@@ -69,15 +65,15 @@ void THNN_(unfolded_acc)(
           }
         } else {
           for(y = 0; y < outputHeight; y++) {
-            iy = (long long)(y*dH + kh);
-            ix = (long long)(0 + kw);
+            iy = (long)y*dH + kh;
+            ix = 0 + kw;
             if (dW == 1 ) {
-               real *dst_slice = dst+(size_t)(iy*inputWidth+ix);
-               THVector_(cadd)(dst_slice, dst_slice, src+(size_t)(y*outputWidth), 1, outputWidth); /* note: THVector_add could handle 1 value better */
+               real *dst_slice = dst+(size_t)iy*inputWidth+ix;
+               THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */
             }else{
               for(x = 0; x < outputWidth; x++) {
-                real *dst_slice = dst+(size_t)(iy*inputWidth+ix+x*dW);
-                THVector_(cadd)(dst_slice, dst_slice, src+(size_t)(y*outputWidth+x), 1, 1);
+                real *dst_slice = dst+(size_t)iy*inputWidth+ix+x*dW;
+                THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1);
               }
             }
           }
@@ -102,59 +98,65 @@ void THNN_(unfolded_copy)(
           int outputWidth,
           int outputHeight)
 {
+  // This function assumes that
+  // kH*kW does not overflow an int
+  // nInputPlane*kH*kW does not overflow a long
+  // outputHeight*dH does not overflow a long
+  // outputWidth*dW does not overflow a long
+
   long k;
   real *input_data = THTensor_(data)(input);
   real *finput_data = THTensor_(data)(finput);
 
 #pragma omp parallel for private(k)
-  for(k = 0; k < nInputPlane*kH*kW; k++) {
-    size_t nip = k / (kH*kW);
-    size_t rest = k % (kH*kW);
-    size_t kh = rest / kW;
-    size_t kw = rest % kW;
-    size_t x,y;
-    long long ix,iy;
-    real *dst = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
-    real *src = input_data + nip*(inputHeight*inputWidth);
+  for(k = 0; k < (long)nInputPlane*kH*kW; k++) {
+    long nip = k / (kH*kW);
+    long rest = k % (kH*kW);
+    long kh = rest / kW;
+    long kw = rest % kW;
+    int x, y;
+    long ix, iy;
+    real *dst = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth);
+    real *src = input_data + nip*((size_t)inputHeight*inputWidth);
     if (padW > 0 || padH > 0) {
-      size_t lpad,rpad;
+      long lpad,rpad;
       for(y = 0; y < outputHeight; y++) {
-        iy = (long long)(y*dH - padH + kh);
+        iy = (long)y*dH - padH + kh;
         if (iy < 0 || iy >= inputHeight) {
-          memset(dst+y*outputWidth, 0, sizeof(real)*outputWidth);
+          memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth);
         } else {
           if (dW==1){
-             ix = (long long)(0 - padW + kw);
-             lpad = fmaxf(0,(int)(padW-kw));
-             rpad = fmaxf(0,(int)(padW-(kW-kw-1)));
+             ix = 0 - padW + kw;
+             lpad = fmaxf(0,padW-kw);
+             rpad = fmaxf(0,padW-(kW-kw-1));
              if (outputWidth-rpad-lpad <= 0) {
-                memset(dst+(size_t)(y*outputWidth), 0, sizeof(real)*outputWidth);
+                memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth);
              } else {
-                if (lpad > 0) memset(dst+y*outputWidth, 0, sizeof(real)*lpad);
-                memcpy(dst+(size_t)(y*outputWidth+lpad), src+(size_t)(iy*inputWidth+ix+lpad), sizeof(real)*(outputWidth-rpad-lpad));
-                if (rpad > 0) memset(dst+y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad);
+                if (lpad > 0) memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*lpad);
+                memcpy(dst+(size_t)y*outputWidth+lpad, src+(size_t)iy*inputWidth+ix+lpad, sizeof(real)*(outputWidth-rpad-lpad));
+                if (rpad > 0) memset(dst+(size_t)y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad);
              }
           }
           else{
             for (x=0; x<outputWidth; x++){
-               ix = (long long)(x*dW - padW + kw);
+               ix = (long)x*dW - padW + kw;
                if (ix < 0 || ix >= inputWidth)
-                 memset(dst+(size_t)(y*outputWidth+x), 0, sizeof(real)*1);
+                 memset(dst+(size_t)y*outputWidth+x, 0, sizeof(real)*1);
                else
-                 memcpy(dst+(size_t)(y*outputWidth+x), src+(size_t)(iy*inputWidth+ix), sizeof(real)*(1));
+                 memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix, sizeof(real)*(1));
             }
           }
         }
       }
     } else {
       for(y = 0; y < outputHeight; y++) {
-        iy = (long long)(y*dH + kh);
-        ix = (long long)(0 + kw);
+        iy = (long)y*dH + kh;
+        ix = 0 + kw;
         if (dW == 1)
-           memcpy(dst+(size_t)(y*outputWidth), src+(size_t)(iy*inputWidth+ix), sizeof(real)*outputWidth);
+           memcpy(dst+(size_t)y*outputWidth, src+(size_t)iy*inputWidth+ix, sizeof(real)*outputWidth);
         else{
           for (x=0; x<outputWidth; x++)
-             memcpy(dst+(size_t)(y*outputWidth+x), src+(size_t)(iy*inputWidth+ix+x*dW), sizeof(real)*(1));
+             memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix+(long)x*dW, sizeof(real)*(1));
          }
       }
     }
diff --git a/lib/THNN/init.c b/lib/THNN/init.c
index 8aae97b..6c64015 100644
--- a/lib/THNN/init.c
+++ b/lib/THNN/init.c
@@ -98,6 +98,9 @@
 #include "generic/LeakyReLU.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/FusedRNNKernel.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/LogSigmoid.c"
 #include "THGenerateFloatTypes.h"
 
@@ -149,6 +152,9 @@
 #include "generic/SparseLinear.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/IndexLinear.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/Sqrt.c"
 #include "THGenerateFloatTypes.h"
 
@@ -185,6 +191,9 @@
 #include "generic/SpatialConvolutionMM.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialDepthWiseConvolution.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/SpatialConvolutionLocal.c"
 #include "THGenerateFloatTypes.h"
 
diff --git a/rocks/nn-scm-1.rockspec b/rocks/nn-scm-1.rockspec
index 9b455d9..980f6ee 100644
--- a/rocks/nn-scm-1.rockspec
+++ b/rocks/nn-scm-1.rockspec
@@ -15,7 +15,8 @@ description = {
 
 dependencies = {
    "torch >= 7.0",
-   "luaffi"
+   "luaffi",
+   "moses >= 1.0"
 }
 
 build = {
diff --git a/test.lua b/test.lua
index ab1183d..4db53bc 100755
--- a/test.lua
+++ b/test.lua
@@ -167,6 +167,62 @@ function nntest.WeightNorm()
    err = nn.Jacobian.testJacobianParameters(model, input,
                                                 model.v, model.gradV)
    mytester:assert(err < precision, 'Spatial Convolution v')
+
+   -- linear save/load
+   model = nn.WeightNorm(nn.Linear(5, 20))
+   input = torch.rand(10, 5)
+   local out = model:forward(input)
+   local modelr = torch.deserialize(torch.serialize(model))
+   local outr = modelr:forward(input)
+   mytester:assertTensorEq(out, outr)
+end
+
+function nntest.LinearWeightNorm()
+   local input = torch.rand(10, 5)
+   local model = nn.LinearWeightNorm(5, 20)
+
+   -- check gradient
+   local err = nn.Jacobian.testJacobianParameters(model, input, model.bias, model.gradBias)
+   mytester:assert(err < precision, 'bias')
+   err = nn.Jacobian.testJacobianParameters(model, input, model.g, model.gradG)
+   mytester:assert(err < precision, 'g')
+   err = nn.Jacobian.testJacobianParameters(model, input, model.v, model.gradV)
+   mytester:assert(err < precision, 'v')
+
+   -- check conversion functions
+   local linear = nn.Linear(5,20)
+   local wnFromLin = nn.LinearWeightNorm.fromLinear(linear)
+   local linFromWn = wnFromLin:toLinear()
+
+   local linOut = linear:forward(input)
+   local wnOut = wnFromLin:forward(input)
+   local linFromWnOut = linFromWn:forward(input)
+
+   mytester:assertTensorEq(linOut, wnOut, precision, "outputs are not equivalent")
+   mytester:assertTensorEq(wnOut, linFromWnOut, precision, "outputs are not equivalent")
+
+   -- check conversion with nobias
+   linear = nn.Linear(5,20,false)
+   wnFromLin = nn.LinearWeightNorm.fromLinear(linear)
+   linFromWn = wnFromLin:toLinear()
+
+   linOut = linear:forward(input)
+   wnOut = wnFromLin:forward(input)
+   linFromWnOut = linFromWn:forward(input)
+
+   mytester:assertTensorEq(linear.weight, wnFromLin.weight, precision, "weights are not equivalent")
+   mytester:assert(not wnFromLin.bias)
+   mytester:assert(not linear.bias)
+   mytester:assertTensorEq(linOut, wnOut, precision, "outputs are not equivalent")
+   mytester:assertTensorEq(wnOut, linFromWnOut, precision, "outputs are not equivalent")
+
+   -- check gradient with nobias
+   model = wnFromLin
+
+   err = nn.Jacobian.testJacobianParameters(model, input, model.g, model.gradG)
+   mytester:assert(err < precision, 'g')
+   err = nn.Jacobian.testJacobianParameters(model, input, model.v, model.gradV)
+   mytester:assert(err < precision, 'v')
 end
 
 function nntest.CAdd()
@@ -1412,6 +1468,189 @@ function nntest.SparseLinear()
    test_sparse_linear(1000, 1000, 10, 100)
 end
 
+local function testIndexLinear(bsize, iSize, oSize, nnz)
+   local inb = bsize
+   local ini = iSize
+   local inj = oSize
+
+   local ilinear  = nn.IndexLinear(ini,inj, true, nil, nil, nil, false)
+   local ilinear2 = nn.IndexLinear(ini,inj, true, nil, nil, nil, false)
+   local linear = nn.Linear(ini, inj)
+   ilinear.weight:zero()
+   ilinear.weight:copy(linear.weight:t():clone())
+   ilinear.bias = linear.bias:clone()
+   ilinear:zeroGradParameters()
+
+   ilinear2.weight:zero()
+   ilinear2.weight:copy(linear.weight:t():clone())
+   ilinear2.bias = linear.bias:clone()
+   ilinear2:zeroGradParameters()
+
+   linear:zeroGradParameters()
+
+   -- Create a random sparse vector
+   local input = {{},{}}
+   local flatInput = {torch.LongTensor(), torch.Tensor(), torch.LongTensor()}
+   local nonsparse = torch.zeros(inb, ini)
+   local sizes = flatInput[3]
+   sizes:resize(inb)
+   for i=1,inb do
+      sizes[i] = nnz
+      input[1][i] = torch.randperm(ini)[{{1,nnz}}]:long()
+      input[2][i] = torch.ones(nnz):uniform()
+      nonsparse[i]:scatter(1, input[1][i], input[2][i])
+   end
+   flatInput[1]:cat(input[1])
+   flatInput[2]:cat(input[2])
+
+   local gradOutput = torch.rand(inb, inj)
+   local cmps = {'weight', 'bias', 'gradBias'}
+   -- Check output wrt linear, non-batch
+   local actual = ilinear:forward({input[1][1], input[2][1]})
+   local actual2 = ilinear2:forward({input[1][1], input[2][1], flatInput[3][1]})
+   local expected = linear:forward(nonsparse[1])
+
+   local actualgi = ilinear:backward({input[1][1], input[2][1]}, gradOutput[1])
+   local actualgi2 = ilinear2:backward({input[1][1], input[2][1], flatInput[3][1]}, gradOutput[1])
+   local expectedgi = linear:backward(nonsparse[1], gradOutput[1])
+
+   ilinear:updateParameters(1)
+   ilinear2:updateParameters(1)
+   linear:updateParameters(1)
+
+   local err = (expected - actual):abs():max()
+   local err2 = (expected - actual2):abs():max()
+
+   local gierr = (expectedgi - actualgi[2]):abs():max()
+   local gierr2 = (expectedgi - actualgi2[2]):abs():max()
+
+   mytester:assertle(err, precision, 'error on result for tensor array')
+   mytester:assertle(gierr, precision, 'error on gradInput for tensor array')
+
+   mytester:assertle(err2, precision, 'error on result for batched tensor')
+   mytester:assertle(gierr2, precision, 'error on gradInput for batched tensor')
+
+   for _,var in ipairs(cmps) do
+      local err, err2
+      if var == 'weight' then
+         err = (ilinear[var]:t() - linear[var]):abs():max()
+         err2 = (ilinear2[var]:t() - linear[var]):abs():max()
+      else
+         err = (ilinear[var] - linear[var]):abs():max()
+         err2 = (ilinear2[var] - linear[var]):abs():max()
+      end
+      mytester:assertle(err, precision, 'error on '..var..' for tensor array')
+      mytester:assertle(err2, precision, 'error on '..var..' for batched tensor')
+   end
+   ilinear:zeroGradParameters()
+   ilinear2:zeroGradParameters()
+   linear:zeroGradParameters()
+
+   -- Check output wrt linear, batch
+   -- doing this n times checks for fast last input param updates
+   local test_n_times = function(ntimes)
+      local actual, expected, actualgi, expectedgi
+      for i=1, ntimes do
+         actual = ilinear:forward(input)
+         actual2 = ilinear2:forward(flatInput)
+         expected = linear:forward(nonsparse)
+
+         actualgi = ilinear:backward(input, gradOutput)
+         actualgi2 = ilinear2:backward(flatInput, gradOutput)
+         expectedgi = linear:backward(nonsparse, gradOutput)
+      end
+      ilinear:updateParameters(1)
+      ilinear2:updateParameters(1)
+      linear:updateParameters(1)
+
+      local err = (expected - actual):abs():max()
+      local err2 = (expected - actual2):abs():max()
+
+      local gicheck = torch.Tensor():resizeAs(expectedgi)
+      local gicheck2 = actualgi2[2]
+
+      for i=1,#actualgi[2] do
+         gicheck[i]:copy(actualgi[2][i])
+      end
+      local gierr = (expectedgi - gicheck):abs():max()
+      local gierr2 = (expectedgi - gicheck2):abs():max()
+
+      mytester:assertle(err, precision, 'error on result for tensor array with ntimes = '..ntimes)
+      mytester:assertle(err2, precision, 'error on result for batched tensor with ntimes = '..ntimes)
+
+      mytester:assertle(gierr, precision, 'error on gradInput for tensor array with ntimes = '..ntimes)
+      mytester:assertle(gierr2, precision, 'error on gradInput for batched tensor with ntimes = '..ntimes)
+
+      for _,var in ipairs(cmps) do
+         local err, err2
+         if var == 'weight' then
+            err = (ilinear[var]:t() - linear[var]):abs():max()
+            err2 = (ilinear2[var]:t() - linear[var]):abs():max()
+         else
+            err = (ilinear[var] - linear[var]):abs():max()
+            err2 = (ilinear2[var] - linear[var]):abs():max()
+         end
+         mytester:assertle(err, precision, 'error on '..var..' for tensor array')
+         mytester:assertle(err2, precision, 'error on '..var..' for batched tensor')
+      end
+
+      ilinear:zeroGradParameters()
+      ilinear2:zeroGradParameters()
+      linear:zeroGradParameters()
+      mytester:assertle(ilinear.gradBias:sum(), precision, 'error zeroing gradbias for tensor array')
+      mytester:assertle(ilinear2.gradBias:sum(), precision, 'error zeroing gradbias for batched tensor')
+   end
+   test_n_times(1)
+   test_n_times(2)
+   test_n_times(3)
+end
+
+function nntest.IndexLinear()
+   testIndexLinear(4, 40 , 10, 30)
+   testIndexLinear(4, 40 , 500, 30)
+   testIndexLinear(4, 200000 , 5, 150000)
+
+   local sizes = {
+      {osize = 1, isize = 10000, nnz = 10000, bsize = 16},
+      {osize = 10, isize = 10000, nnz = 10000, bsize = 16},
+      {osize = 100, isize = 10000, nnz = 10000, bsize = 16},
+
+      {osize = 1, isize = 10000, nnz = 200000, bsize = 1},
+      {osize = 10, isize = 10000, nnz = 200000, bsize = 1},
+      {osize = 100, isize = 10000, nnz = 200000, bsize = 1},
+
+      {osize = 1, isize = 10000, nnz = 200000, bsize = 2},
+      {osize = 10, isize = 10000, nnz = 200000, bsize = 2},
+      {osize = 100, isize = 10000, nnz = 200000, bsize = 2},
+   }
+
+   for i, lsizes in ipairs(sizes) do
+      -- Test multithreaded updates
+      local isize = lsizes.isize
+      local osize = lsizes.osize
+      local il = nn.IndexLinear(isize, osize)
+      local batch = {{},{}}
+      local idx = 100
+      local nnz = lsizes.nnz
+      local bsize = lsizes.bsize
+      for i=1,bsize do
+         batch[1][i] = torch.LongTensor(nnz):fill(idx)
+         batch[2][i] = torch.DoubleTensor(nnz):fill(1)
+      end
+      local totalSize = bsize*nnz
+      local lr = 0.01
+      -- Update the same index all over
+      local out = il:updateOutput(batch)
+      out:fill(1)
+      il:backwardUpdate(batch, out, lr)
+      il:backward(batch, out, 1)
+      il:updateParameters(lr)
+      for i=1,osize do
+         mytester:assertlt(math.abs(il.weight[idx][i] + totalSize * lr * 2), precision, 'parameters update was wrong.')
+      end
+   end
+end
+
 function nntest.Bilinear()
 
    -- set up data:
@@ -2094,19 +2333,45 @@ function nntest.DistKLDivCriterion()
 end
 
 function nntest.ClassNLLCriterion()
+   local batchsize = math.random(2,4)
    local numLabels = math.random(5,10)
-   local input = torch.rand(numLabels)
-   local target = math.random(1,numLabels)
 
-   -- default ClassNLLCriterion
-   local cri = nn.ClassNLLCriterion()
-   criterionJacobianTest(cri, input, target)
+   local function testclassnll(input, target)
+      -- default ClassNLLCriterion
+      local cri = nn.ClassNLLCriterion()
+      criterionJacobianTest(cri, input, target)
 
-   -- ClassNLLCriterion with weights
-   local weights = torch.rand(numLabels)
-   weights = weights / weights:sum()
-   cri = nn.ClassNLLCriterion(weights)
-   criterionJacobianTest(cri, input, target)
+      -- ClassNLLCriterion with weights
+      local weights = torch.rand(numLabels)
+      weights = weights / weights:sum()
+      cri = nn.ClassNLLCriterion(weights)
+      criterionJacobianTest(cri, input, target)
+   end
+
+   -- input/target: 1D/number
+   testclassnll(torch.rand(numLabels), math.random(1,numLabels))
+   -- input/target: 1D/1D
+   testclassnll(torch.rand(numLabels), torch.LongTensor(1):random(1, numLabels))
+   -- input/target: 2D/1D
+   testclassnll(torch.rand(batchsize, numLabels), torch.LongTensor(batchsize):random(1,numLabels))
+   -- test ignoreIndex
+   local ignoreIndex = -1
+   local cri = nn.ClassNLLCriterion(nil, nil, ignoreIndex)
+   local input = torch.randn(numLabels)
+   local target = ignoreIndex
+   mytester:assert(cri:forward(input, target) == 0)
+   mytester:assert(cri:backward(input, target):abs():sum() == 0)
+   local input = torch.randn(batchsize, numLabels)
+   local target = torch.LongTensor(batchsize):random(1,numLabels)
+   target[1] = ignoreIndex
+   local output = cri:forward(input, target)
+   local gradInput = cri:backward(input, target):clone()
+   mytester:assert(gradInput[1]:abs():sum() == 0)
+   local input, target = input:sub(2,batchsize), target:sub(2,batchsize)
+   local output2 = cri:forward(input, target)
+   mytester:assert(math.abs(output2 - output) < 0.0000001)
+   local gradInput2 = cri:backward(input, target)
+   mytester:assertTensorEq(gradInput2, gradInput:sub(2,batchsize), 0.0000001)
 end
 
 function nntest.SpatialClassNLLCriterion()
@@ -4220,6 +4485,32 @@ function nntest.Sum()
    local err       = jac.testJacobian(module, input)
    mytester:assertlt(err,precision, 'error on state ')
 
+   -- squeeze
+   local dimension = 1
+   local module    = nn.Sum(dimension, nil, nil, false)
+   local input     = torch.Tensor({{1, 2, 3},{4, 5, 6}})
+   local expected  = torch.Tensor({5, 7, 9}):view(1, 3)
+   local output    = module:forward(input)
+
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+   mytester:assert(output:isSameSizeAs(expected), 'sizes mismatch')
+
+   local err       = jac.testJacobian(module, input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- squeeze + batch
+   local dimension = 1
+   local module    = nn.Sum(dimension, 1, nil, false)
+   local input     = torch.Tensor({{1, 2, 3},{4, 5, 6}})
+   local expected  = torch.Tensor({6, 15}):view(2, 1)
+   local output    = module:forward(input)
+
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+   mytester:assert(output:isSameSizeAs(expected), 'sizes mismatch')
+
+   local err       = jac.testJacobian(module, input)
+   mytester:assertlt(err,precision, 'error on state ')
+
    -- 3D
    local ini = math.random(3,5)
    local inj = math.random(3,5)
@@ -6646,6 +6937,24 @@ function nntest.MapTable()
       == torch.pointer(map:get(1).weight:storage()))
    map:clearState()
    mytester:assert(map:size() == 1)
+
+  -- check if gradients are correctly reset
+  -- share weights and gradients
+  map = nn.MapTable(nn.Linear(10,5))
+  map:forward(input)
+  _, gradParams = map:getParameters()
+  gradParams:uniform()
+  map:zeroGradParameters()
+  mytester:assertlt(gradParams:sum(),precision)
+
+  -- check if gradients are correctly reset
+  -- do not share weights and gradients
+  map = nn.MapTable(nn.Linear(10,5),false)
+  map:forward(input)
+  _, gradParams = map:getParameters()
+  gradParams:uniform()
+  map:zeroGradParameters()
+  mytester:assertlt(gradParams:sum(),precision)
 end
 
 function nntest.FlattenTable()
@@ -7874,6 +8183,210 @@ function nntest.GPU()
    -- is located in cunn package.
 end
 
+function nntest.Profile()
+   local mx_overhead = 0.05
+   local print_every = 3
+   local net = nn.Profile(nn.Linear(3,4), print_every)
+   local input, gradOutput = torch.randn(1, 3), torch.randn(1, 4)
+   local output, gradInput = net:forward(input), net:backward(input, gradOutput)
+   mytester:assertTensorEq(net.modules[1].output, output, 0.000001)
+   mytester:assertTensorEq(net.modules[1].gradInput, gradInput, 0.000001)
+end
+
+function nntest.NaN()
+   local _ = require 'moses'
+   local input = torch.randn(2,3)
+   local gradOutput = torch.randn(2,4)
+   local lin = nn.Linear(3,4)
+   lin:zeroGradParameters()
+   local nan = nn.NaN(lin)
+   mytester:assert(nan.id == 1)
+   -- test that it works when no NaNs are present
+   local output = nan:forward(input):clone()
+   local gradInput = nan:backward(input, gradOutput):clone()
+   local gradWeight = lin.gradWeight:clone()
+   local gradBias = lin.gradBias:clone()
+   lin:zeroGradParameters()
+   local output2 = lin:forward(input)
+   local gradInput2 = lin:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output2, 0.000001)
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001)
+   mytester:assertTensorEq(gradWeight, lin.gradWeight, 0.000001)
+   mytester:assertTensorEq(gradBias, lin.gradBias, 0.000001)
+   -- test with some NaNs
+   input:zero():log():log()
+   local sum = input:sum()
+   mytester:assert(_.isNaN(sum))
+   mytester:assert(not pcall(function() nan:forward(input) end))
+   lin.bias:fill(sum)
+   input = torch.randn(2,3)
+   mytester:assert(not pcall(function() nan:forward(input) end))
+   lin.bias:uniform(0,1)
+   gradOutput:fill(sum)
+   mytester:assert(not pcall(function() nan:backward(input, gradOutput) end))
+   gradOutput:uniform(0,1)
+   lin.gradBias:fill(sum)
+   mytester:assert(not pcall(function() nan:backward(input, gradOutput) end))
+end
+
+function nntest.DontCast()
+   local input = torch.randn(3,4)
+   local gradOutput = torch.randn(3,2)
+   local linear = nn.Linear(4,2):float()
+   local mlp = nn.DontCast(linear, true)
+   linear:zeroGradParameters()
+   local linear = linear:clone()
+   local output = mlp:forward(input)
+   local gradInput = mlp:backward(input, gradOutput)
+   mytester:assert(torch.type(output) == 'torch.DoubleTensor')
+   mytester:assert(torch.type(gradInput) == 'torch.DoubleTensor')
+   local output2 = linear:forward(input:float())
+   local gradInput2 = linear:backward(input:float(), gradOutput:float())
+   mytester:assertTensorEq(output:float(), output2, 0.000001)
+   mytester:assertTensorEq(gradInput:float(), gradInput2, 0.000001)
+   local mlp3 = nn.DontCast(linear:clone())
+   mlp3:zeroGradParameters()
+   local output3 = mlp3:forward(input:float())
+   local gradInput3 = mlp3:backward(input:float(), gradOutput:float())
+   mytester:assert(torch.type(output3) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput3) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3, output2, 0.000001)
+   mytester:assertTensorEq(gradInput3, gradInput2, 0.000001)
+
+   mlp:float()
+   local output4 = mlp:forward(input:float())
+   local gradInput4 = mlp:backward(input:float(), gradOutput:float())
+   mytester:assert(torch.type(output4) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput4) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3, output4, 0.000001)
+   mytester:assertTensorEq(gradInput3, gradInput4, 0.000001)
+   mlp:double()
+   mytester:assert(torch.type(linear.output) == 'torch.FloatTensor')
+   local output = mlp:forward(input)
+   local gradInput = mlp:backward(input, gradOutput)
+   mytester:assert(torch.type(output4) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput4) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3, output:float(), 0.000001)
+   mytester:assertTensorEq(gradInput3, gradInput:float(), 0.000001)
+
+   -- test table inputs/outputs
+   local input = {torch.randn(3,4), torch.randn(3,4)}
+   local gradOutput = {torch.randn(3,2), torch.randn(3,2)}
+   local linear = nn.ParallelTable():add(nn.Linear(4,2)):add(nn.Linear(4,2)):float()
+   local mlp = nn.DontCast(linear, true)
+   linear:zeroGradParameters()
+   local linear = linear:clone()
+   local output = mlp:forward(input)
+   local gradInput = mlp:backward(input, gradOutput)
+   mytester:assert(torch.type(output[1]) == 'torch.DoubleTensor')
+   mytester:assert(torch.type(gradInput[1]) == 'torch.DoubleTensor')
+   mytester:assert(torch.type(output[2]) == 'torch.DoubleTensor')
+   mytester:assert(torch.type(gradInput[2]) == 'torch.DoubleTensor')
+   local _ = require 'moses'
+   local finput = _.map(input, function(k,v) return v:float() end)
+   local foutput = _.map(output, function(k,v) return v:float() end)
+   local fgradInput = _.map(gradInput, function(k,v) return v:float() end)
+   local fgradOutput = _.map(gradOutput, function(k,v) return v:float() end)
+   local output2 = linear:forward(finput)
+   local gradInput2 = linear:backward(finput, fgradOutput)
+   mytester:assertTensorEq(foutput[1], output2[1], 0.000001)
+   mytester:assertTensorEq(foutput[2], output2[2], 0.000001)
+   mytester:assertTensorEq(fgradInput[1], gradInput2[1], 0.000001)
+   mytester:assertTensorEq(fgradInput[2], gradInput2[2], 0.000001)
+   local mlp3 = nn.DontCast(linear:clone())
+   mlp3:zeroGradParameters()
+   local output3 = mlp3:forward(finput)
+   local gradInput3 = mlp3:backward(finput, fgradOutput)
+   mytester:assert(torch.type(output3[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput3[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(output3[2]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput3[2]) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3[1], output2[1], 0.000001)
+   mytester:assertTensorEq(gradInput3[1], gradInput2[1], 0.000001)
+   mytester:assertTensorEq(output3[2], output2[2], 0.000001)
+   mytester:assertTensorEq(gradInput3[2], gradInput2[2], 0.000001)
+   mlp:float()
+   local output4 = mlp:forward(finput)
+   local gradInput4 = mlp:backward(finput, fgradOutput)
+   mytester:assert(torch.type(output4[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput4[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(output4[2]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput4[2]) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3[1], output4[1], 0.000001)
+   mytester:assertTensorEq(gradInput3[1], gradInput4[1], 0.000001)
+   mytester:assertTensorEq(output3[2], output4[2], 0.000001)
+   mytester:assertTensorEq(gradInput3[2], gradInput4[2], 0.000001)
+   mlp:double()
+   mytester:assert(torch.type(linear.output) == 'table')
+   mytester:assert(torch.type(linear.output[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(linear.output[2]) == 'torch.FloatTensor')
+   local output = mlp:forward(input)
+   local gradInput = mlp:backward(input, gradOutput)
+   mytester:assertTensorEq(output3[1], output[1]:float(), 0.000001)
+   mytester:assertTensorEq(gradInput3[1], gradInput[1]:float(), 0.000001)
+end
+
+function nntest.SpatialDepthWiseConvolution()
+   local epsilon = 0.00001
+
+   local SC = nn.SpatialConvolution
+   local SDWC = nn.SpatialDepthWiseConvolution
+
+   local function spatialDepthWiseConv(
+         nInputPlane, multiplier, kernel, stride, padding, inputSize, weight, bias
+      )
+      local conv = SDWC(nInputPlane, multiplier, kernel, kernel, stride, stride, padding, padding)
+      conv.weight = weight
+      conv.bias = bias
+      return conv
+   end
+
+   -- Utility spatialDepthWiseConv_util() function --------------------------------
+   -- By Alfredo Canziani, alfredo.canziani at gmail.com -----------------------------
+   local function spatialDepthWiseConv_util(
+         nInputPlane, multiplier, kernel, stride, padding, inputSize, weight, bias
+      )
+
+      local conv = nn.Sequential()
+      conv:add(nn.Contiguous())
+      conv:add(nn.View(-1, 1, inputSize, inputSize))
+      conv:add(SC(1, multiplier, kernel, kernel, stride, stride, padding, padding))
+
+      local depthWiseConv = nn.Parallel(2, 2)
+      for channel = 1, nInputPlane do
+         local tempConv = conv:clone()
+         tempConv:get(3).weight = weight:narrow(2, channel, 1):clone()
+         tempConv:get(3).bias = bias:select(2, channel):clone()
+        depthWiseConv:add(tempConv)
+      end
+      depthWiseConv:add(nn.Contiguous())
+      return depthWiseConv
+   end
+
+   local n = 3 -- nInputPlane
+   local s = 28 -- input height and width
+   local b = 3 -- batch size
+   local m = 4 -- multiplier
+   local k = 3 -- kernel size
+   local p = 1 -- padding
+   local st = 1 -- stride
+
+   local testBatch = 1e3 -- number of repetition
+
+   local X = torch.rand(b, n, s, s) -- 1x3x299x299 images
+   local weight = torch.rand(m, n, k, k) -- weight
+   local bias = torch.rand(m, n) -- bias
+
+   local model = spatialDepthWiseConv(n, m, k, st, p, s, weight, bias)
+   local model_util = spatialDepthWiseConv_util(n, m, k, st, p, s, weight, bias)
+
+   local Y_util = model_util:forward(X)
+   local Y = model:forward(X)
+
+   local abs_diff = Y_util:clone():csub(Y):abs()
+   mytester:assert(torch.all(abs_diff:lt(epsilon)))
+end
+
 mytester:add(nntest)
 
 jac = nn.Jacobian
diff --git a/test/benchmarks/IndexLinear.lua b/test/benchmarks/IndexLinear.lua
new file mode 100644
index 0000000..f6b291a
--- /dev/null
+++ b/test/benchmarks/IndexLinear.lua
@@ -0,0 +1,323 @@
+local cudaAvailable, _ = pcall(require, 'cunn')
+
+local function benchmark(opt)
+   local isize = opt.inputSize or 100000
+   local osize = opt.outputSize or 1
+   local weightDecay = opt.weightDecay or 0
+   local nnzMin = opt.featuresMinNumber or 1
+   local nnzMax = opt.featuresMaxNumber or 10000
+   local idxMin = 1
+   local idxMax = isize
+   local ntests = opt.ntests or 10
+   local batchSize = opt.batchSize or 1
+   local lr = opt.learningRate or 0.01
+   torch.setdefaulttensortype('torch.FloatTensor')
+
+   local ilcpu = nn.IndexLinear(isize, osize, nil, nil, nil, nil, nil):float()
+   nn.IndexLinear(isize, osize):float()
+   ilcpu.weightDecay = weightDecay
+   ilcpu.weight:uniform()
+   ilcpu.bias:fill(1)
+
+   local slcpu = nn.SparseLinear(isize, osize):float()
+   slcpu.weightDecay = weightDecay
+   slcpu.weight:uniform()
+   slcpu.bias:copy(ilcpu.bias)
+
+   local ilgpu, slgpu
+   if cudaAvailable then
+      ilgpu = nn.IndexLinear(isize, osize, nil, nil, nil, nil, nil):cuda()
+      nn.IndexLinear(isize, osize):float():cuda()
+      ilgpu.weightDecay = weightDecay
+      ilgpu.weight:copy(ilcpu.weight)
+      ilgpu.bias:copy(ilcpu.bias)
+
+      slgpu = nn.SparseLinear(isize, osize):cuda()
+      slgpu.weightDecay = weightDecay
+      slgpu.weight:copy(slcpu.weight)
+      slgpu.bias:copy(ilcpu.bias)
+   end
+
+   -- Batch preparation for SparseLinearCPU and IndexLinearCPU formats
+   local batchesILCPU = {}
+   local batchesSLCPU = {}
+   local batchesILGPU = {}
+   local batchesSLGPU = {}
+   local gradOutsILCPU = {}
+   local gradOutsSLCPU = {}
+   local gradOutsILGPU = {}
+   local gradOutsSLGPU = {}
+   local tot = 0
+   local samples = 0
+   for j=1,ntests do
+      local batchILCPU = {{}, {}}
+      local batchILGPU = {{}, {}}
+      for i=1,batchSize do
+         local n = torch.random(nnzMin, nnzMax)
+         indices = idxMin + torch.LongTensor():randperm(idxMax - idxMin)
+         batchILCPU[1][i] = indices[{{1,n}}]
+         batchILCPU[2][i] = torch.FloatTensor(n):uniform()
+         if cudaAvailable then
+            batchILGPU[1][i] = torch.CudaLongTensor(n):copy(batchILCPU[1][i])
+            batchILGPU[2][i] = torch.CudaTensor(n):copy(batchILCPU[2][i])
+         end
+         tot = tot + n
+      end
+      samples = samples + batchSize
+      batchesILCPU[j] = batchILCPU
+      if cudaAvailable then
+         batchesILGPU[j] = batchILGPU
+      end
+   end
+
+   for j=1,ntests do
+      local batchSLCPU = {}
+      local batchSLGPU = {}
+      for i=1,#batchesILCPU[j][1] do
+         batchSLCPU[i] = torch.FloatTensor(batchesILCPU[j][1][i]:size(1), 2)
+         batchSLCPU[i][{{}, {1,1}}]:copy(batchesILCPU[j][1][i])
+         batchSLCPU[i][{{}, {2,2}}]:copy(batchesILCPU[j][2][i])
+
+         if cudaAvailable then
+            batchSLGPU[i] = torch.CudaTensor(batchesILCPU[j][1][i]:size(1), 2)
+            batchSLGPU[i][{{}, {1,1}}]:copy(batchesILCPU[j][1][i])
+            batchSLGPU[i][{{}, {2,2}}]:copy(batchesILCPU[j][2][i])
+         end
+      end
+      batchesSLCPU[j] = batchSLCPU
+      if cudaAvailable then
+         batchesSLGPU[j] = batchSLGPU
+      end
+   end
+   for i=1,ntests do
+      gradOutsILCPU[i] = torch.FloatTensor(#batchesILCPU[i][1], osize):uniform()
+      gradOutsSLCPU[i] = torch.FloatTensor(#batchesILCPU[i][1], osize):copy(gradOutsILCPU[i])
+
+      if cudaAvailable then
+         gradOutsILGPU[i] = torch.CudaTensor(#batchesILCPU[i][1], osize):copy(gradOutsILCPU[i])
+         gradOutsSLGPU[i] = torch.CudaTensor(#batchesILCPU[i][1], osize):copy(gradOutsILCPU[i])
+      end
+   end
+   cutorch.synchronize()
+
+   local timings = {}
+   local timer = torch.Timer()
+
+   ilcpu:evaluate()
+   slcpu:evaluate()
+
+   if cudaAvailable then
+      ilgpu:evaluate()
+      slgpu:evaluate()
+   end
+
+   -- Dry-run the forward pass
+   -- to allocate stuff
+   for i=1,ntests do
+      outILCPU = ilcpu:forward(batchesILCPU[i])
+   end
+   for i=1,ntests do
+      outSLCPU = slcpu:forward(batchesSLCPU[i])
+   end
+
+   if cudaAvailable then
+      for i=1,ntests do
+         outSLGPU = slgpu:forward(batchesSLGPU[i])
+      end
+      for i=1,ntests do
+         outILGPU = ilgpu:forward(batchesILGPU[i])
+      end
+      cutorch.synchronize()
+   end
+
+   timings[1] = {ILCPU = timer:time().real}
+   for i=1,ntests do
+      outILCPU = ilcpu:forward(batchesILCPU[i])
+   end
+   timings[1].ILCPU = (timer:time().real - timings[1].ILCPU) / (ntests)
+   timings[1].SLCPU = timer:time().real
+   for i=1,ntests do
+      outSLCPU = slcpu:forward(batchesSLCPU[i])
+   end
+   timings[1].SLCPU = (timer:time().real - timings[1].SLCPU) / (ntests)
+
+   if cudaAvailable then
+      timings[1].SLGPU = timer:time().real
+      for i=1,ntests do
+         outSLGPU = slgpu:forward(batchesSLGPU[i])
+      end
+      cutorch:synchronize()
+      timings[1].SLGPU = (timer:time().real - timings[1].SLGPU) / (ntests)
+      timings[1].ILGPU = timer:time().real
+      for i=1,ntests do
+         outILGPU = ilgpu:forward(batchesILGPU[i])
+      end
+      cutorch:synchronize()
+      timings[1].ILGPU = (timer:time().real - timings[1].ILGPU) / (ntests)
+   end
+
+   -- Dry-run the zero bwd pass
+   -- to allocate stuff
+   for i=1,ntests do
+      ilcpu:zeroGradParameters()
+      outILCPU = ilcpu:forward(batchesILCPU[i])
+      ilcpu:backward(batchesILCPU[i], gradOutsILCPU[i])
+      ilcpu:updateParameters(lr)
+   end
+   for i=1,ntests do
+      slcpu:zeroGradParameters()
+      outSLCPU = slcpu:forward(batchesSLCPU[i])
+      slcpu:backward(batchesSLCPU[i], gradOutsSLCPU[i])
+      slcpu:updateParameters(lr)
+   end
+
+   if cudaAvailable then
+      for i=1,ntests do
+         slgpu:zeroGradParameters()
+         outSLGPU = slgpu:forward(batchesSLGPU[i])
+         slgpu:backward(batchesSLGPU[i], gradOutsSLGPU[i])
+         slgpu:updateParameters(lr)
+      end
+      cutorch:synchronize()
+      for i=1,ntests do
+         ilgpu:zeroGradParameters()
+         outILGPU = ilgpu:forward(batchesILGPU[i])
+         ilgpu:backward(batchesILGPU[i], gradOutsILGPU[i])
+         ilgpu:updateParameters(lr)
+      end
+      cutorch:synchronize()
+   end
+
+   timings[2] = {ILCPU = timer:time().real}
+   for i=1,ntests do
+      ilcpu:zeroGradParameters()
+      outILCPU = ilcpu:forward(batchesILCPU[i])
+      ilcpu:backward(batchesILCPU[i], gradOutsILCPU[i])
+      ilcpu:updateParameters(lr)
+   end
+   timings[2].ILCPU = (timer:time().real - timings[2].ILCPU) / (ntests)
+   timings[2].SLCPU = timer:time().real
+   for i=1,ntests do
+      slcpu:zeroGradParameters()
+      outSLCPU = slcpu:forward(batchesSLCPU[i])
+      slcpu:backward(batchesSLCPU[i], gradOutsSLCPU[i])
+      slcpu:updateParameters(lr)
+   end
+   timings[2].SLCPU = (timer:time().real - timings[2].SLCPU) / (ntests)
+
+   if cudaAvailable then
+      timings[2].SLGPU = timer:time().real
+      for i=1,ntests do
+         slgpu:zeroGradParameters()
+         outSLGPU = slgpu:forward(batchesSLGPU[i])
+         slgpu:backward(batchesSLGPU[i], gradOutsSLGPU[i])
+         slgpu:updateParameters(lr)
+      end
+      cutorch:synchronize()
+      timings[2].SLGPU = (timer:time().real - timings[2].SLGPU) / (ntests)
+      timings[2].ILGPU = timer:time().real
+      for i=1,ntests do
+         ilgpu:zeroGradParameters()
+         outILGPU = ilgpu:forward(batchesILGPU[i])
+         ilgpu:backward(batchesILGPU[i], gradOutsILGPU[i])
+         ilgpu:updateParameters(lr)
+      end
+      cutorch:synchronize()
+      timings[2].ILGPU = (timer:time().real - timings[2].ILGPU) / (ntests)
+   end
+
+   -- Dry-run the bwd update pass
+   -- to allocate stuff
+   for i=1,ntests do
+      outILCPU = ilcpu:forward(batchesILCPU[i])
+      ilcpu:backwardUpdate(batchesILCPU[i], gradOutsILCPU[i], lr)
+   end
+   for i=1,ntests do
+      outSLCPU = slcpu:forward(batchesSLCPU[i])
+      slcpu:backwardUpdate(batchesSLCPU[i], gradOutsSLCPU[i], lr)
+   end
+
+   if cudaAvailable then
+      for i=1,ntests do
+         outSLGPU = slgpu:forward(batchesSLGPU[i])
+         slgpu:backwardUpdate(batchesSLGPU[i], gradOutsSLGPU[i], lr)
+      end
+      cutorch:synchronize()
+      for i=1,ntests do
+         outILGPU = ilgpu:forward(batchesILGPU[i])
+         ilgpu:backwardUpdate(batchesILGPU[i], gradOutsILGPU[i], lr)
+      end
+      cutorch:synchronize()
+   end
+
+   timings[3] = {ILCPU = timer:time().real}
+   for i=1,ntests do
+      outILCPU = ilcpu:forward(batchesILCPU[i])
+      ilcpu:backwardUpdate(batchesILCPU[i], gradOutsILCPU[i], lr)
+   end
+   timings[3].ILCPU = (timer:time().real - timings[3].ILCPU) / (ntests)
+   timings[3].SLCPU = timer:time().real
+   for i=1,ntests do
+      outSLCPU = slcpu:forward(batchesSLCPU[i])
+      slcpu:backwardUpdate(batchesSLCPU[i], gradOutsSLCPU[i], lr)
+   end
+   timings[3].SLCPU = (timer:time().real - timings[3].SLCPU) / (ntests)
+
+   if cudaAvailable then
+      timings[3].SLGPU = timer:time().real
+      for i=1,ntests do
+         outSLGPU = slgpu:forward(batchesSLGPU[i])
+         slgpu:backwardUpdate(batchesSLGPU[i], gradOutsSLGPU[i], lr)
+      end
+      cutorch:synchronize()
+      timings[3].SLGPU = (timer:time().real - timings[3].SLGPU) / (ntests)
+      timings[3].ILGPU = timer:time().real
+      for i=1,ntests do
+         outILGPU = ilgpu:forward(batchesILGPU[i])
+         ilgpu:backwardUpdate(batchesILGPU[i], gradOutsILGPU[i], lr)
+      end
+      cutorch:synchronize()
+      timings[3].ILGPU = (timer:time().real - timings[3].ILGPU) / (ntests)
+   end
+
+   return timings
+end
+
+local formatStr = "forward: %4.4f, forward + backward + updateParams: %4.4f, forward + backwardUpdate: %4.4f"
+local param = {}
+for _, inputSize in ipairs{100000, 1000000} do
+   param.inputSize = inputSize
+   print(string.format("InputSize: %7d", inputSize))
+   for _, outputSize in ipairs{1, 100, 250} do
+      param.outputSize = outputSize
+      print(string.format("  OutputSize: %3d", outputSize))
+      for _, featSize in ipairs{100, 1000, 10000} do
+         param.featuresMinNumber = featSize / 2
+         param.featuresMaxNumber = featSize
+         print(string.format("    NumFeatures: [%4d to %4d]", featSize / 2, featSize))
+         for _, batchSize in ipairs{1, 32, 256} do
+            param.batchSize = batchSize
+            print(string.format("      BatchSize: %3d", batchSize))
+
+            local timings = benchmark(param)
+            print(string.format("        SL on CPU / IL on CPU - " .. formatStr,
+                                timings[1].SLCPU / timings[1].ILCPU,
+                                timings[2].SLCPU / timings[2].ILCPU,
+                                timings[3].SLCPU / timings[3].ILCPU))
+
+            if cudaAvailable then
+               print(string.format("        SL on GPU / IL on GPU - " .. formatStr,
+                                   timings[1].SLGPU / timings[1].ILGPU,
+                                   timings[2].SLGPU / timings[2].ILGPU,
+                                   timings[3].SLGPU / timings[3].ILGPU))
+
+               print(string.format("        IL on CPU / IL on GPU - " .. formatStr,
+                                   timings[1].ILCPU / timings[1].ILGPU,
+                                   timings[2].ILCPU / timings[2].ILGPU,
+                                   timings[3].ILCPU / timings[3].ILGPU))
+
+            end
+         end
+      end
+   end
+end

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-nn.git