[lua-torch-nn] 01/05: Imported Upstream version 0~20160812-g461701f+dfsg

Sat Aug 13 00:44:11 UTC 2016

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-nn.

commit 2b6ee42f780f5795cc1a0bf679dd3e29d0adb88f
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Sat Aug 13 00:28:15 2016 +0000

    Imported Upstream version 0~20160812-g461701f+dfsg
---
 BatchNormalization.lua                             |  23 +-
 Bottle.lua                                         |  65 +++
 GPU.lua                                            | 273 +++++++++++++
 HardTanh.lua                                       |  12 +-
 Index.lua                                          |   9 +-
 LookupTable.lua                                    |  10 +-
 MarginRankingCriterion.lua                         |  28 +-
 Narrow.lua                                         |  10 +-
 ReLU6.lua                                          |  32 ++
 Select.lua                                         |  10 +-
 SpatialDilatedConvolution.lua                      |  10 +-
 ...lMaxPooling.lua => SpatialDilatedMaxPooling.lua |  53 +--
 SpatialFullConvolution.lua                         |  23 +-
 SpatialMaxPooling.lua                              |   2 +
 SpatialUpSamplingBilinear.lua                      | 111 ++++++
 VolumetricDilatedConvolution.lua                   | 103 +++++
 VolumetricReplicationPadding.lua                   |  58 +++
 doc/containers.md                                  |  32 ++
 doc/convolution.md                                 | 121 +++++-
 doc/criterion.md                                   |   2 +-
 doc/image/relu6.png                                | Bin 0 -> 20076 bytes
 doc/module.md                                      |   4 +-
 doc/simple.md                                      |  48 +++
 doc/transfer.md                                    |  27 +-
 hessian.lua                                        |   2 +-
 init.lua                                           |   8 +
 lib/THNN/generic/ClassNLLCriterion.c               |  15 +-
 lib/THNN/generic/HardTanh.c                        | 113 ++++--
 lib/THNN/generic/LookupTable.c                     |  16 +-
 lib/THNN/generic/MultiLabelMarginCriterion.c       |   6 +-
 lib/THNN/generic/MultiMarginCriterion.c            |   6 +-
 lib/THNN/generic/SpatialAdaptiveMaxPooling.c       |  18 +-
 lib/THNN/generic/SpatialClassNLLCriterion.c        |  12 +-
 lib/THNN/generic/SpatialConvolutionMM.c            |   4 +
 lib/THNN/generic/SpatialConvolutionMap.c           |  18 +-
 lib/THNN/generic/SpatialDilatedConvolution.c       |   2 +
 lib/THNN/generic/SpatialFractionalMaxPooling.c     |   6 +-
 lib/THNN/generic/SpatialFullConvolution.c          |  43 +-
 lib/THNN/generic/SpatialFullConvolutionMap.c       |  18 +-
 lib/THNN/generic/SpatialMaxPooling.c               |  52 ++-
 lib/THNN/generic/SpatialMaxUnpooling.c             |  16 +-
 lib/THNN/generic/SpatialUpSamplingBilinear.c       | 127 ++++++
 lib/THNN/generic/THNN.h                            | 223 +++++++----
 lib/THNN/generic/VolumetricConvolutionMM.c         |   4 +
 lib/THNN/generic/VolumetricDilatedConvolution.c    | 356 +++++++++++++++++
 lib/THNN/generic/VolumetricFullConvolution.c       |  39 +-
 lib/THNN/generic/VolumetricReplicationPadding.c    | 301 ++++++++++++++
 lib/THNN/init.c                                    |   9 +
 test.lua                                           | 442 ++++++++++++++++++---
 49 files changed, 2535 insertions(+), 387 deletions(-)

diff --git a/BatchNormalization.lua b/BatchNormalization.lua
index ac42749..578f441 100644
--- a/BatchNormalization.lua
+++ b/BatchNormalization.lua
@@ -74,12 +74,15 @@ function BN:reset()
 end
 
 function BN:checkInputDim(input)
-   assert(input:dim() == self.nDim, string.format(
+   local iDim = input:dim()
+   assert(iDim == self.nDim or
+              (iDim == self.nDim - 1 and self.train == false), string.format(
       'only mini-batch supported (%dD tensor), got %dD tensor instead',
-      self.nDim, input:dim()))
-   assert(input:size(2) == self.running_mean:nElement(), string.format(
+      self.nDim, iDim))
+   local featDim = (iDim == self.nDim - 1) and 1 or 2
+   assert(input:size(featDim) == self.running_mean:nElement(), string.format(
       'got %d-feature tensor, expected %d',
-      input:size(2), self.running_mean:nElement()))
+      input:size(featDim), self.running_mean:nElement()))
 end
 
 local function makeContiguous(self, input, gradOutput)
@@ -98,10 +101,20 @@ local function makeContiguous(self, input, gradOutput)
    return input, gradOutput
 end
 
+local function makeBatch(self, input)
+    local iDim = input:dim()
+    if self.train == false and iDim == self.nDim - 1 then
+        return nn.utils.addSingletonDimension(input, input, 1)
+    else
+        return input
+    end
+end
+
 function BN:updateOutput(input)
    self:checkInputDim(input)
 
    input = makeContiguous(self, input)
+   input = makeBatch(self, input)
 
    self.output:resizeAs(input)
    self.save_mean = self.save_mean or input.new()
@@ -131,6 +144,8 @@ local function backward(self, input, gradOutput, scale, gradInput, gradWeight, g
    assert(self.save_mean and self.save_std, 'must call :updateOutput() first')
 
    input, gradOutput = makeContiguous(self, input, gradOutput)
+   input = makeBatch(self, input)
+   gradOutput = makeBatch(self, gradOutput)
 
    scale = scale or 1
    if gradInput then
diff --git a/Bottle.lua b/Bottle.lua
new file mode 100644
index 0000000..6934bff
--- /dev/null
+++ b/Bottle.lua
@@ -0,0 +1,65 @@
+local Bottle, parent = torch.class("nn.Bottle", "nn.Container")
+local unpack = unpack or table.unpack
+
+function Bottle:__init(module, nInputDim, nOutputDim)
+   parent.__init(self)
+   self.nInputDim = nInputDim or 2
+   self.nOutputDim = nOutputDim or self.nInputDim
+   self.dimDelta = self.nInputDim - self.nOutputDim
+   -- Used to reshape the gradients
+   self.inShape = torch.Tensor(self.nInputDim)
+   self.outShape = torch.Tensor(self.nOutputDim)
+   -- add module to modules
+   self.modules[1] = module
+end
+
+function Bottle:updateOutput(input)
+   -- first batchDims dimensions will be fused
+   local batchDims = input:dim() - self.nInputDim + 1 
+   -- see if bottle is required
+   if batchDims > 1 then
+      -- bottle the first dims
+      local inSize = torch.LongTensor(input:size())
+      local squeezeSize = inSize[{{1, batchDims - 1}}]:prod()
+      self.inShape:copy(inSize[{{batchDims, input:dim()}}])
+      self.inShape[{{1}}]:mul(squeezeSize)
+      -- Forward with the module's dimension
+      local newInput = input:view(unpack(self.inShape:totable()))
+      local output = self.modules[1]:updateOutput(newInput)
+      assert(output:dim() == self.nOutputDim,
+	     "Wrong number of output dims on module. Expected: " ..
+		self.nOutputDim .. ' but got ' .. 
+		tostring(output and output:dim()))
+      self.outShape:copy(torch.LongTensor(output:size()))
+      if math.abs(self.dimDelta) > 0 then
+         inSize:resize(inSize:size(1) - self.dimDelta)
+      end
+      inSize[{{batchDims, inSize:size(1)}}]:copy(self.outShape)
+      inSize[{{batchDims}}]:div(squeezeSize)
+      -- unbottle
+      self.output:set(output:view(unpack(torch.totable(inSize))))
+   else
+      self.output:set(self.modules[1]:updateOutput(input))
+   end
+   return self.output
+end
+
+function Bottle:updateGradInput(input, gradOutput)
+   if input:dim() > self.nInputDim then
+      local input_ = input:view(unpack(self.inShape:totable()))
+      local gradOutput_ = gradOutput:view(unpack(self.outShape:totable()))
+      self.modules[1]:updateGradInput(input_, gradOutput_)
+      self.gradInput:set(self.modules[1].gradInput:viewAs(input))
+   else
+      self.gradInput:set(self.modules[1]:updateGradInput(input))
+   end
+   return self.gradInput
+end
+
+function Bottle:accGradParameters(input, gradOutput, scale)
+   if input:dim() > self.nInputDim then
+      input = input:view(unpack(self.inShape:totable()))
+      gradOutput = gradOutput:view(unpack(self.outShape:totable()))      
+   end
+   self.modules[1]:accGradParameters(input, gradOutput, scale)   
+end
diff --git a/GPU.lua b/GPU.lua
new file mode 100644
index 0000000..3150236
--- /dev/null
+++ b/GPU.lua
@@ -0,0 +1,273 @@
+------------------------------------------------------------------------
+--[[ GPU ]]--
+-- Decorates a module such that its parameters are
+-- hosted on a specified GPU device.
+-- The operations are also executed on that device.
+-- Arguments input and gradOutput are converted to the specified device 
+-- before being fed to the decorated module. 
+-- Returned output is on the specified outdevice (defaults to device). 
+-- Returned gradInput is allocated on the same device as the input.
+-- The unit test is located in cunn.
+------------------------------------------------------------------------
+local GPU, parent = torch.class("nn.GPU", "nn.Container")
+
+function GPU:__init(module, device, outdevice)
+   parent.__init(self)
+   assert(torch.type(device) == 'number')
+   self.device = device
+   self.outdevice = outdevice or device
+   
+   assert(torch.isTypeOf(module, 'nn.Module'))
+   self.modules[1] = module
+   
+   if module:type() == 'torch.CudaTensor' then
+      self:cuda()
+   end
+end
+
+function GPU.recursiveModuleDevice(obj, device)
+   if type(obj) == 'table' and not torch.isTypeOf(obj, 'nn.GPU') and not obj.__noGPU__ then
+      for k,v in pairs(obj) do
+         obj[k] = GPU.recursiveModuleDevice(v, device)
+      end
+   elseif torch.type(obj):match('torch.Cuda.*Tensor') then
+      if obj:getDevice() ~= device then
+         obj = obj:clone() -- this will reallocate it to device
+         local newdevice = obj:getDevice()
+         -- when nElement() == 0 newdevice is 0
+         assert(newdevice == device or newdevice == 0)
+      end
+   end
+   assert(obj ~= nil)
+   return obj
+end
+
+-- set the device of the decorated module
+function GPU:setDevice(device)
+   self.device = device or self.device
+   
+   assert(self.modules[1])
+   self.modules[1] = cutorch.withDevice(self.device, function() 
+      return self.recursiveModuleDevice(self.modules[1], self.device) 
+   end)
+   return self
+end
+
+-- when proto is a device number, returns a dst that has device device for each element in src
+-- otherwise, if proto is a table/tensor, makes sure dst is a identical to src, yet on the same device as proto
+function GPU.recursiveSetDevice(dst, src, proto)
+   local device, prototable
+   if torch.isTensor(proto) then
+      device = proto:getDevice()
+   elseif torch.type(proto) == 'number' then
+      device = proto
+   elseif torch.type(proto) == 'table' then
+      prototable = true
+   else
+      error"Expecting number, table or tensor for arg 3 (proto)"
+   end
+   if torch.type(src) == 'table' then
+      dst = torch.type(dst) == 'table' and dst or {}
+      for k,v in ipairs(src) do
+         dst[k] = GPU.recursiveSetDevice(dst[k], v, prototable and proto[k] or device)
+      end
+      for k=#src+1,#dst do
+         dst[k] = nil
+      end
+   elseif torch.type(src):match('torch.Cuda.*Tensor') and src:getDevice() ~= device and src:getDevice() ~= 0 then
+      if not (torch.type(dst):match('torch.Cuda.*Tensor') and dst:getDevice() == device) then
+         dst = src.new()
+      end
+      cutorch.withDevice(device, function() dst:resizeAs(src):copy(src) end)
+   else
+      dst = src
+   end
+   return dst 
+end
+
+function GPU:updateOutput(input)
+   if self._type == 'torch.CudaTensor' then
+      self._input = self.recursiveSetDevice(self._input, input, self.device)
+      
+      local output = cutorch.withDevice(self.device, function()
+         return self.modules[1]:updateOutput(self._input)
+      end)
+      
+      if self.device ~= self.outdevice then
+         self.output = self.recursiveSetDevice(self.output, output, self.outdevice)
+      else
+         self.output = output
+      end
+   else
+      self.output = self.modules[1]:updateOutput(input)
+   end
+   
+   return self.output
+end
+
+function GPU:updateGradInput(input, gradOutput)
+   if self._type == 'torch.CudaTensor' then
+      self._gradOutput = self.recursiveSetDevice(self._gradOutput, gradOutput, self.device)
+      
+      local gradInput = cutorch.withDevice(self.device, function()
+         return self.modules[1]:updateGradInput(self._input, self._gradOutput)
+      end)
+      
+      self.gradInput = self.recursiveSetDevice(self.gradInput, gradInput, input)
+   else
+      self.gradInput = self.modules[1]:updateGradInput(input, gradOutput)
+   end
+   
+   return self.gradInput
+end
+
+function GPU:accGradParameters(input, gradOutput, scale) 
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function()
+         self.modules[1]:accGradParameters(self._input, self._gradOutput, scale)
+      end)
+   else
+      self.modules[1]:accGradParameters(input, gradOutput, scale)
+   end
+end
+
+function GPU:apply(callback)
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.apply(self, callback) end)
+   else
+      parent.apply(self, callback)
+   end
+end
+
+function GPU:type(type, typecache)
+   if type and type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.type(self, type, typecache) end)
+      self:setDevice()
+   else
+      self.output = nil
+      self.gradInput = nil
+      self._input = nil
+      self._gradOutput = nil
+      parent.type(self, type, typecache)
+   end
+   return self
+end
+
+function GPU:clearState()
+   nn.utils.clear(self, 'output', 'gradInput')
+   self._input = nil
+   self._gradOutput = nil
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.clearState(self) end)
+   else
+      parent.clearState(self)
+   end
+end
+
+function GPU:zeroGradParameters()
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.zeroGradParameters(self) end)
+   else
+      parent.zeroGradParameters(self)
+   end
+end
+
+function GPU:updateParameters(lr)
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.updateParameters(self, lr) end)
+   else
+      parent.updateParameters(self, lr)
+   end
+end
+
+function GPU:training()
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.training(self) end)
+   else
+      parent.training(self)
+   end
+end
+
+function GPU:evaluate()
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.evaluate(self) end)
+   else
+      parent.evaluate(self)
+   end
+end
+
+function GPU:share(mlp, ...)
+   local args = {...}
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.share(self, mlp, unpack(args)) end)
+   else
+      parent.share(self, mlp, unpack(args))
+   end
+   return self
+end
+
+function GPU:reset(...)
+   local args = {...}
+   if self._type == 'torch.CudaTensor' then
+      cutorch.withDevice(self.device, function() parent.reset(self, unpack(args)) end)
+   else
+      parent.reset(self, unpack(args))
+   end
+   return self
+end
+
+function GPU:clone(...)
+   local args = {...}
+   if self._type == 'torch.CudaTensor' then
+      return cutorch.withDevice(self.device, function() parent.clone(self, unpack(args)) end)
+   else
+      return parent.clone(self, unpack(args))
+   end
+end
+
+function GPU:write(file)
+   -- Write all values in the object as a table.
+   local object = {}
+   for k, v in pairs(self) do
+      object[k] = v
+   end
+   local header = {self._type, self.device}
+   file:writeObject(header)
+   file:writeObject(object)
+end
+
+function GPU:read(file)
+   local header = file:readObject()
+   local object
+   if header[1] == 'torch.CudaTensor' then
+      local device = header[2] 
+      if device > cutorch.getDeviceCount() then
+         print"Warning : model was saved with more devices than available on current host."
+         print"Attempting to load module onto device 1"
+         device = 1
+      end
+      object = cutorch.withDevice(device, function() return file:readObject() end)
+   else
+      object = file:readObject()
+   end
+   
+   for k, v in pairs(object) do
+      self[k] = v
+   end
+end
+
+function GPU:__tostring__()
+   if self.modules[1].__tostring__ then
+      return torch.type(self) .. '(' .. self.device ..') @ ' .. self.modules[1]:__tostring__()
+   else
+      return torch.type(self) .. '(' .. self.device ..') @ ' .. torch.type(self.modules[1])
+   end
+end
+
+function GPU:accUpdateGradParameters(input, gradOutput, lr)
+   error("Not Implemented for "..torch.type(self))
+end
+
+function GPU:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   error("Not Implemented for "..torch.type(self))
+end
diff --git a/HardTanh.lua b/HardTanh.lua
index d3449a1..07cfc62 100644
--- a/HardTanh.lua
+++ b/HardTanh.lua
@@ -1,9 +1,13 @@
 local HardTanh, parent = torch.class('nn.HardTanh', 'nn.Module')
 
-function HardTanh:__init(min_value, max_value)
+function HardTanh:__init(min_value, max_value, inplace)
    parent.__init(self)
    self.min_val = min_value or -1
    self.max_val = max_value or 1
+   self.inplace = inplace or false
+   if (inplace and type(inplace) ~= 'boolean') then
+      error('in-place flag must be boolean')
+   end
    assert(self.max_val>self.min_val, 'max_value must be larger than min_value')
 end
 
@@ -14,7 +18,8 @@ function HardTanh:updateOutput(input)
       input:cdata(),
       self.output:cdata(),
       self.min_val,
-      self.max_val
+      self.max_val,
+      self.inplace or false
    )
    return self.output
 end
@@ -25,7 +30,8 @@ function HardTanh:updateGradInput(input, gradOutput)
       gradOutput:cdata(),
       self.gradInput:cdata(),
       self.min_val,
-      self.max_val
+      self.max_val,
+      self.inplace or false
    )
    return self.gradInput
 end
diff --git a/Index.lua b/Index.lua
index 8ae6063..6aa4297 100644
--- a/Index.lua
+++ b/Index.lua
@@ -3,7 +3,7 @@ local Index, parent = torch.class('nn.Index', 'nn.Module')
 function Index:__init(dimension)
     parent.__init(self)
     self.dimension = dimension
-    self.gradInput = {self.gradInput}
+    self.gradInput = {self.gradInput, self.gradInput.new()}
 end
 
 function Index:updateOutput(input)
@@ -17,9 +17,16 @@ function Index:updateGradInput(input, gradOutput)
     local t = input[1]
     local index = input[2]
 
+    self.gradInput[2]:resize(index:size()):zero()
     local gradInput = self.gradInput[1] -- no gradient for the index variable
     gradInput:resizeAs(t):zero()
     gradInput:indexAdd(self.dimension, index, gradOutput)
     return self.gradInput
 end
 
+function Index:clearState()
+    self.gradInput[1]:set()
+    self.gradInput[2]:set()
+    self.output:set()
+    return self
+end
diff --git a/LookupTable.lua b/LookupTable.lua
index 8ec2b34..8a60354 100644
--- a/LookupTable.lua
+++ b/LookupTable.lua
@@ -81,7 +81,7 @@ function LookupTable:updateOutput(input)
 end
 
 function LookupTable:updateGradInput(input, gradOutput)
-   -- the input can be of any type (as in the forward it's 
+   -- the input can be of any type (as in the forward it's
    -- converted anyway to LongTensor) thus, need to allocate
    -- new memory each time the user changes the input type
    if torch.type(self.gradInput) ~= torch.type(input) then
@@ -148,10 +148,10 @@ function LookupTable:type(type, tensorCache)
 
    if type == 'torch.CudaTensor' then
       -- CUDA uses _sorted and _indices temporary tensors
-      self._sorted = self.weight.new()
-      self._indices = self.weight.new()
-      self._count = self.weight.new()
-      self._input = self.weight.new()
+      self._sorted = torch.CudaLongTensor.new()
+      self._indices = torch.CudaLongTensor.new()
+      self._count = torch.CudaLongTensor.new()
+      self._input = torch.CudaLongTensor.new()
    else
       -- self._count and self._input should only be converted if using Cuda
       self._count = torch.IntTensor()
diff --git a/MarginRankingCriterion.lua b/MarginRankingCriterion.lua
index 2c1f4c2..844d905 100644
--- a/MarginRankingCriterion.lua
+++ b/MarginRankingCriterion.lua
@@ -3,14 +3,14 @@ local MarginRankingCriterion, parent = torch.class('nn.MarginRankingCriterion',
 function MarginRankingCriterion:__init(margin)
    parent.__init(self)
    margin=margin or 1
-   self.margin = margin 
+   self.margin = margin
    self.gradInput = {torch.Tensor(1), torch.Tensor(1)}
    self.sizeAverage = true
-end 
- 
-function MarginRankingCriterion:updateOutput(input,y)
-   if input[1]:size(1) == 1 then
-      self.output=math.max(0, -y*(input[1][1]-input[2][1]) + self.margin  ) 
+end
+
+function MarginRankingCriterion:updateOutput(input, y)
+    if torch.type(y) == 'number' then -- non-batch mode
+      self.output = math.max(0, -y * (input[1][1] - input[2][1]) + self.margin)
    else
       self._output = self._output or input[1]:clone()
       self._output:resizeAs(input[1])
@@ -33,14 +33,14 @@ function MarginRankingCriterion:updateOutput(input,y)
 end
 
 function MarginRankingCriterion:updateGradInput(input, y)
-   if input[1]:size(1) == 1 then
-      local dist = -y*(input[1][1]-input[2][1]) + self.margin
+    if torch.type(y) == 'number' then -- non-batch mode
+      local dist = -y * (input[1][1] - input[2][1]) + self.margin
       if dist < 0 then
-         self.gradInput[1][1]=0;
-         self.gradInput[2][1]=0;
-      else	
-         self.gradInput[1][1]=-y
-         self.gradInput[2][1]=y
+         self.gradInput[1][1] = 0;
+         self.gradInput[2][1] = 0;
+      else
+         self.gradInput[1][1] = -y
+         self.gradInput[2][1] = y
       end
    else
       self.dist = self.dist or input[1].new()
@@ -71,5 +71,5 @@ function MarginRankingCriterion:updateGradInput(input, y)
       end
 
    end
-   return self.gradInput 
+   return self.gradInput
 end
diff --git a/Narrow.lua b/Narrow.lua
index 07322d8..0754d45 100644
--- a/Narrow.lua
+++ b/Narrow.lua
@@ -11,23 +11,25 @@ function Narrow:__init(dimension,offset,length)
 end
 
 function Narrow:updateOutput(input)
+   local dim = self.dimension < 0 and input:dim() + self.dimension + 1 or self.dimension
    local length = self.length
    if length < 0 then
-      length = input:size(self.dimension) - self.index + self.length + 2
+      length = input:size(dim) - self.index + self.length + 2
    end
-   local output=input:narrow(self.dimension,self.index,length)
+   local output=input:narrow(dim,self.index,length)
    self.output = self.output:typeAs(output)
    self.output:resizeAs(output):copy(output)
    return self.output
 end
 
 function Narrow:updateGradInput(input, gradOutput)
+   local dim = self.dimension < 0 and input:dim() + self.dimension + 1 or self.dimension
    local length = self.length
    if length < 0 then
-      length = input:size(self.dimension) - self.index + self.length + 2
+      length = input:size(dim) - self.index + self.length + 2
    end
    self.gradInput = self.gradInput:typeAs(input)
    self.gradInput:resizeAs(input):zero()
-   self.gradInput:narrow(self.dimension,self.index,length):copy(gradOutput)
+   self.gradInput:narrow(dim,self.index,length):copy(gradOutput)
    return self.gradInput
 end
diff --git a/ReLU6.lua b/ReLU6.lua
new file mode 100644
index 0000000..be8985b
--- /dev/null
+++ b/ReLU6.lua
@@ -0,0 +1,32 @@
+local ReLU6, parent = torch.class('nn.ReLU6', 'nn.Module')
+
+function ReLU6:__init(inplace)
+   parent.__init(self)
+   
+   if inplace == nil then
+      self.inplace = false
+   else
+      self.inplace = inplace
+   end
+
+   if (inplace and type(inplace) ~= 'boolean') then
+      error('in-place flag must be boolean')
+   end
+end
+
+function ReLU6:updateOutput(input)
+   input.THNN.HardTanh_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      0, 6, self.inplace)
+   return self.output
+end
+
+function ReLU6:updateGradInput(input, gradOutput)
+   input.THNN.HardTanh_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      0, 6, self.inplace)
+   return self.gradInput
+end
diff --git a/Select.lua b/Select.lua
index fccdf32..6dc5a04 100644
--- a/Select.lua
+++ b/Select.lua
@@ -7,16 +7,18 @@ function Select:__init(dimension,index)
 end
 
 function Select:updateOutput(input)
-   local index = self.index < 0 and input:size(self.dimension) + self.index + 1 or self.index
-   local output = input:select(self.dimension, index);
+   local dim = self.dimension < 0 and input:dim() + self.dimension + 1 or self.dimension
+   local index = self.index < 0 and input:size(dim) + self.index + 1 or self.index
+   local output = input:select(dim, index);
    self.output:resizeAs(output)
    return self.output:copy(output)
 end
 
 function Select:updateGradInput(input, gradOutput)
-   local index = self.index < 0 and input:size(self.dimension) + self.index + 1 or self.index
+   local dim = self.dimension < 0 and input:dim() + self.dimension + 1 or self.dimension
+   local index = self.index < 0 and input:size(dim) + self.index + 1 or self.index
    self.gradInput:resizeAs(input)  
    self.gradInput:zero()
-   self.gradInput:select(self.dimension,index):copy(gradOutput) 
+   self.gradInput:select(dim,index):copy(gradOutput) 
    return self.gradInput
 end 
diff --git a/SpatialDilatedConvolution.lua b/SpatialDilatedConvolution.lua
index 8611ee9..0ae914e 100644
--- a/SpatialDilatedConvolution.lua
+++ b/SpatialDilatedConvolution.lua
@@ -1,11 +1,11 @@
 local THNN = require 'nn.THNN'
 local SpatialDilatedConvolution, parent = torch.class('nn.SpatialDilatedConvolution', 'nn.SpatialConvolution')
 
-function SpatialDilatedConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH, dilationH, dilationW)
+function SpatialDilatedConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH, dilationW, dilationH)
    parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
 
-   self.dilationH = dilationH or 1
    self.dilationW = dilationW or 1
+   self.dilationH = dilationH or 1
 end
 
 local function makeContiguous(self, input, gradOutput)
@@ -38,7 +38,7 @@ function SpatialDilatedConvolution:updateOutput(input)
       self.kW, self.kH,
       self.dW, self.dH,
       self.padW, self.padH,
-      self.dilationH, self.dilationW
+      self.dilationW, self.dilationH
    )
    return self.output
 end
@@ -56,7 +56,7 @@ function SpatialDilatedConvolution:updateGradInput(input, gradOutput)
          self.kW, self.kH,
          self.dW, self.dH,
          self.padW, self.padH,
-         self.dilationH, self.dilationW
+         self.dilationW, self.dilationH
       )
       return self.gradInput
    end
@@ -76,7 +76,7 @@ function SpatialDilatedConvolution:accGradParameters(input, gradOutput, scale)
       self.kW, self.kH,
       self.dW, self.dH,
       self.padW, self.padH,
-      self.dilationH, self.dilationW,
+      self.dilationW, self.dilationH,
       scale
    )
 end
diff --git a/SpatialMaxPooling.lua b/SpatialDilatedMaxPooling.lua
similarity index 53%
copy from SpatialMaxPooling.lua
copy to SpatialDilatedMaxPooling.lua
index 8475b13..929459c 100644
--- a/SpatialMaxPooling.lua
+++ b/SpatialDilatedMaxPooling.lua
@@ -1,44 +1,20 @@
-local SpatialMaxPooling, parent = torch.class('nn.SpatialMaxPooling', 'nn.Module')
+local THNN = require 'nn.THNN'
+local SpatialDilatedMaxPooling, parent = torch.class('nn.SpatialDilatedMaxPooling', 'nn.SpatialMaxPooling')
 
-function SpatialMaxPooling:__init(kW, kH, dW, dH, padW, padH)
-   parent.__init(self)
+function SpatialDilatedMaxPooling:__init(kW, kH, dW, dH, padW, padH, dilationW, dilationH)
+   parent.__init(self, kW, kH, dW, dH, padW, padH)
 
-   dW = dW or kW
-   dH = dH or kH
-
-   self.kW = kW
-   self.kH = kH
-   self.dW = dW
-   self.dH = dH
-
-   self.padW = padW or 0
-   self.padH = padH or 0
-
-   self.ceil_mode = false
-   self.indices = torch.Tensor()
-end
-
-function SpatialMaxPooling:ceil()
-  self.ceil_mode = true
-  return self
+   self.dilationW = dilationW or 1
+   self.dilationH = dilationH or 1
 end
 
-function SpatialMaxPooling:floor()
-  self.ceil_mode = false
-  return self
-end
-
-function SpatialMaxPooling:updateOutput(input)
+function SpatialDilatedMaxPooling:updateOutput(input)
    self.indices = self.indices or input.new()
 
    local dims = input:dim()
    self.iheight = input:size(dims-1)
    self.iwidth = input:size(dims)
 
-   -- backward compatibility
-   self.ceil_mode = self.ceil_mode or false
-   self.padW = self.padW or 0
-   self.padH = self.padH or 0
    input.THNN.SpatialMaxPooling_updateOutput(
       input:cdata(),
       self.output:cdata(),
@@ -46,12 +22,13 @@ function SpatialMaxPooling:updateOutput(input)
       self.kW, self.kH,
       self.dW, self.dH,
       self.padW, self.padH,
+      self.dilationW, self.dilationH,
       self.ceil_mode
    )
    return self.output
 end
 
-function SpatialMaxPooling:updateGradInput(input, gradOutput)
+function SpatialDilatedMaxPooling:updateGradInput(input, gradOutput)
    input.THNN.SpatialMaxPooling_updateGradInput(
       input:cdata(),
       gradOutput:cdata(),
@@ -60,28 +37,24 @@ function SpatialMaxPooling:updateGradInput(input, gradOutput)
       self.kW, self.kH,
       self.dW, self.dH,
       self.padW, self.padH,
+      self.dilationW, self.dilationH,
       self.ceil_mode
    )
    return self.gradInput
 end
 
--- for backward compat
-function SpatialMaxPooling:empty()
-   self:clearState()
-end
-
-function SpatialMaxPooling:__tostring__()
+function SpatialDilatedMaxPooling:__tostring__()
    local s =  string.format('%s(%dx%d, %d,%d', torch.type(self),
                             self.kW, self.kH, self.dW, self.dH)
    if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
       s = s .. ', ' .. self.padW .. ','.. self.padH
    end
+   s = s .. ', ' .. self.dilationW .. ',' .. self.dilationH
    s = s .. ')'
-
    return s
 end
 
-function SpatialMaxPooling:clearState()
+function SpatialDilatedMaxPooling:clearState()
    if self.indices then
       self.indices:set()
    end
diff --git a/SpatialFullConvolution.lua b/SpatialFullConvolution.lua
index 40fcd3d..a234769 100644
--- a/SpatialFullConvolution.lua
+++ b/SpatialFullConvolution.lua
@@ -1,3 +1,4 @@
+local THNN = require 'nn.THNN'
 local SpatialFullConvolution, parent = torch.class('nn.SpatialFullConvolution','nn.Module')
 
 function SpatialFullConvolution:__init(nInputPlane, nOutputPlane,
@@ -33,6 +34,12 @@ function SpatialFullConvolution:__init(nInputPlane, nOutputPlane,
    self:reset()
 end
 
+function SpatialFullConvolution:noBias()
+	self.bias = nil
+	self.gradBias = nil
+	return self
+end
+
 function SpatialFullConvolution:reset(stdv)
    if stdv then
       stdv = stdv * math.sqrt(3)
@@ -43,7 +50,9 @@ function SpatialFullConvolution:reset(stdv)
       stdv = 1/math.sqrt(kW*kH*nInputPlane)
    end
    self.weight:uniform(-stdv, stdv)
-   self.bias:uniform(-stdv, stdv)
+   if self.bias then
+      self.bias:uniform(-stdv, stdv)
+   end
 end
 
 local function makeContiguous(self, input, gradOutput)
@@ -99,7 +108,7 @@ function SpatialFullConvolution:updateOutput(input)
     inputTensor:cdata(),
     self.output:cdata(),
     self.weight:cdata(),
-    self.bias:cdata(),
+    THNN.optionalTensor(self.bias),
     self.finput:cdata(),
     self.fgradInput:cdata(),
     self.kW, self.kH,
@@ -131,7 +140,7 @@ function SpatialFullConvolution:updateGradInput(input, gradOutput)
       adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
       -- Momentarily extract the gradInput tensor
       if type(self.gradInput) == 'table' then
-        self.gradInput = self.gradInput[1]
+        self.gradInput = self.gradInput[1] or inputTensor.new()
       end
     end
 
@@ -186,7 +195,7 @@ function SpatialFullConvolution:accGradParameters(input, gradOutput, scale)
     inputTensor:cdata(),
     gradOutput:cdata(),
     self.gradWeight:cdata(),
-    self.gradBias:cdata(),
+    THNN.optionalTensor(self.gradBias),
     self.finput:cdata(),
     self.fgradInput:cdata(),
     self.kW, self.kH,
@@ -215,7 +224,11 @@ function SpatialFullConvolution:__tostring__()
   if (self.adjW or self.adjH) and (self.adjW ~= 0 or self.adjH ~= 0) then
     s = s .. ', ' .. self.adjW .. ',' .. self.adjH
   end
-  return s .. ')'
+  if self.bias then
+     return s .. ')'
+  else
+     return s .. ') without bias'
+ end
 end
 
 function SpatialFullConvolution:clearState()
diff --git a/SpatialMaxPooling.lua b/SpatialMaxPooling.lua
index 8475b13..c05a876 100644
--- a/SpatialMaxPooling.lua
+++ b/SpatialMaxPooling.lua
@@ -46,6 +46,7 @@ function SpatialMaxPooling:updateOutput(input)
       self.kW, self.kH,
       self.dW, self.dH,
       self.padW, self.padH,
+      1, 1,
       self.ceil_mode
    )
    return self.output
@@ -60,6 +61,7 @@ function SpatialMaxPooling:updateGradInput(input, gradOutput)
       self.kW, self.kH,
       self.dW, self.dH,
       self.padW, self.padH,
+      1, 1,
       self.ceil_mode
    )
    return self.gradInput
diff --git a/SpatialUpSamplingBilinear.lua b/SpatialUpSamplingBilinear.lua
new file mode 100644
index 0000000..d911eae
--- /dev/null
+++ b/SpatialUpSamplingBilinear.lua
@@ -0,0 +1,111 @@
+require 'nn.THNN'
+local SpatialUpSamplingBilinear, parent =
+   torch.class('nn.SpatialUpSamplingBilinear', 'nn.Module')
+
+--[[
+Applies a 2D bilinear up-sampling over an input image composed of several
+input planes.
+
+The Y and X dimensions are assumed to be the last 2 tensor dimensions.  For
+instance, if the tensor is 4D, then dim 3 is the y dimension and dim 4 is the x.
+scale_factor is assumed to be a positive integer.
+
+owidth  = (width-1)*(scale_factor-1) + width
+oheight  = (height-1)*(scale_factor-1) + height
+--]]
+
+function SpatialUpSamplingBilinear:__init(scale_factor)
+   parent.__init(self)
+
+   self.scale_factor = scale_factor
+   if self.scale_factor < 1 then
+     error('scale_factor must be greater than 1')
+   end
+   if math.floor(self.scale_factor) ~= self.scale_factor then
+     error('scale_factor must be integer')
+   end
+   self.inputSize = torch.LongStorage(4)
+   self.outputSize = torch.LongStorage(4)
+end
+
+local function makeContiguous(self, input, gradOutput)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resizeAs(input):copy(input)
+      input = self._input
+   end
+   if gradOutput then
+      if not gradOutput:isContiguous() then
+         self._gradOutput = self._gradOutput or gradOutput.new()
+         self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+         gradOutput = self._gradOutput
+      end
+   end
+   return input, gradOutput
+end
+
+function SpatialUpSamplingBilinear:updateOutput(input)
+   assert(input:dim() == 4 or input:dim()==3,
+            'SpatialUpSamplingBilinear only support 3D or 4D tensors' )
+   local inputwas3D = false
+   if input:dim() == 3 then
+      input=input:view(-1, input:size(1), input:size(2), input:size(3))
+      inputwas3D = true
+   end
+   input = makeContiguous(self, input)
+   assert(input:dim() == 4)
+   -- Copy the input size
+   local xdim = input:dim()
+   local ydim = input:dim() - 1
+   for i = 1, input:dim() do
+     self.inputSize[i] = input:size(i)
+     self.outputSize[i] = input:size(i)
+   end
+   self.outputSize[ydim] = (self.outputSize[ydim]-1) * (self.scale_factor-1)
+                           + self.outputSize[ydim]
+   self.outputSize[xdim] = (self.outputSize[xdim]-1) * (self.scale_factor -1)
+                           + self.outputSize[xdim]
+   -- Resize the output if needed
+   self.output:resize(self.outputSize)
+   input.THNN.SpatialUpSamplingBilinear_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   if inputwas3D then
+      input = input:squeeze(1)
+      self.output = self.output:squeeze(1)
+   end
+   return self.output
+end
+
+function SpatialUpSamplingBilinear:updateGradInput(input, gradOutput)
+   assert(input:dim() == 4 or input:dim()==3,
+            'SpatialUpSamplingBilinear only support 3D or 4D tensors' )
+   assert(input:dim() == gradOutput:dim(),
+            'Input and gradOutput should be of same dimension' )
+   local inputwas3D = false
+   if input:dim() == 3 then
+      input=input:view(-1, input:size(1), input:size(2), input:size(3))
+      gradOutput=gradOutput:view(-1, gradOutput:size(1), gradOutput:size(2),
+                                 gradOutput:size(3))
+      inputwas3D = true
+   end
+   assert(input:dim() == 4 and gradOutput:dim() == 4)
+   self.gradInput:resizeAs(input)
+   input.THNN.SpatialUpSamplingBilinear_updateGradInput(
+      gradOutput:cdata(),
+      self.gradInput:cdata()
+   )
+   if inputwas3D then
+      input = input:squeeze(1)
+      gradOutput = gradOutput:squeeze(1)
+      self.gradInput = self.gradInput:squeeze(1)
+   end
+   return self.gradInput
+end
+
+
+function SpatialUpSamplingBilinear:__tostring__()
+   local s = string.format('%s(%d)', torch.type(self), self.scale_factor)
+   return s
+end
diff --git a/VolumetricDilatedConvolution.lua b/VolumetricDilatedConvolution.lua
new file mode 100644
index 0000000..fc7f037
--- /dev/null
+++ b/VolumetricDilatedConvolution.lua
@@ -0,0 +1,103 @@
+local THNN = require 'nn.THNN'
+local VolumetricDilatedConvolution, parent = torch.class('nn.VolumetricDilatedConvolution', 'nn.VolumetricConvolution')
+
+function VolumetricDilatedConvolution:__init(nInputPlane, nOutputPlane, kT, kW, kH, dT, dW, dH, padT, padW, padH, dilationT, dilationW, dilationH)
+   parent.__init(self, nInputPlane, nOutputPlane, kT, kW, kH, dT, dW, dH, padT, padW, padH)
+
+   self.dilationT = dilationT or 1
+   self.dilationW = dilationW or 1
+   self.dilationH = dilationH or 1
+end
+
+local function makeContiguous(self, input, gradOutput)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resizeAs(input):copy(input)
+      input = self._input
+   end
+   if gradOutput then
+      if not gradOutput:isContiguous() then
+	 self._gradOutput = self._gradOutput or gradOutput.new()
+	 self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+	 gradOutput = self._gradOutput
+      end
+   end
+   return input, gradOutput
+end
+
+function VolumetricDilatedConvolution:updateOutput(input)
+   self.finput = self.finput or self.weight.new()
+   self.fgradInput = self.fgradInput or self.weight.new()
+   input = makeContiguous(self, input)
+   input.THNN.VolumetricDilatedConvolution_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      THNN.optionalTensor(self.bias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      self.dilationT, self.dilationW, self.dilationH
+   )
+   return self.output
+end
+
+function VolumetricDilatedConvolution:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      input, gradOutput = makeContiguous(self, input, gradOutput)
+      self.fgradInput = self.fgradInput or self.weight.new()
+      input.THNN.VolumetricDilatedConvolution_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.finput:cdata(),
+         self.kT, self.kW, self.kH,
+         self.dT, self.dW, self.dH,
+         self.padT, self.padW, self.padH,
+         self.dilationT, self.dilationW, self.dilationH
+      )
+      return self.gradInput
+   end
+end
+
+function VolumetricDilatedConvolution:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   input, gradOutput = makeContiguous(self, input, gradOutput)
+   self.fgradInput = self.fgradInput or self.weight.new()
+   input.THNN.VolumetricDilatedConvolution_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      THNN.optionalTensor(self.gradBias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      self.dilationT, self.dilationW, self.dilationH,
+      scale
+   )
+end
+
+function VolumetricDilatedConvolution:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%dx%d', torch.type(self),
+         self.nInputPlane, self.nOutputPlane, self.kT, self.kW, self.kH)
+   if self.dT ~= 1 or self.dW ~= 1 or self.dH ~= 1
+   or self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0 then
+     s = s .. string.format(', %d,%d,%d', self.dT, self.dW, self.dH)
+   end
+   if (self.padT or self.padW or self.padH)
+   and (self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0) then
+     s = s .. ', ' .. self.padT .. ',' .. self.padW .. ',' .. self.padH
+   end
+   s = s .. ', ' .. self.dilationT .. ','
+       .. self.dilationW .. ',' .. self.dilationH
+   if self.bias then
+      return s .. ')'
+   else
+      return s .. ') without bias'
+   end
+end
diff --git a/VolumetricReplicationPadding.lua b/VolumetricReplicationPadding.lua
new file mode 100644
index 0000000..31a9503
--- /dev/null
+++ b/VolumetricReplicationPadding.lua
@@ -0,0 +1,58 @@
+local VolumetricReplicationPadding, parent =
+   torch.class('nn.VolumetricReplicationPadding', 'nn.Module')
+
+function VolumetricReplicationPadding:__init(pleft, pright, ptop, pbottom,
+                                             pfront, pback)
+   parent.__init(self)
+   self.pleft = pleft
+   self.pright = pright or self.pleft
+   self.ptop = ptop or self.pleft
+   self.pbottom = pbottom or self.pleft
+   self.pfront = pfront or self.pleft
+   self.pback = pback or self.pleft
+end
+
+function VolumetricReplicationPadding:updateOutput(input)
+   if input:dim() == 4 or input:dim() == 5 then
+      input.THNN.VolumetricReplicationPadding_updateOutput(
+         input:cdata(), self.output:cdata(),
+         self.pleft, self.pright, self.ptop, self.pbottom, self.pfront,
+         self.pback)
+   else
+      error('input must be 4 or 5-dimensional')
+   end
+   return self.output
+end
+
+function VolumetricReplicationPadding:updateGradInput(input, gradOutput)
+   if input:dim() == 4 and gradOutput:dim() == 4 then
+      assert(input:size(1) == gradOutput:size(1)
+             and input:size(2) + self.pfront + self.pback == gradOutput:size(2)
+             and input:size(3) + self.ptop + self.pbottom == gradOutput:size(3)
+             and input:size(4) + self.pleft + self.pright == gradOutput:size(4),
+             'input and gradOutput must be compatible in size')
+   elseif input:dim() == 5 and gradOutput:dim() == 5 then
+      assert(input:size(1) == gradOutput:size(1)
+             and input:size(2) == gradOutput:size(2)
+             and input:size(3) + self.pfront + self.pback == gradOutput:size(3)
+             and input:size(4) + self.ptop + self.pbottom == gradOutput:size(4)
+             and input:size(5) + self.pleft + self.pright == gradOutput:size(5),
+             'input and gradOutput must be compatible in size')
+   else
+      error(
+         [[input and gradOutput must be 4 or 5-dimensional
+         and have equal number of dimensions]]
+         )
+   end
+   input.THNN.VolumetricReplicationPadding_updateGradInput(
+      input:cdata(), gradOutput:cdata(), self.gradInput:cdata(),
+      self.pleft, self.pright, self.ptop, self.pbottom, self.pfront, self.pback)
+   return self.gradInput
+end
+
+function VolumetricReplicationPadding:__tostring__()
+   return torch.type(self) ..
+   string.format('(left=%d, right=%d, top=%d, bottom=%d, front=%d, back=%d)',
+                 self.pleft, self.pright, self.ptop, self.pbottom,
+                 self.pfront, self.pback)
+end
diff --git a/doc/containers.md b/doc/containers.md
index 9a83607..44060e8 100644
--- a/doc/containers.md
+++ b/doc/containers.md
@@ -7,6 +7,7 @@ Complex neural networks are easily built using container classes:
     * [Parallel](#nn.Parallel) : applies its `ith` child module to the  `ith` slice of the input Tensor ;
     * [Concat](#nn.Concat) : concatenates in one layer several modules along dimension `dim` ;
       * [DepthConcat](#nn.DepthConcat) : like Concat, but adds zero-padding when non-`dim` sizes don't match;
+    * [Bottle](#nn.Bottle) : allows any dimensionality input be forwarded through a module ;
  
 See also the [Table Containers](#nn.TableContainers) for manipulating tables of [Tensors](https://github.com/torch/torch7/blob/master/doc/tensor.md).
 
@@ -274,6 +275,37 @@ module output tensors non-`dim` sizes aren't all odd or even.
 Such that in order to keep the mappings aligned, one need 
 only ensure that these be all odd (or even).
 
+<a name="nn.Bottle"></a>
+## Bottle
+
+
+```lua
+module = nn.Bottle(module, [nInputDim], [nOutputDim])
+```
+Bottle allows varying dimensionality input to be forwarded through any module that accepts input of `nInputDim` dimensions, and generates output of `nOutputDim` dimensions.
+
+Bottle can be used to forward a 4D input of varying sizes through a 2D module `b x n`. The module `Bottle(module, 2)` will accept input of shape `p x q x r x n` and outputs with the shape `p x q x r x m`. Internally Bottle will view the input of `module` as `p*q*r x n`, and view the output as `p x q x r x m`. The numbers `p x q x r` are inferred from the input and can change for every forward/backward pass.
+
+```lua
+input=torch.Tensor(4, 5, 3, 10)
+mlp=nn.Bottle(nn.Linear(10, 2))
+print(input:size())
+print(mlp:forward(input):size())
+```
+which gives the output:
+```lua
+  4
+  5
+  3
+ 10
+[torch.LongStorage of size 4]
+ 4
+ 5
+ 3
+ 2
+[torch.LongStorage of size 4]
+```
+
 <a name="nn.TableContainers"></a>
 ## Table Containers ##
 While the above containers are used for manipulating input [Tensors](https://github.com/torch/torch7/blob/master/doc/tensor.md), table containers are used for manipulating tables :
diff --git a/doc/convolution.md b/doc/convolution.md
index 4e2bb6f..96d92d9 100644
--- a/doc/convolution.md
+++ b/doc/convolution.md
@@ -16,26 +16,30 @@ A convolution is an integral that expresses the amount of overlap of one functio
     * [SpatialConvolutionLocal](#nn.SpatialConvolutionLocal) : a 2D locally-connected layer over an input image ;
     * [SpatialSubSampling](#nn.SpatialSubSampling) : a 2D sub-sampling over an input image ;
     * [SpatialMaxPooling](#nn.SpatialMaxPooling) : a 2D max-pooling operation over an input image ;
+    * [SpatialDilatedMaxPooling](#nn.SpatialDilatedMaxPooling) : a 2D dilated max-pooling operation over an input image ;
     * [SpatialFractionalMaxPooling](#nn.SpatialFractionalMaxPooling) : a 2D fractional max-pooling operation over an input image ;
     * [SpatialAveragePooling](#nn.SpatialAveragePooling) : a 2D average-pooling operation over an input image ;
     * [SpatialAdaptiveMaxPooling](#nn.SpatialAdaptiveMaxPooling) : a 2D max-pooling operation which adapts its parameters dynamically such that the output is of fixed size ;
     * [SpatialMaxUnpooling](#nn.SpatialMaxUnpooling) : a 2D max-unpooling operation ;
     * [SpatialLPPooling](#nn.SpatialLPPooling) : computes the `p` norm in a convolutional manner on a set of input images ;
     * [SpatialConvolutionMap](#nn.SpatialConvolutionMap) : a 2D convolution that uses a generic connection table ;
-    * [SpatialZeroPadding](#nn.SpatialZeroPadding) : padds a feature map with specified number of zeros ;
-    * [SpatialReflectionPadding](#nn.SpatialReflectionPadding) : padds a feature map with the reflection of the input ;
-    * [SpatialReplicationPadding](#nn.SpatialReplicationPadding) : padds a feature map with the value at the edge of the input borders ;
+    * [SpatialZeroPadding](#nn.SpatialZeroPadding) : pads a feature map with specified number of zeros ;
+    * [SpatialReflectionPadding](#nn.SpatialReflectionPadding) : pads a feature map with the reflection of the input ;
+    * [SpatialReplicationPadding](#nn.SpatialReplicationPadding) : pads a feature map with the value at the edge of the input borders ;
     * [SpatialSubtractiveNormalization](#nn.SpatialSubtractiveNormalization) : a spatial subtraction operation on a series of 2D inputs using
     * [SpatialCrossMapLRN](#nn.SpatialCrossMapLRN) : a spatial local response normalization between feature maps ;
     * [SpatialBatchNormalization](#nn.SpatialBatchNormalization): mean/std normalization over the mini-batch inputs and pixels, with an optional affine transform that follows
 a kernel for computing the weighted average in a neighborhood ;
-    * [SpatialUpsamplingNearest](#nn.SpatialUpSamplingNearest): A simple upsampler applied to every channel of the feature map.
+    * [SpatialUpsamplingNearest](#nn.SpatialUpSamplingNearest): A simple nearest neighbor upsampler applied to every channel of the feature map.
+    * [SpatialUpsamplingBilinear](#nn.SpatialUpSamplingNearest): A simple bilinear upsampler applied to every channel of the feature map.
   * [Volumetric Modules](#nn.VolumetricModules) apply to inputs with three-dimensional relationships (e.g. videos) :
     * [VolumetricConvolution](#nn.VolumetricConvolution) : a 3D convolution over an input video (a sequence of images) ;
     * [VolumetricFullConvolution](#nn.VolumetricFullConvolution) : a 3D full convolution over an input video (a sequence of images) ;
+    * [VolumetricDilatedConvolution](#nn.VolumetricDilatedConvolution) : a 3D dilated convolution over an input image ;
     * [VolumetricMaxPooling](#nn.VolumetricMaxPooling) : a 3D max-pooling operation over an input video.
     * [VolumetricAveragePooling](#nn.VolumetricAveragePooling) : a 3D average-pooling operation over an input video.
-    * [VolumetricMaxUnpooling](#nn.VolumetricMaxUnpooling) : a 3D max-unpooling operation ;
+    * [VolumetricMaxUnpooling](#nn.VolumetricMaxUnpooling) : a 3D max-unpooling operation.
+    * [VolumetricReplicationPadding](#nn.VolumetricReplicationPadding) : Pads a volumetric feature map with the value at the edge of the input borders. ;
 
 
 <a name="nn.TemporalModules"></a>
@@ -210,10 +214,10 @@ is the size of a 1D `input` tensor.
 Again with a 1D input, when only `size1` is provided, the `forward(input)` is equivalent to
 performing the following matrix-matrix multiplication in an efficient manner:
 ```lua
-M P
+P M
 ```
-where `M` is a 2D matrix `size x nIndex` containing the parameters of the lookup-table and
-`P` is a 2D matrix, where each column vector `i` is a zero vector except at index `input[i]` where it is `1`.
+where `M` is a 2D matrix of size `nIndex x size1` containing the parameters of the lookup-table and
+`P` is a 2D matrix of size `n x nIndex`, where for each `i`th row vector, every element is zero except the one at index `input[i]` where it is `1`.
 
 1D example:
 ```lua
@@ -422,7 +426,7 @@ module = nn.SpatialFullConvolution(nInputPlane, nOutputPlane, kW, kH, [dW], [dH]
 Applies a 2D full convolution over an input image composed of several input planes. The `input` tensor in
 `forward(input)` is expected to be a 3D or 4D tensor. Note that instead of setting `adjW` and `adjH`, SpatialFullConvolution also accepts a table input with two tensors: `{convInput, sizeTensor}` where `convInput` is the standard input on which the full convolution
 is applied, and the size of `sizeTensor` is used to set the size of the output. Using the two-input version of forward
-will ignore the `adjW` and `adjH` values used to construct the module.
+will ignore the `adjW` and `adjH` values used to construct the module. The layer can be used without a bias by module:noBias().
 
 Other frameworks call this operation "In-network Upsampling", "Fractionally-strided convolution", "Backwards Convolution," "Deconvolution", or "Upconvolution."
 
@@ -454,6 +458,7 @@ Further information about the full convolution can be found in the following pap
 module = nn.SpatialDilatedConvolution(nInputPlane, nOutputPlane, kW, kH, [dW], [dH], [padW], [padH], [dilationW], [dilationH])
 ```
 
+Also sometimes referred to as **atrous convolution**.
 Applies a 2D dilated convolution over an input image composed of several input planes. The `input` tensor in
 `forward(input)` is expected to be a 3D or 4D tensor.
 
@@ -472,8 +477,8 @@ The parameters are the following:
 If the input image is a 3D tensor `nInputPlane x height x width`, the output image size
 will be `nOutputPlane x oheight x owidth` where
 ```lua
-owidth  = width + 2 * padW - dilationW * (kW-1) + 1 / dW + 1
-oheight = height + 2 * padH - dilationH * (kH-1) + 1 / dH + 1
+owidth  = floor(width + 2 * padW - dilationW * (kW-1) + 1) / dW + 1
+oheight = floor(height + 2 * padH - dilationH * (kH-1) + 1) / dH + 1
 ```
 
 Further information about the dilated convolution can be found in the following paper: [Multi-Scale Context Aggregation by Dilated Convolutions](http://arxiv.org/abs/1511.07122).
@@ -540,6 +545,29 @@ oheight = op((height + 2*padH - kH) / dH + 1)
 `op` is a rounding operator. By default, it is `floor`. It can be changed
 by calling `:ceil()` or `:floor()` methods.
 
+<a name="nn.SpatialDilatedMaxPooling"></a>
+### SpatialDilatedMaxPooling ###
+
+```lua
+module = nn.SpatialDilatedMaxPooling(kW, kH [, dW, dH, padW, padH, dilationW, dilationH])
+```
+
+Also sometimes referred to as **atrous pooling**.
+Applies 2D dilated max-pooling operation in `kWxkH` regions by step size
+`dWxdH` steps. The number of output features is equal to the number of
+input planes. If `dilationW` and `dilationH` are not provided, this is equivalent to performing normal `nn.SpatialMaxPooling`.
+
+If the input image is a 3D tensor `nInputPlane x height x width`, the output
+image size will be `nOutputPlane x oheight x owidth` where
+
+```lua
+owidth  = op((width - (dilationW * (kW - 1) + 1) + 2*padW) / dW + 1)
+oheight = op((height - (dilationH * (kH - 1) + 1) + 2*padH) / dH + 1)
+```
+
+`op` is a rounding operator. By default, it is `floor`. It can be changed
+by calling `:ceil()` or `:floor()` methods.
+
 <a name="nn.SpatialFractionalMaxPooling"></a>
 ### SpatialFractionalMaxPooling ###
 
@@ -716,6 +744,27 @@ output(u,v) = input(floor((u-1)/scale)+1, floor((v-1)/scale)+1)
 
 Where `u` and `v` are index from 1 (as per lua convention).  There are no learnable parameters.
 
+<a name="nn.SpatialUpSamplingBilinear"></a>
+### SpatialUpSamplingBilinear ###
+
+```lua
+module = nn.SpatialUpSamplingBilinear(scale)
+```
+
+Applies a 2D up-sampling over an input image composed of several input planes. The `input` tensor in
+`forward(input)` is expected to be a 3D or 4D tensor (i.e. for 4D: `nBatchPlane x nInputPlane x height x width`). The number of output planes will be the same. The v dimension is assumed to be the second last dimension (i.e. for 4D it will be the 3rd dim), and the u dimension is assumed to be the last dimension.
+
+The parameters are the following:
+  * `scale`: The upscale ratio.  Must be a positive integer
+
+The up-scaling method is bilinear, and given an input of height iH and width iW, output height and width will be:
+```lua
+oH = (iH - 1)(scale - 1) + iH
+oW = (iW - 1)(scale - 1) + iW
+```
+
+There are no learnable parameters.
+
 <a name="nn.SpatialZeroPadding"></a>
 ### SpatialZeroPadding ###
 
@@ -880,7 +929,7 @@ columns or rows of the input image might be lost. It is up to the user to
 add proper padding in images.
 
 If the input image is a 4D tensor `nInputPlane x time x height x width`, the output image size
-will be `nOutputPlane x otime x owidth x oheight` where
+will be `nOutputPlane x otime x oheight x owidth` where
 ```lua
 otime  = floor((time  + 2*padT - kT) / dT + 1)
 owidth  = floor((width  + 2*padW - kW) / dW + 1)
@@ -903,6 +952,8 @@ Applies a 3D full convolution over an input image composed of several input plan
 `forward(input)` is expected to be a 4D or 5D tensor. Note that instead of setting `adjT`, `adjW` and `adjH`, VolumetricFullConvolution also accepts a table input with two tensors: `{convInput, sizeTensor}` where `convInput` is the standard input on which the full convolution is applied, and the size of `sizeTensor` is used to set the size of the output. Using the two-input version of forward
 will ignore the `adjT`, `adjW` and `adjH` values used to construct the module.
 
+This can be used as 3D deconvolution, or 3D upsampling. So that the 3D FCN can be easly implemented.
+
 The parameters are the following:
 * `nInputPlane`: The number of expected input planes in the image given into `forward()`.
 * `nOutputPlane`: The number of output planes the convolution layer will produce.
@@ -924,6 +975,42 @@ owidth  = (width  - 1) * dW - 2*padW + kW
 oheight = (height - 1) * dH - 2*padH + kH
 ```
 
+<a name="nn.VolumetricDilatedConvolution"></a>
+### VolumetricDilatedConvolution ###
+
+```lua
+module = nn.VolumetricDilatedConvolution(nInputPlane, nOutputPlane, kT, kW, kH, [dT], [dW], [dH], [padT], [padW], [padH], [dilationT], [dilationW], [dilationH])
+```
+
+Applies a 3D dilated convolution over an input image composed of several input planes. The `input` tensor in
+`forward(input)` is expected to be a 4D or 5D tensor.
+
+The parameters are the following:
+  * `nInputPlane`: The number of expected input planes in the image given into `forward()`.
+  * `nOutputPlane`: The number of output planes the convolution layer will produce.
+  * `kT`: The kernel depth of the convolution
+  * `kW`: The kernel width of the convolution
+  * `kH`: The kernel height of the convolution
+  * `dT`: The step of the convolution in the depth dimension. Default is `1`.
+  * `dW`: The step of the convolution in the width dimension. Default is `1`.
+  * `dH`: The step of the convolution in the height dimension. Default is `1`.
+  * `padT`: The additional zeros added per depth to the input planes. Default is `0`, a good number is `(kT-1)/2`.
+  * `padW`: The additional zeros added per width to the input planes. Default is `0`, a good number is `(kW-1)/2`.
+  * `padH`: The additional zeros added per height to the input planes. Default is `0`, a good number is `(kH-1)/2`.
+  * `dilationT`: The number of pixels to skip. Default is `1`. `1` makes it a VolumetricConvolution
+  * `dilationW`: The number of pixels to skip. Default is `1`. `1` makes it a VolumetricConvolution
+  * `dilationH`: The number of pixels to skip. Default is `1`. `1` makes it a VolumetricConvolution
+
+If the input image is a 4D tensor `nInputPlane x depth x height x width`, the output image size
+will be `nOutputPlane x odepth x oheight x owidth` where
+```lua
+odepth  = floor(depth + 2 * padT - dilationT * (kT-1) + 1) / dT + 1
+owidth  = floor(width + 2 * padW - dilationW * (kW-1) + 1) / dW + 1
+oheight = floor(height + 2 * padH - dilationH * (kH-1) + 1) / dH + 1
+```
+
+Further information about the dilated convolution can be found in the following paper: [Multi-Scale Context Aggregation by Dilated Convolutions](http://arxiv.org/abs/1511.07122).
+
 <a name="nn.VolumetricMaxPooling"></a>
 ### VolumetricMaxPooling ###
 
@@ -962,3 +1049,13 @@ values (corresponding to their position within each map) are stored:
 If `C` is a tensor of same size as `B`, `module:updateOutput(C)` outputs a
 tensor `D` of same size as `A` such that:
 `D[{n,k,indices[{n,k,t}],indices[{n,k,i}],indices[{n,k,j}]}] = C[{n,k,t,i,j}]`.
+
+<a name="nn.VolumetricReplicationPadding"></a>
+### VolumetricReplicationPadding ###
+
+```lua
+module = nn.VolumetricReplicationPadding(padLeft, padRight, padTop, padBottom,
+                                         padFront, padBack)
+```
+
+Each feature map of a given input is padded with the replication of the input boundary.
diff --git a/doc/criterion.md b/doc/criterion.md
index 6e25f72..270edb9 100644
--- a/doc/criterion.md
+++ b/doc/criterion.md
@@ -332,7 +332,7 @@ By default, the losses are averaged over observations for each minibatch. Howeve
 criterion = nn.SoftMarginCriterion()
 ```
 
-Creates a criterion that optimizes a two-class classification logisitic loss between input `x` (a `Tensor` of dimension `1`) and output `y` (which is a tensor containing either `1`s or `-1`s).
+Creates a criterion that optimizes a two-class classification logistic loss between input `x` (a `Tensor` of dimension `1`) and output `y` (which is a tensor containing either `1`s or `-1`s).
 
 ```lua
 loss(x, y) = sum_i (log(1 + exp(-y[i]*x[i]))) / x:nElement()
diff --git a/doc/image/relu6.png b/doc/image/relu6.png
new file mode 100644
index 0000000..0a88563
Binary files /dev/null and b/doc/image/relu6.png differ
diff --git a/doc/module.md b/doc/module.md
index ce8c7b4..02b0603 100644
--- a/doc/module.md
+++ b/doc/module.md
@@ -300,11 +300,11 @@ This function will go over all the weights and gradWeights and make them view in
 
 <a name="nn.Module.training"></a>
 ### training() ###
-This sets the mode of the Module (or sub-modules) to `train=true`. This is useful for modules like [Dropout](simple.md#nn.Dropout) that have a different behaviour during training vs evaluation.
+This sets the mode of the Module (or sub-modules) to `train=true`. This is useful for modules like [Dropout](simple.md#nn.Dropout) or [BatchNormalization](simple.md#nn.BatchNormalization) that have a different behaviour during training vs evaluation.
 
 <a name="nn.Module.evaluate"></a>
 ### evaluate() ###
-This sets the mode of the Module (or sub-modules) to `train=false`. This is useful for modules like [Dropout](simple.md#nn.Dropout) that have a different behaviour during training vs evaluation.
+This sets the mode of the Module (or sub-modules) to `train=false`. This is useful for modules like [Dropout](simple.md#nn.Dropout) or [BatchNormalization](simple.md#nn.BatchNormalization) that have a different behaviour during training vs evaluation.
 
 <a name="nn.Module.findModules"></a>
 ### findModules(typename) ###
diff --git a/doc/simple.md b/doc/simple.md
index 50e5c9f..6f01a56 100644
--- a/doc/simple.md
+++ b/doc/simple.md
@@ -51,6 +51,7 @@ Simple Modules are used for various tasks like adapting Tensor methods and provi
     * [Padding](#nn.Padding) : adds padding to a dimension ;
     * [L1Penalty](#nn.L1Penalty) : adds an L1 penalty to an input (for sparsity) ;
     * [GradientReversal](#nn.GradientReversal) : reverses the gradient (to maximize an objective function) ;
+    * [GPU](#nn.GPU) : decorates a module so that it can be executed on a specific GPU device.
 
 <a name="nn.Linear"></a>
 ## Linear ##
@@ -1404,3 +1405,50 @@ One can also call:
 module:setLambda(lambda)
 ```
 to set the hyper-parameter `lambda` dynamically during training.
+
+<a name="nn.GPU"></a>
+## GPU ##
+
+```lua
+gpu = nn.GPU(module, device, [outdevice])
+require 'cunn'
+gpu:cuda()
+``` 
+
+Decorates an encapsulated `module` so that it can be executed on a specific GPU `device`.
+The decorated module's `parameters` are thus hosted on the specified GPU `device`.
+All operations on the `gpu` module are executed on that device.
+Calls to `forward`/`backward` will transfer arguments `input` and `gradOutput` to the specified `device`, 
+which are then fed as arguments to the decorated `module`. 
+Returned `output` is located on the specified `outdevice` (defaults to `device`). 
+Returned `gradInput` is allocated on the same device as the `input`.
+
+When serialized/deserialized, the `gpu` module will be run on the same `device` that it was serialized with.
+To prevent this from happening, the module can be converted to float/double before serialization:
+
+```lua
+gpu:float()
+gpustr = torch.serialize(gpu)
+``` 
+
+The module is located in the __nn__ package instead of __cunn__ as this allows
+it to be used in CPU-only enviroments, which are common for production models.
+
+The module supports nested table `input` and `gradOutput` tensors originating from multiple devices.
+Each nested tensor in the returned `gradInput` will be transfered to the device its commensurate tensor in the `input`.
+
+The intended use-case is not for model-parallelism where the models are executed in parallel on multiple devices, but 
+for sequential models where a single GPU doesn't have enough memory. 
+
+Example using 4 GPUs:
+
+```lua
+mlp = nn.Sequential()
+   :add(nn.GPU(nn.Linear(10000,10000), 1))
+   :add(nn.GPU(nn.Linear(10000,10000), 2))
+   :add(nn.GPU(nn.Linear(10000,10000), 3))
+   :add(nn.GPU(nn.Linear(10000,10000), 4, cutorch.getDevice()))
+``` 
+
+Note how the last `GPU` instance will return an `output` tensor on the same device as the current device (`cutorch.getDevice`).
+ 
diff --git a/doc/transfer.md b/doc/transfer.md
index c1dfc80..358ea7e 100644
--- a/doc/transfer.md
+++ b/doc/transfer.md
@@ -15,7 +15,7 @@ thus outputting a Tensor of the same dimension.
   * `f(x)` = `x,` `otherwise.`
 
 The range of the linear region `[-1 1]` can be adjusted by specifying arguments in declaration, for example `nn.HardTanh(min_value, max_value)`.
-Otherwise, `[min_value max_value]` is set to `[-1 1]` by default.
+Otherwise, `[min_value max_value]` is set to `[-1 1]` by default. In-place operation defined by third argument boolean.
 
 
 ```lua
@@ -183,7 +183,7 @@ gnuplot.grid(true)
 <a name="nn.LogSoftMax"></a>
 ## LogSoftMax ##
 
-Applies the `LogSoftmax` function to an n-dimensional input Tensor.
+Applies the `LogSoftMax` function to an n-dimensional input Tensor.
 
 `LogSoftmax` is defined as `f_i(x)` = `log(1/a exp(x_i))`,
 where  `a` = `sum_j exp(x_j)`.
@@ -261,6 +261,29 @@ gnuplot.grid(true)
 ```
 ![](image/relu.png)
 
+<a name="nn.ReLU6"></a>
+## ReLU6 ##
+
+Same as `ReLU` except that the rectifying function `f(x)` saturates at `x = 6`. This layer is useful for training networks that do not loose precision (due to FP saturation) when implemented as FP16.
+
+`ReLU6` is defined as `f(x)` = `min(max(0, x), 6)`
+
+Can optionally do its operation in-place without using extra state memory:
+```lua
+m=nn.ReLU6(true) -- true = in-place, false = keeping separate state.
+```
+
+```lua
+ii=torch.linspace(-3, 9)
+m=nn.ReLU6() 
+oo=m:forward(ii)
+go=torch.ones(100)
+gi=m:backward(ii,go)
+gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+gnuplot.grid(true)
+```
+![](image/relu6.png)
+
 <a name="nn.PReLU"></a>
 ## PReLU ##
 
diff --git a/hessian.lua b/hessian.lua
index 4d3afa3..33ef2b0 100644
--- a/hessian.lua
+++ b/hessian.lua
@@ -53,7 +53,7 @@ function nn.hessian.enable()
          error('Number of weights is not equal to number of weights squares')
       end
       module.diagHessianInput = module.diagHessianInput or input.new()
-      module.diagHessianInput:resizeAs(input)
+      module.diagHessianInput:resizeAs(input):zero()
 
       local gi = module.gradInput
       module.gradInput = module.diagHessianInput
diff --git a/init.lua b/init.lua
index 516f29b..a9c68da 100644
--- a/init.lua
+++ b/init.lua
@@ -15,6 +15,7 @@ require('nn.Concat')
 require('nn.Parallel')
 require('nn.Sequential')
 require('nn.DepthConcat')
+require('nn.Bottle')
 
 require('nn.Linear')
 require('nn.Bilinear')
@@ -83,6 +84,7 @@ require('nn.HardShrink')
 require('nn.SoftShrink')
 require('nn.Threshold')
 require('nn.ReLU')
+require('nn.ReLU6')
 require('nn.PReLU')
 require('nn.LeakyReLU')
 require('nn.SpatialSoftMax')
@@ -99,6 +101,7 @@ require('nn.SpatialConvolutionMap')
 require('nn.SpatialDilatedConvolution')
 require('nn.SpatialSubSampling')
 require('nn.SpatialMaxPooling')
+require('nn.SpatialDilatedMaxPooling')
 require('nn.SpatialMaxUnpooling')
 require('nn.SpatialFractionalMaxPooling')
 require('nn.SpatialLPPooling')
@@ -115,14 +118,19 @@ require('nn.SpatialZeroPadding')
 require('nn.SpatialReflectionPadding')
 require('nn.SpatialReplicationPadding')
 require('nn.SpatialUpSamplingNearest')
+require('nn.SpatialUpSamplingBilinear')
 require('nn.SpatialBatchNormalization')
 
 require('nn.VolumetricConvolution')
 require('nn.VolumetricFullConvolution')
+require('nn.VolumetricDilatedConvolution')
 require('nn.VolumetricMaxPooling')
 require('nn.VolumetricMaxUnpooling')
 require('nn.VolumetricAveragePooling')
 require('nn.VolumetricBatchNormalization')
+require('nn.VolumetricReplicationPadding')
+
+require('nn.GPU')
 
 require('nn.ParallelTable')
 require('nn.Identity')
diff --git a/lib/THNN/generic/ClassNLLCriterion.c b/lib/THNN/generic/ClassNLLCriterion.c
index eb02f7c..aea726c 100644
--- a/lib/THNN/generic/ClassNLLCriterion.c
+++ b/lib/THNN/generic/ClassNLLCriterion.c
@@ -20,6 +20,9 @@ void THNN_(ClassNLLCriterion_updateOutput)(
   if (THTensor_(nDimension)(input) > 2) {
     THError("input tensor should be 1D or 2D");
   }
+  if (weights && THTensor_(nElement)(weights) != n_classes) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
 
   input = THTensor_(newContiguous)(input);
   target = THIndexTensor_(newContiguous)(target);
@@ -34,7 +37,7 @@ void THNN_(ClassNLLCriterion_updateOutput)(
   output_data[0] = total_weight_data[0] = 0.0;
 
   if (THTensor_(nDimension)(input) == 1) {
-    int cur_target = target_data[0] - 1;
+    int cur_target = target_data[0] - TH_INDEX_BASE;
     THAssert(cur_target >= 0 && cur_target < n_classes);
     total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f;
     output_data[0] = -input_data[cur_target] * total_weight_data[0];
@@ -46,7 +49,7 @@ void THNN_(ClassNLLCriterion_updateOutput)(
 
     int i;
     for (i = 0; i < batch_size; i++) {
-      int cur_target = target_data[i] - 1;
+      int cur_target = target_data[i] - TH_INDEX_BASE;
       THAssert(cur_target >= 0 && cur_target < n_classes);
 
       real cur_weight = weights ? weights_data[cur_target] : 1.0f;
@@ -95,6 +98,10 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
   if (THTensor_(nDimension)(input) > 2) {
     THError("input tensor should be 1D or 2D");
   }
+  
+  if (weights && THTensor_(nElement)(weights) != n_classes) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
 
   target = THIndexTensor_(newContiguous)(target);
   weights = weights ? THTensor_(newContiguous)(weights) : NULL;
@@ -104,7 +111,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
   real *gradInput_data = THTensor_(data)(gradInput);
 
   if (THTensor_(nDimension)(input) == 1) {
-    int cur_target = target_data[0] - 1;
+    int cur_target = target_data[0] - TH_INDEX_BASE;
     THAssert(cur_target >= 0 && cur_target < n_classes);
 
     gradInput_data[cur_target] =
@@ -118,7 +125,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
 
     int i;
     for (i = 0; i < batch_size; i++){
-      int cur_target = target_data[i] - 1;
+      int cur_target = target_data[i] - TH_INDEX_BASE;
 
       THAssert(cur_target >= 0 && cur_target < n_classes);
 
diff --git a/lib/THNN/generic/HardTanh.c b/lib/THNN/generic/HardTanh.c
index 9764ec0..3b7ba3d 100644
--- a/lib/THNN/generic/HardTanh.c
+++ b/lib/THNN/generic/HardTanh.c
@@ -7,37 +7,59 @@ void THNN_(HardTanh_updateOutput)(
           THTensor *input,
           THTensor *output,
           real min_val,
-          real max_val)
+          real max_val,
+          bool inplace)
 {
-  THTensor_(resizeAs)(output, input);
+  if (inplace)
+    THTensor_(set)(output, input);
+  else
+    THTensor_(resizeAs)(output, input);
   
   if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
   {
-    TH_TENSOR_APPLY2(real, output, real, input,
-      if (*input_data < min_val)
-        *output_data = min_val;
-      else if (*input_data <= max_val)
-        *output_data = *input_data;
-      else
-        *output_data = max_val;
-    );
+    if (inplace)
+      TH_TENSOR_APPLY(real, input,
+        if (*input_data < min_val)
+          *input_data = min_val;
+        else if (*input_data > max_val)
+          *input_data = max_val;
+      );
+      TH_TENSOR_APPLY2(real, output, real, input,
+        if (*input_data < min_val)
+          *output_data = min_val;
+        else if (*input_data <= max_val)
+          *output_data = *input_data;
+        else
+          *output_data = max_val;
+      );
   }
   else
   {
-    real* ptr_output = THTensor_(data)(output);
     real* ptr_input  = THTensor_(data)(input);
+    real* ptr_output = THTensor_(data)(output);
     long i;
+    long n = THTensor_(nElement)(input);
 
+    if (inplace)
 #pragma omp parallel for private(i)
-    for (i = 0; i < THTensor_(nElement)(input); i++)
-    {
-      if (ptr_input[i] < min_val)
-        ptr_output[i] = min_val;
-      else if (ptr_input[i] <= max_val)
-        ptr_output[i] = ptr_input[i];
-      else
-        ptr_output[i] = max_val;
-    }
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val)
+          ptr_input[i] = min_val;
+        else if (ptr_input[i] > max_val)
+          ptr_input[i] = max_val;
+      }
+    else
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val)
+          ptr_output[i] = min_val;
+        else if (ptr_input[i] <= max_val)
+          ptr_output[i] = ptr_input[i];
+        else
+          ptr_output[i] = max_val;
+      }
   }
 }
 
@@ -47,21 +69,33 @@ void THNN_(HardTanh_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           real min_val,
-          real max_val)
+          real max_val,
+          bool inplace)
 {
-  THTensor_(resizeAs)(gradInput, input);
+  if (inplace)
+    THTensor_(set)(gradInput, gradOutput);
+  else
+    THTensor_(resizeAs)(gradInput, input);
 
   if (input->nDimension == 1 ||
     !THTensor_(isContiguous)(input) ||
     !THTensor_(isContiguous)(gradOutput) ||
     !THTensor_(isContiguous)(gradInput))
   {
-    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
-      if (*input_data < min_val || *input_data > max_val)
-        *gradInput_data = 0;
-      else
-        *gradInput_data = *gradOutput_data;
-    );
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, gradOutput, real, input,
+        if (*input_data < min_val || *input_data > max_val)
+          *gradOutput_data = 0;
+      );
+    }
+    else
+      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+        if (*input_data < min_val || *input_data > max_val)
+          *gradInput_data = 0;
+        else
+          *gradInput_data = *gradOutput_data;
+      );
   }
   else
   {
@@ -69,15 +103,24 @@ void THNN_(HardTanh_updateGradInput)(
     real* ptr_gradInput  = THTensor_(data)(gradInput);
     real* ptr_input      = THTensor_(data)(input);
     long i;
+    long n = THTensor_(nElement)(input);
 
+    if (inplace)
 #pragma omp parallel for private(i)
-    for (i = 0; i < THTensor_(nElement)(input); i++)
-    {
-      if (ptr_input[i] < min_val || ptr_input[i] > max_val)
-        ptr_gradInput[i] = 0;
-      else
-        ptr_gradInput[i] = ptr_gradOutput[i];
-    }
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] <= min_val || ptr_input[i] >= max_val)
+          ptr_gradInput[i] = 0;
+      }
+    else
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val || ptr_input[i] > max_val)
+          ptr_gradInput[i] = 0;
+        else
+          ptr_gradInput[i] = ptr_gradOutput[i];
+      }
   }
 }
 
diff --git a/lib/THNN/generic/LookupTable.c b/lib/THNN/generic/LookupTable.c
index a35ff84..378d1c3 100644
--- a/lib/THNN/generic/LookupTable.c
+++ b/lib/THNN/generic/LookupTable.c
@@ -12,12 +12,12 @@ static void THNN_(LookupTable_resetCount)(
 
   for (i = 0; i<numel; i++)
   {
-    long k = input_data[i] - 1;
+    long k = input_data[i] - TH_INDEX_BASE;
     count_data[k] = 0;
   }
   for (i = 0; i<numel; i++)
   {
-    long k = input_data[i] - 1;
+    long k = input_data[i] - TH_INDEX_BASE;
     count_data[k]++;
   }
 }
@@ -56,7 +56,7 @@ void THNN_(LookupTable_accGradParameters)(
 
   // check that inputs are all within range
   for (i=0; i<numel; i++)
-    if (input_data[i] < 1 || input_data[i] > numw)
+    if (input_data[i] < TH_INDEX_BASE || input_data[i] >= numw + TH_INDEX_BASE)
       THError("input out of range");
 
   gradOutput = THTensor_(newContiguous)(gradOutput);
@@ -86,7 +86,7 @@ void THNN_(LookupTable_accGradParameters)(
       {
         if (input_data[i] != paddingValue)
         {
-            long k = input_data[i] - 1;
+            long k = input_data[i] - TH_INDEX_BASE;
             if (k >= start && k < end)
             {
                 real scale_ = scale;
@@ -106,7 +106,7 @@ void THNN_(LookupTable_accGradParameters)(
   {
     if (input_data[i] != paddingValue)
     {
-        long k = input_data[i] - 1;
+        long k = input_data[i] - TH_INDEX_BASE;
         real scale_ = scale;
         if (count_data) scale_ /= count_data[k];
         THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
@@ -178,7 +178,7 @@ void THNN_(LookupTable_renorm)(
   long stride = THTensor_(stride)(weight, 0);
   real *gw = THTensor_(data)(weight);
   for (i=0; i<numel; i++)
-    if (row_idx[i] < 1 || row_idx[i] > numw)
+    if (row_idx[i] < TH_INDEX_BASE || row_idx[i] >= numw + TH_INDEX_BASE)
       THError("input out of range");
   // get unique indices
   qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex));
@@ -197,7 +197,7 @@ void THNN_(LookupTable_renorm)(
     #pragma omp parallel for private(i)
     for (i=0; i<numel; i++)
     {
-      long k = row_idx[i] - 1;
+      long k = row_idx[i] - TH_INDEX_BASE;
       THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
     }
     return;
@@ -205,7 +205,7 @@ void THNN_(LookupTable_renorm)(
 #endif
   for (i=0; i<numel; i++)
   {
-    long k = row_idx[i] - 1;
+    long k = row_idx[i] - TH_INDEX_BASE;
     THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
   }
 }
diff --git a/lib/THNN/generic/MultiLabelMarginCriterion.c b/lib/THNN/generic/MultiLabelMarginCriterion.c
index 4cbb000..9cfc5fe 100644
--- a/lib/THNN/generic/MultiLabelMarginCriterion.c
+++ b/lib/THNN/generic/MultiLabelMarginCriterion.c
@@ -47,14 +47,14 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
   {
     for (ddt = 0; ddt < dim; ddt++)
     {
-      long target_idx = (long)target_data[ddt]-1;
+      long target_idx = (long)target_data[ddt] - TH_INDEX_BASE;
       if (target_idx < 0)
         break;
       isTarget_data[target_idx] = 1;
     }
     for (dt = 0; dt < dim; dt++)
     {
-      long target_idx = (long)target_data[dt]-1;
+      long target_idx = (long)target_data[dt] - TH_INDEX_BASE;
       real input_target;
       if (target_idx < 0)
         break;
@@ -141,7 +141,7 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
   {
     for (dt = 0; dt < dim; dt++)
     {
-      long target_idx = (long)target_data[dt]-1;
+      long target_idx = (long)target_data[dt] - TH_INDEX_BASE;
       real input_target;
       if (target_idx < 0)
         break;
diff --git a/lib/THNN/generic/MultiMarginCriterion.c b/lib/THNN/generic/MultiMarginCriterion.c
index 2463da1..455cf5e 100644
--- a/lib/THNN/generic/MultiMarginCriterion.c
+++ b/lib/THNN/generic/MultiMarginCriterion.c
@@ -34,7 +34,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   for (t = 0; t < nframe; t++)
   {
     real idx = THTensor_(get1d)(target, t);
-    THArgCheck((idx >= 1) && (idx <= dim), 3, "target out of range");
+    THArgCheck((idx >= TH_INDEX_BASE) && (idx < dim + TH_INDEX_BASE), 3, "target out of range");
   }
 
   input = THTensor_(newContiguous)(input);
@@ -47,7 +47,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   sum = 0;
   for (t = 0; t < nframe; t++)
   {
-    long target_idx = (long)(target_data[t]-1);
+    long target_idx = (long)(target_data[t] - TH_INDEX_BASE);
     real input_target = input_data[target_idx];
     for (d = 0; d < dim; d++)
     {
@@ -124,7 +124,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
 
   for (t = 0; t < nframe; t++)
   {
-    long target_idx = (long)(target_data[t])-1;
+    long target_idx = (long)(target_data[t]) - TH_INDEX_BASE;
     real input_target = input_data[target_idx];
     real gradInput_target = 0;
     for (d = 0; d < dim; d++)
diff --git a/lib/THNN/generic/SpatialAdaptiveMaxPooling.c b/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
index 61afc40..5d6d995 100644
--- a/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
+++ b/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
@@ -30,7 +30,7 @@ static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
 
       for(j = 0; j < owidth; j++)
       {
-        
+
         int x_start = (int)floor((float)j / owidth * iwidth);
         int x_end   = (int)ceil((float)(j + 1) / owidth * iwidth);
         int kW = x_end-x_start;
@@ -64,8 +64,8 @@ static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
         *op = maxval;
 
         /* store location of max (x,y) */
-        *indyp = (int)(maxindex / kW)+1;
-        *indxp = (maxindex % kW) +1;
+        *indyp = (int)(maxindex / kW) + TH_INDEX_BASE;
+        *indxp = (maxindex % kW) + TH_INDEX_BASE;
       }
     }
   }
@@ -85,7 +85,7 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
   long nslices;
   long iheight;
   long iwidth;
-  
+
   long istride_d;
   long istride_h;
   long istride_w;
@@ -98,7 +98,7 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
 
   THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
 
-  if (input->nDimension == 4) 
+  if (input->nDimension == 4)
   {
     istride_b = input->stride[0];
     nbatch = input->size[0];
@@ -179,7 +179,7 @@ static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(
     real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
     real *indx_p_k = indx_p + k*owidth*oheight;
     real *indy_p_k = indy_p + k*owidth*oheight;
-    
+
     /* calculate max points */
     long i, j;
     for(i = 0; i < oheight; i++)
@@ -189,9 +189,9 @@ static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(
       {
         int x_start = (int)floor((float) j / owidth * iwidth);
         /* retrieve position of max */
-        long maxi = indy_p_k[i*owidth + j] - 1 + y_start;
-        long maxj = indx_p_k[i*owidth + j] - 1 + x_start;
-        
+        long maxi = indy_p_k[i*owidth + j] - TH_INDEX_BASE + y_start;
+        long maxj = indx_p_k[i*owidth + j] - TH_INDEX_BASE + x_start;
+
         /* update gradient */
         gradInput_p_k[maxi*iwidth + maxj] += gradOutput_p_k[i*owidth + j];
       }
diff --git a/lib/THNN/generic/SpatialClassNLLCriterion.c b/lib/THNN/generic/SpatialClassNLLCriterion.c
index 3121c30..cbb4cea 100644
--- a/lib/THNN/generic/SpatialClassNLLCriterion.c
+++ b/lib/THNN/generic/SpatialClassNLLCriterion.c
@@ -7,6 +7,9 @@
               "only batches of spatial targets supported (3D tensors)");         \
   THArgCheck(THTensor_(nDimension)(input) == 4, 2,                               \
               "only batches of spatial inputs supported (4D tensors)");          \
+  if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) {    \
+    THError("weight tensor should be defined either for all or no classes");     \
+  }                                                                              \
                                                                                  \
   {                                                                              \
     long input0 = THTensor_(size)(input, 0);                                     \
@@ -51,7 +54,7 @@ void THNN_(SpatialClassNLLCriterion_updateOutput)(
   real output_acc = 0;
   for (int b = 0; b < batch_size; b++) {
     for (int elem = 0; elem < map_size; elem++) {
-      int cur_target = target_data[b * map_size + elem] - 1;
+      int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
       THAssert(cur_target >= 0 && cur_target < n_classes);
 
       real cur_weight = weights ? weights_data[cur_target] : 1.0f;
@@ -102,11 +105,12 @@ void THNN_(SpatialClassNLLCriterion_updateGradInput)(
 
   real normalize = sizeAverage ? *total_weight_data : 1.0f;
 
-  int b,elem;
-#pragma omp parallel for
+  int b;
+  #pragma omp parallel for
   for (b = 0; b < batch_size; b++) {
+    int elem;
     for (elem = 0; elem < map_size; elem++) {
-      int cur_target = target_data[b * map_size + elem] - 1;
+      int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
       THAssert(cur_target >= 0 && cur_target < n_classes);
 
       gradInput_data[b * sample_size + cur_target * map_size + elem] =
diff --git a/lib/THNN/generic/SpatialConvolutionMM.c b/lib/THNN/generic/SpatialConvolutionMM.c
index a549a37..e7460c8 100644
--- a/lib/THNN/generic/SpatialConvolutionMM.c
+++ b/lib/THNN/generic/SpatialConvolutionMM.c
@@ -174,6 +174,10 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
 
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(resizeAs)(fgradInput, finput);
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput); 
   THTensor_(transpose)(weight, weight, 0, 1);
 
   if(input->nDimension == 3)
diff --git a/lib/THNN/generic/SpatialConvolutionMap.c b/lib/THNN/generic/SpatialConvolutionMap.c
index aef0b1e..82886c2 100644
--- a/lib/THNN/generic/SpatialConvolutionMap.c
+++ b/lib/THNN/generic/SpatialConvolutionMap.c
@@ -10,7 +10,7 @@ void THNN_(SpatialConvolutionMap_updateOutput)(
   THArgCheck(
     weight != NULL && weight->nDimension == 3
     && connTable != NULL && connTable->size[0] == weight->size[0], 4,
-    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
   real *weight_data = THTensor_(data)(weight);
@@ -75,8 +75,8 @@ void THNN_(SpatialConvolutionMap_updateOutput)(
       for (k = 0; k < nweight; k++)
       {
         /* get offsets for input/output */
-        int o = (int)connTable_data[k*2+1]-1;
-        int i = (int)connTable_data[k*2+0]-1;
+        int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+        int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
 
         if (o == p)
         {
@@ -106,7 +106,7 @@ void THNN_(SpatialConvolutionMap_updateGradInput)(
   THArgCheck(
     weight != NULL && weight->nDimension == 3
     && connTable != NULL && connTable->size[0] == weight->size[0], 5,
-    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
   real *weight_data = THTensor_(data)(weight);
@@ -154,8 +154,8 @@ void THNN_(SpatialConvolutionMap_updateGradInput)(
       int nkernel = connTable->size[0];
       for (k = 0; k < nkernel; k++)
       {
-        int o = (int)connTable_data[k*2+1]-1;
-        int i = (int)connTable_data[k*2+0]-1;
+        int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+        int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
         if (i == p)
         {
           /* gradient to input */
@@ -182,7 +182,7 @@ void THNN_(SpatialConvolutionMap_accGradParameters)(
   THArgCheck(
     gradWeight != NULL && gradWeight->nDimension == 3
     && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
-    "3D gradWeight tensor expected (connTable:size(1) x kH x kW)"
+    "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
   real *gradWeight_data = THTensor_(data)(gradWeight);
@@ -237,8 +237,8 @@ void THNN_(SpatialConvolutionMap_accGradParameters)(
     long m;
     for (m = 0; m < nbatch; m++)
     {
-      int o = (int)THTensor_(get2d)(connTable,k,1)-1;
-      int i = (int)THTensor_(get2d)(connTable,k,0)-1;
+      int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
+      int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
 
       /* gradient to kernel */
       THTensor_(validXCorr2DRevptr)(
diff --git a/lib/THNN/generic/SpatialDilatedConvolution.c b/lib/THNN/generic/SpatialDilatedConvolution.c
index 3f75016..3928af0 100644
--- a/lib/THNN/generic/SpatialDilatedConvolution.c
+++ b/lib/THNN/generic/SpatialDilatedConvolution.c
@@ -49,6 +49,7 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
 
   // Resize output
   THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+  THTensor_(zero)(output);
 
   // Resize temporary columns
   THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
@@ -171,6 +172,7 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
 
   // Resize temporary columns
   THTensor_(resize2d)(gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+  THTensor_(zero)(gradColumns);
 
   // Helpers
   THTensor *gradInput_n = THTensor_(new)();
diff --git a/lib/THNN/generic/SpatialFractionalMaxPooling.c b/lib/THNN/generic/SpatialFractionalMaxPooling.c
index 1c2b6ab..c0a9384 100644
--- a/lib/THNN/generic/SpatialFractionalMaxPooling.c
+++ b/lib/THNN/generic/SpatialFractionalMaxPooling.c
@@ -79,7 +79,7 @@ static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
 
         outputForPlane[h * outputW + w] = maxVal;
         /* +1 to lua index */
-        indicesForPlane[h * outputW + w] = (real) maxIndex + 1;
+        indicesForPlane[h * outputW + w] = (real) maxIndex + TH_INDEX_BASE;
       }
     }
 
@@ -96,7 +96,7 @@ void THNN_(SpatialFractionalMaxPooling_updateOutput)(
     int poolSizeW, int poolSizeH,
     THTensor *indices,
     THTensor *randomSamples) {
-  
+
   long numBatch = 1;
   int planeDim = 0;
   int heightDim = 1;
@@ -177,7 +177,7 @@ static void THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
     for (h = 0; h < outputH; ++h) {
       for (w = 0; w < outputW; ++w) {
         long outputIndex = h * outputW + w;
-        long index = indicesForPlane[outputIndex] - 1;
+        long index = indicesForPlane[outputIndex] - TH_INDEX_BASE;
         THAssert(index >= 0 && index < inputW * inputH);
 
         gradInputForPlane[index] += gradOutputForPlane[outputIndex];
diff --git a/lib/THNN/generic/SpatialFullConvolution.c b/lib/THNN/generic/SpatialFullConvolution.c
index 20dd126..a82477d 100644
--- a/lib/THNN/generic/SpatialFullConvolution.c
+++ b/lib/THNN/generic/SpatialFullConvolution.c
@@ -98,6 +98,7 @@ void THNN_(SpatialFullConvolution_updateOutput)(
 
   // Resize temporary columns
   THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+  THTensor_(zero)(columns);
 
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
@@ -152,16 +153,17 @@ void THNN_(SpatialFullConvolution_updateOutput)(
     long k_ = 1;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THBlas_(gemm)(
-        't', 'n',
-        n_, m_, k_,
-        1,
-        THTensor_(data)(ones), k_,
-        THTensor_(data)(bias), k_,
-        1,
-        THTensor_(data)(output_n), n_
-    );
-
+    if (bias) {
+      THBlas_(gemm)(
+          't', 'n',
+          n_, m_, k_,
+          1,
+          THTensor_(data)(ones), k_,
+          THTensor_(data)(bias), k_,
+          1,
+          THTensor_(data)(output_n), n_
+      );
+    }
   }
 
   // Free
@@ -210,6 +212,7 @@ void THNN_(SpatialFullConvolution_updateGradInput)(
 
   // Resize output
   THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
 
   // Resize temporary columns
   THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
@@ -355,15 +358,17 @@ void THNN_(SpatialFullConvolution_accGradParameters)(
     long k_ = outputHeight * outputWidth;
 
     // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
-    THBlas_(gemv)(
-        't',
-        k_, m_,
-        scale,
-        THTensor_(data)(gradOutput_n), k_,
-        THTensor_(data)(ones), 1,
-        1,
-        THTensor_(data)(gradBias), 1
-    );
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
   }
 
   // Free
diff --git a/lib/THNN/generic/SpatialFullConvolutionMap.c b/lib/THNN/generic/SpatialFullConvolutionMap.c
index bbb0282..1bd3455 100644
--- a/lib/THNN/generic/SpatialFullConvolutionMap.c
+++ b/lib/THNN/generic/SpatialFullConvolutionMap.c
@@ -10,7 +10,7 @@ void THNN_(SpatialFullConvolutionMap_updateOutput)(
   THArgCheck(
     weight != NULL && weight->nDimension == 3
     && connTable != NULL && connTable->size[0] == weight->size[0], 4,
-    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
   const int kH = (int)weight->size[1];
@@ -62,8 +62,8 @@ void THNN_(SpatialFullConvolutionMap_updateOutput)(
     for (k = 0; k < nweight; k++)
     {
       /* get offsets for input/output */
-      int o = (int)connTable_data[k*2+1]-1;
-      int i = (int)connTable_data[k*2+0]-1;
+      int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+      int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
 
       if (o == p)
       {
@@ -91,7 +91,7 @@ void THNN_(SpatialFullConvolutionMap_updateGradInput)(
   THArgCheck(
     weight != NULL && weight->nDimension == 3
     && connTable != NULL && connTable->size[0] == weight->size[0], 5,
-    "3D weight tensor expected (connTable:size(1) x kH x kW)"
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
   /* contiguous */
@@ -125,8 +125,8 @@ void THNN_(SpatialFullConvolutionMap_updateGradInput)(
     int nkernel = connTable->size[0];
     for (k = 0; k < nkernel; k++)
     {
-      int o = (int)connTable_data[k*2+1]-1;
-      int i = (int)connTable_data[k*2+0]-1;
+      int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+      int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
       if (i == p)
       {
         /* gradient to input */
@@ -154,7 +154,7 @@ void THNN_(SpatialFullConvolutionMap_accGradParameters)(
   THArgCheck(
     gradWeight != NULL && gradWeight->nDimension == 3
     && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
-    "3D gradWeight tensor expected (connTable:size(1) x kH x kW)"
+    "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
   /* contiguous */
@@ -191,8 +191,8 @@ void THNN_(SpatialFullConvolutionMap_accGradParameters)(
 #pragma omp parallel for private(k)
   for (k = 0; k < nkernel; k++)
   {
-    int o = (int)THTensor_(get2d)(connTable,k,1)-1;
-    int i = (int)THTensor_(get2d)(connTable,k,0)-1;
+    int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
+    int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
 
     /* gradient to kernel */
     THTensor_(validXCorr2DRevptr)(
diff --git a/lib/THNN/generic/SpatialMaxPooling.c b/lib/THNN/generic/SpatialMaxPooling.c
index d28fe85..3daef1d 100644
--- a/lib/THNN/generic/SpatialMaxPooling.c
+++ b/lib/THNN/generic/SpatialMaxPooling.c
@@ -16,7 +16,10 @@ static void THNN_(SpatialMaxPooling_updateOutput_frame)(
           int dW,
           int dH,
           int padW,
-          int padH)
+          int padH,
+          int dilationW,
+          int dilationH
+          )
 {
   long k;
 #pragma omp parallel for private(k)
@@ -31,10 +34,12 @@ static void THNN_(SpatialMaxPooling_updateOutput_frame)(
       {
         long hstart = i * dH - padH;
         long wstart = j * dW - padW;
-        long hend = fminf(hstart + kH, iheight);
-        long wend = fminf(wstart + kW, iwidth);
-        hstart = fmaxf(hstart, 0);
-        wstart = fmaxf(wstart, 0);
+        long hend = fminf(hstart + (kH - 1) * dilationH + 1, iheight);
+        long wend = fminf(wstart + (kW - 1) * dilationW + 1, iwidth);
+        while(hstart < 0)
+          hstart += dilationH;
+        while(wstart < 0)
+          wstart += dilationW;
 
         /* local pointers */
         real *op = output_p  + k*owidth*oheight + i*owidth + j;
@@ -45,9 +50,9 @@ static void THNN_(SpatialMaxPooling_updateOutput_frame)(
         real maxval = -THInf;
         long tcntr = 0;
         long x,y;
-        for(y = hstart; y < hend; y++)
+        for(y = hstart; y < hend; y += dilationH)
         {
-          for(x = wstart; x < wend; x++)
+          for(x = wstart; x < wend; x += dilationW)
           {
             tcntr = y*iwidth + x;
             real val = *(ip + tcntr);
@@ -63,7 +68,7 @@ static void THNN_(SpatialMaxPooling_updateOutput_frame)(
         *op = maxval;
 
         /* store location of max */
-        *indp = maxindex + 1;
+        *indp = maxindex + TH_INDEX_BASE;
       }
     }
   }
@@ -80,6 +85,8 @@ void THNN_(SpatialMaxPooling_updateOutput)(
           int dH,
           int padW,
           int padH,
+          int dilationW,
+          int dilationH,
           bool ceil_mode)
 {
   int dimw = 2;
@@ -97,31 +104,34 @@ void THNN_(SpatialMaxPooling_updateOutput)(
 
   THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
 
-  if (input->nDimension == 4) 
+  if (input->nDimension == 4)
   {
     nbatch = input->size[0];
     dimw++;
     dimh++;
   }
   THArgCheck(input->size[dimw] >= kW - padW && input->size[dimh] >= kH - padH, 2, "input image smaller than kernel size");
-
   THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
-
+  
   /* sizes */
   nslices = input->size[dimh-1];
   iheight = input->size[dimh];
   iwidth = input->size[dimw];
   if (ceil_mode)
   {
-    oheight = (long)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1;
-    owidth  = (long)(ceil((float)(iwidth  - kW + 2*padW) / dW)) + 1;
+    oheight = (long)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    owidth  = (long)(ceil((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
   }
   else
   {
-    oheight = (long)(floor((float)(iheight - kH + 2*padH) / dH)) + 1;
-    owidth  = (long)(floor((float)(iwidth  - kW + 2*padW) / dW)) + 1;
+    oheight = (long)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    owidth  = (long)(floor((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
   }
 
+  if (owidth < 1 || oheight < 1)
+    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+            nslices,iheight,iwidth,nslices,oheight,owidth);
+
   if (padW || padH)
   {
     // ensure that the last pooling starts inside the image
@@ -151,7 +161,9 @@ void THNN_(SpatialMaxPooling_updateOutput)(
                                               iwidth, iheight,
                                               owidth, oheight,
                                               kW, kH, dW, dH,
-                                              padW, padH);
+                                              padW, padH,
+                                              dilationW, dilationH
+                                              );
   }
   else
   {
@@ -174,7 +186,9 @@ void THNN_(SpatialMaxPooling_updateOutput)(
                                                 iwidth, iheight,
                                                 owidth, oheight,
                                                 kW, kH, dW, dH,
-                                                padW, padH);
+                                                padW, padH,
+                                                dilationW, dilationH
+                                                );
     }
   }
 
@@ -209,7 +223,7 @@ static void THNN_(SpatialMaxPooling_updateGradInput_frame)(
       for(j = 0; j < owidth; j++)
       {
         /* retrieve position of max */
-        long maxp = ind_p_k[i*owidth + j] - 1;
+        long maxp = ind_p_k[i*owidth + j] - TH_INDEX_BASE;
         /* update gradient */
         gradInput_p_k[maxp] += gradOutput_p_k[i*owidth + j];
       }
@@ -229,6 +243,8 @@ void THNN_(SpatialMaxPooling_updateGradInput)(
           int dH,
           int padW,
           int padH,
+          int dilationW,
+          int dilationH,
           bool ceil_mode)
 {
   int dimw = 2;
diff --git a/lib/THNN/generic/SpatialMaxUnpooling.c b/lib/THNN/generic/SpatialMaxUnpooling.c
index 6e7a76e..cd1739b 100644
--- a/lib/THNN/generic/SpatialMaxUnpooling.c
+++ b/lib/THNN/generic/SpatialMaxUnpooling.c
@@ -11,7 +11,7 @@ static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *o
   long k;
 #pragma omp parallel for private(k)
   for (k = 0; k < nslices; k++)
-  {    
+  {
     real *output_p_k = output_p + k*owidth*oheight;
     real *input_p_k = input_p + k*iwidth*iheight;
     real *ind_p_k = ind_p + k*iwidth*iheight;
@@ -21,7 +21,7 @@ static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *o
     {
       for(j = 0; j < iwidth; j++)
       {
-        maxp = ind_p_k[i*iwidth + j] - 1;  /* retrieve position of max */
+        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE;  /* retrieve position of max */
         if(maxp<0 || maxp>=owidth*oheight){
             THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
         }
@@ -52,9 +52,9 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
   THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
   if (!THTensor_(isSameSizeAs)(input, indices)){
     THError("Invalid input size w.r.t current indices size");
-  }  
+  }
 
-  if (input->nDimension == 4) 
+  if (input->nDimension == 4)
   {
     nbatch = input->size[0];
     dimw++;
@@ -131,11 +131,11 @@ static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p,
     for(i = 0; i < iheight; i++)
     {
       for(j = 0; j < iwidth; j++)
-      {        
-        maxp = ind_p_k[i*iwidth + j] - 1; /* retrieve position of max */         
+      {
+        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */
         if(maxp<0 || maxp>=owidth*oheight){
             THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
-        }  
+        }
         gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */
       }
     }
@@ -162,7 +162,7 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)(
 
   if (!THTensor_(isSameSizeAs)(input, indices)){
     THError("Invalid input size w.r.t current indices size");
-  } 
+  }
 
   /* get contiguous gradOutput and indices */
   gradOutput = THTensor_(newContiguous)(gradOutput);
diff --git a/lib/THNN/generic/SpatialUpSamplingBilinear.c b/lib/THNN/generic/SpatialUpSamplingBilinear.c
new file mode 100644
index 0000000..78290b6
--- /dev/null
+++ b/lib/THNN/generic/SpatialUpSamplingBilinear.c
@@ -0,0 +1,127 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialUpSamplingBilinear.c"
+#else
+
+void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output){
+  input = THTensor_(newContiguous)(input);
+  output = THTensor_(newContiguous)(output);
+  THTensor_(zero)(output);
+  real *idata = THTensor_(data)(input);
+  real *odata = THTensor_(data)(output);
+  int channels = THTensor_(size)(input, 0) * THTensor_(size)(input, 1);
+  int height1 = THTensor_(size)(input, 2);
+  int width1 = THTensor_(size)(input, 3);
+  int height2 = THTensor_(size)(output, 2);
+  int width2 = THTensor_(size)(output, 3);
+  THAssert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
+  // special case: just copy
+  if (height1 == height2 && width1 == width2) {
+    for (int h2 = 0; h2 < height2; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < width2; ++w2) {
+        const int w1 = w2;
+        const real* pos1 = &idata[h1 * width1 + w1];
+        real* pos2 = &odata[h2 * width2 + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos2[0] = pos1[0];
+          pos1 += width1 * height1;
+          pos2 += width2 * height2;
+        }
+      }
+    }
+    return;
+  }
+  const float rheight =(height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f;
+  const float rwidth = (width2 > 1) ? (float)(width1 - 1) / (width2 - 1) : 0.f;
+  for (int h2 = 0; h2 < height2; ++h2) {
+    const float h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const real h1lambda = h1r - h1;
+    const real h0lambda = (real)1. - h1lambda;
+    for (int w2 = 0; w2 < width2; ++w2) {
+      const float w1r = rwidth * w2;
+      const int w1 = w1r;
+      const int w1p = (w1 < width1 - 1) ? 1 : 0;
+      const real w1lambda = w1r - w1;
+      const real w0lambda = (real)1. - w1lambda;
+      const real* pos1 = &idata[h1 * width1 + w1];
+      real* pos2 = &odata[h2 * width2 + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos2[0] = h0lambda * (w0lambda * pos1[0]+ w1lambda * pos1[w1p])
+                  + h1lambda * (w0lambda * pos1[h1p * width1]
+                  + w1lambda * pos1[h1p * width1 + w1p]);
+        pos1 += width1 * height1;
+        pos2 += width2 * height2;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+    THNNState *state,
+    THTensor *gradOutput,
+    THTensor *gradInput){
+  gradInput = THTensor_(newContiguous)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THTensor_(zero)(gradInput);
+  real *data1 = THTensor_(data)(gradInput);
+  real *data2 = THTensor_(data)(gradOutput);
+  int channels = THTensor_(size)(gradInput, 0) * THTensor_(size)(gradInput, 1);
+  int height1 = THTensor_(size)(gradInput, 2);
+  int width1 = THTensor_(size)(gradInput, 3);
+  int height2 = THTensor_(size)(gradOutput, 2);
+  int width2 = THTensor_(size)(gradOutput, 3);
+  THAssert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
+  // special case: same-size matching grids
+  if (height1 == height2 && width1 == width2) {
+    for (int h2 = 0; h2 < height2; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < width2; ++w2) {
+        const int w1 = w2;
+        real* pos1 = &data1[h1 * width1 + w1];
+        const real* pos2 = &data2[h2 * width2 + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos1[0] += pos2[0];
+          pos1 += width1 * height1;
+          pos2 += width2 * height2;
+        }
+      }
+    }
+    return;
+  }
+  const float rheight =(height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f;
+  const float rwidth = (width2 > 1) ? (float)(width1 - 1)/(width2 - 1) : 0.f;
+  for (int h2 = 0; h2 < height2; ++h2) {
+    const float h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const real h1lambda = h1r - h1;
+    const real h0lambda = (real)1. - h1lambda;
+    for (int w2 = 0; w2 < width2; ++w2) {
+      const float w1r = rwidth * w2;
+      const int w1 = w1r;
+      const int w1p = (w1 < width1 - 1) ? 1 : 0;
+      const real w1lambda = w1r - w1;
+      const real w0lambda = (real)1. - w1lambda;
+      real* pos1 = &data1[h1 * width1 + w1];
+      const real* pos2 = &data2[h2 * width2 + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos1[0] += h0lambda * w0lambda * pos2[0];
+        pos1[w1p] += h0lambda * w1lambda * pos2[0];
+        pos1[h1p * width1] += h1lambda * w0lambda * pos2[0];
+        pos1[h1p * width1 + w1p] += h1lambda * w1lambda * pos2[0];
+        pos1 += width1 * height1;
+        pos2 += width2 * height2;
+      }
+    }
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/THNN.h b/lib/THNN/generic/THNN.h
index 1600fb1..7ad6f70 100644
--- a/lib/THNN/generic/THNN.h
+++ b/lib/THNN/generic/THNN.h
@@ -106,14 +106,16 @@ TH_API void THNN_(HardTanh_updateOutput)(
           THTensor *input,             // input tensor
           THTensor *output,            // [OUT] output tensor
           real min_val,                // lower threshold
-          real max_val);               // upper threshold
+          real max_val,
+          bool inplace);               // upper threshold
 TH_API void THNN_(HardTanh_updateGradInput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
           THTensor *gradOutput,        // gradient w.r.t. module's output
           THTensor *gradInput,         // [OUT] gradient w.r.t. the input
           real min_val,                // lower threshold
-          real max_val);               // upper threshold
+          real max_val,
+          bool inplace);               // upper threshold
 
 TH_API void THNN_(L1Cost_updateOutput)(
           THNNState *state,            // library's state
@@ -122,7 +124,7 @@ TH_API void THNN_(L1Cost_updateOutput)(
 TH_API void THNN_(L1Cost_updateGradInput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
-          THTensor *gradOutput,        // gradient w.r.t module's output
+          THTensor *gradOutput,        // [OPTIONAL] gradient w.r.t module's output
           THTensor *gradInput);        // [OUT] gradient w.r.t the input
 
 TH_API void THNN_(LeakyReLU_updateOutput)(
@@ -168,8 +170,8 @@ TH_API void THNN_(LookupTable_accGradParameters)(
           THTensor *gradOutput,
           THTensor *gradWeight,
           THIntegerTensor *count,
-          THTensor *sorted,
-          THTensor *indices,
+          THTensor *sorted,            // [OPTIONAL]
+          THTensor *indices,           // [OPTIONAL]
           bool scaleGradByFreq,
           int paddingValue,
           real scale);
@@ -245,7 +247,7 @@ TH_API void THNN_(MultiMarginCriterion_updateOutput)(
           THTensor *output,
           bool sizeAverage,
           int p,
-          THTensor* weights,
+          THTensor* weights,      // [OPTIONAL]
           real margin);
 TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
           THNNState *state,
@@ -254,7 +256,7 @@ TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
           THTensor *gradInput,
           bool sizeAverage,
           int p,
-          THTensor *weights,
+          THTensor *weights,      // [OPTIONAL]
           real margin);
 
 TH_API void THNN_(PReLU_updateOutput)(
@@ -537,8 +539,8 @@ TH_API void THNN_(BatchNormalization_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *weight,
-          THTensor *bias,
+          THTensor *weight,       // [OPTIONAL]
+          THTensor *bias,         // [OPTIONAL]
           THTensor *running_mean,
           THTensor *running_var,
           THTensor *save_mean,
@@ -550,10 +552,10 @@ TH_API void THNN_(BatchNormalization_backward)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
-          THTensor *gradInput,
-          THTensor *gradWeight,
-          THTensor *gradBias,
-          THTensor *weight,
+          THTensor *gradInput,    // [OPTIONAL]
+          THTensor *gradWeight,   // [OPTIONAL]
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *weight,       // [OPTIONAL]
           THTensor *running_mean,
           THTensor *running_var,
           THTensor *save_mean,
@@ -600,7 +602,7 @@ TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
           THTensor *input,
           THTensor *output,
           THTensor *weight,
-          THTensor *bias,
+          THTensor *bias,         // [OPTIONAL]
           THTensor *finput,
           THTensor *fgradInput,
           int kW, int kH,
@@ -622,7 +624,7 @@ TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradWeight,
-          THTensor *gradBias,
+          THTensor *gradBias,     // [OPTIONAL]
           THTensor *finput,
           THTensor *fgradInput,
           int kW, int kH,
@@ -726,7 +728,7 @@ TH_API void THNN_(SpatialFullConvolution_updateOutput)(
           THTensor *input,
           THTensor *output,
           THTensor *weight,
-          THTensor *bias,
+          THTensor *bias,         // [OPTIONAL]
           THTensor *columns,
           THTensor *ones,
           int kW, int kH,
@@ -749,7 +751,7 @@ TH_API void THNN_(SpatialFullConvolution_accGradParameters)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradWeight,
-          THTensor *gradBias,
+          THTensor *gradBias,     // [OPTIONAL]
           THTensor *columns,
           THTensor *ones,
           int kW, int kH,
@@ -792,43 +794,43 @@ TH_API void THNN_(SpatialFullConvolutionMap_accGradParameters)(
           real scale);            // scaling factor
 
 TH_API void THNN_(SpatialDilatedConvolution_updateOutput)(
-    THNNState *state,
-    THTensor *input,
-    THTensor *output,
-    THTensor *weight,
-    THTensor *bias,
-    THTensor *columns,
-    THTensor *ones,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
-    int dilationW, int dilationH);
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH);
 
 TH_API void THNN_(SpatialDilatedConvolution_updateGradInput)(
-    THNNState *state,
-    THTensor *input,
-    THTensor *gradOutput,
-    THTensor *gradInput,
-    THTensor *weight,
-    THTensor *gradColumns,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
-    int dilationW, int dilationH);
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH);
 
 TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)(
-    THNNState *state,
-    THTensor *input,
-    THTensor *gradOutput,
-    THTensor *gradWeight,
-    THTensor *gradBias,
-    THTensor *columns,
-    THTensor *ones,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
-    int dilationW, int dilationH,
-    real scale);
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          real scale);
 
 TH_API void THNN_(SpatialMaxPooling_updateOutput)(
           THNNState *state,
@@ -838,6 +840,7 @@ TH_API void THNN_(SpatialMaxPooling_updateOutput)(
           int kW, int kH,
           int dW, int dH,
           int padW, int padH,
+          int dilationW, int dilationH,
           bool ceil_mode);
 TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
           THNNState *state,
@@ -848,6 +851,7 @@ TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
           int kW, int kH,
           int dW, int dH,
           int padW, int padH,
+          int dilationW, int dilationH,
           bool ceil_mode);
 
 TH_API void THNN_(SpatialMaxUnpooling_updateOutput)(
@@ -902,6 +906,15 @@ TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
           THTensor *gradInput,
           int scale_factor);
 
+TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+
 TH_API void THNN_(unfolded_acc)(
           THTensor *finput,
           THTensor *input,
@@ -1031,6 +1044,45 @@ TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
           int aT, int aW, int aH,   // extra output adjustment
           real scale);              // scaling factor
 
+TH_API void THNN_(VolumetricDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH,
+          real scale);
+
 TH_API void THNN_(VolumetricMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -1067,30 +1119,51 @@ TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
           int dT, int dW, int dH,
           int pT, int pW, int pH);
 
-TH_API void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
-                                                         THTensor *input,
-                                                         THTensor *output,
-                                                         int pad_l, int pad_r,
-                                                         int pad_t, int pad_b);
-
-TH_API void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state,
-                                                            THTensor *input,
-                                                            THTensor *gradOutput,
-                                                            THTensor *gradInput,
-                                                            int pad_l, int pad_r,
-                                                            int pad_t, int pad_b);
-
-TH_API void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
-                                                         THTensor *input,
-                                                         THTensor *output,
-                                                         int pad_l, int pad_r,
-                                                         int pad_t, int pad_b);
-
-TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
-                                                            THTensor *input,
-                                                            THTensor *gradOutput,
-                                                            THTensor *gradInput,
-                                                            int pad_l, int pad_r,
-                                                            int pad_t, int pad_b);
+TH_API void THNN_(SpatialReflectionPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReflectionPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReplicationPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pleft, int pright,
+          int ptop, int pbottom,
+          int pfront, int pback);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pleft, int pright,
+          int ptop, int pbottom,
+          int pfront, int pback);
 
 #endif
diff --git a/lib/THNN/generic/VolumetricConvolutionMM.c b/lib/THNN/generic/VolumetricConvolutionMM.c
index a226350..8fef1cf 100644
--- a/lib/THNN/generic/VolumetricConvolutionMM.c
+++ b/lib/THNN/generic/VolumetricConvolutionMM.c
@@ -395,6 +395,10 @@ void THNN_(VolumetricConvolutionMM_updateGradInput)(
 
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(resizeAs)(fgradInput, finput);
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput);  
   THTensor_(transpose)(weight, weight, 0, 1);
 
   if (input->nDimension == 4)
diff --git a/lib/THNN/generic/VolumetricDilatedConvolution.c b/lib/THNN/generic/VolumetricDilatedConvolution.c
new file mode 100644
index 0000000..1a9cc93
--- /dev/null
+++ b/lib/THNN/generic/VolumetricDilatedConvolution.c
@@ -0,0 +1,356 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricDilatedConvolution.c"
+#else
+
+void THNN_(VolumetricDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH)
+{
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected, but got: %d", input->nDimension);
+  THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match. Expected: %d, got %d", nInputPlane, input->size[0]);
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match. Expected: %d, got %d", nInputPlane, input->size[1]);
+  }
+
+  long inputDepth  = input->size[2];
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[4];
+  long outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 3 ||
+      ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long n_ = outputDepth * outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        0,
+        THTensor_(data)(output_n), n_
+      );
+    } else {
+      THTensor_(zero)(output_n);
+    }
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = columns->size[1];
+    long k = nInputPlane*kT*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(columns), n,
+      THTensor_(data)(weight), k,
+      1,
+      THTensor_(data)(output_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH)
+{
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputDepth  = input->size[2];
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+  THTensor_(zero)(gradColumns);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    long m = nInputPlane*kT*kW*kH;
+    long n = gradColumns->size[1];
+    long k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(gradOutput_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2vol)(
+      THTensor_(data)(gradColumns),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(gradInput_n)
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH,
+          real scale)
+{
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(gradWeight->nDimension == 5, 4, "gradWeight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+  THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
+
+  // Params
+  int nInputPlane = gradWeight->size[1];
+  int nOutputPlane = gradWeight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputDepth  = input->size[2];
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = nInputPlane*kT*kW*kH;
+    long k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(gradOutput_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long k_ = outputDepth * outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/VolumetricFullConvolution.c b/lib/THNN/generic/VolumetricFullConvolution.c
index 5a6a1a7..4eb36c4 100644
--- a/lib/THNN/generic/VolumetricFullConvolution.c
+++ b/lib/THNN/generic/VolumetricFullConvolution.c
@@ -8,12 +8,13 @@ static void THNN_(vol2col)(
   const int kT, const int kH, const int kW,
   const int pT, const int pH, const int pW,
   const int dT, const int dH, const int dW,
+  const int dilationT, const int dilationH, const int dilationW,
   real *data_col)
 {
   int c, t, h, w;
-  int depth_col  = (depth  + 2 * pT - kT) / dT + 1;
-  int height_col = (height + 2 * pH - kH) / dH + 1;
-  int width_col  = (width  + 2 * pW - kW) / dW + 1;
+  int depth_col  = (depth  + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int width_col  = (width  + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1;
   int channels_col = channels * kT * kH * kW;
   for (c = 0; c < channels_col; ++c)
   {
@@ -27,10 +28,12 @@ static void THNN_(vol2col)(
       {
         for (w = 0; w < width_col; ++w)
         {
-          int t_pad = t * dT - pT + t_offset;
-          int h_pad = h * dH - pH + h_offset;
-          int w_pad = w * dW - pW + w_offset;
-          if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+          int t_pad = t * dT - pT + t_offset * dilationT;
+          int h_pad = h * dH - pH + h_offset * dilationH;
+          int w_pad = w * dW - pW + w_offset * dilationW;
+          if (t_pad >= 0 && t_pad < depth &&
+              h_pad >= 0 && h_pad < height &&
+              w_pad >= 0 && w_pad < width)
             data_col[((c * depth_col + t) * height_col + h) * width_col + w] =
               data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad];
           else
@@ -47,13 +50,14 @@ static void THNN_(col2vol)(
   const int kT, const int kH, const int kW,
   const int pT, const int pH, const int pW,
   const int dT, const int dH, const int dW,
+  const int dilationT, const int dilationH, const int dilationW,
   real* data_vol)
 {
   int c, t, h, w;
   memset(data_vol, 0, sizeof(real) * depth * height * width * channels);
-  int depth_col = (depth + 2 * pT - kT) / dT + 1;
-  int height_col = (height + 2 * pH - kH) / dH + 1;
-  int width_col = (width + 2 * pW - kW) / dW + 1;
+  int depth_col  = (depth  + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int width_col  = (width  + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1;
   int channels_col = channels * kT * kH * kW;
   for (c = 0; c < channels_col; ++c)
   {
@@ -67,10 +71,12 @@ static void THNN_(col2vol)(
       {
         for (w = 0; w < width_col; ++w)
         {
-          int t_pad = t * dT - pT + t_offset;
-          int h_pad = h * dH - pH + h_offset;
-          int w_pad = w * dW - pW + w_offset;
-          if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+          int t_pad = t * dT - pT + t_offset * dilationT;
+          int h_pad = h * dH - pH + h_offset * dilationH;
+          int w_pad = w * dW - pW + w_offset * dilationW;
+          if (t_pad >= 0 && t_pad < depth &&
+              h_pad >= 0 && h_pad < height &&
+              w_pad >= 0 && w_pad < width)
             data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] +=
               data_col[((c * depth_col + t) * height_col + h) * width_col + w];
         }
@@ -137,6 +143,7 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
 
   // Resize temporary columns
   THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+  THTensor_(zero)(columns);
 
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
@@ -184,6 +191,7 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
       kT, kH, kW,
       pT, pH, pW,
       dT, dH, dW,
+       1,  1,  1,
       THTensor_(data)(output_n)
     );
 
@@ -268,6 +276,7 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(
 
   // Resize output
   THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
 
   // Resize temporary columns
   THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
@@ -291,6 +300,7 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(
       kT, kH, kW,
       pT, pH, pW,
       dT, dH, dW,
+       1,  1,  1,
       THTensor_(data)(gradColumns)
     );
 
@@ -405,6 +415,7 @@ void THNN_(VolumetricFullConvolution_accGradParameters)(
       kT, kH, kW,
       pT, pH, pW,
       dT, dH, dW,
+       1,  1,  1,
       THTensor_(data)(columns)
     );
 
diff --git a/lib/THNN/generic/VolumetricReplicationPadding.c b/lib/THNN/generic/VolumetricReplicationPadding.c
new file mode 100644
index 0000000..c4ab02e
--- /dev/null
+++ b/lib/THNN/generic/VolumetricReplicationPadding.c
@@ -0,0 +1,301 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricReplicationPadding.c"
+#else
+
+static void THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight, long idepth,
+  long owidth, long oheight, long odepth,
+  int pleft, int pright,
+  int ptop, int pbottom,
+  int pfront, int pback)
+{
+  int iStartX = fmax(0, -pleft);
+  int iStartY = fmax(0, -ptop);
+  int iStartZ = fmax(0, -pfront);
+  int oStartX = fmax(0, pleft);
+  int oStartY = fmax(0, ptop);
+  int oStartZ = fmax(0, pfront);
+
+  long k, ip_x, ip_y, ip_z;
+#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
+  for (k = 0; k < nslices; k++) {
+    long i, j, z;
+    for (z = 0; z < odepth; z++) {
+      for (i = 0; i < oheight; i++) {
+        for (j = 0; j < owidth; j++) {
+          if (j < pleft) {
+            ip_x = pleft;
+          } else if (j >= pleft && j < iwidth + pleft) {
+            ip_x = j;
+          } else {
+            ip_x = iwidth + pleft - 1;
+          }
+          ip_x = ip_x - oStartX + iStartX;
+
+          if (i < ptop) {
+            ip_y = ptop;
+          } else if (i >= ptop && i < iheight + ptop) {
+            ip_y = i;
+          } else {
+            ip_y = iheight + ptop - 1;
+          }
+          ip_y = ip_y - oStartY + iStartY;
+
+          if (z < pfront) {
+            ip_z = pfront;
+          } else if (z >= pfront && z < idepth + pfront) {
+            ip_z = z;
+          } else {
+            ip_z = idepth + pfront - 1;
+          }
+          ip_z = ip_z - oStartZ + iStartZ;
+
+          real *dest_p = output_p + k * owidth * oheight * odepth +
+              z * owidth * oheight + i * owidth + j;
+          real *src_p = input_p + k * iwidth * iheight * idepth +
+              ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
+          *dest_p = *src_p;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricReplicationPadding_updateOutput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *output,
+                                                      int pleft, int pright,
+                                                      int ptop, int pbottom,
+                                                      int pfront, int pback)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long idepth;
+  long iheight;
+  long iwidth;
+  long odepth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5,
+             2, "input must be 4 or 5-dimensional");
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1 , 2,
+             "input is too small");
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 4)
+  {
+    THTensor_(resize4d)(output, nslices, odepth, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+         input_data, output_data, nslices, iwidth, iheight, idepth,
+         owidth, oheight, odepth, pleft, pright, ptop, pbottom, pfront,
+         pback);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize5d)(output, nbatch, nslices, odepth, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+        input_data + p * nslices * iwidth * iheight * idepth,
+        output_data + p * nslices * owidth * oheight * odepth,
+        nslices,
+        iwidth, iheight, idepth,
+        owidth, oheight, odepth,
+        pleft, pright,
+        ptop, pbottom,
+        pfront, pback);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight, long idepth,
+  long owidth, long oheight, long odepth,
+  int pleft, int pright,
+  int ptop, int pbottom,
+  int pfront, int pback)
+{
+  int iStartX = fmax(0, -pleft);
+  int iStartY = fmax(0, -ptop);
+  int iStartZ = fmax(0, -pfront);
+  int oStartX = fmax(0, pleft);
+  int oStartY = fmax(0, ptop);
+  int oStartZ = fmax(0, pfront);
+
+  long k, ip_x, ip_y, ip_z;
+#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
+  for (k = 0; k < nslices; k++) {
+    long i, j, z;
+    for (z = 0; z < odepth; z++) {
+      for (i = 0; i < oheight; i++) {
+        for (j = 0; j < owidth; j++) {
+          if (j < pleft) {
+            ip_x = pleft;
+          } else if (j >= pleft && j < iwidth + pleft) {
+            ip_x = j;
+          } else {
+            ip_x = iwidth + pleft - 1;
+          }
+          ip_x = ip_x - oStartX + iStartX;
+
+          if (i < ptop) {
+            ip_y = ptop;
+          } else if (i >= ptop && i < iheight + ptop) {
+            ip_y = i;
+          } else {
+            ip_y = iheight + ptop - 1;
+          }
+          ip_y = ip_y - oStartY + iStartY;
+
+          if (z < pfront) {
+            ip_z = pfront;
+          } else if (z >= pfront && z < idepth + pfront) {
+            ip_z = z;
+          } else {
+            ip_z = idepth + pfront - 1;
+          }
+          ip_z = ip_z - oStartZ + iStartZ;
+
+          real *src_p = goutput_p + k * owidth * oheight * odepth +
+              z * owidth * oheight + i * owidth + j;
+          real *dest_p = ginput_p + k * iwidth * iheight * idepth +
+              ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
+          *dest_p += *src_p;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *gradOutput,
+                                                         THTensor *gradInput,
+                                                         int pleft, int pright,
+                                                         int ptop, int pbottom,
+                                                         int pfront, int pback)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long idepth;
+  long iheight;
+  long iwidth;
+  long odepth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+                "gradOutput width unexpected");
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected");
+  THArgCheck(odepth == THTensor_(size)(gradOutput, dimd), 3,
+                "gradOutput depth unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 4) {
+    THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight, idepth,
+      owidth, oheight, odepth,
+      pleft, pright,
+      ptop, pbottom,
+      pfront, pback);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * idepth * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * odepth * oheight * owidth,
+        nslices,
+        iwidth, iheight, idepth,
+        owidth, oheight, odepth,
+        pleft, pright,
+        ptop, pbottom,
+        pfront, pback);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/lib/THNN/init.c b/lib/THNN/init.c
index 7c0de94..739706c 100644
--- a/lib/THNN/init.c
+++ b/lib/THNN/init.c
@@ -148,6 +148,9 @@
 #include "generic/SpatialUpSamplingNearest.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialUpSamplingBilinear.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/VolumetricAveragePooling.c"
 #include "THGenerateFloatTypes.h"
 
@@ -160,6 +163,9 @@
 #include "generic/VolumetricFullConvolution.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/VolumetricDilatedConvolution.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/VolumetricMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 
@@ -171,3 +177,6 @@
 
 #include "generic/SpatialReplicationPadding.c"
 #include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricReplicationPadding.c"
+#include "THGenerateFloatTypes.h"
diff --git a/test.lua b/test.lua
index 8bf98ec..e288e25 100644
--- a/test.lua
+++ b/test.lua
@@ -85,6 +85,42 @@ function nntest.Add()
    end
 end
 
+function nntest.Bottle()
+   local ini = 2
+   local inj = 3
+   local ink = 4
+   local out = 5
+   local input = torch.Tensor(ini,inj,ink):normal()
+   local linear = nn.Linear(ink, out)
+   local module1 = nn.Bottle(linear)
+   local module2 = nn.Sequential()
+   module2:add(nn.View(ini*inj, ink))
+   module2:add(linear)
+   module2:add(nn.View(ini, inj, out))
+   local output1 = module1:forward(input)
+   local output2 = module2:forward(input)
+   mytester:eq(output1, output2, 0.0001, 'Bottle output not the same as Module')
+
+   local shape = {4, 5, 6, 7, 8, 1, 3}
+   local input = torch.Tensor(table.unpack(shape)):normal()
+   local module = nn.Sequential()
+   module:add(nn.Squeeze(2))
+   module:add(nn.Linear(3, 3))
+   local module1 = nn.Bottle(module, 3, 2)
+   local outShape = {4, 5, 6, 7, 8, 3}
+   local module2 = nn.Sequential()
+   module2:add(nn.View(4*5*6*7*8, 1, 3))
+   module2:add(module)
+   module2:add(nn.View(table.unpack(outShape)))
+   local output1 = module1:forward(input)
+   local grad = torch.Tensor(output1:size()):normal()
+   local gradOutput1 = module1:backward(input, grad):clone()
+   local output2 = module2:forward(input)
+   local gradOutput2 = module2:backward(input, grad):clone()
+   mytester:eq(output1, output2, 0.0001, 'Bottle output not the same as Module')
+   mytester:eq(gradOutput1, gradOutput2, 0.0001, 'Bottle gradOutput not the same as Module')
+end
+
 function nntest.CMul()
    local ini = math.random(3,5)
    local inj = math.random(3,5)
@@ -263,6 +299,23 @@ function nntest.ReLU()
    mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'ReLU gradInput')
 end
 
+function nntest.ReLU6()
+   for inplace = 0, 1 do
+      local input = torch.randn(3, 4):mul(6)
+      local gradOutput = torch.randn(3,4)
+      local module = nn.ReLU6(inplace == 1)
+      local output = module:forward(input:clone())
+      local gt = input:clone():gt(input, 0)
+      local lt = input:clone():lt(input, 6)
+      local output2 = gt:clone():cmul(lt):cmul(input)
+      output2:add(6, input:clone():gt(input, 6))
+      mytester:assertTensorEq(output, output2, 0.000001, 'ReLU6 output '..(inplace and '(inplace)' or '') )
+      local gradInput = module:backward(input, gradOutput:clone())
+      local gradInput2 = gt:clone():cmul(lt):cmul(gradOutput)
+      mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'ReLU gradInput '..(inplace and '(inplace)' or '') )
+   end
+end
+
 function nntest.Exp()
    local ini = math.random(3,5)
    local inj = math.random(3,5)
@@ -1321,19 +1374,19 @@ function nntest.MarginRankingCriterion()
    mytester:assert(torch.type(gradInput2[2]) == 'torch.FloatTensor', "MRC:type() error 2")
 
    -- batch, sizeAverage true, jacobian
-   local margin = math.random()*2-1
-   local batch_size = math.random(2,10)
+   local margin = math.random() * 2 - 1
+   local batch_size = math.random(1,10)
    local crit = nn.MarginRankingCriterion(margin)
    crit.sizeAverage = true
-   local v = torch.rand(2,batch_size)
+   local v = torch.rand(2, batch_size)
    local t = torch.Tensor(batch_size):random(0,1):mul(2):add(-1)
    criterionJacobianTest1DTable(crit,v,t)
 
    -- batch, sizeAverage false, jacobian
-   local margin = math.random()*2-1
+   local margin = math.random() * 2 - 1
    local crit = nn.MarginRankingCriterion(margin)
    crit.sizeAverage = false
-   local v = torch.rand(2,batch_size)
+   local v = torch.rand(2, batch_size)
    local t = torch.Tensor(batch_size):random(0,1):mul(2):add(-1)
    criterionJacobianTest1DTable(crit,v,t)
 
@@ -1609,7 +1662,7 @@ function nntest.LogSoftmax()
    local ferr,berr = jac.testIO(module,input)
    mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
    mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
-   
+
    -- test logsoftmax when gradOutput is non-contiguous
    local layer = nn.LogSoftMax()
    layer:zeroGradParameters()
@@ -1622,13 +1675,13 @@ function nntest.LogSoftmax()
    gradOutput = gradOutput:clone()
    local gradInput2 = layer:backward(input, gradOutput):clone()
 
-   mytester:assertlt(gradInput1:add(-1, gradInput2):abs():max(), 
-		     1e-10, 
-		     torch.typename(layer) 
-			.. ' non-contiguous gradOutput check')
-   
-   
-   
+   mytester:assertlt(gradInput1:add(-1, gradInput2):abs():max(),
+           1e-10,
+           torch.typename(layer)
+         .. ' non-contiguous gradOutput check')
+
+
+
 
 end
 
@@ -2471,76 +2524,99 @@ function nntest.SpatialFullConvolution()
    local module = nn.SpatialFullConvolution(from, to, ki, kj, di, dj, padW, padH, adjW, adjH)
    local input = torch.Tensor(from, inj, ini):zero()
 
-   -- stochastic
+   local function jacTests(module)
+      -- stochastic
 
-   local err = jac.testJacobian(module, input)
-   mytester:assertlt(err, precision, 'error on state ')
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error on state ')
 
-   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
-   mytester:assertlt(err , precision, 'error on weight ')
+      local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+      mytester:assertlt(err , precision, 'error on weight ')
 
-   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
-   mytester:assertlt(err , precision, 'error on bias ')
+      if module.bias then
+         local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+         mytester:assertlt(err , precision, 'error on bias ')
+      end
 
-   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
-   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+      local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+      mytester:assertlt(err , precision, 'error on weight [direct update] ')
 
-   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
-   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+      if module.bias then
+         local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+         mytester:assertlt(err , precision, 'error on bias [direct update] ')
+      end
 
-   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
-      mytester:assertlt(err, precision, string.format(
-                         'error on weight [%s]', t))
-   end
+      for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+         mytester:assertlt(err, precision, string.format(
+                            'error on weight [%s]', t))
+      end
 
-   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
-      mytester:assertlt(err, precision, string.format(
-                         'error on bias [%s]', t))
-   end
+      if module.bias then
+         for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+            mytester:assertlt(err, precision, string.format(
+                               'error on bias [%s]', t))
+         end
+      end
 
-   -- batch
+      -- batch
 
-   --verbose = true
-   local batch = math.random(2,5)
+      --verbose = true
+      local batch = math.random(2,5)
 
-   module = nn.SpatialFullConvolution(from, to, ki, kj, di, dj, padW, padH, adjW, adjH)
-   input = torch.Tensor(batch,from,inj,ini):zero()
+      module = nn.SpatialFullConvolution(from, to, ki, kj, di, dj, padW, padH, adjW, adjH)
+      input = torch.Tensor(batch,from,inj,ini):zero()
 
-   -- Check that the required output size matches the actual output size
-   local output = module:forward(input)
-   mytester:asserteq(output:size(3), outj, 'output height error')
-   mytester:asserteq(output:size(4), outi, 'output width error')
+      -- Check that the required output size matches the actual output size
+      local output = module:forward(input)
+      mytester:asserteq(output:size(3), outj, 'output height error')
+      mytester:asserteq(output:size(4), outi, 'output width error')
 
-   local err = jac.testJacobian(module, input)
-   mytester:assertlt(err, precision, 'batch error on state ')
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'batch error on state ')
 
-   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
-   mytester:assertlt(err , precision, 'batch error on weight ')
+      local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+      mytester:assertlt(err , precision, 'batch error on weight ')
 
-   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
-   mytester:assertlt(err , precision, 'batch error on bias ')
+      if module.bias then
+         local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+         mytester:assertlt(err , precision, 'batch error on bias ')
+      end
 
-   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
-   mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+      local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+      mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
 
-   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
-   mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+      if module.bias then
+         local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+         mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+      end
 
-   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
-      mytester:assertlt(err, precision, string.format(
-                         'error on weight [%s]', t))
-   end
+      for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+         mytester:assertlt(err, precision, string.format(
+                            'error on weight [%s]', t))
+      end
 
-   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
-      mytester:assertlt(err, precision, string.format(
-                         'batch error on bias [%s]', t))
+      if module.bias then
+         for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+            mytester:assertlt(err, precision, string.format(
+                               'batch error on bias [%s]', t))
+         end
+      end
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+      mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
    end
 
-   local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   jacTests(module)
+   module:noBias()
+   jacTests(module)
+   module.bias = torch.Tensor(module.nOutputPlane):zero()
+   module.gradBias = torch.Tensor(module.nOutputPlane):zero()
+   module:reset()
+   jacTests(module)
 
    -- non-contiguous
+   local batch = math.random(2,5)
    local input = torch.randn(batch,from,ini,inj):transpose(3,4) -- non-contiguous
    local inputc = input:contiguous() -- contiguous
    local output = module:forward(input)
@@ -2604,8 +2680,8 @@ function nntest.SpatialDilatedConvolution()
    local padH = math.random(0,2)
    local outi = math.random(5,9)
    local outj = math.random(5,9)
-   local dilationW = math.random(0,10)
-   local dilationH = math.random(0,10)
+   local dilationW = math.random(1,10)
+   local dilationH = math.random(1,10)
    local ini = (outi - 1) * di - 2 * padW + dilationW * (ki-1) + 1
    local inj = (outj - 1) * dj - 2 * padH + dilationH * (kj-1) + 1
 
@@ -3134,6 +3210,49 @@ function nntest.SpatialMaxUnpooling()
   end
 end
 
+function nntest.SpatialDilatedMaxPooling()
+   for _,ceil_mode in pairs({true,false}) do
+      local from = math.random(1,5)
+      local ki = math.random(1,4)
+      local kj = math.random(1,4)
+      local si = math.random(1,3)
+      local sj = math.random(1,3)
+      local outi = math.random(4,5)
+      local outj = math.random(4,5)
+      local padW = math.min(math.random(0,1),math.floor(ki/2))
+      local padH =  math.min(math.random(0,1),math.floor(kj/2))
+      local dilationW = math.random(1,5)
+      local dilationH = math.random(1,5)
+      local ini = (outi-1)*si+(dilationW*(ki-1)+1)-2*padW
+      local inj = (outj-1)*sj+(dilationH*(kj-1)+1)-2*padH
+
+      local ceil_string = ceil_mode and 'ceil' or 'floor'
+      local module = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padW,padH,dilationW, dilationH)
+      if ceil_mode then module:ceil() else module:floor() end
+      local input = torch.rand(from,inj,ini)
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state ')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+      -- batch
+      local nbatch = math.random(2,5)
+      input = torch.rand(nbatch,from,inj,ini)
+      module = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padW,padH,dilationW,dilationH)
+      if ceil_mode then module:ceil() else module:floor() end
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state (Batch)')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+  end
+end
+
 function nntest.SpatialFractionalMaxPooling()
     local batch = math.random(1, 3)
     local plane = math.random(1, 3)
@@ -3819,6 +3938,112 @@ function nntest.VolumetricConvolution()
    mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
 end
 
+function nntest.VolumetricDilatedConvolution()
+   local from = math.random(1,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local kk = math.random(1,5)
+   local di =  math.random(1,4)
+   local dj =  math.random(1,4)
+   local dk =  math.random(1,4)
+   local padW = 0 -- math.random(0,2)
+   local padH = 0 -- math.random(0,2)
+   local padT = 0 -- math.random(0,2)
+   local outi = math.random(2,3)
+   local outj = math.random(2,5)
+   local outk = math.random(2,5)
+   local dilationW = math.random(1,3)
+   local dilationH = math.random(1,3)
+   local dilationT = math.random(1,3)
+   local ini = (outi - 1) * di - 2 * padW + dilationW * (ki-1) + 1
+   local inj = (outj - 1) * dj - 2 * padH + dilationH * (kj-1) + 1
+   local ink = (outk - 1) * dk - 2 * padT + dilationT * (kk-1) + 1
+
+   local module = nn.VolumetricDilatedConvolution(from, to, kk, ki, kj, dk, di, dj, padT, padW, padH, dilationT, dilationW, dilationH)
+   local input = torch.Tensor(from, ink, inj, ini):zero()
+
+   -- stochastic
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   -- batch
+
+   --verbose = true
+   local batch = math.random(2,5)
+
+   module = nn.VolumetricDilatedConvolution(from, to, kk, ki, kj, dk, di, dj, padT, padW, padH, dilationT, dilationW, dilationH)
+   input = torch.Tensor(batch,from,ink,inj,ini):zero()
+
+   -- Check that the required output size matches the actual output size
+   local output = module:forward(input)
+   mytester:asserteq(output:size(3), outk, 'output width error')
+   mytester:asserteq(output:size(4), outj, 'output height error')
+   mytester:asserteq(output:size(5), outi, 'output width error')
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'batch error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'batch error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'batch error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+   -- non-contiguous
+   local input = torch.randn(batch,from,ink,ini,inj):transpose(4,5) -- non-contiguous
+   local inputc = input:contiguous() -- contiguous
+   local output = module:forward(input)
+   local outputc = module:forward(inputc)
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+   local gradInput = module:backward(input, output)
+   local gradInputc = module:backward(inputc, outputc)
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+end
+
 function nntest.VolumetricConvolutionBatchCompare()
    local from = math.random(2,3)
    local to = math.random(2,3)
@@ -4339,6 +4564,16 @@ function nntest.Index()
     local gradOutput = torch.Tensor{{1, 2}, {1, 2}}
     local gradInput = net:backward(input, gradOutput)
     equal(gradInput[1], torch.Tensor{{2, 4}, {0, 0}}, "error in 2D backward pass")
+
+    -- test clearState
+    local m = nn.Index(1)
+    local tensor = torch.Tensor(10, 3)
+    local indices = torch.LongTensor{ 2,3,4}
+
+    m:clearState()
+    m:forward({tensor, indices})
+    m:backward({tensor,indices}, torch.rand(3,3))
+
 end
 
 function nntest.Squeeze()
@@ -4739,6 +4974,8 @@ function nntest.Select()
   mytester:asserteq(nn.Select(1,-1):forward(input)[1], 8, "negative index")
   mytester:asserteq(nn.Select(1,-1):forward(input)[2], 0, "negative index")
   mytester:asserteq(nn.Select(1,-2):forward(input)[2], 6, "negative index")
+  mytester:asserteq(nn.Select(-1,-1):forward(input)[1], 7, "negative dim + negative index")
+  mytester:asserteq(nn.Select(-1,-1):forward(input)[2], 1, "negative dim + negative index")
 end
 
 function nntest.SelectTable()
@@ -5018,6 +5255,23 @@ function nntest.Narrow()
    mytester:assertTensorEq(gradInput, gradInput1, 0.00001, "Narrow #3 gradInput err")
    mytester:assertTensorEq(output, output2, 0.0000001, "Narrow #3 negative output err")
    mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "Narrow #3 negative gradInput err")
+
+   -- check basic narrow functionality #4
+   local input = torch.rand(3, 10, 4)
+   local output = input:narrow(2, 5, 3)
+   local gradOutput = torch.rand(3, 3, 4)
+   local gradInput = torch.zeros(3, 10, 4)
+   gradInput:narrow(2, 5, 3):copy(gradOutput)
+   local module1 = nn.Narrow(-2, 5, 3)
+   local output1 = module1:forward(input)
+   local gradInput1 = module1:backward(input, gradOutput)
+   local module2 = nn.Narrow(-2, 5, -4)
+   local output2 = module2:forward(input)
+   local gradInput2 = module2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output1, 0.0000001, "Narrow #4 output err")
+   mytester:assertTensorEq(gradInput, gradInput1, 0.00001, "Narrow #4 gradInput err")
+   mytester:assertTensorEq(output, output2, 0.0000001, "Narrow #4 negative output err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "Narrow #4 negative gradInput err")
 end
 
 function nntest.NarrowTable()
@@ -5158,6 +5412,30 @@ function nntest.SpatialUpSamplingNearest()
   end
 end
 
+function nntest.SpatialUpSamplingBilinear()
+  for scale=2,4 do
+     for dim = 3,4 do
+       local m = nn.SpatialUpSamplingBilinear(scale)
+
+       -- Create a randomly sized dimD vector
+       local shape = {}
+       for i = 1, dim do
+         table.insert(shape, torch.random(2, 2+dim-1))
+       end
+
+       -- Check that the gradient is correct by using finite elements
+       local input = torch.DoubleTensor(table.unpack(shape)):normal()
+
+       local err = jac.testJacobian(m, input)
+       mytester:assertlt(err, precision, ' error on state ')
+
+       local ferr, berr = jac.testIO(m, input)
+       mytester:asserteq(ferr, 0, torch.typename(m)..' - i/o forward err ')
+       mytester:asserteq(berr, 0, torch.typename(m)..' - i/o backward err ')
+   end
+  end
+end
+
 function nntest.Concat()
    local input = torch.randn(4, 2)
    local num_modules = math.random(2, 5)
@@ -5939,6 +6217,7 @@ local function testBatchNormalization(moduleName, dim, k)
    jacTests(module, input, true)
    module:evaluate()
    jacTests(module, input, true)
+   jacTests(module, input[1], true)
 
    -- batch norm without affine transform
    module = nn[moduleName](planes, 1e-5, 0.1, false)
@@ -5946,6 +6225,7 @@ local function testBatchNormalization(moduleName, dim, k)
    jacTests(module, input, false)
    module:evaluate()
    jacTests(module, input, false)
+   jacTests(module, input[1], false)
 end
 
 function nntest.BatchNormalization()
@@ -6067,6 +6347,37 @@ function nntest.SpatialReplicationPadding()
    mytester:assertalmosteq(err, 0.0, 1e-7)
 end
 
+function nntest.VolumetricReplicationPadding()
+   for batch = 0, 1 do
+      local nbatch
+      if batch == 1 then
+         nbatch = math.random(1,3)
+      end
+      local plane = math.random(1,3)
+      local sizeZ = math.random(1,4)
+      local sizeY = math.random(7,11)
+      local sizeX = math.random(7,11)
+      local padLeft = math.random(-3,3)
+      local padRight = math.random(-3,3)
+      local padTop = math.random(-3,3)
+      local padBotom = math.random(-3,3)
+      local padFront = math.random(3,3)
+      local padBack = math.random(3,3)
+      local jac = nn.Jacobian
+      local layer =
+          nn.VolumetricReplicationPadding(padLeft, padRight, padTop,
+                                          padBottom, padFront, padBack)
+      local input
+      if batch == 1 then
+         input = torch.rand(nbatch, plane, sizeZ, sizeY, sizeX)
+      else
+         input = torch.rand(plane, sizeZ, sizeY, sizeX)
+      end
+      local err = jac.testJacobian(layer, input)
+      mytester:assertalmosteq(err, 0.0, 1e-7)
+   end
+end
+
 function nntest.Typecast()
   local function make_network()
     local seq = nn.Sequential()
@@ -6288,6 +6599,11 @@ function nntest.ErrorHandling()
    )
 end
 
+function nntest.GPU()
+   -- this is a placeholder to let you know that the nn.GPU unit test
+   -- is located in cunn package.
+end
+
 mytester:add(nntest)
 
 jac = nn.Jacobian

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-nn.git