[lasagne] 01/17: Imported Upstream version 0.1+git20160728.8b66737

Thu Nov 2 23:24:38 UTC 2017

This is an automated email from the git hooks/post-receive script.

sinclairs-guest pushed a commit to branch master
in repository lasagne.

commit 14f47d8ad110401a0125fff50e4213c7d172fe97
Author: Daniel Stender <stender at debian.org>
Date:   Thu Aug 18 20:59:26 2016 +0200

    Imported Upstream version 0.1+git20160728.8b66737
---
 .coveragerc                                |    2 +
 .coveragerc-nogpu                          |    6 +
 .github/CONTRIBUTING.md                    |   16 +
 .github/ISSUE_TEMPLATE.md                  |   12 +
 .github/PULL_REQUEST_TEMPLATE.md           |   27 +
 .gitignore                                 |  102 ++
 .travis.yml                                |   25 +
 CHANGES.rst                                |   35 +
 LICENSE                                    |   28 +
 MANIFEST.in                                |   11 +
 README.rst                                 |  138 +++
 docs/Makefile                              |  177 ++++
 docs/_static/fix_rtd.css                   |    4 +
 docs/conf.py                               |  340 +++++++
 docs/index.rst                             |   50 +
 docs/modules/init.rst                      |   60 ++
 docs/modules/layers.rst                    |  212 ++++
 docs/modules/layers/base.rst               |   13 +
 docs/modules/layers/conv.rst               |   37 +
 docs/modules/layers/corrmm.rst             |    8 +
 docs/modules/layers/cuda_convnet.rst       |    8 +
 docs/modules/layers/dense.rst              |   13 +
 docs/modules/layers/dnn.rst                |   11 +
 docs/modules/layers/embedding.rst          |   10 +
 docs/modules/layers/helper.rst             |   15 +
 docs/modules/layers/input.rst              |   10 +
 docs/modules/layers/merge.rst              |   18 +
 docs/modules/layers/noise.rst              |   15 +
 docs/modules/layers/normalization.rst      |   15 +
 docs/modules/layers/pool.rst               |   34 +
 docs/modules/layers/recurrent.rst          |   22 +
 docs/modules/layers/shape.rst              |   29 +
 docs/modules/layers/special.rst            |   40 +
 docs/modules/nonlinearities.rst            |   38 +
 docs/modules/objectives.rst                |   27 +
 docs/modules/random.rst                    |    7 +
 docs/modules/regularization.rst            |   20 +
 docs/modules/updates.rst                   |   31 +
 docs/modules/utils.rst                     |   13 +
 docs/user/custom_layers.rst                |  159 +++
 docs/user/development.rst                  |  234 +++++
 docs/user/installation.rst                 |  249 +++++
 docs/user/layers.rst                       |  203 ++++
 docs/user/tutorial.rst                     |  620 ++++++++++++
 examples/mnist.py                          |  362 +++++++
 examples/recurrent.py                      |  171 ++++
 lasagne/__init__.py                        |   34 +
 lasagne/conftest.py                        |   12 +
 lasagne/init.py                            |  367 +++++++
 lasagne/layers/__init__.py                 |   13 +
 lasagne/layers/base.py                     |  328 ++++++
 lasagne/layers/conv.py                     |  934 ++++++++++++++++++
 lasagne/layers/corrmm.py                   |  147 +++
 lasagne/layers/cuda_convnet.py             |  634 ++++++++++++
 lasagne/layers/dense.py                    |  192 ++++
 lasagne/layers/dnn.py                      |  593 +++++++++++
 lasagne/layers/embedding.py                |   69 ++
 lasagne/layers/helper.py                   |  520 ++++++++++
 lasagne/layers/input.py                    |   75 ++
 lasagne/layers/merge.py                    |  403 ++++++++
 lasagne/layers/noise.py                    |  136 +++
 lasagne/layers/normalization.py            |  375 +++++++
 lasagne/layers/pool.py                     |  639 ++++++++++++
 lasagne/layers/recurrent.py                | 1480 ++++++++++++++++++++++++++++
 lasagne/layers/shape.py                    |  397 ++++++++
 lasagne/layers/special.py                  | 1155 ++++++++++++++++++++++
 lasagne/nonlinearities.py                  |  305 ++++++
 lasagne/objectives.py                      |  379 +++++++
 lasagne/random.py                          |   36 +
 lasagne/regularization.py                  |  189 ++++
 lasagne/tests/conftest.py                  |   10 +
 lasagne/tests/layers/conftest.py           |   13 +
 lasagne/tests/layers/test_base.py          |  180 ++++
 lasagne/tests/layers/test_conv.py          |  781 +++++++++++++++
 lasagne/tests/layers/test_dense.py         |  361 +++++++
 lasagne/tests/layers/test_embedding.py     |   56 ++
 lasagne/tests/layers/test_helper.py        |  791 +++++++++++++++
 lasagne/tests/layers/test_input.py         |   41 +
 lasagne/tests/layers/test_merge.py         |  256 +++++
 lasagne/tests/layers/test_noise.py         |  127 +++
 lasagne/tests/layers/test_normalization.py |  327 ++++++
 lasagne/tests/layers/test_pool.py          |  905 +++++++++++++++++
 lasagne/tests/layers/test_recurrent.py     | 1101 +++++++++++++++++++++
 lasagne/tests/layers/test_shape.py         |  291 ++++++
 lasagne/tests/layers/test_special.py       |  793 +++++++++++++++
 lasagne/tests/test_examples.py             |   38 +
 lasagne/tests/test_init.py                 |  351 +++++++
 lasagne/tests/test_nonlinearities.py       |   69 ++
 lasagne/tests/test_objectives.py           |  236 +++++
 lasagne/tests/test_regularization.py       |   99 ++
 lasagne/tests/test_theano_extensions.py    |  155 +++
 lasagne/tests/test_updates.py              |  227 +++++
 lasagne/tests/test_utils.py                |  308 ++++++
 lasagne/theano_extensions/__init__.py      |    0
 lasagne/theano_extensions/conv.py          |  273 +++++
 lasagne/theano_extensions/padding.py       |   53 +
 lasagne/updates.py                         |  819 +++++++++++++++
 lasagne/utils.py                           |  450 +++++++++
 requirements-dev.txt                       |   10 +
 requirements.txt                           |    1 +
 setup.cfg                                  |   10 +
 setup.py                                   |   67 ++
 102 files changed, 22288 insertions(+)

diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000..3aa94fb
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,2 @@
+[run]
+omit = lasagne/tests/*
diff --git a/.coveragerc-nogpu b/.coveragerc-nogpu
new file mode 100644
index 0000000..6feb141
--- /dev/null
+++ b/.coveragerc-nogpu
@@ -0,0 +1,6 @@
+[run]
+omit =
+    lasagne/tests/*
+    lasagne/layers/corrmm.py
+    lasagne/layers/cuda_convnet.py
+    lasagne/layers/dnn.py
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
new file mode 100644
index 0000000..ffffa7a
--- /dev/null
+++ b/.github/CONTRIBUTING.md
@@ -0,0 +1,16 @@
+- **If you have a question or need help using Lasagne**, please post on [our mailing list](https://groups.google.com/forum/#!forum/lasagne-users) instead of creating an issue. Make sure to check the [Lasagne documentation](http://lasagne.readthedocs.org/en/latest/) and the [Theano documentation](http://deeplearning.net/software/theano/) first! You can search the mailing list as well to see if your question has come up before.
+
+- **If you would like to report a bug**, feel free to open an issue. Please verify first that the problem is not in your own code by reviewing the documentation. If you are able to provide a minimal code example that reproduces the bug, this will greatly speed up the process of tracking down the problem.
+
+- **If you would like to contribute**, feel free to open a pull request. Please review our documentation on [what to contribute](http://lasagne.readthedocs.org/en/latest/user/development.html#what-to-contribute) and [how to contribute](http://lasagne.readthedocs.org/en/latest/user/development.html#how-to-contribute). Some contributions may be better suited for our [Recipes repository](https://github.com/Lasagne/Recipes), where we collect examples, tutorials, trained models, utilities and [...]
+
+Links
+-----
+
+- Mailing list: https://groups.google.com/forum/#!forum/lasagne-users
+- Lasagne documentation: http://lasagne.readthedocs.org/en/latest/
+- Theano documentation: http://deeplearning.net/software/theano/
+
+- What to contribute: http://lasagne.readthedocs.org/en/latest/user/development.html#what-to-contribute
+- How to contribute: http://lasagne.readthedocs.org/en/latest/user/development.html#how-to-contribute
+- Recipes repository: https://github.com/Lasagne/Recipes
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 0000000..3034f40
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,12 @@
+Before submitting your issue, please check these hints!
+
+- **If you have a usage question**, please please post on [our mailing list](https://groups.google.com/forum/#!forum/lasagne-users) instead of creating an issue.
+  Make sure to check the [Lasagne documentation](http://lasagne.readthedocs.org/en/latest/) and the [Theano documentation](http://deeplearning.net/software/theano/) first!
+  You can search the mailing list as well to see if your question has come up before.
+
+- **If you suspect you have found a bug**, please first try [updating to the bleeding-edge versions of Theano and Lasagne](http://lasagne.readthedocs.io/en/latest/user/installation.html#bleeding-edge-version). It may have been fixed already.
+  If you are not sure whether the problem lies within your code, Theano, or Lasagne, first post on [our mailing list](https://groups.google.com/forum/#!forum/lasagne-users).
+  In any case, try to provide a minimal code example that reproduces the bug, this will greatly speed up the process of tracking down the problem.
+
+- **If you have a feature request or idea**, please include a clear description of the use case(s) it would enable, referencing research papers if applicable, and indicate whether you would be willing to implement the feature yourself.
+  We are happy to discuss your suggestion, help refining it, and decide upfront whether it would fit the main library or our [Lasagne/Recipes](https://github.com/Lasagne/Recipes).
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000..194342c
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,27 @@
+Before submitting your pull request, please check these hints!
+
+- If you are not familiar with the github workflow, have a look:
+  https://guides.github.com/introduction/flow/
+  In particular, note that in order to update your pull request to include any
+  changes we asked for, you just need to push to your branch again.
+- If your pull request addresses a particular issue from our issue tracker,
+  reference it in your pull request description on github (not the commit
+  message) using the syntax `Closes #123` or `Fixes #123`.
+  
+Pull request check list:
+
+- Install Lasagne in editable mode to be able to run tests locally:
+  http://lasagne.readthedocs.io/en/latest/user/development.html#development-setup
+- Make sure PEP8 is followed:
+  `python -m pep8 lasagne/`
+- Make sure the test suite runs through:
+  `python -m py.test`
+  (or, to only run tests that include the substring `foo` in their name:
+  `python -m py.test -k foo`)
+- At the end of the test run output, check if coverage is at 100%. If not (or
+  not for the files you changed), you will need to add tests covering the code
+  you added.
+- It is fine to submit a PR without tests to get initial feedback on the
+  implementation, but we cannot merge it without tests.
+- If you added/changed any documentation, verify that it renders correctly:
+  http://lasagne.readthedocs.io/en/latest/user/development.html#documentation
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b0f5aff
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,102 @@
+# Virtualenv
+bin/
+include/
+src/
+
+# Dataset used in examples
+train-images-idx3-ubyte.gz
+train-labels-idx1-ubyte.gz
+t10k-images-idx3-ubyte.gz
+t10k-labels-idx1-ubyte.gz
+# Dataset used in earlier versions
+mnist.pkl.gz
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+### vim ###
+[._]*.s[a-w][a-z]
+[._]s[a-w][a-z]
+*.un~
+Session.vim
+.netrwhist
+*~
+
+
+### OSX ###
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear on external disk
+.Spotlight-V100
+.Trashes
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+# Directories from IDE
+.idea
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..a2698fe
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,25 @@
+language: python
+sudo: false
+python:
+  - "2.7"
+  - "3.4"
+addons:
+  apt:
+    packages:
+    - libblas-dev
+    - liblapack-dev
+    - gfortran
+before_install:
+  - pip install -U pip
+install:
+  - travis_wait travis_retry pip install -r requirements-dev.txt
+  - travis_retry pip install python-coveralls
+  - travis_retry python setup.py dev
+script: py.test --runslow --cov-config=.coveragerc-nogpu
+after_success:
+  - coveralls
+cache:
+  - apt
+  - directories:
+    - $HOME/.cache/pip
+    - $HOME/.theano
diff --git a/CHANGES.rst b/CHANGES.rst
new file mode 100644
index 0000000..8ad9658
--- /dev/null
+++ b/CHANGES.rst
@@ -0,0 +1,35 @@
+Changelog
+---------
+
+0.1 (2015-08-13)
+~~~~~~~~~~~~~~~~
+
+First release.
+
+* core contributors, in alphabetical order:
+
+  * Eric Battenberg (@ebattenberg)
+  * Sander Dieleman (@benanne)
+  * Daniel Nouri (@dnouri)
+  * Eben Olson (@ebenolson)
+  * Aäron van den Oord (@avdnoord)
+  * Colin Raffel (@craffel)
+  * Jan Schlüter (@f0k)
+  * Søren Kaae Sønderby (@skaae)
+
+* extra contributors, in chronological order:
+
+  * Daniel Maturana (@dimatura): documentation, cuDNN layers, LRN
+  * Jonas Degrave (@317070): get_all_param_values() fix
+  * Jack Kelly (@JackKelly): help with recurrent layers
+  * Gábor Takács (@takacsg84): support broadcastable parameters in lasagne.updates
+  * Diogo Moitinho de Almeida (@diogo149): MNIST example fixes
+  * Brian McFee (@bmcfee): MaxPool2DLayer fix
+  * Martin Thoma (@MartinThoma): documentation
+  * Jeffrey De Fauw (@JeffreyDF): documentation, ADAM fix
+  * Michael Heilman (@mheilman): NonlinearityLayer, lasagne.random
+  * Gregory Sanders (@instagibbs): documentation fix
+  * Jon Crall (@erotemic): check for non-positive input shapes
+  * Hendrik Weideman (@hjweide): set_all_param_values() test, MaxPool2DCCLayer fix
+  * Kashif Rasul (@kashif): ADAM simplification
+  * Peter de Rivaz (@peterderivaz): documentation fix
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..940a5d0
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,28 @@
+The MIT License (MIT)
+
+Copyright (c) 2014-2015 Lasagne contributors
+
+Lasagne uses a shared copyright model: each contributor holds copyright over
+their contributions to Lasagne. The project versioning records all such
+contribution and copyright details.
+By contributing to the Lasagne repository through pull-request, comment,
+or otherwise, the contributor releases their content to the license and
+copyright terms herein.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..044ffc6
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,11 @@
+include *.rst
+include *.txt
+include LICENSE
+
+recursive-include lasagne/tests *.py
+include .coveragerc
+recursive-include examples *.py
+recursive-include docs *.rst conf.py *.css Makefile
+
+recursive-exclude * __pycache__
+recursive-exclude * *.py[co]
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..a1846e1
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,138 @@
+.. image:: https://readthedocs.org/projects/lasagne/badge/
+    :target: http://lasagne.readthedocs.org/en/latest/
+
+.. image:: https://travis-ci.org/Lasagne/Lasagne.svg
+    :target: https://travis-ci.org/Lasagne/Lasagne
+
+.. image:: https://img.shields.io/coveralls/Lasagne/Lasagne.svg
+    :target: https://coveralls.io/r/Lasagne/Lasagne
+
+.. image:: https://img.shields.io/badge/license-MIT-blue.svg
+    :target: https://github.com/Lasagne/Lasagne/blob/master/LICENSE
+
+.. image:: https://zenodo.org/badge/16974/Lasagne/Lasagne.svg
+   :target: https://zenodo.org/badge/latestdoi/16974/Lasagne/Lasagne
+
+Lasagne
+=======
+
+Lasagne is a lightweight library to build and train neural networks in Theano.
+Its main features are:
+
+* Supports feed-forward networks such as Convolutional Neural Networks (CNNs),
+  recurrent networks including Long Short-Term Memory (LSTM), and any
+  combination thereof
+* Allows architectures of multiple inputs and multiple outputs, including
+  auxiliary classifiers
+* Many optimization methods including Nesterov momentum, RMSprop and ADAM
+* Freely definable cost function and no need to derive gradients due to
+  Theano's symbolic differentiation
+* Transparent support of CPUs and GPUs due to Theano's expression compiler
+
+Its design is governed by `six principles
+<http://lasagne.readthedocs.org/en/latest/user/development.html#philosophy>`_:
+
+* Simplicity: Be easy to use, easy to understand and easy to extend, to
+  facilitate use in research
+* Transparency: Do not hide Theano behind abstractions, directly process and
+  return Theano expressions or Python / numpy data types
+* Modularity: Allow all parts (layers, regularizers, optimizers, ...) to be
+  used independently of Lasagne
+* Pragmatism: Make common use cases easy, do not overrate uncommon cases
+* Restraint: Do not obstruct users with features they decide not to use
+* Focus: "Do one thing and do it well"
+
+
+Installation
+------------
+
+In short, you can install a known compatible version of Theano and the latest
+Lasagne development version via:
+
+.. code-block:: bash
+
+  pip install -r https://raw.githubusercontent.com/Lasagne/Lasagne/master/requirements.txt
+  pip install https://github.com/Lasagne/Lasagne/archive/master.zip
+
+For more details and alternatives, please see the `Installation instructions
+<http://lasagne.readthedocs.org/en/latest/user/installation.html>`_.
+
+
+Documentation
+-------------
+
+Documentation is available online: http://lasagne.readthedocs.org/
+
+For support, please refer to the `lasagne-users mailing list
+<https://groups.google.com/forum/#!forum/lasagne-users>`_.
+
+
+Example
+-------
+
+.. code-block:: python
+
+  import lasagne
+  import theano
+  import theano.tensor as T
+
+  # create Theano variables for input and target minibatch
+  input_var = T.tensor4('X')
+  target_var = T.ivector('y')
+
+  # create a small convolutional neural network
+  from lasagne.nonlinearities import leaky_rectify, softmax
+  network = lasagne.layers.InputLayer((None, 3, 32, 32), input_var)
+  network = lasagne.layers.Conv2DLayer(network, 64, (3, 3),
+                                       nonlinearity=leaky_rectify)
+  network = lasagne.layers.Conv2DLayer(network, 32, (3, 3),
+                                       nonlinearity=leaky_rectify)
+  network = lasagne.layers.Pool2DLayer(network, (3, 3), stride=2, mode='max')
+  network = lasagne.layers.DenseLayer(lasagne.layers.dropout(network, 0.5),
+                                      128, nonlinearity=leaky_rectify,
+                                      W=lasagne.init.Orthogonal())
+  network = lasagne.layers.DenseLayer(lasagne.layers.dropout(network, 0.5),
+                                      10, nonlinearity=softmax)
+
+  # create loss function
+  prediction = lasagne.layers.get_output(network)
+  loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
+  loss = loss.mean() + 1e-4 * lasagne.regularization.regularize_network_params(
+          network, lasagne.regularization.l2)
+
+  # create parameter update expressions
+  params = lasagne.layers.get_all_params(network, trainable=True)
+  updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01,
+                                              momentum=0.9)
+
+  # compile training function that updates parameters and returns training loss
+  train_fn = theano.function([input_var, target_var], loss, updates=updates)
+
+  # train network (assuming you've got some training data in numpy arrays)
+  for epoch in range(100):
+      loss = 0
+      for input_batch, target_batch in training_data:
+          loss += train_fn(input_batch, target_batch)
+      print("Epoch %d: Loss %g" % (epoch + 1, loss / len(training_data)))
+
+  # use trained network for predictions
+  test_prediction = lasagne.layers.get_output(network, deterministic=True)
+  predict_fn = theano.function([input_var], T.argmax(test_prediction, axis=1))
+  print("Predicted class for first test input: %r" % predict_fn(test_data[0]))
+
+For a fully-functional example, see `examples/mnist.py <examples/mnist.py>`_,
+and check the `Tutorial
+<http://lasagne.readthedocs.org/en/latest/user/tutorial.html>`_ for in-depth
+explanations of the same. More examples, code snippets and reproductions of
+recent research papers are maintained in the separate `Lasagne Recipes
+<https://github.com/Lasagne/Recipes>`_ repository.
+
+
+Development
+-----------
+
+Lasagne is a work in progress, input is welcome.
+
+Please see the `Contribution instructions
+<http://lasagne.readthedocs.org/en/latest/user/development.html>`_ for details
+on how you can contribute!
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..a454085
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,177 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/lasagne.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/lasagne.qhc"
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/lasagne"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/lasagne"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/docs/_static/fix_rtd.css b/docs/_static/fix_rtd.css
new file mode 100644
index 0000000..11e85fa
--- /dev/null
+++ b/docs/_static/fix_rtd.css
@@ -0,0 +1,4 @@
+/* work around https://github.com/snide/sphinx_rtd_theme/issues/149 */
+.rst-content table.field-list .field-body {
+    padding-top: 8px;
+}
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..a641f9d
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,340 @@
+# -*- coding: utf-8 -*-
+#
+# Lasagne documentation build configuration file, created by
+# sphinx-quickstart on Sat Nov  8 11:00:12 2014.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+# sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.doctest',
+    'sphinx.ext.mathjax',
+#    'sphinx.ext.viewcode',  # create HTML file of source code and link to it
+    'sphinx.ext.linkcode',  # link to github, see linkcode_resolve() below
+    'numpydoc',
+#    'sphinx.ext.napoleon',  # alternative to numpydoc -- looks a bit worse.
+]
+
+# See https://github.com/rtfd/readthedocs.org/issues/283
+mathjax_path = ('https://cdn.mathjax.org/mathjax/latest/MathJax.js?'
+                'config=TeX-AMS-MML_HTMLorMML')
+
+# see http://stackoverflow.com/q/12206334/562769
+numpydoc_show_class_members = False
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Lasagne'
+copyright = u'2014–2015, Lasagne contributors'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+import lasagne
+# The short X.Y version.
+version = '.'.join(lasagne.__version__.split('.', 2)[:2])
+# The full version, including alpha/beta/rc tags.
+release = lasagne.__version__
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+# language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+# today = ''
+# Else, today_fmt is used as the format for a strftime call.
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+# show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+# keep_warnings = False
+
+# Resolve function for the linkcode extension.
+def linkcode_resolve(domain, info):
+    def find_source():
+        # try to find the file and line number, based on code from numpy:
+        # https://github.com/numpy/numpy/blob/master/doc/source/conf.py#L286
+        obj = sys.modules[info['module']]
+        for part in info['fullname'].split('.'):
+            obj = getattr(obj, part)
+        import inspect
+        import os
+        fn = inspect.getsourcefile(obj)
+        fn = os.path.relpath(fn, start=os.path.dirname(lasagne.__file__))
+        source, lineno = inspect.getsourcelines(obj)
+        return fn, lineno, lineno + len(source) - 1
+
+    if domain != 'py' or not info['module']:
+        return None
+    try:
+        filename = 'lasagne/%s#L%d-L%d' % find_source()
+    except Exception:
+        filename = info['module'].replace('.', '/') + '.py'
+    tag = 'master' if 'dev' in release else ('v' + release)
+    return "https://github.com/Lasagne/Lasagne/blob/%s/%s" % (tag, filename)
+
+
+# -- Options for HTML output ----------------------------------------------
+
+## Classic Python style:
+#html_theme = 'classic'
+#html_theme_options = {
+#    'stickysidebar': True,
+#}
+
+## Read the docs style:
+if os.environ.get('READTHEDOCS') != 'True':
+    try:
+        import sphinx_rtd_theme
+    except ImportError:
+        pass  # assume we have sphinx >= 1.3
+    else:
+        html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+    html_theme = 'sphinx_rtd_theme'
+def setup(app):
+    app.add_stylesheet("fix_rtd.css")
+
+## Bootstrap style:
+#import sphinx_bootstrap_theme
+#html_theme = 'bootstrap'
+#html_theme_options = {
+#    'bootswatch_theme': 'cosmo',  # see http://bootswatch.com/ for more
+#    'bootstrap_version': '3',
+#    'navbar_title': 'Lasagne',
+#    'source_link_position': 'footer',
+#}
+#html_theme_path = sphinx_bootstrap_theme.get_html_theme_path()
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+# html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+# html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+# html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+# html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+# html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+# html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+# html_domain_indices = True
+
+# If false, no index is generated.
+# html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+# html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'lasagnedoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    # 'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    ('index', 'lasagne.tex', u'lasagne Documentation',
+     u'Lasagne contributors', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+# latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+# latex_appendices = []
+
+# If false, no module index is generated.
+# latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'lasagne', u'Lasagne Documentation',
+     [u'Lasagne contributors'], 1)
+]
+
+# If true, show URL addresses after external links.
+# man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    ('index', 'lasagne', u'Lasagne Documentation',
+     u'Lasagne contributors', 'Lasagne',
+     'One line description of project.', 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+# texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+# texinfo_no_detailmenu = False
+
+
+# fool rtd into thinking a GPU is available, so all modules are importable
+try:
+    from unittest.mock import Mock
+except ImportError:
+    from mock import Mock
+
+import theano
+import theano.sandbox.cuda
+
+theano.config = Mock(device='gpu')
+theano.sandbox.cuda.cuda_enabled = True
+theano.sandbox.cuda.dnn = Mock(dnn_available=lambda: True)
+
+import sys
+
+sys.modules['pylearn2'] = Mock()
+sys.modules['pylearn2.sandbox'] = Mock()
+sys.modules['pylearn2.sandbox.cuda_convnet'] = Mock()
+sys.modules['pylearn2.sandbox.cuda_convnet.filter_acts'] = \
+    Mock(FilterActs=None)
+
+sys.modules['theano.sandbox.cuda.blas'] = Mock(GpuCorrMM=None)
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..d975985
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,50 @@
+Welcome to Lasagne
+==================
+
+Lasagne is a lightweight library to build and train neural networks in Theano.
+
+Lasagne is a work in progress, input is welcome. The available documentation is
+limited for now. The project is on `GitHub`_.
+
+User Guide
+------------
+
+The Lasagne user guide explains how to install Lasagne, how to build and train
+neural networks using Lasagne, and how to contribute to the library as a
+developer.
+
+.. toctree::
+  :maxdepth: 2
+
+  user/installation
+  user/tutorial
+  user/layers
+  user/custom_layers
+  user/development
+
+API Reference
+-------------
+
+If you are looking for information on a specific function, class or
+method, this part of the documentation is for you.
+
+.. toctree::
+  :maxdepth: 2
+
+  modules/layers
+  modules/updates
+  modules/init
+  modules/nonlinearities
+  modules/objectives
+  modules/regularization
+  modules/random
+  modules/utils
+
+Indices and tables
+------------------
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
+.. _GitHub: https://github.com/Lasagne/Lasagne
\ No newline at end of file
diff --git a/docs/modules/init.rst b/docs/modules/init.rst
new file mode 100644
index 0000000..45a6a90
--- /dev/null
+++ b/docs/modules/init.rst
@@ -0,0 +1,60 @@
+:mod:`lasagne.init`
+===================
+
+.. automodule:: lasagne.init
+
+Initializers
+------------
+
+.. autosummary::
+
+   Constant
+   Normal
+   Uniform
+   Glorot
+   GlorotNormal
+   GlorotUniform
+   He
+   HeNormal
+   HeUniform
+   Orthogonal
+   Sparse
+
+Detailed description
+--------------------
+
+.. autoclass:: Initializer
+   :members:
+
+.. autoclass:: Constant
+   :members:
+
+.. autoclass:: Normal
+   :members:
+
+.. autoclass:: Uniform
+   :members:
+
+.. autoclass:: Glorot
+   :members:
+
+.. autoclass:: GlorotNormal
+   :members:
+
+.. autoclass:: GlorotUniform
+   :members:
+
+.. autoclass:: He
+   :members:
+
+.. autoclass:: HeNormal
+   :members:
+
+.. autoclass:: HeUniform
+   :members:
+
+.. autoclass:: Orthogonal
+   :members:
+
+.. autoclass:: Sparse
+   :members:
diff --git a/docs/modules/layers.rst b/docs/modules/layers.rst
new file mode 100644
index 0000000..9aa3e8d
--- /dev/null
+++ b/docs/modules/layers.rst
@@ -0,0 +1,212 @@
+:mod:`lasagne.layers`
+=====================
+
+.. automodule:: lasagne.layers
+
+.. toctree::
+    :hidden:
+
+    layers/helper
+    layers/base
+    layers/input
+    layers/dense
+    layers/conv
+    layers/pool
+    layers/recurrent
+    layers/noise
+    layers/shape
+    layers/merge
+    layers/normalization
+    layers/embedding
+    layers/special
+    layers/corrmm
+    layers/cuda_convnet
+    layers/dnn
+   
+
+.. rubric:: :doc:`layers/helper`
+
+.. autosummary::
+    :nosignatures:
+
+    get_output
+    get_output_shape
+    get_all_layers
+    get_all_params
+    count_params
+    get_all_param_values
+    set_all_param_values
+
+
+.. rubric:: :doc:`layers/base`
+
+.. autosummary::
+    :nosignatures:
+
+    Layer
+    MergeLayer
+
+
+.. rubric:: :doc:`layers/input`
+
+.. autosummary::
+    :nosignatures:
+
+    InputLayer
+
+
+.. rubric:: :doc:`layers/dense`
+
+.. autosummary::
+    :nosignatures:
+
+    DenseLayer
+    NINLayer
+
+
+.. rubric:: :doc:`layers/conv`
+
+.. autosummary::
+    :nosignatures:
+
+    Conv1DLayer
+    Conv2DLayer
+    TransposedConv2DLayer
+    Deconv2DLayer
+    DilatedConv2DLayer
+
+
+.. rubric:: :doc:`layers/pool`
+
+.. autosummary::
+    :nosignatures:
+
+    MaxPool1DLayer
+    MaxPool2DLayer
+    Pool1DLayer
+    Pool2DLayer
+    Upscale1DLayer
+    Upscale2DLayer
+    GlobalPoolLayer
+    FeaturePoolLayer
+    FeatureWTALayer
+
+
+.. rubric:: :doc:`layers/recurrent`
+
+.. autosummary::
+    :nosignatures:
+
+    CustomRecurrentLayer
+    RecurrentLayer
+    LSTMLayer
+    GRULayer
+    Gate
+
+
+.. rubric:: :doc:`layers/noise`
+
+.. autosummary::
+    :nosignatures:
+
+    DropoutLayer
+    dropout
+    GaussianNoiseLayer
+
+
+.. rubric:: :doc:`layers/shape`
+
+.. autosummary::
+    :nosignatures:
+
+    ReshapeLayer
+    reshape
+    FlattenLayer
+    flatten
+    DimshuffleLayer
+    dimshuffle
+    PadLayer
+    pad
+    SliceLayer
+
+
+.. rubric:: :doc:`layers/merge`
+
+.. autosummary::
+    :nosignatures:
+
+    ConcatLayer
+    concat
+    ElemwiseMergeLayer
+    ElemwiseSumLayer
+
+
+.. rubric:: :doc:`layers/normalization`
+
+.. autosummary::
+    :nosignatures:
+
+    LocalResponseNormalization2DLayer
+    BatchNormLayer
+    batch_norm
+
+
+.. rubric:: :doc:`layers/embedding`
+
+.. autosummary::
+    :nosignatures:
+
+    EmbeddingLayer
+
+
+.. rubric:: :doc:`layers/special`
+
+.. autosummary::
+    :nosignatures:
+
+    NonlinearityLayer
+    BiasLayer
+    ExpressionLayer
+    InverseLayer
+    TransformerLayer
+    ParametricRectifierLayer
+    prelu
+    RandomizedRectifierLayer
+    rrelu
+
+
+.. rubric:: :doc:`layers/corrmm`
+
+.. autosummary::
+    :nosignatures:
+
+    corrmm.Conv2DMMLayer
+
+
+.. rubric:: :doc:`layers/cuda_convnet`
+
+.. autosummary::
+    :nosignatures:
+
+    cuda_convnet.Conv2DCCLayer
+    cuda_convnet.MaxPool2DCCLayer
+    cuda_convnet.ShuffleBC01ToC01BLayer
+    cuda_convnet.bc01_to_c01b
+    cuda_convnet.ShuffleC01BToBC01Layer
+    cuda_convnet.c01b_to_bc01
+    cuda_convnet.NINLayer_c01b
+
+
+.. rubric:: :doc:`layers/dnn`
+
+.. autosummary::
+    :nosignatures:
+
+    dnn.Conv2DDNNLayer
+    dnn.Conv3DDNNLayer
+    dnn.MaxPool2DDNNLayer
+    dnn.Pool2DDNNLayer
+    dnn.MaxPool3DDNNLayer
+    dnn.Pool3DDNNLayer
+    dnn.SpatialPyramidPoolingDNNLayer
+
diff --git a/docs/modules/layers/base.rst b/docs/modules/layers/base.rst
new file mode 100644
index 0000000..55c1869
--- /dev/null
+++ b/docs/modules/layers/base.rst
@@ -0,0 +1,13 @@
+Layer base classes
+------------------
+
+.. automodule:: lasagne.layers.base
+
+.. currentmodule:: lasagne.layers
+
+.. autoclass:: Layer
+   :members:
+
+.. autoclass:: MergeLayer
+    :members:
+
diff --git a/docs/modules/layers/conv.rst b/docs/modules/layers/conv.rst
new file mode 100644
index 0000000..9938d2b
--- /dev/null
+++ b/docs/modules/layers/conv.rst
@@ -0,0 +1,37 @@
+Convolutional layers
+--------------------
+
+.. automodule:: lasagne.layers.conv
+
+.. currentmodule:: lasagne.layers
+
+.. autoclass:: Conv1DLayer
+    :members:
+
+.. autoclass:: Conv2DLayer
+    :members:
+
+.. note::
+    For experts: ``Conv2DLayer`` will create a convolutional layer using
+    ``T.nnet.conv2d``, Theano's default convolution. On compilation for GPU,
+    Theano replaces this with a `cuDNN`_-based implementation if available,
+    otherwise falls back to a gemm-based implementation. For details on this,
+    please see the `Theano convolution documentation`_.
+
+    Lasagne also provides convolutional layers directly enforcing a specific
+    implementation: :class:`lasagne.layers.dnn.Conv2DDNNLayer` to enforce
+    cuDNN, :class:`lasagne.layers.corrmm.Conv2DMMLayer` to enforce the
+    gemm-based one, :class:`lasagne.layers.cuda_convnet.Conv2DCCLayer` for
+    Krizhevsky's `cuda-convnet`_.
+
+.. _cuda-convnet: https://code.google.com/p/cuda-convnet/
+.. _cuDNN: https://developer.nvidia.com/cudnn
+.. _Theano convolution documentation: http://deeplearning.net/software/theano/library/tensor/nnet/conv.html
+
+.. autoclass:: TransposedConv2DLayer
+    :members:
+
+.. autoclass:: Deconv2DLayer
+
+.. autoclass:: DilatedConv2DLayer
+    :members:
diff --git a/docs/modules/layers/corrmm.rst b/docs/modules/layers/corrmm.rst
new file mode 100644
index 0000000..83fa795
--- /dev/null
+++ b/docs/modules/layers/corrmm.rst
@@ -0,0 +1,8 @@
+:mod:`lasagne.layers.corrmm`
+----------------------------
+
+This module houses layers that require a GPU to work. Its layers are not automatically imported into the :mod:`lasagne.layers` namespace: To use these layers, you need to ``import lasagne.layers.corrmm`` explicitly.
+
+.. automodule:: lasagne.layers.corrmm
+    :members:
+
diff --git a/docs/modules/layers/cuda_convnet.rst b/docs/modules/layers/cuda_convnet.rst
new file mode 100644
index 0000000..4e69a6f
--- /dev/null
+++ b/docs/modules/layers/cuda_convnet.rst
@@ -0,0 +1,8 @@
+:mod:`lasagne.layers.cuda_convnet`
+----------------------------------
+
+This module houses layers that require `pylearn2 <https://deeplearning.net/software/pylearn2>` to work. Its layers are not automatically imported into the :mod:`lasagne.layers` namespace: To use these layers, you need to ``import lasagne.layers.cuda_convnet`` explicitly.
+
+.. automodule:: lasagne.layers.cuda_convnet
+    :members:
+
diff --git a/docs/modules/layers/dense.rst b/docs/modules/layers/dense.rst
new file mode 100644
index 0000000..2f2fa97
--- /dev/null
+++ b/docs/modules/layers/dense.rst
@@ -0,0 +1,13 @@
+Dense layers
+------------
+
+.. automodule:: lasagne.layers.dense
+
+.. currentmodule:: lasagne.layers
+
+.. autoclass:: DenseLayer
+   :members:
+
+.. autoclass:: NINLayer
+    :members:
+
diff --git a/docs/modules/layers/dnn.rst b/docs/modules/layers/dnn.rst
new file mode 100644
index 0000000..167abc5
--- /dev/null
+++ b/docs/modules/layers/dnn.rst
@@ -0,0 +1,11 @@
+:mod:`lasagne.layers.dnn`
+-------------------------
+
+This module houses layers that require `cuDNN <https://developer.nvidia.com/cudnn>`_ to work. Its layers are not automatically imported into the :mod:`lasagne.layers` namespace: To use these layers, you need to ``import lasagne.layers.dnn`` explicitly.
+
+Note that these layers are not required to use cuDNN: If cuDNN is available, Theano will use it for the default convolution and pooling layers anyway.
+However, they allow you to enforce the usage of cuDNN or use features not available in :mod:`lasagne.layers`.
+
+.. automodule:: lasagne.layers.dnn
+    :members:
+
diff --git a/docs/modules/layers/embedding.rst b/docs/modules/layers/embedding.rst
new file mode 100644
index 0000000..da750ea
--- /dev/null
+++ b/docs/modules/layers/embedding.rst
@@ -0,0 +1,10 @@
+Embedding layers
+----------------
+
+.. automodule:: lasagne.layers.embedding
+
+.. currentmodule:: lasagne.layers
+
+.. autoclass:: EmbeddingLayer
+    :members:
+
diff --git a/docs/modules/layers/helper.rst b/docs/modules/layers/helper.rst
new file mode 100644
index 0000000..c27eb86
--- /dev/null
+++ b/docs/modules/layers/helper.rst
@@ -0,0 +1,15 @@
+Helper functions
+----------------
+
+.. automodule:: lasagne.layers.helper
+
+.. currentmodule:: lasagne.layers
+
+.. autofunction:: get_output
+.. autofunction:: get_output_shape
+.. autofunction:: get_all_layers
+.. autofunction:: get_all_params
+.. autofunction:: count_params
+.. autofunction:: get_all_param_values
+.. autofunction:: set_all_param_values
+
diff --git a/docs/modules/layers/input.rst b/docs/modules/layers/input.rst
new file mode 100644
index 0000000..83a509c
--- /dev/null
+++ b/docs/modules/layers/input.rst
@@ -0,0 +1,10 @@
+Network input
+-------------
+
+.. automodule:: lasagne.layers.input
+
+.. currentmodule:: lasagne.layers
+
+.. autoclass:: InputLayer
+   :members:
+
diff --git a/docs/modules/layers/merge.rst b/docs/modules/layers/merge.rst
new file mode 100644
index 0000000..ff79ef0
--- /dev/null
+++ b/docs/modules/layers/merge.rst
@@ -0,0 +1,18 @@
+Merge layers
+------------
+
+.. automodule:: lasagne.layers.merge
+
+.. currentmodule:: lasagne.layers
+
+.. autoclass:: ConcatLayer
+    :members:
+
+.. autoclass:: concat
+
+.. autoclass:: ElemwiseMergeLayer
+    :members:
+
+.. autoclass:: ElemwiseSumLayer
+    :members:
+
diff --git a/docs/modules/layers/noise.rst b/docs/modules/layers/noise.rst
new file mode 100644
index 0000000..883bc9c
--- /dev/null
+++ b/docs/modules/layers/noise.rst
@@ -0,0 +1,15 @@
+Noise layers
+------------
+
+.. automodule:: lasagne.layers.noise
+
+.. currentmodule:: lasagne.layers
+
+.. autoclass:: DropoutLayer
+    :members:
+
+.. autoclass:: dropout
+
+.. autoclass:: GaussianNoiseLayer
+    :members:
+
diff --git a/docs/modules/layers/normalization.rst b/docs/modules/layers/normalization.rst
new file mode 100644
index 0000000..9ca062c
--- /dev/null
+++ b/docs/modules/layers/normalization.rst
@@ -0,0 +1,15 @@
+Normalization layers
+--------------------
+
+.. automodule:: lasagne.layers.normalization
+
+.. currentmodule:: lasagne.layers
+
+.. autoclass:: LocalResponseNormalization2DLayer
+    :members:
+
+.. autoclass:: BatchNormLayer
+    :members:
+
+.. autofunction:: batch_norm
+
diff --git a/docs/modules/layers/pool.rst b/docs/modules/layers/pool.rst
new file mode 100644
index 0000000..72a873e
--- /dev/null
+++ b/docs/modules/layers/pool.rst
@@ -0,0 +1,34 @@
+Pooling layers
+--------------
+
+.. automodule:: lasagne.layers.pool
+
+.. currentmodule:: lasagne.layers
+
+.. autoclass:: MaxPool1DLayer
+    :members:
+
+.. autoclass:: MaxPool2DLayer
+    :members:
+
+.. autoclass:: Pool1DLayer
+    :members:
+
+.. autoclass:: Pool2DLayer
+    :members:
+
+.. autoclass:: Upscale1DLayer
+    :members:
+
+.. autoclass:: Upscale2DLayer
+    :members:
+
+.. autoclass:: GlobalPoolLayer
+    :members:
+
+.. autoclass:: FeaturePoolLayer
+    :members:
+
+.. autoclass:: FeatureWTALayer
+    :members:
+
diff --git a/docs/modules/layers/recurrent.rst b/docs/modules/layers/recurrent.rst
new file mode 100644
index 0000000..81fa90e
--- /dev/null
+++ b/docs/modules/layers/recurrent.rst
@@ -0,0 +1,22 @@
+Recurrent layers
+----------------
+
+.. automodule:: lasagne.layers.recurrent
+
+.. currentmodule:: lasagne.layers
+
+.. autoclass:: CustomRecurrentLayer
+    :members:
+
+.. autoclass:: RecurrentLayer
+    :members:
+
+.. autoclass:: LSTMLayer
+    :members:
+
+.. autoclass:: GRULayer
+    :members:
+
+.. autoclass:: Gate
+    :members:
+
diff --git a/docs/modules/layers/shape.rst b/docs/modules/layers/shape.rst
new file mode 100644
index 0000000..5e7baac
--- /dev/null
+++ b/docs/modules/layers/shape.rst
@@ -0,0 +1,29 @@
+Shape layers
+------------
+
+.. automodule:: lasagne.layers.shape
+
+.. currentmodule:: lasagne.layers
+
+.. autoclass:: ReshapeLayer
+    :members:
+
+.. autoclass:: reshape
+
+.. autoclass:: FlattenLayer
+    :members:
+
+.. autoclass:: flatten
+
+.. autoclass:: DimshuffleLayer
+    :members:
+
+.. autoclass:: dimshuffle
+
+.. autoclass:: PadLayer
+    :members:
+
+.. autoclass:: pad
+
+.. autoclass:: SliceLayer
+
diff --git a/docs/modules/layers/special.rst b/docs/modules/layers/special.rst
new file mode 100644
index 0000000..2e9998d
--- /dev/null
+++ b/docs/modules/layers/special.rst
@@ -0,0 +1,40 @@
+Special-purpose layers
+----------------------
+
+.. automodule:: lasagne.layers.special
+
+.. currentmodule:: lasagne.layers
+
+.. autoclass:: NonlinearityLayer
+   :members:
+
+.. autoclass:: BiasLayer
+   :members:
+
+.. autoclass:: ScaleLayer
+   :members:
+
+.. autofunction:: standardize
+
+.. autoclass:: ExpressionLayer
+   :members:
+
+.. autoclass:: InverseLayer
+    :members:
+
+.. autoclass:: TransformerLayer
+    :members:
+
+.. autoclass:: TPSTransformerLayer
+    :members:
+
+.. autoclass:: ParametricRectifierLayer
+    :members:
+
+.. autofunction:: prelu
+
+.. autoclass:: RandomizedRectifierLayer
+    :members:
+
+.. autofunction:: rrelu
+
diff --git a/docs/modules/nonlinearities.rst b/docs/modules/nonlinearities.rst
new file mode 100644
index 0000000..704c35f
--- /dev/null
+++ b/docs/modules/nonlinearities.rst
@@ -0,0 +1,38 @@
+:mod:`lasagne.nonlinearities`
+=============================
+
+.. automodule:: lasagne.nonlinearities
+
+.. autosummary::
+
+   sigmoid
+   softmax
+   tanh
+   ScaledTanH
+   rectify
+   LeakyRectify
+   leaky_rectify
+   very_leaky_rectify
+   elu
+   softplus
+   linear
+   identity
+
+Detailed description
+--------------------
+
+.. autofunction:: sigmoid
+.. autofunction:: softmax
+.. autofunction:: tanh
+.. autoclass:: ScaledTanH
+   :members:
+.. autoclass:: ScaledTanh
+.. autofunction:: rectify
+.. autoclass:: LeakyRectify
+   :members:
+.. autofunction:: leaky_rectify
+.. autofunction:: very_leaky_rectify
+.. autofunction:: elu
+.. autofunction:: softplus
+.. autofunction:: linear
+.. autofunction:: identity
diff --git a/docs/modules/objectives.rst b/docs/modules/objectives.rst
new file mode 100644
index 0000000..565dbaa
--- /dev/null
+++ b/docs/modules/objectives.rst
@@ -0,0 +1,27 @@
+:mod:`lasagne.objectives`
+=========================
+
+.. automodule:: lasagne.objectives
+
+
+Loss functions
+--------------
+
+.. autofunction:: binary_crossentropy
+.. autofunction:: categorical_crossentropy
+.. autofunction:: squared_error
+.. autofunction:: binary_hinge_loss
+.. autofunction:: multiclass_hinge_loss
+
+
+Aggregation functions
+---------------------
+
+.. autofunction:: aggregate
+
+
+Evaluation functions
+--------------------
+
+.. autofunction:: binary_accuracy
+.. autofunction:: categorical_accuracy
diff --git a/docs/modules/random.rst b/docs/modules/random.rst
new file mode 100644
index 0000000..ec0a101
--- /dev/null
+++ b/docs/modules/random.rst
@@ -0,0 +1,7 @@
+:mod:`lasagne.random`
+=====================
+
+.. automodule:: lasagne.random
+
+.. autofunction:: get_rng
+.. autofunction:: set_rng
diff --git a/docs/modules/regularization.rst b/docs/modules/regularization.rst
new file mode 100644
index 0000000..92f0f91
--- /dev/null
+++ b/docs/modules/regularization.rst
@@ -0,0 +1,20 @@
+:mod:`lasagne.regularization`
+=============================
+
+.. automodule:: lasagne.regularization
+
+Helper functions
+----------------
+
+.. autofunction:: apply_penalty
+.. autofunction:: regularize_layer_params
+.. autofunction:: regularize_layer_params_weighted
+.. autofunction:: regularize_network_params
+
+
+Penalty functions
+-----------------
+
+.. autofunction:: l1
+.. autofunction:: l2
+
diff --git a/docs/modules/updates.rst b/docs/modules/updates.rst
new file mode 100644
index 0000000..526aea4
--- /dev/null
+++ b/docs/modules/updates.rst
@@ -0,0 +1,31 @@
+:mod:`lasagne.updates`
+======================
+
+.. automodule:: lasagne.updates
+
+
+Update functions
+----------------
+
+.. autofunction:: sgd
+.. autofunction:: momentum
+.. autofunction:: nesterov_momentum
+.. autofunction:: adagrad
+.. autofunction:: rmsprop
+.. autofunction:: adadelta
+.. autofunction:: adam
+.. autofunction:: adamax
+
+
+Update modification functions
+-----------------------------
+
+.. autofunction:: apply_momentum
+.. autofunction:: apply_nesterov_momentum
+
+
+Helper functions
+----------------
+
+.. autofunction:: norm_constraint
+.. autofunction:: total_norm_constraint
diff --git a/docs/modules/utils.rst b/docs/modules/utils.rst
new file mode 100644
index 0000000..9c8c559
--- /dev/null
+++ b/docs/modules/utils.rst
@@ -0,0 +1,13 @@
+:mod:`lasagne.utils`
+====================
+
+.. automodule:: lasagne.utils
+
+.. autofunction:: floatX
+.. autofunction:: shared_empty
+.. autofunction:: as_theano_expression
+.. autofunction:: collect_shared_vars
+.. autofunction:: one_hot
+.. autofunction:: unique
+.. autofunction:: compute_norms
+.. autofunction:: create_param
diff --git a/docs/user/custom_layers.rst b/docs/user/custom_layers.rst
new file mode 100644
index 0000000..fc9ba11
--- /dev/null
+++ b/docs/user/custom_layers.rst
@@ -0,0 +1,159 @@
+Creating custom layers
+======================
+
+
+A simple layer
+--------------
+
+To implement a custom layer in Lasagne, you will have to write a Python class
+that subclasses :class:`Layer` and implement at least one method:
+`get_output_for()`. This method computes the output of the layer given its
+input. Note that both the output and the input are Theano expressions, so they
+are symbolic.
+
+The following is an example implementation of a layer that multiplies its input
+by 2:
+
+.. code:: python
+
+    class DoubleLayer(lasagne.layers.Layer):
+        def get_output_for(self, input, **kwargs):
+            return 2 * input
+
+This is all that's required to implement a functioning custom layer class in
+Lasagne.
+
+
+A layer that changes the shape
+------------------------------
+
+If the layer does not change the shape of the data (for example because it
+applies an elementwise operation), then implementing only this one method is
+sufficient. Lasagne will assume that the output of the layer has the same shape
+as its input.
+
+However, if the operation performed by the layer changes the shape of the data,
+you also need to implement `get_output_shape_for()`. This method computes the
+shape of the layer output given the shape of its input. Note that this shape
+computation should result in a tuple of integers, so it is *not* symbolic.
+
+This method exists because Lasagne needs a way to propagate shape information
+when a network is defined, so it can determine what sizes the parameter tensors
+should be, for example. This mechanism allows each layer to obtain the size of
+its input from the previous layer, which means you don't have to specify the
+input size manually. This also prevents errors stemming from inconsistencies
+between the layers' expected and actual shapes.
+
+We can implement a layer that computes the sum across the trailing axis of its
+input as follows:
+
+.. code:: python
+
+    class SumLayer(lasagne.layers.Layer):
+        def get_output_for(self, input, **kwargs):
+            return input.sum(axis=-1)
+
+        def get_output_shape_for(self, input_shape):
+            return input_shape[:-1]
+
+
+It is important that the shape computation is correct, as this shape
+information may be used to initialize other layers in the network.
+
+
+A layer with parameters
+-----------------------
+
+If the layer has parameters, these should be initialized in the constructor.
+In Lasagne, parameters are represented by Theano shared variables. A method
+is provided to create and register parameter variables:
+:meth:`lasagne.layers.Layer.add_param()`.
+
+To show how this can be used, here is a layer that multiplies its input
+by a matrix ``W`` (much like a typical fully connected layer in a neural
+network would). This matrix is a parameter of the layer. The shape of the
+matrix will be ``(num_inputs, num_units)``, where ``num_inputs`` is the
+number of input features and ``num_units`` has to be specified when the layer
+is created.
+
+.. code:: python
+
+    class DotLayer(lasagne.layers.Layer):
+        def __init__(self, incoming, num_units, W=lasagne.init.Normal(0.01), **kwargs):
+            super(DotLayer, self).__init__(incoming, **kwargs)
+            num_inputs = self.input_shape[1]
+            self.num_units = num_units
+            self.W = self.add_param(W, (num_inputs, num_units), name='W')
+
+        def get_output_for(self, input, **kwargs):
+            return T.dot(input, self.W)
+
+        def get_output_shape_for(self, input_shape):
+            return (input_shape[0], self.num_units)
+
+A few things are worth noting here: when overriding the constructor, we need
+to call the superclass constructor on the first line. This is important to
+ensure the layer functions properly.
+Note that we pass ``**kwargs`` - although this is not strictly necessary, it
+enables some other cool Lasagne features, such as making it possible to give
+the layer a name:
+
+>>> l_dot = DotLayer(l_in, num_units=50, name='my_dot_layer')
+
+The call to ``self.add_param()`` creates the Theano shared variable
+representing the parameter, and registers it so it can later be retrieved using
+:meth:`lasagne.layers.Layer.get_params()`. It returns the created variable,
+which we tuck away in ``self.W`` for easy access.
+
+Note that we've also made it possible to specify a custom initialization
+strategy for ``W`` by adding a constructor argument for it, e.g.:
+
+>>> l_dot = DotLayer(l_in, num_units=50, W=lasagne.init.Constant(0.0))
+
+This 'Lasagne idiom' of tucking away a created parameter variable in an
+attribute for easy access and adding a constructor argument with the same name
+to specify the initialization strategy is very common throughout the library.
+
+Finally, note that we used ``self.input_shape`` to determine the shape of the
+parameter matrix. This property is available in all Lasagne layers, once the
+superclass constructor has been called.
+
+
+A layer with multiple behaviors
+-------------------------------
+
+Some layers can have multiple behaviors. For example, a layer implementing
+dropout should be able to be switched on or off. During training, we want it
+to apply dropout noise to its input and scale up the remaining values, but
+during evaluation we don't want it to do anything.
+
+For this purpose, the `get_output_for()` method takes optional keyword
+arguments (``kwargs``). When `get_output()` is called to compute an expression
+for the output of a network, all specified keyword arguments are passed to the
+`get_output_for()` methods of all layers in the network.
+
+For layers that add noise for regularization purposes, such as dropout, the
+convention in Lasagne is to use the keyword argument ``deterministic`` to
+control its behavior.
+
+Lasagne's :class:`lasagne.layers.DropoutLayer` looks roughly like this
+(simplified implementation for illustration purposes):
+
+.. code:: python
+
+    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
+    _srng = RandomStreams()
+
+    class DropoutLayer(Layer):
+        def __init__(self, incoming, p=0.5, **kwargs):
+            super(DropoutLayer, self).__init__(incoming, **kwargs)
+            self.p = p
+
+        def get_output_for(self, input, deterministic=False, **kwargs):
+            if deterministic:  # do nothing in the deterministic case
+                return input
+            else:  # add dropout noise otherwise
+                retain_prob = 1 - self.p
+                input /= retain_prob
+                return input * _srng.binomial(input.shape, p=retain_prob,
+                                              dtype=theano.config.floatX)
diff --git a/docs/user/development.rst b/docs/user/development.rst
new file mode 100644
index 0000000..1c548c2
--- /dev/null
+++ b/docs/user/development.rst
@@ -0,0 +1,234 @@
+Development
+===========
+
+The Lasagne project was started by Sander Dieleman in September 2014. It is
+developed by a core team of eight people (in alphabetical order:
+`Eric Battenberg <http://ericbattenberg.com/>`_,
+`Sander Dieleman <http://benanne.github.io>`_,
+`Daniel Nouri <http://danielnouri.org>`_,
+`Eben Olson <https://github.com/ebenolson>`_,
+`Aäron van den Oord <https://twitter.com/avdnoord>`_,
+`Colin Raffel <http://colinraffel.com/>`_,
+`Jan Schlüter <http://www.ofai.at/~jan.schlueter/>`_,
+`Søren Kaae Sønderby <http://www1.bio.ku.dk/english/staff/?pure=en/persons/418078>`_)
+and `numerous additional contributors
+<https://github.com/Lasagne/Lasagne/graphs/contributors>`_ on GitHub:
+https://github.com/Lasagne/Lasagne
+
+As an open-source project by researchers for researchers, we highly welcome
+contributions! Every bit helps and will be credited.
+
+
+
+.. _lasagne-philosopy:
+
+Philosophy
+----------
+
+Lasagne grew out of a need to combine the flexibility of Theano with the availability of the right building blocks for training neural networks. Its development is guided by a number of design goals:
+
+* **Simplicity**: Be easy to use, easy to understand and easy to extend, to
+  facilitate use in research. Interfaces should be kept small, with as few
+  classes and methods as possible. Every added abstraction and feature should
+  be carefully scrutinized, to determine whether the added complexity is
+  justified.
+
+* **Transparency**: Do not hide Theano behind abstractions, directly process
+  and return Theano expressions or Python / numpy data types. Try to rely on
+  Theano's functionality where possible, and follow Theano's conventions.
+
+* **Modularity**: Allow all parts (layers, regularizers, optimizers, ...) to be
+  used independently of Lasagne. Make it easy to use components in isolation or
+  in conjunction with other frameworks.
+
+* **Pragmatism**: Make common use cases easy, do not overrate uncommon cases.
+  Ideally, everything should be possible, but common use cases shouldn't be
+  made more difficult just to cater for exotic ones.
+
+* **Restraint**: Do not obstruct users with features they decide not to use.
+  Both in using and in extending components, it should be possible for users to
+  be fully oblivious to features they do not need.
+
+* **Focus**: "Do one thing and do it well". Do not try to provide a library for
+  everything to do with deep learning.
+
+
+
+What to contribute
+------------------
+
+Give feedback
+~~~~~~~~~~~~~
+
+To send us general feedback, questions or ideas for improvement, please post on
+`our mailing list`_.
+
+If you have a very concrete feature proposal, add it to the `issue tracker on
+GitHub`_:
+
+* Explain how it would work, and link to a scientific paper if applicable.
+* Keep the scope as narrow as possible, to make it easier to implement.
+
+
+Report bugs
+~~~~~~~~~~~
+
+Report bugs at the `issue tracker on GitHub`_.
+If you are reporting a bug, please include:
+
+* your Lasagne and Theano version.
+* steps to reproduce the bug, ideally reduced to a few Python commands.
+* the results you obtain, and the results you expected instead.
+
+If you are unsure whether the behavior you experience is a bug, or if you are
+unsure whether it is related to Lasagne or Theano, please just ask on `our
+mailing list`_ first.
+
+
+Fix bugs
+~~~~~~~~
+
+Look through the GitHub issues for bug reports. Anything tagged with "bug" is
+open to whoever wants to implement it. If you discover a bug in Lasagne you can
+fix yourself, by all means feel free to just implement a fix and not report it
+first.
+
+
+Implement features
+~~~~~~~~~~~~~~~~~~
+
+Look through the GitHub issues for feature proposals. Anything tagged with
+"feature" or "enhancement" is open to whoever wants to implement it. If you
+have a feature in mind you want to implement yourself, please note that Lasagne
+has a fairly narrow focus and we strictly follow a set of :ref:`design
+principles <lasagne-philosopy>`, so we cannot guarantee upfront that your code
+will be included. Please do not hesitate to just propose your idea in a GitHub
+issue or on the mailing list first, so we can discuss it and/or guide you
+through the implementation.
+
+
+Write documentation
+~~~~~~~~~~~~~~~~~~~
+
+Whenever you find something not explained well, misleading, glossed over or
+just wrong, please update it! The *Edit on GitHub* link on the top right of
+every documentation page and the *[source]* link for every documented entity
+in the API reference will help you to quickly locate the origin of any text.
+
+
+
+How to contribute
+-----------------
+
+Edit on GitHub
+~~~~~~~~~~~~~~
+
+As a very easy way of just fixing issues in the documentation, use the *Edit
+on GitHub* link on the top right of a documentation page or the *[source]* link
+of an entity in the API reference to open the corresponding source file in
+GitHub, then click the *Edit this file* link to edit the file in your browser
+and send us a Pull Request. All you need for this is a free GitHub account.
+
+For any more substantial changes, please follow the steps below to setup
+Lasagne for development.
+
+
+Development setup
+~~~~~~~~~~~~~~~~~
+
+First, follow the instructions for performing a development installation of
+Lasagne (including forking on GitHub): :ref:`lasagne-development-install`
+
+To be able to run the tests and build the documentation locally, install
+additional requirements with: ``pip install -r requirements-dev.txt`` (adding
+``--user`` if you want to install to your home directory instead).
+
+If you use the bleeding-edge version of Theano, then instead of running that
+command, just use ``pip install`` to manually install all dependencies listed
+in ``requirements-dev.txt`` with their correct versions; otherwise it will
+attempt to downgrade Theano to the known good version in ``requirements.txt``.
+
+
+Documentation
+~~~~~~~~~~~~~
+
+The documentation is generated with `Sphinx
+<http://sphinx-doc.org/latest/index.html>`_. To build it locally, run the
+following commands:
+
+.. code:: bash
+
+    cd docs
+    make html
+
+Afterwards, open ``docs/_build/html/index.html`` to view the documentation as
+it would appear on `readthedocs <http://lasagne.readthedocs.org/>`_. If you
+changed a lot and seem to get misleading error messages or warnings, run
+``make clean html`` to force Sphinx to recreate all files from scratch.
+
+When writing docstrings, follow existing documentation as much as possible to
+ensure consistency throughout the library. For additional information on the
+syntax and conventions used, please refer to the following documents:
+
+* `reStructuredText Primer <http://sphinx-doc.org/rest.html>`_
+* `Sphinx reST markup constructs <http://sphinx-doc.org/markup/index.html>`_
+* `A Guide to NumPy/SciPy Documentation <https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt>`_
+
+
+Testing
+~~~~~~~
+
+Lasagne has a code coverage of 100%, which has proven very helpful in the past,
+but also creates some duties:
+
+* Whenever you change any code, you should test whether it breaks existing
+  features by just running the test suite. The test suite will also be run by
+  `Travis <https://travis-ci.org/>`_ for any Pull Request to Lasagne.
+* Any code you add needs to be accompanied by tests ensuring that nobody else
+  breaks it in future. `Coveralls <https://coveralls.io/>`_ will check whether
+  the code coverage stays at 100% for any Pull Request to Lasagne.
+* Every bug you fix indicates a missing test case, so a proposed bug fix should
+  come with a new test that fails without your fix.
+
+To run the full test suite, just do
+
+.. code:: bash
+
+    py.test
+
+Testing will take over 5 minutes for the first run, but less than a minute for
+subsequent runs when Theano can reuse compiled code. It will end with a code
+coverage report specifying which code lines are not covered by tests, if any.
+Furthermore, it will list any failed tests, and failed `PEP8
+<https://www.python.org/dev/peps/pep-0008/>`_ checks.
+
+To only run tests matching a certain name pattern, use the ``-k`` command line
+switch, e.g., ``-k pool`` will run the pooling layer tests only.
+
+To land in a ``pdb`` debug prompt on a failure to inspect it more closely, use
+the ``--pdb`` switch.
+
+Finally, for a loop-on-failing mode, do ``pip install pytest-xdist`` and run
+``py.test -f``. This will pause after the run, wait for any source file to
+change and run all previously failing tests again.
+
+
+Sending Pull Requests
+~~~~~~~~~~~~~~~~~~~~~
+
+When you're satisfied with your addition, the tests pass and the documentation
+looks good without any markup errors, commit your changes to a new branch, push
+that branch to your fork and send us a Pull Request via GitHub's web interface.
+
+All these steps are nicely explained on GitHub:
+https://guides.github.com/introduction/flow/
+
+When filing your Pull Request, please include a description of what it does, to
+help us reviewing it. If it is fixing an open issue, say, issue #123, add
+*Fixes #123*, *Resolves #123* or *Closes #123* to the description text, so
+GitHub will close it when your request is merged.
+
+
+
+.. _issue tracker on GitHub: https://github.com/Lasagne/Lasagne/issues
+.. _our mailing list: https://groups.google.com/forum/#!forum/lasagne-users
diff --git a/docs/user/installation.rst b/docs/user/installation.rst
new file mode 100644
index 0000000..01d7abd
--- /dev/null
+++ b/docs/user/installation.rst
@@ -0,0 +1,249 @@
+.. _installation:
+
+============
+Installation
+============
+
+Lasagne has a couple of prerequisites that need to be installed first, but it
+is not very picky about versions. The single exception is Theano: Due to its
+tight coupling to Theano, you will have to install a recent version of Theano
+(usually more recent than the latest official release!) fitting the version of
+Lasagne you choose to install.
+
+Most of the instructions below assume you are running a Linux or Mac system,
+but are otherwise very generic. For detailed step-by-step instructions for
+specific platforms including Windows, check our `From Zero to Lasagne
+<https://github.com/Lasagne/Lasagne/wiki/From-Zero-to-Lasagne>`_ guides.
+
+If you run into any trouble, please check the `Theano installation instructions
+<http://deeplearning.net/software/theano/install.html>`_ which cover installing
+the prerequisites for a range of operating systems, or ask for help on `our
+mailing list <https://groups.google.com/d/forum/lasagne-users>`_.
+
+
+Prerequisites
+=============
+
+Python + pip
+------------
+
+Lasagne currently requires Python 2.7 or 3.4 to run. Please install Python via
+the package manager of your operating system if it is not included already.
+
+Python includes ``pip`` for installing additional modules that are not shipped
+with your operating system, or shipped in an old version, and we will make use
+of it below. We recommend installing these modules into your home directory
+via ``--user``, or into a `virtual environment
+<http://www.dabapps.com/blog/introduction-to-pip-and-virtualenv-python/>`_
+via ``virtualenv``.
+
+C compiler
+----------
+
+Theano requires a working C compiler, and numpy/scipy require a compiler as
+well if you install them via ``pip``. On Linux, the default compiler is usually
+``gcc``, and on Mac OS, it's ``clang``. Again, please install them via the
+package manager of your operating system.
+
+numpy/scipy + BLAS
+------------------
+
+Lasagne requires numpy of version 1.6.2 or above, and Theano also requires
+scipy 0.11 or above. Numpy/scipy rely on a BLAS library to provide fast linear
+algebra routines. They will work fine without one, but a lot slower, so it is
+worth getting this right (but this is less important if you plan to use a GPU).
+
+If you install numpy and scipy via your operating system's package manager,
+they should link to the BLAS library installed in your system. If you install
+numpy and scipy via ``pip install numpy`` and ``pip install scipy``, make sure
+to have development headers for your BLAS library installed (e.g., the
+``libopenblas-dev`` package on Debian/Ubuntu) while running the installation
+command. Please refer to the `numpy/scipy build instructions
+<http://www.scipy.org/scipylib/building/index.html>`_ if in doubt.
+
+Theano
+------
+
+The version to install depends on the Lasagne version you choose, so this will
+be handled below.
+
+
+Stable Lasagne release
+======================
+
+Lasagne 0.1 requires a more recent version of Theano than the one available
+on PyPI. To install a version that is known to work, run the following command:
+
+.. code-block:: bash
+
+  pip install -r https://raw.githubusercontent.com/Lasagne/Lasagne/v0.1/requirements.txt
+
+.. warning::
+  An even more recent version of Theano will often work as well, but at the
+  time of writing, a simple ``pip install Theano`` will give you a version that
+  is too old.
+
+To install release 0.1 of Lasagne from PyPI, run the following command:
+
+.. code-block:: bash
+
+  pip install Lasagne==0.1
+
+If you do not use ``virtualenv``, add ``--user`` to both commands to install
+into your home directory instead. To upgrade from an earlier installation, add
+``--upgrade``.
+
+
+Bleeding-edge version
+=====================
+
+The latest development version of Lasagne usually works fine with the latest
+development version of Theano. To install both, run the following commands:
+
+.. code-block:: bash
+
+  pip install --upgrade https://github.com/Theano/Theano/archive/master.zip
+  pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip
+
+Again, add ``--user`` if you want to install to your home directory instead.
+
+
+.. _lasagne-development-install:
+
+Development installation
+========================
+
+Alternatively, you can install Lasagne (and optionally Theano) from source,
+in a way that any changes to your local copy of the source tree take effect
+without requiring a reinstall. This is often referred to as *editable* or
+*development* mode. Firstly, you will need to obtain a copy of the source tree:
+
+.. code-block:: bash
+
+  git clone https://github.com/Lasagne/Lasagne.git
+
+It will be cloned to a subdirectory called ``Lasagne``. Make sure to place it
+in some permanent location, as for an *editable* installation, Python will
+import the module directly from this directory and not copy over the files.
+Enter the directory and install the known good version of Theano:
+
+.. code-block:: bash
+
+  cd Lasagne
+  pip install -r requirements.txt
+
+Alternatively, install the bleeding-edge version of Theano as described in the
+previous section.
+
+To install the Lasagne package itself, in editable mode, run:
+
+.. code-block:: bash
+
+  pip install --editable .
+
+As always, add ``--user`` to install it to your home directory instead.
+
+**Optional**: If you plan to contribute to Lasagne, you will need to fork the
+Lasagne repository on GitHub. This will create a repository under your user
+account. Update your local clone to refer to the official repository as
+``upstream``, and your personal fork as ``origin``:
+
+.. code-block:: bash
+
+  git remote rename origin upstream
+  git remote add origin https://github.com/<your-github-name>/Lasagne.git
+
+If you set up an `SSH key <https://help.github.com/categories/ssh/>`_, use the
+SSH clone URL instead: ``git at github.com:<your-github-name>/Lasagne.git``.
+
+You can now use this installation to develop features and send us pull requests
+on GitHub, see :doc:`development`!
+
+
+GPU support
+===========
+
+Thanks to Theano, Lasagne transparently supports training your networks on a
+GPU, which may be 10 to 50 times faster than training them on a CPU. Currently,
+this requires an NVIDIA GPU with CUDA support, and some additional software for
+Theano to use it.
+
+CUDA
+----
+
+Install the latest CUDA Toolkit and possibly the corresponding driver available
+from NVIDIA: https://developer.nvidia.com/cuda-downloads
+
+Closely follow the *Getting Started Guide* linked underneath the download table
+to be sure you don't mess up your system by installing conflicting drivers.
+
+After installation, make sure ``/usr/local/cuda/bin`` is in your ``PATH``, so
+``nvcc --version`` works. Also make sure ``/usr/local/cuda/lib64`` is in your
+``LD_LIBRARY_PATH``, so the toolkit libraries can be found.
+
+Theano
+------
+
+If CUDA is set up correctly, the following should print some information on
+your GPU (the first CUDA-capable GPU in your system if you have multiple ones):
+
+.. code-block:: bash
+
+  THEANO_FLAGS=device=gpu python -c "import theano; print(theano.sandbox.cuda.device_properties(0))"
+
+To configure Theano to use the GPU by default, create a file ``.theanorc``
+directly in your home directory, with the following contents:
+
+.. code-block:: none
+
+  [global]
+  floatX = float32
+  device = gpu
+
+Optionally add ``allow_gc = False`` for some extra performance at the expense
+of (sometimes substantially) higher GPU memory usage.
+
+If you run into problems, please check Theano's instructions for `Using the GPU
+<http://deeplearning.net/software/theano/tutorial/using_gpu.html>`_.
+
+cuDNN
+-----
+
+NVIDIA provides a library for common neural network operations that especially
+speeds up Convolutional Neural Networks (CNNs). Again, it can be obtained from
+NVIDIA (after registering as a developer): https://developer.nvidia.com/cudnn
+
+Note that it requires a reasonably modern GPU with Compute Capability 3.0 or higher;
+see `NVIDIA's list of CUDA GPUs <https://developer.nvidia.com/cuda-gpus>`_.
+
+To install it, copy the ``*.h`` files to ``/usr/local/cuda/include`` and the
+``lib*`` files to ``/usr/local/cuda/lib64``.
+
+To check whether it is found by Theano, run the following command:
+
+.. code-block:: bash
+
+  python -c "from theano.sandbox.cuda.dnn import dnn_available as d; print(d() or d.msg)"
+
+It will print ``True`` if everything is fine, or an error message otherwise.
+There are no additional steps required for Theano to make use of cuDNN.
+
+Docker
+======
+
+Instead of manually installing Theano and Lasagne on your machines as described above,
+you may want to use a pre-made `Docker <https://www.docker.com/what-docker>`_
+image: `Lasagne Docker (CPU) <https://hub.docker.com/r/kaixhin/lasagne/>`_ or
+`Lasagne Docker (CUDA) <https://hub.docker.com/r/kaixhin/cuda-lasagne/>`_. These
+are updated on a weekly basis with bleeding-edge builds of Theano and Lasagne.
+Examples of running bash in a Docker container are as follows:
+
+.. code-block:: bash
+
+  sudo docker run -it kaixhin/lasagne
+  sudo nvidia-docker run -it kaixhin/cuda-lasagne:7.0
+
+For a guide to Docker, see the `official docs <https://docs.docker.com>`_.
+CUDA support requires `NVIDIA Docker <https://github.com/NVIDIA/nvidia-docker>`_.
+For more details on how to use the Lasagne Docker images,
+consult the `source project <https://github.com/Kaixhin/dockerfiles>`_.
diff --git a/docs/user/layers.rst b/docs/user/layers.rst
new file mode 100644
index 0000000..a5b1f30
--- /dev/null
+++ b/docs/user/layers.rst
@@ -0,0 +1,203 @@
+Layers
+======
+
+
+The `lasagne.layers` module provides various classes representing the layers
+of a neural network. All of them are subclasses of the
+:class:`lasagne.layers.Layer` base class.
+
+Creating a layer
+----------------
+
+A layer can be created as an instance of a `Layer` subclass. For example, a
+dense layer can be created as follows:
+
+>>> import lasagne
+>>> l = lasagne.layers.DenseLayer(l_in, num_units=100) # doctest: +SKIP
+
+This will create a dense layer with 100 units, connected to another layer
+`l_in`.
+
+Creating a network
+------------------
+
+Note that for almost all types of layers, you will have to specify one or more
+other layers that the layer you are creating gets its input from. The main
+exception is :class:`InputLayer`, which can be used to represent the input of
+a network.
+
+Chaining layer instances together like this will allow you to specify your
+desired network structure. Note that the same layer can be used as input to
+multiple other layers, allowing for arbitrary tree and directed acyclic graph
+(DAG) structures.
+
+Here is an example of an MLP with a single hidden layer:
+
+>>> import theano.tensor as T
+>>> l_in = lasagne.layers.InputLayer((100, 50))
+>>> l_hidden = lasagne.layers.DenseLayer(l_in, num_units=200)
+>>> l_out = lasagne.layers.DenseLayer(l_hidden, num_units=10,
+...                                   nonlinearity=T.nnet.softmax)
+
+The first layer of the network is an `InputLayer`, which represents the input.
+When creating an input layer, you should specify the shape of the input data.
+In this example, the input is a matrix with shape (100, 50), representing a
+batch of 100 data points, where each data point is a vector of length 50.
+The first dimension of a tensor is usually the batch dimension, following the
+established Theano and scikit-learn conventions.
+
+The hidden layer of the network is a dense layer with 200 units, taking its
+input from the input layer. Note that we did not specify the nonlinearity of
+the hidden layer. A layer with rectified linear units will be created by
+default.
+
+The output layer of the network is a dense layer with 10 units and a softmax
+nonlinearity, allowing for 10-way classification of the input vectors.
+
+Note also that we did not create any object representing the entire network.
+Instead, the output layer instance `l_out` is also used to refer to the entire
+network in Lasagne.
+
+Naming layers
+-------------
+
+For convenience, you can name a layer by specifying the `name` keyword
+argument:
+
+>>> l_hidden = lasagne.layers.DenseLayer(l_in, num_units=200,
+...                                      name="hidden_layer")
+
+Initializing parameters
+-----------------------
+
+Many types of layers, such as :class:`DenseLayer`, have trainable parameters.
+These are referred to by short names that match the conventions used in modern
+deep learning literature. For example, a weight matrix will usually be called
+`W`, and a bias vector will usually be `b`.
+
+When creating a layer with trainable parameters, Theano shared variables will
+be created for them and initialized automatically. You can optionally specify
+your own initialization strategy by using keyword arguments that match the
+parameter variable names. For example:
+
+>>> l = lasagne.layers.DenseLayer(l_in, num_units=100,
+...                               W=lasagne.init.Normal(0.01))
+
+The weight matrix `W` of this dense layer will be initialized using samples
+from a normal distribution with standard deviation 0.01 (see `lasagne.init`
+for more information).
+
+There are several ways to manually initialize parameters:
+
+- Theano shared variable
+    If a shared variable instance is provided, this is used unchanged as the
+    parameter variable. For example:
+
+    >>> import theano
+    >>> import numpy as np
+    >>> W = theano.shared(np.random.normal(0, 0.01, (50, 100)))
+    >>> l = lasagne.layers.DenseLayer(l_in, num_units=100, W=W)
+
+- numpy array
+    If a numpy array is provided, a shared variable is created and initialized
+    using the array. For example:
+
+    >>> W_init = np.random.normal(0, 0.01, (50, 100))
+    >>> l = lasagne.layers.DenseLayer(l_in, num_units=100, W=W_init)
+
+- callable
+    If a callable is provided (e.g. a function or a
+    :class:`lasagne.init.Initializer` instance), a shared variable is created
+    and the callable is called with the desired shape to generate suitable
+    initial parameter values. The variable is then initialized with those
+    values. For example:
+
+    >>> l = lasagne.layers.DenseLayer(l_in, num_units=100,
+    ...                               W=lasagne.init.Normal(0.01))
+
+    Or, using a custom initialization function:
+
+    >>> def init_W(shape):
+    ...     return np.random.normal(0, 0.01, shape)
+    >>> l = lasagne.layers.DenseLayer(l_in, num_units=100, W=init_W)
+
+Some types of parameter variables can also be set to ``None`` at initialization
+(e.g. biases). In that case, the parameter variable will be omitted.
+For example, creating a dense layer without biases is done as follows:
+
+>>> l = lasagne.layers.DenseLayer(l_in, num_units=100, b=None)
+
+Parameter sharing
+-----------------
+
+Parameter sharing between multiple layers can be achieved by using the
+same Theano shared variable instance for their parameters. For example:
+
+>>> l1 = lasagne.layers.DenseLayer(l_in, num_units=100)
+>>> l2 = lasagne.layers.DenseLayer(l_in, num_units=100, W=l1.W)
+
+These two layers will now share weights (but have separate biases).
+
+Propagating data through layers
+-------------------------------
+
+To compute an expression for the output of a single layer given its input, the
+`get_output_for()` method can be used. To compute the output of a network, you
+should instead call :func:`lasagne.layers.get_output()` on it. This will
+traverse the network graph.
+
+You can call this function with the layer you want to compute the output
+expression for:
+
+>>> y = lasagne.layers.get_output(l_out)
+
+In that case, a Theano expression will be returned that represents the output
+in function of the input variables associated with the
+:class:`lasagne.layers.InputLayer` instance (or instances) in the network,
+so given the example network from before, you could compile a Theano function
+to compute its output given an input as follows:
+
+>>> f = theano.function([l_in.input_var], lasagne.layers.get_output(l_out))
+
+You can also specify a Theano expression to use as input as a second argument
+to :func:`lasagne.layers.get_output()`:
+
+>>> x = T.matrix('x')
+>>> y = lasagne.layers.get_output(l_out, x)
+>>> f = theano.function([x], y)
+
+This only works when there is only a single :class:`InputLayer` in the network.
+If there is more than one, you can specify input expressions in a dictionary.
+For example, in a network with two input layers `l_in1` and `l_in2` and an
+output layer `l_out`:
+
+>>> x1 = T.matrix('x1')
+>>> x2 = T.matrix('x2')
+>>> y = lasagne.layers.get_output(l_out, { l_in1: x1, l_in2: x2 })
+
+Any keyword arguments passed to `get_output()` are propagated to all layers.
+This makes it possible to control the behavior of the entire network. The
+main use case for this is the ``deterministic`` keyword argument, which
+disables stochastic behaviour such as dropout when set to ``True``. This is
+useful because a deterministic output is desirable at evaluation time.
+
+>>> y = lasagne.layers.get_output(l_out, deterministic=True)
+
+Some networks may have multiple output layers - or you may just want to
+compute output expressions for intermediate layers in the network. In that
+case, you can pass a list of layers. For example, in a network with two output
+layers `l_out1` and `l_out2`:
+
+>>> y1, y2 = lasagne.layers.get_output([l_out1, l_out2])
+
+You could also just call :func:`lasagne.layers.get_output()` twice:
+
+>>> y1 = lasagne.layers.get_output(l_out1)
+>>> y2 = lasagne.layers.get_output(l_out2)
+
+However, this is **not recommended**! Some network layers may have
+non-deterministic output, such as dropout layers. If you compute the network
+output expressions with separate calls to :func:`lasagne.layers.get_output()`,
+they will not use the same samples. Furthermore, this may lead to unnecessary
+computation because Theano is not always able to merge identical computations 
+properly. Calling `get_output()` only once prevents both of these issues.
\ No newline at end of file
diff --git a/docs/user/tutorial.rst b/docs/user/tutorial.rst
new file mode 100644
index 0000000..c11580c
--- /dev/null
+++ b/docs/user/tutorial.rst
@@ -0,0 +1,620 @@
+.. _tutorial:
+
+========
+Tutorial
+========
+
+This tutorial will walk you through building a handwritten digits classifier
+using the MNIST dataset, arguably the "Hello World" of neural networks.
+More tutorials and examples can be found in the `Lasagne Recipes`_ repository.
+
+
+Before we start
+===============
+
+The tutorial assumes that you are somewhat familiar with neural networks and
+Theano (the library which Lasagne is built on top of). You can try to learn
+both at once from the `Deeplearning Tutorial`_.
+
+For a more slow-paced introduction to artificial neural networks, we recommend
+`Convolutional Neural Networks for Visual Recognition`_ by Andrej Karpathy et
+al., `Neural Networks and Deep Learning`_ by Michael Nielsen or a standard text
+book such as "Machine Learning" by Tom Mitchell.
+
+To learn more about Theano, have a look at the `Theano tutorial`_. You will not
+need all of it, but a basic understanding of how Theano works is required to be
+able to use Lasagne. If you're new to Theano, going through that tutorial up to
+(and including) "More Examples" should get you covered! `Graph Structures`_ is
+a good extra read if you're curious about its inner workings.
+
+
+Run the MNIST example
+=====================
+
+In this first part of the tutorial, we will just run the MNIST example that's
+included in the source distribution of Lasagne.
+
+We assume that you have already run through the :ref:`installation`. If you
+haven't done so already, get a copy of the source tree of Lasagne, and navigate
+to the folder in a terminal window. Enter the ``examples`` folder and run the
+``mnist.py`` example script:
+
+.. code-block:: bash
+
+  cd examples
+  python mnist.py
+
+If everything is set up correctly, you will get an output like the following:
+
+.. code-block:: text
+
+  Using gpu device 0: GeForce GT 640
+  Loading data...
+  Downloading train-images-idx3-ubyte.gz
+  Downloading train-labels-idx1-ubyte.gz
+  Downloading t10k-images-idx3-ubyte.gz
+  Downloading t10k-labels-idx1-ubyte.gz
+  Building model and compiling functions...
+  Starting training...
+
+  Epoch 1 of 500 took 1.858s
+    training loss:                1.233348
+    validation loss:              0.405868
+    validation accuracy:          88.78 %
+  Epoch 2 of 500 took 1.845s
+    training loss:                0.571644
+    validation loss:              0.310221
+    validation accuracy:          91.24 %
+  Epoch 3 of 500 took 1.845s
+    training loss:                0.471582
+    validation loss:              0.265931
+    validation accuracy:          92.35 %
+  Epoch 4 of 500 took 1.847s
+    training loss:                0.412204
+    validation loss:              0.238558
+    validation accuracy:          93.05 %
+  ...
+
+The example script allows you to try three different models, selected via the
+first command line argument. Run the script with ``python mnist.py --help`` for
+more information and feel free to play around with it some more before we have
+a look at the implementation.
+
+
+Understand the MNIST example
+============================
+
+Let's now investigate what's needed to make that happen! To follow along, open
+up the source code in your favorite editor (or online: `mnist.py`_).
+
+
+Preface
+-------
+
+The first thing you might notice is that besides Lasagne, we also import numpy
+and Theano:
+
+.. code-block:: python
+
+  import numpy as np
+  import theano
+  import theano.tensor as T
+  
+  import lasagne
+
+While Lasagne is built on top of Theano, it is meant as a supplement helping
+with some tasks, not as a replacement. You will always mix Lasagne with some
+vanilla Theano code.
+
+
+Loading data
+------------
+
+The first piece of code defines a function ``load_dataset()``. Its purpose is
+to download the MNIST dataset (if it hasn't been downloaded yet) and return it
+in the form of regular numpy arrays. There is no Lasagne involved at all, so
+for the purpose of this tutorial, we can regard it as:
+
+.. code-block:: python
+
+  def load_dataset():
+      ...
+      return X_train, y_train, X_val, y_val, X_test, y_test
+
+``X_train.shape`` is ``(50000, 1, 28, 28)``, to be interpreted as: 50,000
+images of 1 channel, 28 rows and 28 columns each. Note that the number of
+channels is 1 because we have monochrome input. Color images would have 3
+channels, spectrograms also would have a single channel.
+``y_train.shape`` is simply ``(50000,)``, that is, it is a vector the same
+length of ``X_train`` giving an integer class label for each image -- namely,
+the digit between 0 and 9 depicted in the image (according to the human
+annotator who drew that digit).
+
+
+Building the model
+------------------
+
+This is where Lasagne steps in. It allows you to define an arbitrarily
+structured neural network by creating and stacking or merging layers.
+Since every layer knows its immediate incoming layers, the output layer (or
+output layers) of a network double as a handle to the network as a whole, so
+usually this is the only thing we will pass on to the rest of the code.
+
+As mentioned above, ``mnist.py`` supports three types of models, and we
+implement that via three easily exchangeable functions of the same interface.
+First, we'll define a function that creates a Multi-Layer Perceptron (MLP) of
+a fixed architecture, explaining all the steps in detail. We'll then present
+a function generating an MLP of a custom architecture. Finally, we'll
+show how to create a Convolutional Neural Network (CNN).
+
+
+Multi-Layer Perceptron (MLP)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The first function, ``build_mlp()``, creates an MLP of two hidden layers of
+800 units each, followed by a softmax output layer of 10 units. It applies 20%
+dropout to the input data and 50% dropout to the hidden layers. It is similar,
+but not fully equivalent to the smallest MLP in [Hinton2012]_ (that paper uses
+different nonlinearities, weight initialization and training).
+
+The foundation of each neural network in Lasagne is an
+:class:`InputLayer <lasagne.layers.InputLayer>` instance (or multiple of those)
+representing the input data that will subsequently be fed to the network. Note
+that the ``InputLayer`` is not tied to any specific data yet, but only holds
+the shape of the data that will be passed to the network. In addition, it
+creates or can be linked to a `Theano variable
+<http://deeplearning.net/software/theano/glossary.html#term-variable>`_ that
+will represent the network input in the `Theano graph
+<http://deeplearning.net/software/theano/glossary.html#term-expression-graph>`_
+we'll build from the network later.
+Thus, our function starts like this:
+
+.. code-block:: python
+
+    def build_mlp(input_var=None):
+        l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),
+                                         input_var=input_var)
+
+The four numbers in the shape tuple represent, in order:
+``(batchsize, channels, rows, columns)``.
+Here we've set the batchsize to ``None``, which means the network will accept
+input data of arbitrary batchsize after compilation. If you know the batchsize
+beforehand and do not need this flexibility, you should give the batchsize
+here -- especially for convolutional layers, this can allow Theano to apply
+some optimizations.
+``input_var`` denotes the Theano variable we want to link the network's input
+layer to. If it is omitted (or set to ``None``), the layer will just create a
+suitable variable itself, but it can be handy to link an existing variable to
+the network at construction time -- especially if you're creating networks of
+multiple input layers. Here, we link it to a variable given as an argument to
+the ``build_mlp()`` function.
+
+Before adding the first hidden layer, we'll apply 20% dropout to the input
+data. This is realized via a :class:`DropoutLayer
+<lasagne.layers.DropoutLayer>` instance:
+
+.. code-block:: python
+
+    l_in_drop = lasagne.layers.DropoutLayer(l_in, p=0.2)
+
+Note that the first constructor argument is the incoming layer, such that
+``l_in_drop`` is now stacked on top of ``l_in``. All layers work this way,
+except for layers that merge multiple inputs: those accept a list of incoming
+layers as their first constructor argument instead.
+
+We'll proceed with the first fully-connected hidden layer of 800 units. Note
+that when stacking a :class:`DenseLayer <lasagne.layers.DenseLayer>` on
+higher-order input tensors, they will be flattened implicitly so we don't need
+to care about that. In this case, the input will be flattened from 1x28x28
+images to 784-dimensional vectors.
+
+.. code-block:: python
+
+    l_hid1 = lasagne.layers.DenseLayer(
+            l_in_drop, num_units=800,
+            nonlinearity=lasagne.nonlinearities.rectify,
+            W=lasagne.init.GlorotUniform())
+
+Again, the first constructor argument means that we're stacking ``l_hid1`` on
+top of ``l_in_drop``.
+``num_units`` simply gives the number of units for this fully-connected layer.
+``nonlinearity`` takes a nonlinearity function, several of which are defined
+in :mod:`lasagne.nonlinearities`. Here we've chosen the linear rectifier, so
+we'll obtain ReLUs.
+Finally, :class:`lasagne.init.GlorotUniform()` gives the initializer for the
+weight matrix ``W``. This particular initializer samples weights from a uniform
+distribution of a carefully chosen range. Other initializers are available in
+:mod:`lasagne.init`, and alternatively, ``W`` could also have been initialized
+from a Theano shared variable or numpy array of the correct shape (784x800 in
+this case, as the input to this layer has 1*28*28=784 dimensions).
+Note that ``lasagne.init.GlorotUniform()`` is the default, so we'll omit it
+from here -- we just wanted to highlight that there is a choice.
+
+We'll now add dropout of 50%, another 800-unit dense layer and 50% dropout
+again:
+
+.. code-block:: python
+
+    l_hid1_drop = lasagne.layers.DropoutLayer(l_hid1, p=0.5)
+
+    l_hid2 = lasagne.layers.DenseLayer(
+            l_hid1_drop, num_units=800,
+            nonlinearity=lasagne.nonlinearities.rectify)
+
+    l_hid2_drop = lasagne.layers.DropoutLayer(l_hid2, p=0.5)
+
+Finally, we'll add the fully-connected output layer. The main difference is
+that it uses the softmax nonlinearity, as we're planning to solve a 10-class
+classification problem with this network.
+
+.. code-block:: python
+
+    l_out = lasagne.layers.DenseLayer(
+            l_hid2_drop, num_units=10,
+            nonlinearity=lasagne.nonlinearities.softmax)
+
+As mentioned above, each layer is linked to its incoming layer(s), so we only
+need the output layer(s) to access a network in Lasagne:
+
+.. code-block:: python
+
+    return l_out
+
+
+Custom MLP
+^^^^^^^^^^
+
+The second function has a slightly more extensive signature:
+
+.. code-block:: python
+
+    def build_custom_mlp(input_var=None, depth=2, width=800, drop_input=.2,
+                         drop_hidden=.5):
+
+By default, it creates the same network as ``build_mlp()`` described above, but
+it can be customized with respect to the number and size of hidden layers, as
+well as the amount of input and hidden dropout. This demonstrates how creating
+a network in Python code can be a lot more flexible than a configuration file.
+See for yourself:
+
+.. code-block:: python
+
+    # Input layer and dropout (with shortcut `dropout` for `DropoutLayer`):
+    network = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),
+                                        input_var=input_var)
+    if drop_input:
+        network = lasagne.layers.dropout(network, p=drop_input)
+    # Hidden layers and dropout:
+    nonlin = lasagne.nonlinearities.rectify
+    for _ in range(depth):
+        network = lasagne.layers.DenseLayer(
+                network, width, nonlinearity=nonlin)
+        if drop_hidden:
+            network = lasagne.layers.dropout(network, p=drop_hidden)
+    # Output layer:
+    softmax = lasagne.nonlinearities.softmax
+    network = lasagne.layers.DenseLayer(network, 10, nonlinearity=softmax)
+    return network
+
+With two ``if`` clauses and a ``for`` loop, this network definition allows
+varying the architecture in a way that would be impossible for a ``.yaml`` file
+in `Pylearn2`_ or a ``.cfg`` file in `cuda-convnet`_.
+
+Note that to make the code easier, all the layers are just called ``network``
+here -- there is no need to give them different names if all we return is the
+last one we created anyway; we just used different names before for clarity.
+
+
+Convolutional Neural Network (CNN)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Finally, the ``build_cnn()`` function creates a CNN of two convolution and
+pooling stages, a fully-connected hidden layer and a fully-connected output
+layer.
+The function begins like the others:
+
+.. code-block:: python
+
+    def build_cnn(input_var=None):
+        network = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),
+                                            input_var=input_var)
+
+We don't apply dropout to the inputs, as this tends to work less well for
+convolutional layers. Instead of a :class:`DenseLayer
+<lasagne.layers.DenseLayer>`, we now add a :class:`Conv2DLayer
+<lasagne.layers.Conv2DLayer>` with 32 filters of size 5x5 on top:
+
+.. code-block:: python
+
+    network = lasagne.layers.Conv2DLayer(
+            network, num_filters=32, filter_size=(5, 5),
+            nonlinearity=lasagne.nonlinearities.rectify,
+            W=lasagne.init.GlorotUniform())
+
+The nonlinearity and weight initializer can be given just as for the
+``DenseLayer`` (and again, ``GlorotUniform()`` is the default, we'll omit it
+from now). Strided and padded convolutions are supported as well; see the
+:class:`Conv2DLayer <lasagne.layers.Conv2DLayer>` docstring.
+
+.. note::
+    For experts: ``Conv2DLayer`` will create a convolutional layer using
+    ``T.nnet.conv2d``, Theano's default convolution. On compilation for GPU,
+    Theano replaces this with a `cuDNN`_-based implementation if available,
+    otherwise falls back to a gemm-based implementation. For details on this,
+    please see the `Theano convolution documentation`_.
+
+    Lasagne also provides convolutional layers directly enforcing a specific
+    implementation: :class:`lasagne.layers.dnn.Conv2DDNNLayer` to enforce
+    cuDNN, :class:`lasagne.layers.corrmm.Conv2DMMLayer` to enforce the
+    gemm-based one, :class:`lasagne.layers.cuda_convnet.Conv2DCCLayer` for
+    Krizhevsky's `cuda-convnet`_.
+
+We then apply max-pooling of factor 2 in both dimensions, using a
+:class:`MaxPool2DLayer <lasagne.layers.MaxPool2DLayer>` instance:
+
+.. code-block:: python
+
+    network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))
+
+We add another convolution and pooling stage like the ones before:
+
+.. code-block:: python
+
+    network = lasagne.layers.Conv2DLayer(
+            network, num_filters=32, filter_size=(5, 5),
+            nonlinearity=lasagne.nonlinearities.rectify)
+    network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))
+
+Then a fully-connected layer of 256 units with 50% dropout on its inputs
+(using the :class:`lasagne.layers.dropout` shortcut directly inline):
+
+.. code-block:: python
+
+    network = lasagne.layers.DenseLayer(
+            lasagne.layers.dropout(network, p=.5),
+            num_units=256,
+            nonlinearity=lasagne.nonlinearities.rectify)
+
+And finally a 10-unit softmax output layer, again with 50% dropout:
+
+.. code-block:: python
+
+    network = lasagne.layers.DenseLayer(
+            lasagne.layers.dropout(network, p=.5),
+            num_units=10,
+            nonlinearity=lasagne.nonlinearities.softmax)
+
+    return network
+
+
+Training the model
+------------------
+
+The remaining part of the ``mnist.py`` script copes with setting up and running
+a training loop over the MNIST dataset.
+
+
+Dataset iteration
+^^^^^^^^^^^^^^^^^
+
+It first defines a short helper function for synchronously iterating over two
+numpy arrays of input data and targets, respectively, in mini-batches of a
+given number of items. For the purpose of this tutorial, we can shorten it to:
+
+.. code-block:: python
+
+    def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
+        if shuffle:
+            ...
+        for ...:
+            yield inputs[...], targets[...]
+
+All that's relevant is that it is a generator function that serves one batch of
+inputs and targets at a time until the given dataset (in ``inputs`` and
+``targets``) is exhausted, either in sequence or in random order. Below we will
+plug this function into our training loop, validation loop and test loop.
+
+
+Preparation
+^^^^^^^^^^^
+
+Let's now focus on the ``main()`` function. A bit simplified, it begins like
+this:
+
+.. code-block:: python
+
+    # Load the dataset
+    X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()
+    # Prepare Theano variables for inputs and targets
+    input_var = T.tensor4('inputs')
+    target_var = T.ivector('targets')
+    # Create neural network model
+    network = build_mlp(input_var)
+
+The first line loads the inputs and targets of the MNIST dataset as numpy
+arrays, split into training, validation and test data.
+The next two statements define symbolic Theano variables that will represent
+a mini-batch of inputs and targets in all the Theano expressions we will
+generate for network training and inference. They are not tied to any data yet,
+but their dimensionality and data type is fixed already and matches the actual
+inputs and targets we will process later.
+Finally, we call one of the three functions for building the Lasagne network,
+depending on the first command line argument -- we've just removed command line
+handling here for clarity. Note that we hand the symbolic input variable to
+``build_mlp()`` so it will be linked to the network's input layer.
+
+
+Loss and update expressions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Continuing, we create a loss expression to be minimized in training:
+
+.. code-block:: python
+
+    prediction = lasagne.layers.get_output(network)
+    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
+    loss = loss.mean()
+
+The first step generates a Theano expression for the network output given the
+input variable linked to the network's input layer(s). The second step defines
+a Theano expression for the categorical cross-entropy loss between said network
+output and the targets. Finally, as we need a scalar loss, we simply take the
+mean over the mini-batch. Depending on the problem you are solving, you will
+need different loss functions, see :mod:`lasagne.objectives` for more.
+
+Having the model and the loss function defined, we create update expressions
+for training the network. An update expression describes how to change the
+trainable parameters of the network at each presented mini-batch. We will use
+Stochastic Gradient Descent (SGD) with Nesterov momentum here, but the
+:mod:`lasagne.updates` module offers several others you can plug in instead:
+
+.. code-block:: python
+
+    params = lasagne.layers.get_all_params(network, trainable=True)
+    updates = lasagne.updates.nesterov_momentum(
+            loss, params, learning_rate=0.01, momentum=0.9)
+
+The first step collects all Theano ``SharedVariable`` instances making up the
+trainable parameters of the layer, and the second step generates an update
+expression for each parameter.
+
+For monitoring progress during training, after each epoch, we evaluate the
+network on the validation set. We need a slightly different loss expression
+for that:
+
+.. code-block:: python
+
+    test_prediction = lasagne.layers.get_output(network, deterministic=True)
+    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
+                                                            target_var)
+    test_loss = test_loss.mean()
+
+The crucial difference is that we pass ``deterministic=True`` to the
+:func:`get_output <lasagne.layers.get_output>` call. This causes all
+nondeterministic layers to switch to a deterministic implementation, so in our
+case, it disables the dropout layers.
+As an additional monitoring quantity, we create an expression for the
+classification accuracy:
+
+.. code-block:: python
+
+    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
+                      dtype=theano.config.floatX)
+
+It also builds on the deterministic ``test_prediction`` expression.
+
+
+Compilation
+^^^^^^^^^^^
+
+Equipped with all the necessary Theano expressions, we're now ready to compile
+a function performing a training step:
+
+.. code-block:: python
+
+    train_fn = theano.function([input_var, target_var], loss, updates=updates)
+
+This tells Theano to generate and compile a function taking two inputs -- a
+mini-batch of images and a vector of corresponding targets -- and returning a
+single output: the training loss. Additionally, each time it is invoked, it
+applies all parameter updates in the ``updates`` dictionary, thus performing a
+gradient descent step with Nesterov momentum.
+
+For validation, we compile a second function:
+
+.. code-block:: python
+
+    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
+
+This one also takes a mini-batch of images and targets, then returns the
+(deterministic) loss and classification accuracy, not performing any updates.
+
+
+Training loop
+^^^^^^^^^^^^^
+
+We're finally ready to write the training loop. In essence, we just need to do
+the following:
+
+.. code-block:: python
+
+    for epoch in range(num_epochs):
+        for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True):
+            inputs, targets = batch
+            train_fn(inputs, targets)
+
+This uses our dataset iteration helper function to iterate over the training
+data in random order, in mini-batches of 500 items each, for ``num_epochs``
+epochs, and calls the training function we compiled to perform an update step
+of the network parameters.
+
+But to be able to monitor the training progress, we capture the training loss,
+compute the validation loss and print some information to the console every
+time an epoch finishes:
+
+.. code-block:: python
+
+    for epoch in range(num_epochs):
+        # In each epoch, we do a full pass over the training data:
+        train_err = 0
+        train_batches = 0
+        start_time = time.time()
+        for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True):
+            inputs, targets = batch
+            train_err += train_fn(inputs, targets)
+            train_batches += 1
+
+        # And a full pass over the validation data:
+        val_err = 0
+        val_acc = 0
+        val_batches = 0
+        for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False):
+            inputs, targets = batch
+            err, acc = val_fn(inputs, targets)
+            val_err += err
+            val_acc += acc
+            val_batches += 1
+
+        # Then we print the results for this epoch:
+        print("Epoch {} of {} took {:.3f}s".format(
+            epoch + 1, num_epochs, time.time() - start_time))
+        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
+        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
+        print("  validation accuracy:\t\t{:.2f} %".format(
+            val_acc / val_batches * 100))
+
+At the very end, we re-use the ``val_fn()`` function to compute the loss and
+accuracy on the test set, finishing the script.
+
+
+
+Where to go from here
+=====================
+
+This finishes our introductory tutorial. For more information on what you can
+do with Lasagne's layers, just continue reading through :doc:`layers` and
+:doc:`custom_layers`.
+More tutorials, examples and code snippets can be found in the `Lasagne
+Recipes`_ repository.
+Finally, the reference lists and explains all layers (:mod:`lasagne.layers`),
+weight initializers (:mod:`lasagne.init`), nonlinearities
+(:mod:`lasagne.nonlinearities`), loss expressions (:mod:`lasagne.objectives`),
+training methods (:mod:`lasagne.updates`) and regularizers
+(:mod:`lasagne.regularization`) included in the library, and should also make
+it simple to create your own.
+
+
+
+.. _Lasagne Recipes: https://github.com/Lasagne/Recipes
+.. _Deeplearning Tutorial: http://deeplearning.net/tutorial/
+.. _Convolutional Neural Networks for Visual Recognition: http://cs231n.github.io/
+.. _Neural Networks and Deep Learning: http://neuralnetworksanddeeplearning.com/
+.. _Theano tutorial: http://deeplearning.net/software/theano/tutorial/
+.. _Graph Structures: http://deeplearning.net/software/theano/extending/graphstructures.html
+.. _mnist.py: https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py
+.. [Hinton2012] Improving neural networks by preventing co-adaptation
+   of feature detectors. http://arxiv.org/abs/1207.0580
+.. _Pylearn2: http://deeplearning.net/software/pylearn2/
+.. _cuda-convnet: https://code.google.com/p/cuda-convnet/
+.. _cuDNN: https://developer.nvidia.com/cudnn
+.. _Theano convolution documentation: http://deeplearning.net/software/theano/library/tensor/nnet/conv.html
diff --git a/examples/mnist.py b/examples/mnist.py
new file mode 100755
index 0000000..1ce6192
--- /dev/null
+++ b/examples/mnist.py
@@ -0,0 +1,362 @@
+#!/usr/bin/env python
+
+"""
+Usage example employing Lasagne for digit recognition using the MNIST dataset.
+
+This example is deliberately structured as a long flat file, focusing on how
+to use Lasagne, instead of focusing on writing maximally modular and reusable
+code. It is used as the foundation for the introductory Lasagne tutorial:
+http://lasagne.readthedocs.org/en/latest/user/tutorial.html
+
+More in-depth examples and reproductions of paper results are maintained in
+a separate repository: https://github.com/Lasagne/Recipes
+"""
+
+from __future__ import print_function
+
+import sys
+import os
+import time
+
+import numpy as np
+import theano
+import theano.tensor as T
+
+import lasagne
+
+
+# ################## Download and prepare the MNIST dataset ##################
+# This is just some way of getting the MNIST dataset from an online location
+# and loading it into numpy arrays. It doesn't involve Lasagne at all.
+
+def load_dataset():
+    # We first define a download function, supporting both Python 2 and 3.
+    if sys.version_info[0] == 2:
+        from urllib import urlretrieve
+    else:
+        from urllib.request import urlretrieve
+
+    def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
+        print("Downloading %s" % filename)
+        urlretrieve(source + filename, filename)
+
+    # We then define functions for loading MNIST images and labels.
+    # For convenience, they also download the requested files if needed.
+    import gzip
+
+    def load_mnist_images(filename):
+        if not os.path.exists(filename):
+            download(filename)
+        # Read the inputs in Yann LeCun's binary format.
+        with gzip.open(filename, 'rb') as f:
+            data = np.frombuffer(f.read(), np.uint8, offset=16)
+        # The inputs are vectors now, we reshape them to monochrome 2D images,
+        # following the shape convention: (examples, channels, rows, columns)
+        data = data.reshape(-1, 1, 28, 28)
+        # The inputs come as bytes, we convert them to float32 in range [0,1].
+        # (Actually to range [0, 255/256], for compatibility to the version
+        # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.)
+        return data / np.float32(256)
+
+    def load_mnist_labels(filename):
+        if not os.path.exists(filename):
+            download(filename)
+        # Read the labels in Yann LeCun's binary format.
+        with gzip.open(filename, 'rb') as f:
+            data = np.frombuffer(f.read(), np.uint8, offset=8)
+        # The labels are vectors of integers now, that's exactly what we want.
+        return data
+
+    # We can now download and read the training and test set images and labels.
+    X_train = load_mnist_images('train-images-idx3-ubyte.gz')
+    y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
+    X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
+    y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
+
+    # We reserve the last 10000 training examples for validation.
+    X_train, X_val = X_train[:-10000], X_train[-10000:]
+    y_train, y_val = y_train[:-10000], y_train[-10000:]
+
+    # We just return all the arrays in order, as expected in main().
+    # (It doesn't matter how we do this as long as we can read them again.)
+    return X_train, y_train, X_val, y_val, X_test, y_test
+
+
+# ##################### Build the neural network model #######################
+# This script supports three types of models. For each one, we define a
+# function that takes a Theano variable representing the input and returns
+# the output layer of a neural network model built in Lasagne.
+
+def build_mlp(input_var=None):
+    # This creates an MLP of two hidden layers of 800 units each, followed by
+    # a softmax output layer of 10 units. It applies 20% dropout to the input
+    # data and 50% dropout to the hidden layers.
+
+    # Input layer, specifying the expected input shape of the network
+    # (unspecified batchsize, 1 channel, 28 rows and 28 columns) and
+    # linking it to the given Theano variable `input_var`, if any:
+    l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),
+                                     input_var=input_var)
+
+    # Apply 20% dropout to the input data:
+    l_in_drop = lasagne.layers.DropoutLayer(l_in, p=0.2)
+
+    # Add a fully-connected layer of 800 units, using the linear rectifier, and
+    # initializing weights with Glorot's scheme (which is the default anyway):
+    l_hid1 = lasagne.layers.DenseLayer(
+            l_in_drop, num_units=800,
+            nonlinearity=lasagne.nonlinearities.rectify,
+            W=lasagne.init.GlorotUniform())
+
+    # We'll now add dropout of 50%:
+    l_hid1_drop = lasagne.layers.DropoutLayer(l_hid1, p=0.5)
+
+    # Another 800-unit layer:
+    l_hid2 = lasagne.layers.DenseLayer(
+            l_hid1_drop, num_units=800,
+            nonlinearity=lasagne.nonlinearities.rectify)
+
+    # 50% dropout again:
+    l_hid2_drop = lasagne.layers.DropoutLayer(l_hid2, p=0.5)
+
+    # Finally, we'll add the fully-connected output layer, of 10 softmax units:
+    l_out = lasagne.layers.DenseLayer(
+            l_hid2_drop, num_units=10,
+            nonlinearity=lasagne.nonlinearities.softmax)
+
+    # Each layer is linked to its incoming layer(s), so we only need to pass
+    # the output layer to give access to a network in Lasagne:
+    return l_out
+
+
+def build_custom_mlp(input_var=None, depth=2, width=800, drop_input=.2,
+                     drop_hidden=.5):
+    # By default, this creates the same network as `build_mlp`, but it can be
+    # customized with respect to the number and size of hidden layers. This
+    # mostly showcases how creating a network in Python code can be a lot more
+    # flexible than a configuration file. Note that to make the code easier,
+    # all the layers are just called `network` -- there is no need to give them
+    # different names if all we return is the last one we created anyway; we
+    # just used different names above for clarity.
+
+    # Input layer and dropout (with shortcut `dropout` for `DropoutLayer`):
+    network = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),
+                                        input_var=input_var)
+    if drop_input:
+        network = lasagne.layers.dropout(network, p=drop_input)
+    # Hidden layers and dropout:
+    nonlin = lasagne.nonlinearities.rectify
+    for _ in range(depth):
+        network = lasagne.layers.DenseLayer(
+                network, width, nonlinearity=nonlin)
+        if drop_hidden:
+            network = lasagne.layers.dropout(network, p=drop_hidden)
+    # Output layer:
+    softmax = lasagne.nonlinearities.softmax
+    network = lasagne.layers.DenseLayer(network, 10, nonlinearity=softmax)
+    return network
+
+
+def build_cnn(input_var=None):
+    # As a third model, we'll create a CNN of two convolution + pooling stages
+    # and a fully-connected hidden layer in front of the output layer.
+
+    # Input layer, as usual:
+    network = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),
+                                        input_var=input_var)
+    # This time we do not apply input dropout, as it tends to work less well
+    # for convolutional layers.
+
+    # Convolutional layer with 32 kernels of size 5x5. Strided and padded
+    # convolutions are supported as well; see the docstring.
+    network = lasagne.layers.Conv2DLayer(
+            network, num_filters=32, filter_size=(5, 5),
+            nonlinearity=lasagne.nonlinearities.rectify,
+            W=lasagne.init.GlorotUniform())
+    # Expert note: Lasagne provides alternative convolutional layers that
+    # override Theano's choice of which implementation to use; for details
+    # please see http://lasagne.readthedocs.org/en/latest/user/tutorial.html.
+
+    # Max-pooling layer of factor 2 in both dimensions:
+    network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))
+
+    # Another convolution with 32 5x5 kernels, and another 2x2 pooling:
+    network = lasagne.layers.Conv2DLayer(
+            network, num_filters=32, filter_size=(5, 5),
+            nonlinearity=lasagne.nonlinearities.rectify)
+    network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))
+
+    # A fully-connected layer of 256 units with 50% dropout on its inputs:
+    network = lasagne.layers.DenseLayer(
+            lasagne.layers.dropout(network, p=.5),
+            num_units=256,
+            nonlinearity=lasagne.nonlinearities.rectify)
+
+    # And, finally, the 10-unit output layer with 50% dropout on its inputs:
+    network = lasagne.layers.DenseLayer(
+            lasagne.layers.dropout(network, p=.5),
+            num_units=10,
+            nonlinearity=lasagne.nonlinearities.softmax)
+
+    return network
+
+
+# ############################# Batch iterator ###############################
+# This is just a simple helper function iterating over training data in
+# mini-batches of a particular size, optionally in random order. It assumes
+# data is available as numpy arrays. For big datasets, you could load numpy
+# arrays as memory-mapped files (np.load(..., mmap_mode='r')), or write your
+# own custom data iteration function. For small datasets, you can also copy
+# them to GPU at once for slightly improved performance. This would involve
+# several changes in the main program, though, and is not demonstrated here.
+# Notice that this function returns only mini-batches of size `batchsize`.
+# If the size of the data is not a multiple of `batchsize`, it will not
+# return the last (remaining) mini-batch.
+
+def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
+    assert len(inputs) == len(targets)
+    if shuffle:
+        indices = np.arange(len(inputs))
+        np.random.shuffle(indices)
+    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
+        if shuffle:
+            excerpt = indices[start_idx:start_idx + batchsize]
+        else:
+            excerpt = slice(start_idx, start_idx + batchsize)
+        yield inputs[excerpt], targets[excerpt]
+
+
+# ############################## Main program ################################
+# Everything else will be handled in our main program now. We could pull out
+# more functions to better separate the code, but it wouldn't make it any
+# easier to read.
+
+def main(model='mlp', num_epochs=500):
+    # Load the dataset
+    print("Loading data...")
+    X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()
+
+    # Prepare Theano variables for inputs and targets
+    input_var = T.tensor4('inputs')
+    target_var = T.ivector('targets')
+
+    # Create neural network model (depending on first command line parameter)
+    print("Building model and compiling functions...")
+    if model == 'mlp':
+        network = build_mlp(input_var)
+    elif model.startswith('custom_mlp:'):
+        depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',')
+        network = build_custom_mlp(input_var, int(depth), int(width),
+                                   float(drop_in), float(drop_hid))
+    elif model == 'cnn':
+        network = build_cnn(input_var)
+    else:
+        print("Unrecognized model type %r." % model)
+        return
+
+    # Create a loss expression for training, i.e., a scalar objective we want
+    # to minimize (for our multi-class problem, it is the cross-entropy loss):
+    prediction = lasagne.layers.get_output(network)
+    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
+    loss = loss.mean()
+    # We could add some weight decay as well here, see lasagne.regularization.
+
+    # Create update expressions for training, i.e., how to modify the
+    # parameters at each training step. Here, we'll use Stochastic Gradient
+    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
+    params = lasagne.layers.get_all_params(network, trainable=True)
+    updates = lasagne.updates.nesterov_momentum(
+            loss, params, learning_rate=0.01, momentum=0.9)
+
+    # Create a loss expression for validation/testing. The crucial difference
+    # here is that we do a deterministic forward pass through the network,
+    # disabling dropout layers.
+    test_prediction = lasagne.layers.get_output(network, deterministic=True)
+    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
+                                                            target_var)
+    test_loss = test_loss.mean()
+    # As a bonus, also create an expression for the classification accuracy:
+    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
+                      dtype=theano.config.floatX)
+
+    # Compile a function performing a training step on a mini-batch (by giving
+    # the updates dictionary) and returning the corresponding training loss:
+    train_fn = theano.function([input_var, target_var], loss, updates=updates)
+
+    # Compile a second function computing the validation loss and accuracy:
+    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
+
+    # Finally, launch the training loop.
+    print("Starting training...")
+    # We iterate over epochs:
+    for epoch in range(num_epochs):
+        # In each epoch, we do a full pass over the training data:
+        train_err = 0
+        train_batches = 0
+        start_time = time.time()
+        for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True):
+            inputs, targets = batch
+            train_err += train_fn(inputs, targets)
+            train_batches += 1
+
+        # And a full pass over the validation data:
+        val_err = 0
+        val_acc = 0
+        val_batches = 0
+        for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False):
+            inputs, targets = batch
+            err, acc = val_fn(inputs, targets)
+            val_err += err
+            val_acc += acc
+            val_batches += 1
+
+        # Then we print the results for this epoch:
+        print("Epoch {} of {} took {:.3f}s".format(
+            epoch + 1, num_epochs, time.time() - start_time))
+        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
+        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
+        print("  validation accuracy:\t\t{:.2f} %".format(
+            val_acc / val_batches * 100))
+
+    # After training, we compute and print the test error:
+    test_err = 0
+    test_acc = 0
+    test_batches = 0
+    for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False):
+        inputs, targets = batch
+        err, acc = val_fn(inputs, targets)
+        test_err += err
+        test_acc += acc
+        test_batches += 1
+    print("Final results:")
+    print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
+    print("  test accuracy:\t\t{:.2f} %".format(
+        test_acc / test_batches * 100))
+
+    # Optionally, you could now dump the network weights to a file like this:
+    # np.savez('model.npz', *lasagne.layers.get_all_param_values(network))
+    #
+    # And load them again later on like this:
+    # with np.load('model.npz') as f:
+    #     param_values = [f['arr_%d' % i] for i in range(len(f.files))]
+    # lasagne.layers.set_all_param_values(network, param_values)
+
+
+if __name__ == '__main__':
+    if ('--help' in sys.argv) or ('-h' in sys.argv):
+        print("Trains a neural network on MNIST using Lasagne.")
+        print("Usage: %s [MODEL [EPOCHS]]" % sys.argv[0])
+        print()
+        print("MODEL: 'mlp' for a simple Multi-Layer Perceptron (MLP),")
+        print("       'custom_mlp:DEPTH,WIDTH,DROP_IN,DROP_HID' for an MLP")
+        print("       with DEPTH hidden layers of WIDTH units, DROP_IN")
+        print("       input dropout and DROP_HID hidden dropout,")
+        print("       'cnn' for a simple Convolutional Neural Network (CNN).")
+        print("EPOCHS: number of training epochs to perform (default: 500)")
+    else:
+        kwargs = {}
+        if len(sys.argv) > 1:
+            kwargs['model'] = sys.argv[1]
+        if len(sys.argv) > 2:
+            kwargs['num_epochs'] = int(sys.argv[2])
+        main(**kwargs)
diff --git a/examples/recurrent.py b/examples/recurrent.py
new file mode 100755
index 0000000..274e83b
--- /dev/null
+++ b/examples/recurrent.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+'''
+Recurrent network example.  Trains a bidirectional vanilla RNN to output the
+sum of two numbers in a sequence of random numbers sampled uniformly from
+[0, 1] based on a separate marker sequence.
+'''
+
+from __future__ import print_function
+
+
+import numpy as np
+import theano
+import theano.tensor as T
+import lasagne
+
+
+# Min/max sequence length
+MIN_LENGTH = 50
+MAX_LENGTH = 55
+# Number of units in the hidden (recurrent) layer
+N_HIDDEN = 100
+# Number of training sequences in each batch
+N_BATCH = 100
+# Optimization learning rate
+LEARNING_RATE = .001
+# All gradients above this will be clipped
+GRAD_CLIP = 100
+# How often should we check the output?
+EPOCH_SIZE = 100
+# Number of epochs to train the net
+NUM_EPOCHS = 10
+
+
+def gen_data(min_length=MIN_LENGTH, max_length=MAX_LENGTH, n_batch=N_BATCH):
+    '''
+    Generate a batch of sequences for the "add" task, e.g. the target for the
+    following
+
+    ``| 0.5 | 0.7 | 0.3 | 0.1 | 0.2 | ... | 0.5 | 0.9 | ... | 0.8 | 0.2 |
+      |  0  |  0  |  1  |  0  |  0  |     |  0  |  1  |     |  0  |  0  |``
+
+    would be 0.3 + .9 = 1.2.  This task was proposed in [1]_ and explored in
+    e.g. [2]_.
+
+    Parameters
+    ----------
+    min_length : int
+        Minimum sequence length.
+    max_length : int
+        Maximum sequence length.
+    n_batch : int
+        Number of samples in the batch.
+
+    Returns
+    -------
+    X : np.ndarray
+        Input to the network, of shape (n_batch, max_length, 2), where the last
+        dimension corresponds to the two sequences shown above.
+    y : np.ndarray
+        Correct output for each sample, shape (n_batch,).
+    mask : np.ndarray
+        A binary matrix of shape (n_batch, max_length) where ``mask[i, j] = 1``
+        when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j >
+        (length of sequence i)``.
+
+    References
+    ----------
+    .. [1] Hochreiter, Sepp, and Jürgen Schmidhuber. "Long short-term memory."
+    Neural computation 9.8 (1997): 1735-1780.
+
+    .. [2] Sutskever, Ilya, et al. "On the importance of initialization and
+    momentum in deep learning." Proceedings of the 30th international
+    conference on machine learning (ICML-13). 2013.
+    '''
+    # Generate X - we'll fill the last dimension later
+    X = np.concatenate([np.random.uniform(size=(n_batch, max_length, 1)),
+                        np.zeros((n_batch, max_length, 1))],
+                       axis=-1)
+    mask = np.zeros((n_batch, max_length))
+    y = np.zeros((n_batch,))
+    # Compute masks and correct values
+    for n in range(n_batch):
+        # Randomly choose the sequence length
+        length = np.random.randint(min_length, max_length)
+        # Make the mask for this sample 1 within the range of length
+        mask[n, :length] = 1
+        # Zero out X after the end of the sequence
+        X[n, length:, 0] = 0
+        # Set the second dimension to 1 at the indices to add
+        X[n, np.random.randint(length/10), 1] = 1
+        X[n, np.random.randint(length/2, length), 1] = 1
+        # Multiply and sum the dimensions of X to get the target value
+        y[n] = np.sum(X[n, :, 0]*X[n, :, 1])
+    # Center the inputs and outputs
+    X -= X.reshape(-1, 2).mean(axis=0)
+    y -= y.mean()
+    return (X.astype(theano.config.floatX), y.astype(theano.config.floatX),
+            mask.astype(theano.config.floatX))
+
+
+def main(num_epochs=NUM_EPOCHS):
+    print("Building network ...")
+    # First, we build the network, starting with an input layer
+    # Recurrent layers expect input of shape
+    # (batch size, max sequence length, number of features)
+    l_in = lasagne.layers.InputLayer(shape=(N_BATCH, MAX_LENGTH, 2))
+    # The network also needs a way to provide a mask for each sequence.  We'll
+    # use a separate input layer for that.  Since the mask only determines
+    # which indices are part of the sequence for each batch entry, they are
+    # supplied as matrices of dimensionality (N_BATCH, MAX_LENGTH)
+    l_mask = lasagne.layers.InputLayer(shape=(N_BATCH, MAX_LENGTH))
+    # We're using a bidirectional network, which means we will combine two
+    # RecurrentLayers, one with the backwards=True keyword argument.
+    # Setting a value for grad_clipping will clip the gradients in the layer
+    # Setting only_return_final=True makes the layers only return their output
+    # for the final time step, which is all we need for this task
+    l_forward = lasagne.layers.RecurrentLayer(
+        l_in, N_HIDDEN, mask_input=l_mask, grad_clipping=GRAD_CLIP,
+        W_in_to_hid=lasagne.init.HeUniform(),
+        W_hid_to_hid=lasagne.init.HeUniform(),
+        nonlinearity=lasagne.nonlinearities.tanh, only_return_final=True)
+    l_backward = lasagne.layers.RecurrentLayer(
+        l_in, N_HIDDEN, mask_input=l_mask, grad_clipping=GRAD_CLIP,
+        W_in_to_hid=lasagne.init.HeUniform(),
+        W_hid_to_hid=lasagne.init.HeUniform(),
+        nonlinearity=lasagne.nonlinearities.tanh,
+        only_return_final=True, backwards=True)
+    # Now, we'll concatenate the outputs to combine them.
+    l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward])
+    # Our output layer is a simple dense connection, with 1 output unit
+    l_out = lasagne.layers.DenseLayer(
+        l_concat, num_units=1, nonlinearity=lasagne.nonlinearities.tanh)
+
+    target_values = T.vector('target_output')
+
+    # lasagne.layers.get_output produces a variable for the output of the net
+    network_output = lasagne.layers.get_output(l_out)
+    # The network output will have shape (n_batch, 1); let's flatten to get a
+    # 1-dimensional vector of predicted values
+    predicted_values = network_output.flatten()
+    # Our cost will be mean-squared error
+    cost = T.mean((predicted_values - target_values)**2)
+    # Retrieve all parameters from the network
+    all_params = lasagne.layers.get_all_params(l_out)
+    # Compute SGD updates for training
+    print("Computing updates ...")
+    updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)
+    # Theano functions for training and computing cost
+    print("Compiling functions ...")
+    train = theano.function([l_in.input_var, target_values, l_mask.input_var],
+                            cost, updates=updates)
+    compute_cost = theano.function(
+        [l_in.input_var, target_values, l_mask.input_var], cost)
+
+    # We'll use this "validation set" to periodically check progress
+    X_val, y_val, mask_val = gen_data()
+
+    print("Training ...")
+    try:
+        for epoch in range(num_epochs):
+            for _ in range(EPOCH_SIZE):
+                X, y, m = gen_data()
+                train(X, y, m)
+            cost_val = compute_cost(X_val, y_val, mask_val)
+            print("Epoch {} validation cost = {}".format(epoch, cost_val))
+    except KeyboardInterrupt:
+        pass
+
+if __name__ == '__main__':
+    main()
diff --git a/lasagne/__init__.py b/lasagne/__init__.py
new file mode 100644
index 0000000..107f711
--- /dev/null
+++ b/lasagne/__init__.py
@@ -0,0 +1,34 @@
+"""
+Tools to train neural nets in Theano
+"""
+
+try:
+    install_instr = """
+
+Please make sure you install a recent enough version of Theano. Note that a
+simple 'pip install theano' will usually give you a version that is too old
+for Lasagne. See the installation docs for more details:
+http://lasagne.readthedocs.org/en/latest/user/installation.html#theano"""
+    import theano
+except ImportError:  # pragma: no cover
+    raise ImportError("Could not import Theano." + install_instr)
+else:
+    try:
+        import theano.tensor.signal.pool
+    except ImportError:  # pragma: no cover
+        raise ImportError("Your Theano version is too old." + install_instr)
+    del install_instr
+    del theano
+
+
+from . import nonlinearities
+from . import init
+from . import layers
+from . import objectives
+from . import random
+from . import regularization
+from . import updates
+from . import utils
+
+
+__version__ = "0.2.dev1"
diff --git a/lasagne/conftest.py b/lasagne/conftest.py
new file mode 100644
index 0000000..39ffa9f
--- /dev/null
+++ b/lasagne/conftest.py
@@ -0,0 +1,12 @@
+ignore_test_paths = [
+    "*/layers/corrmm.py",
+    "*/layers/cuda_convnet.py",
+    "*/layers/dnn.py",
+    ]
+
+
+def pytest_ignore_collect(path, config):
+    """Ignore paths that would otherwise be collceted by the doctest
+    plugin and lead to ImportError due to missing dependencies.
+    """
+    return any(path.fnmatch(ignore) for ignore in ignore_test_paths)
diff --git a/lasagne/init.py b/lasagne/init.py
new file mode 100644
index 0000000..348ddc1
--- /dev/null
+++ b/lasagne/init.py
@@ -0,0 +1,367 @@
+"""
+Functions to create initializers for parameter variables.
+
+Examples
+--------
+>>> from lasagne.layers import DenseLayer
+>>> from lasagne.init import Constant, GlorotUniform
+>>> l1 = DenseLayer((100,20), num_units=50,
+...                 W=GlorotUniform('relu'), b=Constant(0.0))
+"""
+
+import numpy as np
+
+from .utils import floatX
+from .random import get_rng
+
+
+class Initializer(object):
+    """Base class for parameter tensor initializers.
+
+    The :class:`Initializer` class represents a weight initializer used
+    to initialize weight parameters in a neural network layer. It should be
+    subclassed when implementing new types of weight initializers.
+
+    """
+    def __call__(self, shape):
+        """
+        Makes :class:`Initializer` instances callable like a function, invoking
+        their :meth:`sample()` method.
+        """
+        return self.sample(shape)
+
+    def sample(self, shape):
+        """
+        Sample should return a theano.tensor of size shape and data type
+        theano.config.floatX.
+
+        Parameters
+        -----------
+        shape : tuple or int
+            Integer or tuple specifying the size of the returned
+            matrix.
+        returns : theano.tensor
+            Matrix of size shape and dtype theano.config.floatX.
+        """
+        raise NotImplementedError()
+
+
+class Normal(Initializer):
+    """Sample initial weights from the Gaussian distribution.
+
+    Initial weight parameters are sampled from N(mean, std).
+
+    Parameters
+    ----------
+    std : float
+        Std of initial parameters.
+    mean : float
+        Mean of initial parameters.
+    """
+    def __init__(self, std=0.01, mean=0.0):
+        self.std = std
+        self.mean = mean
+
+    def sample(self, shape):
+        return floatX(get_rng().normal(self.mean, self.std, size=shape))
+
+
+class Uniform(Initializer):
+    """Sample initial weights from the uniform distribution.
+
+    Parameters are sampled from U(a, b).
+
+    Parameters
+    ----------
+    range : float or tuple
+        When std is None then range determines a, b. If range is a float the
+        weights are sampled from U(-range, range). If range is a tuple the
+        weights are sampled from U(range[0], range[1]).
+    std : float or None
+        If std is a float then the weights are sampled from
+        U(mean - np.sqrt(3) * std, mean + np.sqrt(3) * std).
+    mean : float
+        see std for description.
+    """
+    def __init__(self, range=0.01, std=None, mean=0.0):
+        if std is not None:
+            a = mean - np.sqrt(3) * std
+            b = mean + np.sqrt(3) * std
+        else:
+            try:
+                a, b = range  # range is a tuple
+            except TypeError:
+                a, b = -range, range  # range is a number
+
+        self.range = (a, b)
+
+    def sample(self, shape):
+        return floatX(get_rng().uniform(
+            low=self.range[0], high=self.range[1], size=shape))
+
+
+class Glorot(Initializer):
+    """Glorot weight initialization.
+
+    This is also known as Xavier initialization [1]_.
+
+    Parameters
+    ----------
+    initializer : lasagne.init.Initializer
+        Initializer used to sample the weights, must accept `std` in its
+        constructor to sample from a distribution with a given standard
+        deviation.
+    gain : float or 'relu'
+        Scaling factor for the weights. Set this to ``1.0`` for linear and
+        sigmoid units, to 'relu' or ``sqrt(2)`` for rectified linear units, and
+        to ``sqrt(2/(1+alpha**2))`` for leaky rectified linear units with
+        leakiness ``alpha``. Other transfer functions may need different
+        factors.
+    c01b : bool
+        For a :class:`lasagne.layers.cuda_convnet.Conv2DCCLayer` constructed
+        with ``dimshuffle=False``, `c01b` must be set to ``True`` to compute
+        the correct fan-in and fan-out.
+
+    References
+    ----------
+    .. [1] Xavier Glorot and Yoshua Bengio (2010):
+           Understanding the difficulty of training deep feedforward neural
+           networks. International conference on artificial intelligence and
+           statistics.
+
+    Notes
+    -----
+    For a :class:`DenseLayer <lasagne.layers.DenseLayer>`, if ``gain='relu'``
+    and ``initializer=Uniform``, the weights are initialized as
+
+    .. math::
+       a &= \\sqrt{\\frac{12}{fan_{in}+fan_{out}}}\\\\
+       W &\sim U[-a, a]
+
+    If ``gain=1`` and ``initializer=Normal``, the weights are initialized as
+
+    .. math::
+       \\sigma &= \\sqrt{\\frac{2}{fan_{in}+fan_{out}}}\\\\
+       W &\sim N(0, \\sigma)
+
+    See Also
+    --------
+    GlorotNormal  : Shortcut with Gaussian initializer.
+    GlorotUniform : Shortcut with uniform initializer.
+    """
+    def __init__(self, initializer, gain=1.0, c01b=False):
+        if gain == 'relu':
+            gain = np.sqrt(2)
+
+        self.initializer = initializer
+        self.gain = gain
+        self.c01b = c01b
+
+    def sample(self, shape):
+        if self.c01b:
+            if len(shape) != 4:
+                raise RuntimeError(
+                    "If c01b is True, only shapes of length 4 are accepted")
+
+            n1, n2 = shape[0], shape[3]
+            receptive_field_size = shape[1] * shape[2]
+        else:
+            if len(shape) < 2:
+                raise RuntimeError(
+                    "This initializer only works with shapes of length >= 2")
+
+            n1, n2 = shape[:2]
+            receptive_field_size = np.prod(shape[2:])
+
+        std = self.gain * np.sqrt(2.0 / ((n1 + n2) * receptive_field_size))
+        return self.initializer(std=std).sample(shape)
+
+
+class GlorotNormal(Glorot):
+    """Glorot with weights sampled from the Normal distribution.
+
+    See :class:`Glorot` for a description of the parameters.
+    """
+    def __init__(self, gain=1.0, c01b=False):
+        super(GlorotNormal, self).__init__(Normal, gain, c01b)
+
+
+class GlorotUniform(Glorot):
+    """Glorot with weights sampled from the Uniform distribution.
+
+    See :class:`Glorot` for a description of the parameters.
+    """
+    def __init__(self, gain=1.0, c01b=False):
+        super(GlorotUniform, self).__init__(Uniform, gain, c01b)
+
+
+class He(Initializer):
+    """He weight initialization.
+
+    Weights are initialized with a standard deviation of
+    :math:`\\sigma = gain \\sqrt{\\frac{1}{fan_{in}}}` [1]_.
+
+    Parameters
+    ----------
+    initializer : lasagne.init.Initializer
+        Initializer used to sample the weights, must accept `std` in its
+        constructor to sample from a distribution with a given standard
+        deviation.
+    gain : float or 'relu'
+        Scaling factor for the weights. Set this to ``1.0`` for linear and
+        sigmoid units, to 'relu' or ``sqrt(2)`` for rectified linear units, and
+        to ``sqrt(2/(1+alpha**2))`` for leaky rectified linear units with
+        leakiness ``alpha``. Other transfer functions may need different
+        factors.
+    c01b : bool
+        For a :class:`lasagne.layers.cuda_convnet.Conv2DCCLayer` constructed
+        with ``dimshuffle=False``, `c01b` must be set to ``True`` to compute
+        the correct fan-in and fan-out.
+
+    References
+    ----------
+    .. [1] Kaiming He et al. (2015):
+           Delving deep into rectifiers: Surpassing human-level performance on
+           imagenet classification. arXiv preprint arXiv:1502.01852.
+
+    See Also
+    ----------
+    HeNormal  : Shortcut with Gaussian initializer.
+    HeUniform : Shortcut with uniform initializer.
+    """
+    def __init__(self, initializer, gain=1.0, c01b=False):
+        if gain == 'relu':
+            gain = np.sqrt(2)
+
+        self.initializer = initializer
+        self.gain = gain
+        self.c01b = c01b
+
+    def sample(self, shape):
+        if self.c01b:
+            if len(shape) != 4:
+                raise RuntimeError(
+                    "If c01b is True, only shapes of length 4 are accepted")
+
+            fan_in = np.prod(shape[:3])
+        else:
+            if len(shape) == 2:
+                fan_in = shape[0]
+            elif len(shape) > 2:
+                fan_in = np.prod(shape[1:])
+            else:
+                raise RuntimeError(
+                    "This initializer only works with shapes of length >= 2")
+
+        std = self.gain * np.sqrt(1.0 / fan_in)
+        return self.initializer(std=std).sample(shape)
+
+
+class HeNormal(He):
+    """He initializer with weights sampled from the Normal distribution.
+
+    See :class:`He` for a description of the parameters.
+    """
+    def __init__(self, gain=1.0, c01b=False):
+        super(HeNormal, self).__init__(Normal, gain, c01b)
+
+
+class HeUniform(He):
+    """He initializer with weights sampled from the Uniform distribution.
+
+    See :class:`He` for a description of the parameters.
+    """
+    def __init__(self, gain=1.0, c01b=False):
+        super(HeUniform, self).__init__(Uniform, gain, c01b)
+
+
+class Constant(Initializer):
+    """Initialize weights with constant value.
+
+    Parameters
+    ----------
+     val : float
+        Constant value for weights.
+    """
+    def __init__(self, val=0.0):
+        self.val = val
+
+    def sample(self, shape):
+        return floatX(np.ones(shape) * self.val)
+
+
+class Sparse(Initializer):
+    """Initialize weights as sparse matrix.
+
+    Parameters
+    ----------
+    sparsity : float
+        Exact fraction of non-zero values per column. Larger values give less
+        sparsity.
+    std : float
+        Non-zero weights are sampled from N(0, std).
+    """
+    def __init__(self, sparsity=0.1, std=0.01):
+        self.sparsity = sparsity
+        self.std = std
+
+    def sample(self, shape):
+        if len(shape) != 2:
+            raise RuntimeError(
+                "sparse initializer only works with shapes of length 2")
+
+        w = floatX(np.zeros(shape))
+        n_inputs, n_outputs = shape
+        size = int(self.sparsity * n_inputs)  # fraction of number of inputs
+
+        for k in range(n_outputs):
+            indices = np.arange(n_inputs)
+            get_rng().shuffle(indices)
+            indices = indices[:size]
+            values = floatX(get_rng().normal(0.0, self.std, size=size))
+            w[indices, k] = values
+
+        return w
+
+
+class Orthogonal(Initializer):
+    """Intialize weights as Orthogonal matrix.
+
+    Orthogonal matrix initialization [1]_. For n-dimensional shapes where
+    n > 2, the n-1 trailing axes are flattened. For convolutional layers, this
+    corresponds to the fan-in, so this makes the initialization usable for
+    both dense and convolutional layers.
+
+    Parameters
+    ----------
+    gain : float or 'relu'
+        Scaling factor for the weights. Set this to ``1.0`` for linear and
+        sigmoid units, to 'relu' or ``sqrt(2)`` for rectified linear units, and
+        to ``sqrt(2/(1+alpha**2))`` for leaky rectified linear units with
+        leakiness ``alpha``. Other transfer functions may need different
+        factors.
+
+    References
+    ----------
+    .. [1] Saxe, Andrew M., James L. McClelland, and Surya Ganguli.
+           "Exact solutions to the nonlinear dynamics of learning in deep
+           linear neural networks." arXiv preprint arXiv:1312.6120 (2013).
+    """
+    def __init__(self, gain=1.0):
+        if gain == 'relu':
+            gain = np.sqrt(2)
+
+        self.gain = gain
+
+    def sample(self, shape):
+        if len(shape) < 2:
+            raise RuntimeError("Only shapes of length 2 or more are "
+                               "supported.")
+
+        flat_shape = (shape[0], np.prod(shape[1:]))
+        a = get_rng().normal(0.0, 1.0, flat_shape)
+        u, _, v = np.linalg.svd(a, full_matrices=False)
+        # pick the one with the correct shape
+        q = u if u.shape == flat_shape else v
+        q = q.reshape(shape)
+        return floatX(self.gain * q)
diff --git a/lasagne/layers/__init__.py b/lasagne/layers/__init__.py
new file mode 100644
index 0000000..f11c37e
--- /dev/null
+++ b/lasagne/layers/__init__.py
@@ -0,0 +1,13 @@
+from .base import *
+from .helper import *
+from .input import *
+from .dense import *
+from .noise import *
+from .conv import *
+from .pool import *
+from .shape import *
+from .merge import *
+from .normalization import *
+from .embedding import *
+from .recurrent import *
+from .special import *
diff --git a/lasagne/layers/base.py b/lasagne/layers/base.py
new file mode 100644
index 0000000..868df84
--- /dev/null
+++ b/lasagne/layers/base.py
@@ -0,0 +1,328 @@
+from collections import OrderedDict
+
+import theano.tensor as T
+
+from .. import utils
+
+
+__all__ = [
+    "Layer",
+    "MergeLayer",
+]
+
+
+# Layer base class
+
+class Layer(object):
+    """
+    The :class:`Layer` class represents a single layer of a neural network. It
+    should be subclassed when implementing new types of layers.
+
+    Because each layer can keep track of the layer(s) feeding into it, a
+    network's output :class:`Layer` instance can double as a handle to the full
+    network.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape.
+    name : a string or None
+        An optional name to attach to this layer.
+    """
+    def __init__(self, incoming, name=None):
+        if isinstance(incoming, tuple):
+            self.input_shape = incoming
+            self.input_layer = None
+        else:
+            self.input_shape = incoming.output_shape
+            self.input_layer = incoming
+
+        self.name = name
+        self.params = OrderedDict()
+        self.get_output_kwargs = []
+
+        if any(d is not None and d <= 0 for d in self.input_shape):
+            raise ValueError((
+                "Cannot create Layer with a non-positive input_shape "
+                "dimension. input_shape=%r, self.name=%r") % (
+                    self.input_shape, self.name))
+
+    @property
+    def output_shape(self):
+        shape = self.get_output_shape_for(self.input_shape)
+        if any(isinstance(s, T.Variable) for s in shape):
+            raise ValueError("%s returned a symbolic output shape from its "
+                             "get_output_shape_for() method: %r. This is not "
+                             "allowed; shapes must be tuples of integers for "
+                             "fixed-size dimensions and Nones for variable "
+                             "dimensions." % (self.__class__.__name__, shape))
+        return shape
+
+    def get_params(self, unwrap_shared=True, **tags):
+        """
+        Returns a list of Theano shared variables or expressions that
+        parameterize the layer.
+
+        By default, all shared variables that participate in the forward pass
+        will be returned (in the order they were registered in the Layer's
+        constructor via :meth:`add_param()`). The list can optionally be
+        filtered by specifying tags as keyword arguments. For example,
+        ``trainable=True`` will only return trainable parameters, and
+        ``regularizable=True`` will only return parameters that can be
+        regularized (e.g., by L2 decay).
+
+        If any of the layer's parameters was set to a Theano expression instead
+        of a shared variable, `unwrap_shared` controls whether to return the
+        shared variables involved in that expression (``unwrap_shared=True``,
+        the default), or the expression itself (``unwrap_shared=False``). In
+        either case, tag filtering applies to the expressions, considering all
+        variables within an expression to be tagged the same.
+
+        Parameters
+        ----------
+        unwrap_shared : bool (default: True)
+            Affects only parameters that were set to a Theano expression. If
+            ``True`` the function returns the shared variables contained in
+            the expression, otherwise the Theano expression itself.
+
+        **tags (optional)
+            tags can be specified to filter the list. Specifying ``tag1=True``
+            will limit the list to parameters that are tagged with ``tag1``.
+            Specifying ``tag1=False`` will limit the list to parameters that
+            are not tagged with ``tag1``. Commonly used tags are
+            ``regularizable`` and ``trainable``.
+
+        Returns
+        -------
+        list of Theano shared variables or expressions
+            A list of variables that parameterize the layer
+
+        Notes
+        -----
+        For layers without any parameters, this will return an empty list.
+        """
+        result = list(self.params.keys())
+
+        only = set(tag for tag, value in tags.items() if value)
+        if only:
+            # retain all parameters that have all of the tags in `only`
+            result = [param for param in result
+                      if not (only - self.params[param])]
+
+        exclude = set(tag for tag, value in tags.items() if not value)
+        if exclude:
+            # retain all parameters that have none of the tags in `exclude`
+            result = [param for param in result
+                      if not (self.params[param] & exclude)]
+
+        if unwrap_shared:
+            return utils.collect_shared_vars(result)
+        else:
+            return result
+
+    def get_output_shape_for(self, input_shape):
+        """
+        Computes the output shape of this layer, given an input shape.
+
+        Parameters
+        ----------
+        input_shape : tuple
+            A tuple representing the shape of the input. The tuple should have
+            as many elements as there are input dimensions, and the elements
+            should be integers or `None`.
+
+        Returns
+        -------
+        tuple
+            A tuple representing the shape of the output of this layer. The
+            tuple has as many elements as there are output dimensions, and the
+            elements are all either integers or `None`.
+
+        Notes
+        -----
+        This method will typically be overridden when implementing a new
+        :class:`Layer` class. By default it simply returns the input
+        shape. This means that a layer that does not modify the shape
+        (e.g. because it applies an elementwise operation) does not need
+        to override this method.
+        """
+        return input_shape
+
+    def get_output_for(self, input, **kwargs):
+        """
+        Propagates the given input through this layer (and only this layer).
+
+        Parameters
+        ----------
+        input : Theano expression
+            The expression to propagate through this layer.
+
+        Returns
+        -------
+        output : Theano expression
+            The output of this layer given the input to this layer.
+
+
+        Notes
+        -----
+        This is called by the base :meth:`lasagne.layers.get_output()`
+        to propagate data through a network.
+
+        This method should be overridden when implementing a new
+        :class:`Layer` class. By default it raises `NotImplementedError`.
+        """
+        raise NotImplementedError
+
+    def add_param(self, spec, shape, name=None, **tags):
+        """
+        Register and possibly initialize a parameter tensor for the layer.
+
+        When defining a layer class, this method is called in the constructor
+        to define which parameters the layer has, what their shapes are, how
+        they should be initialized and what tags are associated with them.
+        This allows layer classes to transparently support parameter
+        initialization from numpy arrays and callables, as well as setting
+        parameters to existing Theano shared variables or Theano expressions.
+
+        All registered parameters are stored along with their tags in the
+        ordered dictionary :attr:`Layer.params`, and can be retrieved with
+        :meth:`Layer.get_params()`, optionally filtered by their tags.
+
+        Parameters
+        ----------
+        spec : Theano shared variable, expression, numpy array or callable
+            initial value, expression or initializer for this parameter.
+            See :func:`lasagne.utils.create_param` for more information.
+
+        shape : tuple of int
+            a tuple of integers representing the desired shape of the
+            parameter tensor.
+
+        name : str (optional)
+            a descriptive name for the parameter variable. This will be passed
+            to ``theano.shared`` when the variable is created, prefixed by the
+            layer's name if any (in the form ``'layer_name.param_name'``). If
+            ``spec`` is already a shared variable or expression, this parameter
+            will be ignored to avoid overwriting an existing name.
+
+        **tags (optional)
+            tags associated with the parameter can be specified as keyword
+            arguments. To associate the tag ``tag1`` with the parameter, pass
+            ``tag1=True``.
+
+            By default, the tags ``regularizable`` and ``trainable`` are
+            associated with the parameter. Pass ``regularizable=False`` or
+            ``trainable=False`` respectively to prevent this.
+
+        Returns
+        -------
+        Theano shared variable or Theano expression
+            the resulting parameter variable or parameter expression
+
+        Notes
+        -----
+        It is recommended to assign the resulting parameter variable/expression
+        to an attribute of the layer for easy access, for example:
+
+        >>> self.W = self.add_param(W, (2, 3), name='W')  #doctest: +SKIP
+        """
+        # prefix the param name with the layer name if it exists
+        if name is not None:
+            if self.name is not None:
+                name = "%s.%s" % (self.name, name)
+        # create shared variable, or pass through given variable/expression
+        param = utils.create_param(spec, shape, name)
+        # parameters should be trainable and regularizable by default
+        tags['trainable'] = tags.get('trainable', True)
+        tags['regularizable'] = tags.get('regularizable', True)
+        self.params[param] = set(tag for tag, value in tags.items() if value)
+
+        return param
+
+
+class MergeLayer(Layer):
+    """
+    This class represents a layer that aggregates input from multiple layers.
+    It should be subclassed when implementing new types of layers that obtain
+    their input from multiple layers.
+
+    Parameters
+    ----------
+    incomings : a list of :class:`Layer` instances or tuples
+        The layers feeding into this layer, or expected input shapes.
+    name : a string or None
+        An optional name to attach to this layer.
+    """
+    def __init__(self, incomings, name=None):
+        self.input_shapes = [incoming if isinstance(incoming, tuple)
+                             else incoming.output_shape
+                             for incoming in incomings]
+        self.input_layers = [None if isinstance(incoming, tuple)
+                             else incoming
+                             for incoming in incomings]
+        self.name = name
+        self.params = OrderedDict()
+        self.get_output_kwargs = []
+
+    @Layer.output_shape.getter
+    def output_shape(self):
+        shape = self.get_output_shape_for(self.input_shapes)
+        if any(isinstance(s, T.Variable) for s in shape):
+            raise ValueError("%s returned a symbolic output shape from its "
+                             "get_output_shape_for() method: %r. This is not "
+                             "allowed; shapes must be tuples of integers for "
+                             "fixed-size dimensions and Nones for variable "
+                             "dimensions." % (self.__class__.__name__, shape))
+        return shape
+
+    def get_output_shape_for(self, input_shapes):
+        """
+        Computes the output shape of this layer, given a list of input shapes.
+
+        Parameters
+        ----------
+        input_shape : list of tuple
+            A list of tuples, with each tuple representing the shape of one of
+            the inputs (in the correct order). These tuples should have as many
+            elements as there are input dimensions, and the elements should be
+            integers or `None`.
+
+        Returns
+        -------
+        tuple
+            A tuple representing the shape of the output of this layer. The
+            tuple has as many elements as there are output dimensions, and the
+            elements are all either integers or `None`.
+
+        Notes
+        -----
+        This method must be overridden when implementing a new
+        :class:`Layer` class with multiple inputs. By default it raises
+        `NotImplementedError`.
+        """
+        raise NotImplementedError
+
+    def get_output_for(self, inputs, **kwargs):
+        """
+        Propagates the given inputs through this layer (and only this layer).
+
+        Parameters
+        ----------
+        inputs : list of Theano expressions
+            The Theano expressions to propagate through this layer.
+
+        Returns
+        -------
+        Theano expressions
+            The output of this layer given the inputs to this layer.
+
+        Notes
+        -----
+        This is called by the base :meth:`lasagne.layers.get_output()`
+        to propagate data through a network.
+
+        This method should be overridden when implementing a new
+        :class:`Layer` class with multiple inputs. By default it raises
+        `NotImplementedError`.
+        """
+        raise NotImplementedError
diff --git a/lasagne/layers/conv.py b/lasagne/layers/conv.py
new file mode 100644
index 0000000..9288d23
--- /dev/null
+++ b/lasagne/layers/conv.py
@@ -0,0 +1,934 @@
+import theano.tensor as T
+
+from .. import init
+from .. import nonlinearities
+from ..utils import as_tuple
+from ..theano_extensions import conv, padding
+
+from .base import Layer
+
+
+__all__ = [
+    "Conv1DLayer",
+    "Conv2DLayer",
+    "TransposedConv2DLayer",
+    "Deconv2DLayer",
+    "DilatedConv2DLayer",
+]
+
+
+def conv_output_length(input_length, filter_size, stride, pad=0):
+    """Helper function to compute the output size of a convolution operation
+
+    This function computes the length along a single axis, which corresponds
+    to a 1D convolution. It can also be used for convolutions with higher
+    dimensionalities by using it individually for each axis.
+
+    Parameters
+    ----------
+    input_length : int or None
+        The size of the input.
+
+    filter_size : int
+        The size of the filter.
+
+    stride : int
+        The stride of the convolution operation.
+
+    pad : int, 'full' or 'same' (default: 0)
+        By default, the convolution is only computed where the input and the
+        filter fully overlap (a valid convolution). When ``stride=1``, this
+        yields an output that is smaller than the input by ``filter_size - 1``.
+        The `pad` argument allows you to implicitly pad the input with zeros,
+        extending the output size.
+
+        A single integer results in symmetric zero-padding of the given size on
+        both borders.
+
+        ``'full'`` pads with one less than the filter size on both sides. This
+        is equivalent to computing the convolution wherever the input and the
+        filter overlap by at least one position.
+
+        ``'same'`` pads with half the filter size on both sides (one less on
+        the second side for an even filter size). When ``stride=1``, this
+        results in an output size equal to the input size.
+
+    Returns
+    -------
+    int or None
+        The output size corresponding to the given convolution parameters, or
+        ``None`` if `input_size` is ``None``.
+
+    Raises
+    ------
+    ValueError
+        When an invalid padding is specified, a `ValueError` is raised.
+    """
+    if input_length is None:
+        return None
+    if pad == 'valid':
+        output_length = input_length - filter_size + 1
+    elif pad == 'full':
+        output_length = input_length + filter_size - 1
+    elif pad == 'same':
+        output_length = input_length
+    elif isinstance(pad, int):
+        output_length = input_length + 2 * pad - filter_size + 1
+    else:
+        raise ValueError('Invalid pad: {0}'.format(pad))
+
+    # This is the integer arithmetic equivalent to
+    # np.ceil(output_length / stride)
+    output_length = (output_length + stride - 1) // stride
+
+    return output_length
+
+
+def conv_input_length(output_length, filter_size, stride, pad=0):
+    """Helper function to compute the input size of a convolution operation
+
+    This function computes the length along a single axis, which corresponds
+    to a 1D convolution. It can also be used for convolutions with higher
+    dimensionalities by using it individually for each axis.
+
+    Parameters
+    ----------
+    output_length : int or None
+        The size of the output.
+
+    filter_size : int
+        The size of the filter.
+
+    stride : int
+        The stride of the convolution operation.
+
+    pad : int, 'full' or 'same' (default: 0)
+        By default, the convolution is only computed where the input and the
+        filter fully overlap (a valid convolution). When ``stride=1``, this
+        yields an output that is smaller than the input by ``filter_size - 1``.
+        The `pad` argument allows you to implicitly pad the input with zeros,
+        extending the output size.
+
+        A single integer results in symmetric zero-padding of the given size on
+        both borders.
+
+        ``'full'`` pads with one less than the filter size on both sides. This
+        is equivalent to computing the convolution wherever the input and the
+        filter overlap by at least one position.
+
+        ``'same'`` pads with half the filter size on both sides (one less on
+        the second side for an even filter size). When ``stride=1``, this
+        results in an output size equal to the input size.
+
+    Returns
+    -------
+    int or None
+        The smallest input size corresponding to the given convolution
+        parameters for the given output size, or ``None`` if `output_size` is
+        ``None``. For a strided convolution, any input size of up to
+        ``stride - 1`` elements larger than returned will still give the same
+        output size.
+
+    Raises
+    ------
+    ValueError
+        When an invalid padding is specified, a `ValueError` is raised.
+
+    Notes
+    -----
+    This can be used to compute the output size of a convolution backward pass,
+    also called transposed convolution, fractionally-strided convolution or
+    (wrongly) deconvolution in the literature.
+    """
+    if output_length is None:
+        return None
+    if pad == 'valid':
+        pad = 0
+    elif pad == 'full':
+        pad = filter_size - 1
+    elif pad == 'same':
+        pad = filter_size // 2
+    if not isinstance(pad, int):
+        raise ValueError('Invalid pad: {0}'.format(pad))
+    return (output_length - 1) * stride - 2 * pad + filter_size
+
+
+class BaseConvLayer(Layer):
+    """
+    lasagne.layers.BaseConvLayer(incoming, num_filters, filter_size,
+    stride=1, pad=0, untie_biases=False,
+    W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.),
+    nonlinearity=lasagne.nonlinearities.rectify, flip_filters=True,
+    n=None, **kwargs)
+
+    Convolutional layer base class
+
+    Base class for performing an `n`-dimensional convolution on its input,
+    optionally adding a bias and applying an elementwise nonlinearity. Note
+    that this class cannot be used in a Lasagne network, only its subclasses
+    can (e.g., :class:`Conv1DLayer`, :class:`Conv2DLayer`).
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape. Must
+        be a tensor of 2+`n` dimensions:
+        ``(batch_size, num_input_channels, <n spatial dimensions>)``.
+
+    num_filters : int
+        The number of learnable convolutional filters this layer has.
+
+    filter_size : int or iterable of int
+        An integer or an `n`-element tuple specifying the size of the filters.
+
+    stride : int or iterable of int
+        An integer or an `n`-element tuple specifying the stride of the
+        convolution operation.
+
+    pad : int, iterable of int, 'full', 'same' or 'valid' (default: 0)
+        By default, the convolution is only computed where the input and the
+        filter fully overlap (a valid convolution). When ``stride=1``, this
+        yields an output that is smaller than the input by ``filter_size - 1``.
+        The `pad` argument allows you to implicitly pad the input with zeros,
+        extending the output size.
+
+        A single integer results in symmetric zero-padding of the given size on
+        all borders, a tuple of `n` integers allows different symmetric padding
+        per dimension.
+
+        ``'full'`` pads with one less than the filter size on both sides. This
+        is equivalent to computing the convolution wherever the input and the
+        filter overlap by at least one position.
+
+        ``'same'`` pads with half the filter size (rounded down) on both sides.
+        When ``stride=1`` this results in an output size equal to the input
+        size. Even filter size is not supported.
+
+        ``'valid'`` is an alias for ``0`` (no padding / a valid convolution).
+
+        Note that ``'full'`` and ``'same'`` can be faster than equivalent
+        integer values due to optimizations by Theano.
+
+    untie_biases : bool (default: False)
+        If ``False``, the layer will have a bias parameter for each channel,
+        which is shared across all positions in this channel. As a result, the
+        `b` attribute will be a vector (1D).
+
+        If ``True``, the layer will have separate bias parameters for each
+        position in each channel. As a result, the `b` attribute will be an
+        `n`-dimensional tensor.
+
+    W : Theano shared variable, expression, numpy array or callable
+        Initial value, expression or initializer for the weights.
+        These should be a tensor of 2+`n` dimensions with shape
+        ``(num_filters, num_input_channels, <n spatial dimensions>)``.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    b : Theano shared variable, expression, numpy array, callable or ``None``
+        Initial value, expression or initializer for the biases. If set to
+        ``None``, the layer will have no biases. Otherwise, biases should be
+        a 1D array with shape ``(num_filters,)`` if `untied_biases` is set to
+        ``False``. If it is set to ``True``, its shape should be
+        ``(num_filters, <n spatial dimensions>)`` instead.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the layer activations. If None
+        is provided, the layer will be linear.
+
+    flip_filters : bool (default: True)
+        Whether to flip the filters before sliding them over the input,
+        performing a convolution (this is the default), or not to flip them and
+        perform a correlation. Note that for some other convolutional layers in
+        Lasagne, flipping incurs an overhead and is disabled by default --
+        check the documentation when using learned weights from another layer.
+
+    n : int or None
+        The dimensionality of the convolution (i.e., the number of spatial
+        dimensions of each feature map and each convolutional filter). If
+        ``None``, will be inferred from the input shape.
+
+    **kwargs
+        Any additional keyword arguments are passed to the `Layer` superclass.
+
+    Attributes
+    ----------
+    W : Theano shared variable or expression
+        Variable or expression representing the filter weights.
+
+    b : Theano shared variable or expression
+        Variable or expression representing the biases.
+    """
+    def __init__(self, incoming, num_filters, filter_size, stride=1, pad=0,
+                 untie_biases=False,
+                 W=init.GlorotUniform(), b=init.Constant(0.),
+                 nonlinearity=nonlinearities.rectify, flip_filters=True,
+                 n=None, **kwargs):
+        super(BaseConvLayer, self).__init__(incoming, **kwargs)
+        if nonlinearity is None:
+            self.nonlinearity = nonlinearities.identity
+        else:
+            self.nonlinearity = nonlinearity
+
+        if n is None:
+            n = len(self.input_shape) - 2
+        elif n != len(self.input_shape) - 2:
+            raise ValueError("Tried to create a %dD convolution layer with "
+                             "input shape %r. Expected %d input dimensions "
+                             "(batchsize, channels, %d spatial dimensions)." %
+                             (n, self.input_shape, n+2, n))
+        self.n = n
+        self.num_filters = num_filters
+        self.filter_size = as_tuple(filter_size, n, int)
+        self.flip_filters = flip_filters
+        self.stride = as_tuple(stride, n, int)
+        self.untie_biases = untie_biases
+
+        if pad == 'same':
+            if any(s % 2 == 0 for s in self.filter_size):
+                raise NotImplementedError(
+                    '`same` padding requires odd filter size.')
+        if pad == 'valid':
+            self.pad = as_tuple(0, n)
+        elif pad in ('full', 'same'):
+            self.pad = pad
+        else:
+            self.pad = as_tuple(pad, n, int)
+
+        self.W = self.add_param(W, self.get_W_shape(), name="W")
+        if b is None:
+            self.b = None
+        else:
+            if self.untie_biases:
+                biases_shape = (num_filters,) + self.output_shape[2:]
+            else:
+                biases_shape = (num_filters,)
+            self.b = self.add_param(b, biases_shape, name="b",
+                                    regularizable=False)
+
+    def get_W_shape(self):
+        """Get the shape of the weight matrix `W`.
+
+        Returns
+        -------
+        tuple of int
+            The shape of the weight matrix.
+        """
+        num_input_channels = self.input_shape[1]
+        return (self.num_filters, num_input_channels) + self.filter_size
+
+    def get_output_shape_for(self, input_shape):
+        pad = self.pad if isinstance(self.pad, tuple) else (self.pad,) * self.n
+        batchsize = input_shape[0]
+        return ((batchsize, self.num_filters) +
+                tuple(conv_output_length(input, filter, stride, p)
+                      for input, filter, stride, p
+                      in zip(input_shape[2:], self.filter_size,
+                             self.stride, pad)))
+
+    def get_output_for(self, input, **kwargs):
+        conved = self.convolve(input, **kwargs)
+
+        if self.b is None:
+            activation = conved
+        elif self.untie_biases:
+            activation = conved + T.shape_padleft(self.b, 1)
+        else:
+            activation = conved + self.b.dimshuffle(('x', 0) + ('x',) * self.n)
+
+        return self.nonlinearity(activation)
+
+    def convolve(self, input, **kwargs):
+        """
+        Symbolically convolves `input` with ``self.W``, producing an output of
+        shape ``self.output_shape``. To be implemented by subclasses.
+
+        Parameters
+        ----------
+        input : Theano tensor
+            The input minibatch to convolve
+        **kwargs
+            Any additional keyword arguments from :meth:`get_output_for`
+
+        Returns
+        -------
+        Theano tensor
+            `input` convolved according to the configuration of this layer,
+            without any bias or nonlinearity applied.
+        """
+        raise NotImplementedError("BaseConvLayer does not implement the "
+                                  "convolve() method. You will want to "
+                                  "use a subclass such as Conv2DLayer.")
+
+
+class Conv1DLayer(BaseConvLayer):
+    """
+    lasagne.layers.Conv1DLayer(incoming, num_filters, filter_size, stride=1,
+    pad=0, untie_biases=False, W=lasagne.init.GlorotUniform(),
+    b=lasagne.init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify,
+    flip_filters=True, convolution=lasagne.theano_extensions.conv.conv1d_mc0,
+    **kwargs)
+
+    1D convolutional layer
+
+    Performs a 1D convolution on its input and optionally adds a bias and
+    applies an elementwise nonlinearity.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape. The
+        output of this layer should be a 3D tensor, with shape
+        ``(batch_size, num_input_channels, input_length)``.
+
+    num_filters : int
+        The number of learnable convolutional filters this layer has.
+
+    filter_size : int or iterable of int
+        An integer or a 1-element tuple specifying the size of the filters.
+
+    stride : int or iterable of int
+        An integer or a 1-element tuple specifying the stride of the
+        convolution operation.
+
+    pad : int, iterable of int, 'full', 'same' or 'valid' (default: 0)
+        By default, the convolution is only computed where the input and the
+        filter fully overlap (a valid convolution). When ``stride=1``, this
+        yields an output that is smaller than the input by ``filter_size - 1``.
+        The `pad` argument allows you to implicitly pad the input with zeros,
+        extending the output size.
+
+        An integer or a 1-element tuple results in symmetric zero-padding of
+        the given size on both borders.
+
+        ``'full'`` pads with one less than the filter size on both sides. This
+        is equivalent to computing the convolution wherever the input and the
+        filter overlap by at least one position.
+
+        ``'same'`` pads with half the filter size (rounded down) on both sides.
+        When ``stride=1`` this results in an output size equal to the input
+        size. Even filter size is not supported.
+
+        ``'valid'`` is an alias for ``0`` (no padding / a valid convolution).
+
+    untie_biases : bool (default: False)
+        If ``False``, the layer will have a bias parameter for each channel,
+        which is shared across all positions in this channel. As a result, the
+        `b` attribute will be a vector (1D).
+
+        If True, the layer will have separate bias parameters for each
+        position in each channel. As a result, the `b` attribute will be a
+        matrix (2D).
+
+    W : Theano shared variable, expression, numpy array or callable
+        Initial value, expression or initializer for the weights.
+        These should be a 3D tensor with shape
+        ``(num_filters, num_input_channels, filter_length)``.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    b : Theano shared variable, expression, numpy array, callable or ``None``
+        Initial value, expression or initializer for the biases. If set to
+        ``None``, the layer will have no biases. Otherwise, biases should be
+        a 1D array with shape ``(num_filters,)`` if `untied_biases` is set to
+        ``False``. If it is set to ``True``, its shape should be
+        ``(num_filters, input_length)`` instead.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the layer activations. If None
+        is provided, the layer will be linear.
+
+    flip_filters : bool (default: True)
+        Whether to flip the filters before sliding them over the input,
+        performing a convolution (this is the default), or not to flip them and
+        perform a correlation. Note that for some other convolutional layers in
+        Lasagne, flipping incurs an overhead and is disabled by default --
+        check the documentation when using learned weights from another layer.
+
+    convolution : callable
+        The convolution implementation to use. The
+        `lasagne.theano_extensions.conv` module provides some alternative
+        implementations for 1D convolutions, because the Theano API only
+        features a 2D convolution implementation. Usually it should be fine
+        to leave this at the default value. Note that not all implementations
+        support all settings for `pad` and `subsample`.
+
+    **kwargs
+        Any additional keyword arguments are passed to the `Layer` superclass.
+
+    Attributes
+    ----------
+    W : Theano shared variable or expression
+        Variable or expression representing the filter weights.
+
+    b : Theano shared variable or expression
+        Variable or expression representing the biases.
+    """
+    def __init__(self, incoming, num_filters, filter_size, stride=1,
+                 pad=0, untie_biases=False,
+                 W=init.GlorotUniform(), b=init.Constant(0.),
+                 nonlinearity=nonlinearities.rectify, flip_filters=True,
+                 convolution=conv.conv1d_mc0, **kwargs):
+        super(Conv1DLayer, self).__init__(incoming, num_filters, filter_size,
+                                          stride, pad, untie_biases, W, b,
+                                          nonlinearity, flip_filters, n=1,
+                                          **kwargs)
+        self.convolution = convolution
+
+    def convolve(self, input, **kwargs):
+        border_mode = 'half' if self.pad == 'same' else self.pad
+        conved = self.convolution(input, self.W,
+                                  self.input_shape, self.get_W_shape(),
+                                  subsample=self.stride,
+                                  border_mode=border_mode,
+                                  filter_flip=self.flip_filters)
+        return conved
+
+
+class Conv2DLayer(BaseConvLayer):
+    """
+    lasagne.layers.Conv2DLayer(incoming, num_filters, filter_size,
+    stride=(1, 1), pad=0, untie_biases=False,
+    W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.),
+    nonlinearity=lasagne.nonlinearities.rectify, flip_filters=True,
+    convolution=theano.tensor.nnet.conv2d, **kwargs)
+
+    2D convolutional layer
+
+    Performs a 2D convolution on its input and optionally adds a bias and
+    applies an elementwise nonlinearity.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape. The
+        output of this layer should be a 4D tensor, with shape
+        ``(batch_size, num_input_channels, input_rows, input_columns)``.
+
+    num_filters : int
+        The number of learnable convolutional filters this layer has.
+
+    filter_size : int or iterable of int
+        An integer or a 2-element tuple specifying the size of the filters.
+
+    stride : int or iterable of int
+        An integer or a 2-element tuple specifying the stride of the
+        convolution operation.
+
+    pad : int, iterable of int, 'full', 'same' or 'valid' (default: 0)
+        By default, the convolution is only computed where the input and the
+        filter fully overlap (a valid convolution). When ``stride=1``, this
+        yields an output that is smaller than the input by ``filter_size - 1``.
+        The `pad` argument allows you to implicitly pad the input with zeros,
+        extending the output size.
+
+        A single integer results in symmetric zero-padding of the given size on
+        all borders, a tuple of two integers allows different symmetric padding
+        per dimension.
+
+        ``'full'`` pads with one less than the filter size on both sides. This
+        is equivalent to computing the convolution wherever the input and the
+        filter overlap by at least one position.
+
+        ``'same'`` pads with half the filter size (rounded down) on both sides.
+        When ``stride=1`` this results in an output size equal to the input
+        size. Even filter size is not supported.
+
+        ``'valid'`` is an alias for ``0`` (no padding / a valid convolution).
+
+        Note that ``'full'`` and ``'same'`` can be faster than equivalent
+        integer values due to optimizations by Theano.
+
+    untie_biases : bool (default: False)
+        If ``False``, the layer will have a bias parameter for each channel,
+        which is shared across all positions in this channel. As a result, the
+        `b` attribute will be a vector (1D).
+
+        If True, the layer will have separate bias parameters for each
+        position in each channel. As a result, the `b` attribute will be a
+        3D tensor.
+
+    W : Theano shared variable, expression, numpy array or callable
+        Initial value, expression or initializer for the weights.
+        These should be a 4D tensor with shape
+        ``(num_filters, num_input_channels, filter_rows, filter_columns)``.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    b : Theano shared variable, expression, numpy array, callable or ``None``
+        Initial value, expression or initializer for the biases. If set to
+        ``None``, the layer will have no biases. Otherwise, biases should be
+        a 1D array with shape ``(num_filters,)`` if `untied_biases` is set to
+        ``False``. If it is set to ``True``, its shape should be
+        ``(num_filters, output_rows, output_columns)`` instead.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the layer activations. If None
+        is provided, the layer will be linear.
+
+    flip_filters : bool (default: True)
+        Whether to flip the filters before sliding them over the input,
+        performing a convolution (this is the default), or not to flip them and
+        perform a correlation. Note that for some other convolutional layers in
+        Lasagne, flipping incurs an overhead and is disabled by default --
+        check the documentation when using learned weights from another layer.
+
+    convolution : callable
+        The convolution implementation to use. Usually it should be fine to
+        leave this at the default value.
+
+    **kwargs
+        Any additional keyword arguments are passed to the `Layer` superclass.
+
+    Attributes
+    ----------
+    W : Theano shared variable or expression
+        Variable or expression representing the filter weights.
+
+    b : Theano shared variable or expression
+        Variable or expression representing the biases.
+    """
+    def __init__(self, incoming, num_filters, filter_size, stride=(1, 1),
+                 pad=0, untie_biases=False,
+                 W=init.GlorotUniform(), b=init.Constant(0.),
+                 nonlinearity=nonlinearities.rectify, flip_filters=True,
+                 convolution=T.nnet.conv2d, **kwargs):
+        super(Conv2DLayer, self).__init__(incoming, num_filters, filter_size,
+                                          stride, pad, untie_biases, W, b,
+                                          nonlinearity, flip_filters, n=2,
+                                          **kwargs)
+        self.convolution = convolution
+
+    def convolve(self, input, **kwargs):
+        border_mode = 'half' if self.pad == 'same' else self.pad
+        conved = self.convolution(input, self.W,
+                                  self.input_shape, self.get_W_shape(),
+                                  subsample=self.stride,
+                                  border_mode=border_mode,
+                                  filter_flip=self.flip_filters)
+        return conved
+
+# TODO: add Conv3DLayer
+
+
+class TransposedConv2DLayer(BaseConvLayer):
+    """
+    lasagne.layers.TransposedConv2DLayer(incoming, num_filters, filter_size,
+    stride=(1, 1), crop=0, untie_biases=False,
+    W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.),
+    nonlinearity=lasagne.nonlinearities.rectify, flip_filters=False, **kwargs)
+
+    2D transposed convolution layer
+
+    Performs the backward pass of a 2D convolution (also called transposed
+    convolution, fractionally-strided convolution or deconvolution in the
+    literature) on its input and optionally adds a bias and applies an
+    elementwise nonlinearity.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape. The
+        output of this layer should be a 4D tensor, with shape
+        ``(batch_size, num_input_channels, input_rows, input_columns)``.
+
+    num_filters : int
+        The number of learnable convolutional filters this layer has.
+
+    filter_size : int or iterable of int
+        An integer or a 2-element tuple specifying the size of the filters.
+
+    stride : int or iterable of int
+        An integer or a 2-element tuple specifying the stride of the
+        transposed convolution operation. For the transposed convolution, this
+        gives the dilation factor for the input -- increasing it increases the
+        output size.
+
+    crop : int, iterable of int, 'full', 'same' or 'valid' (default: 0)
+        By default, the transposed convolution is computed where the input and
+        the filter overlap by at least one position (a full convolution). When
+        ``stride=1``, this yields an output that is larger than the input by
+        ``filter_size - 1``. It can be thought of as a valid convolution padded
+        with zeros. The `crop` argument allows you to decrease the amount of
+        this zero-padding, reducing the output size. It is the counterpart to
+        the `pad` argument in a non-transposed convolution.
+
+        A single integer results in symmetric cropping of the given size on all
+        borders, a tuple of two integers allows different symmetric cropping
+        per dimension.
+
+        ``'full'`` disables zero-padding. It is is equivalent to computing the
+        convolution wherever the input and the filter fully overlap.
+
+        ``'same'`` pads with half the filter size (rounded down) on both sides.
+        When ``stride=1`` this results in an output size equal to the input
+        size. Even filter size is not supported.
+
+        ``'valid'`` is an alias for ``0`` (no cropping / a full convolution).
+
+        Note that ``'full'`` and ``'same'`` can be faster than equivalent
+        integer values due to optimizations by Theano.
+
+    untie_biases : bool (default: False)
+        If ``False``, the layer will have a bias parameter for each channel,
+        which is shared across all positions in this channel. As a result, the
+        `b` attribute will be a vector (1D).
+
+        If True, the layer will have separate bias parameters for each
+        position in each channel. As a result, the `b` attribute will be a
+        3D tensor.
+
+    W : Theano shared variable, expression, numpy array or callable
+        Initial value, expression or initializer for the weights.
+        These should be a 4D tensor with shape
+        ``(num_input_channels, num_filters, filter_rows, filter_columns)``.
+        Note that the first two dimensions are swapped compared to a
+        non-transposed convolution.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    b : Theano shared variable, expression, numpy array, callable or ``None``
+        Initial value, expression or initializer for the biases. If set to
+        ``None``, the layer will have no biases. Otherwise, biases should be
+        a 1D array with shape ``(num_filters,)`` if `untied_biases` is set to
+        ``False``. If it is set to ``True``, its shape should be
+        ``(num_filters, output_rows, output_columns)`` instead.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the layer activations. If None
+        is provided, the layer will be linear.
+
+    flip_filters : bool (default: False)
+        Whether to flip the filters before sliding them over the input,
+        performing a convolution, or not to flip them and perform a
+        correlation (this is the default). Note that this flag is inverted
+        compared to a non-transposed convolution.
+
+    **kwargs
+        Any additional keyword arguments are passed to the `Layer` superclass.
+
+    Attributes
+    ----------
+    W : Theano shared variable or expression
+        Variable or expression representing the filter weights.
+
+    b : Theano shared variable or expression
+        Variable or expression representing the biases.
+
+    Notes
+    -----
+    The transposed convolution is implemented as the backward pass of a
+    corresponding non-transposed convolution. It can be thought of as dilating
+    the input (by adding ``stride - 1`` zeros between adjacent input elements),
+    padding it with ``filter_size - 1 - crop`` zeros, and cross-correlating it
+    with the filters. See [1]_ for more background.
+
+    Examples
+    --------
+    To transpose an existing convolution, with tied filter weights:
+
+    >>> from lasagne.layers import Conv2DLayer, TransposedConv2DLayer
+    >>> conv = Conv2DLayer((None, 1, 32, 32), 16, 3, stride=2, pad=2)
+    >>> deconv = TransposedConv2DLayer(conv, conv.input_shape[1],
+    ...         conv.filter_size, stride=conv.stride, crop=conv.pad,
+    ...         W=conv.W, flip_filters=not conv.flip_filters)
+
+    References
+    ----------
+    .. [1] Vincent Dumoulin, Francesco Visin (2016):
+           A guide to convolution arithmetic for deep learning. arXiv.
+           http://arxiv.org/abs/1603.07285,
+           https://github.com/vdumoulin/conv_arithmetic
+    """
+    def __init__(self, incoming, num_filters, filter_size, stride=(1, 1),
+                 crop=0, untie_biases=False,
+                 W=init.GlorotUniform(), b=init.Constant(0.),
+                 nonlinearity=nonlinearities.rectify, flip_filters=False,
+                 **kwargs):
+        super(TransposedConv2DLayer, self).__init__(
+                incoming, num_filters, filter_size, stride, crop, untie_biases,
+                W, b, nonlinearity, flip_filters, n=2, **kwargs)
+        # rename self.pad to self.crop:
+        self.crop = self.pad
+        del self.pad
+
+    def get_W_shape(self):
+        num_input_channels = self.input_shape[1]
+        # first two sizes are swapped compared to a forward convolution
+        return (num_input_channels, self.num_filters) + self.filter_size
+
+    def get_output_shape_for(self, input_shape):
+        # when called from the constructor, self.crop is still called self.pad:
+        crop = getattr(self, 'crop', getattr(self, 'pad', None))
+        crop = crop if isinstance(crop, tuple) else (crop,) * self.n
+        batchsize = input_shape[0]
+        return ((batchsize, self.num_filters) +
+                tuple(conv_input_length(input, filter, stride, p)
+                      for input, filter, stride, p
+                      in zip(input_shape[2:], self.filter_size,
+                             self.stride, crop)))
+
+    def convolve(self, input, **kwargs):
+        border_mode = 'half' if self.crop == 'same' else self.crop
+        op = T.nnet.abstract_conv.AbstractConv2d_gradInputs(
+            imshp=self.output_shape,
+            kshp=self.get_W_shape(),
+            subsample=self.stride, border_mode=border_mode,
+            filter_flip=not self.flip_filters)
+        output_size = self.output_shape[2:]
+        if any(s is None for s in output_size):
+            output_size = self.get_output_shape_for(input.shape)[2:]
+        conved = op(self.W, input, output_size)
+        return conved
+
+Deconv2DLayer = TransposedConv2DLayer
+
+
+class DilatedConv2DLayer(BaseConvLayer):
+    """
+    lasagne.layers.DilatedConv2DLayer(incoming, num_filters, filter_size,
+    dilation=(1, 1), pad=0, untie_biases=False,
+    W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.),
+    nonlinearity=lasagne.nonlinearities.rectify, flip_filters=False, **kwargs)
+
+    2D dilated convolution layer
+
+    Performs a 2D convolution with dilated filters, then optionally adds a bias
+    and applies an elementwise nonlinearity.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape. The
+        output of this layer should be a 4D tensor, with shape
+        ``(batch_size, num_input_channels, input_rows, input_columns)``.
+
+    num_filters : int
+        The number of learnable convolutional filters this layer has.
+
+    filter_size : int or iterable of int
+        An integer or a 2-element tuple specifying the size of the filters.
+
+    dilation : int or iterable of int
+        An integer or a 2-element tuple specifying the dilation factor of the
+        filters. A factor of :math:`x` corresponds to :math:`x - 1` zeros
+        inserted between adjacent filter elements.
+
+    pad : int, iterable of int, or 'valid' (default: 0)
+        The amount of implicit zero padding of the input.
+        This implementation does not support padding, the argument is provided
+        for compatibility to other convolutional layers only.
+
+    untie_biases : bool (default: False)
+        If ``False``, the layer will have a bias parameter for each channel,
+        which is shared across all positions in this channel. As a result, the
+        `b` attribute will be a vector (1D).
+
+        If True, the layer will have separate bias parameters for each
+        position in each channel. As a result, the `b` attribute will be a
+        3D tensor.
+
+    W : Theano shared variable, expression, numpy array or callable
+        Initial value, expression or initializer for the weights.
+        These should be a 4D tensor with shape
+        ``(num_input_channels, num_filters, filter_rows, filter_columns)``.
+        Note that the first two dimensions are swapped compared to a
+        non-dilated convolution.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    b : Theano shared variable, expression, numpy array, callable or ``None``
+        Initial value, expression or initializer for the biases. If set to
+        ``None``, the layer will have no biases. Otherwise, biases should be
+        a 1D array with shape ``(num_filters,)`` if `untied_biases` is set to
+        ``False``. If it is set to ``True``, its shape should be
+        ``(num_filters, output_rows, output_columns)`` instead.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the layer activations. If None
+        is provided, the layer will be linear.
+
+    flip_filters : bool (default: False)
+        Whether to flip the filters before sliding them over the input,
+        performing a convolution, or not to flip them and perform a
+        correlation (this is the default).
+        This implementation does not support flipped filters, the argument is
+        provided for compatibility to other convolutional layers only.
+
+    **kwargs
+        Any additional keyword arguments are passed to the `Layer` superclass.
+
+    Attributes
+    ----------
+    W : Theano shared variable or expression
+        Variable or expression representing the filter weights.
+
+    b : Theano shared variable or expression
+        Variable or expression representing the biases.
+
+    Notes
+    -----
+    The dilated convolution is implemented as the backward pass of a
+    convolution wrt. weights, passing the filters as the output gradient.
+    It can be thought of as dilating the filters (by adding ``dilation - 1``
+    zeros between adjacent filter elements) and cross-correlating them with the
+    input. See [1]_ for more background.
+
+    References
+    ----------
+    .. [1] Fisher Yu, Vladlen Koltun (2016),
+           Multi-Scale Context Aggregation by Dilated Convolutions. ICLR 2016.
+           http://arxiv.org/abs/1511.07122, https://github.com/fyu/dilation
+    """
+    def __init__(self, incoming, num_filters, filter_size, dilation=(1, 1),
+                 pad=0, untie_biases=False,
+                 W=init.GlorotUniform(), b=init.Constant(0.),
+                 nonlinearity=nonlinearities.rectify, flip_filters=False,
+                 **kwargs):
+        self.dilation = as_tuple(dilation, 2, int)
+        super(DilatedConv2DLayer, self).__init__(
+                incoming, num_filters, filter_size, 1, pad,
+                untie_biases, W, b, nonlinearity, flip_filters, n=2, **kwargs)
+        # remove self.stride:
+        del self.stride
+        # require valid convolution
+        if self.pad != (0, 0):
+            raise NotImplementedError(
+                    "DilatedConv2DLayer requires pad=0 / (0,0) / 'valid', but "
+                    "got %r. For a padded dilated convolution, add a PadLayer."
+                    % (pad,))
+        # require unflipped filters
+        if self.flip_filters:
+            raise NotImplementedError(
+                    "DilatedConv2DLayer requires flip_filters=False.")
+
+    def get_W_shape(self):
+        num_input_channels = self.input_shape[1]
+        # first two sizes are swapped compared to a forward convolution
+        return (num_input_channels, self.num_filters) + self.filter_size
+
+    def get_output_shape_for(self, input_shape):
+        batchsize = input_shape[0]
+        return ((batchsize, self.num_filters) +
+                tuple(conv_output_length(input, (filter-1) * dilate + 1, 1, 0)
+                      for input, filter, dilate
+                      in zip(input_shape[2:], self.filter_size,
+                             self.dilation)))
+
+    def convolve(self, input, **kwargs):
+        # we perform a convolution backward pass wrt weights,
+        # passing kernels as output gradient
+        imshp = self.input_shape
+        kshp = self.output_shape
+        # and swapping channels and batchsize
+        imshp = (imshp[1], imshp[0]) + imshp[2:]
+        kshp = (kshp[1], kshp[0]) + kshp[2:]
+        op = T.nnet.abstract_conv.AbstractConv2d_gradWeights(
+            imshp=imshp, kshp=kshp,
+            subsample=self.dilation, border_mode='valid',
+            filter_flip=False)
+        output_size = self.output_shape[2:]
+        if any(s is None for s in output_size):
+            output_size = self.get_output_shape_for(input.shape)[2:]
+        conved = op(input.transpose(1, 0, 2, 3), self.W, output_size)
+        return conved.transpose(1, 0, 2, 3)
diff --git a/lasagne/layers/corrmm.py b/lasagne/layers/corrmm.py
new file mode 100644
index 0000000..e487397
--- /dev/null
+++ b/lasagne/layers/corrmm.py
@@ -0,0 +1,147 @@
+import theano
+
+from .. import init
+from .. import nonlinearities
+
+from .base import Layer
+
+from .conv import conv_output_length, BaseConvLayer
+from ..utils import as_tuple
+
+from theano.sandbox.cuda.basic_ops import gpu_contiguous
+from theano.sandbox.cuda.blas import GpuCorrMM
+
+
+__all__ = [
+    "Conv2DMMLayer",
+]
+
+
+if not theano.sandbox.cuda.cuda_enabled:
+    raise ImportError(
+            "requires GPU support -- see http://lasagne.readthedocs.org/en/"
+            "latest/user/installation.html#gpu-support")  # pragma: no cover
+
+
+class Conv2DMMLayer(BaseConvLayer):
+    """
+    lasagne.layers.Conv2DMMLayer(incoming, num_filters, filter_size,
+    stride=(1, 1), pad=0, untie_biases=False,
+    W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.),
+    nonlinearity=lasagne.nonlinearities.rectify, flip_filters=False,
+    **kwargs)
+
+    2D convolutional layer
+
+    Performs a 2D convolution on its input and optionally adds a bias and
+    applies an elementwise nonlinearity.  This is an alternative implementation
+    which uses ``theano.sandbox.cuda.blas.GpuCorrMM`` directly.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape. The
+        output of this layer should be a 4D tensor, with shape
+        ``(batch_size, num_input_channels, input_rows, input_columns)``.
+
+    num_filters : int
+        The number of learnable convolutional filters this layer has.
+
+    filter_size : int or iterable of int
+        An integer or a 2-element tuple specifying the size of the filters.
+
+    stride : int or iterable of int
+        An integer or a 2-element tuple specifying the stride of the
+        convolution operation.
+
+    pad : int, iterable of int, 'full', 'same' or 'valid' (default: 0)
+        By default, the convolution is only computed where the input and the
+        filter fully overlap (a valid convolution). When ``stride=1``, this
+        yields an output that is smaller than the input by ``filter_size - 1``.
+        The `pad` argument allows you to implicitly pad the input with zeros,
+        extending the output size.
+
+        A single integer results in symmetric zero-padding of the given size on
+        all borders, a tuple of two integers allows different symmetric padding
+        per dimension.
+
+        ``'full'`` pads with one less than the filter size on both sides. This
+        is equivalent to computing the convolution wherever the input and the
+        filter overlap by at least one position.
+
+        ``'same'`` pads with half the filter size (rounded down) on both sides.
+        When ``stride=1`` this results in an output size equal to the input
+        size. Even filter size is not supported.
+
+        ``'valid'`` is an alias for ``0`` (no padding / a valid convolution).
+
+        Note that ``'full'`` and ``'same'`` can be faster than equivalent
+        integer values due to optimizations by Theano.
+
+    untie_biases : bool (default: False)
+        If ``False``, the layer will have a bias parameter for each channel,
+        which is shared across all positions in this channel. As a result, the
+        `b` attribute will be a vector (1D).
+
+        If True, the layer will have separate bias parameters for each
+        position in each channel. As a result, the `b` attribute will be a
+        3D tensor.
+
+    W : Theano shared variable, expression, numpy array or callable
+        Initial value, expression or initializer for the weights.
+        These should be a 4D tensor with shape
+        ``(num_filters, num_input_channels, filter_rows, filter_columns)``.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    b : Theano shared variable, expression, numpy array, callable or ``None``
+        Initial value, expression or initializer for the biases. If set to
+        ``None``, the layer will have no biases. Otherwise, biases should be
+        a 1D array with shape ``(num_filters,)`` if `untied_biases` is set to
+        ``False``. If it is set to ``True``, its shape should be
+        ``(num_filters, output_rows, output_columns)`` instead.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the layer activations. If None
+        is provided, the layer will be linear.
+
+    flip_filters : bool (default: False)
+        Whether to flip the filters and perform a convolution, or not to flip
+        them and perform a correlation. Flipping adds a bit of overhead, so it
+        is disabled by default. In most cases this does not make a difference
+        anyway because the filters are learnt. However, ``flip_filters`` should
+        be set to ``True`` if weights are loaded into it that were learnt using
+        a regular :class:`lasagne.layers.Conv2DLayer`, for example.
+
+    **kwargs
+        Any additional keyword arguments are passed to the `Layer` superclass.
+
+    Attributes
+    ----------
+    W : Theano shared variable
+        Variable representing the filter weights.
+
+    b : Theano shared variable
+        Variable representing the biases.
+    """
+    def __init__(self, incoming, num_filters, filter_size, stride=(1, 1),
+                 pad=0, untie_biases=False, W=init.GlorotUniform(),
+                 b=init.Constant(0.), nonlinearity=nonlinearities.rectify,
+                 flip_filters=False, **kwargs):
+        super(Conv2DMMLayer, self).__init__(incoming, num_filters, filter_size,
+                                            stride, pad, untie_biases, W, b,
+                                            nonlinearity, flip_filters, n=2,
+                                            **kwargs)
+        border_mode = 'half' if self.pad == 'same' else self.pad
+        self.corr_mm_op = GpuCorrMM(subsample=self.stride,
+                                    border_mode=border_mode)
+
+    def convolve(self, input, **kwargs):
+        filters = self.W
+        if self.flip_filters:
+            filters = filters[:, :, ::-1, ::-1]  # flip top-down, left-right
+
+        contiguous_filters = gpu_contiguous(filters)
+        contiguous_input = gpu_contiguous(input)
+        conved = self.corr_mm_op(contiguous_input, contiguous_filters)
+        return conved
diff --git a/lasagne/layers/cuda_convnet.py b/lasagne/layers/cuda_convnet.py
new file mode 100644
index 0000000..092730c
--- /dev/null
+++ b/lasagne/layers/cuda_convnet.py
@@ -0,0 +1,634 @@
+import numpy as np
+import theano
+import theano.tensor as T
+
+from .. import init
+from .. import nonlinearities
+
+from .base import Layer
+
+from .conv import conv_output_length, BaseConvLayer
+from .pool import pool_output_length
+from ..utils import as_tuple
+
+from theano.sandbox.cuda.basic_ops import gpu_contiguous
+from pylearn2.sandbox.cuda_convnet.filter_acts import FilterActs
+
+__all__ = [
+    "Conv2DCCLayer",
+    "MaxPool2DCCLayer",
+    "ShuffleBC01ToC01BLayer",
+    "bc01_to_c01b",
+    "ShuffleC01BToBC01Layer",
+    "c01b_to_bc01",
+    "NINLayer_c01b",
+]
+
+
+if not theano.sandbox.cuda.cuda_enabled:
+    raise ImportError(
+            "requires GPU support -- see http://lasagne.readthedocs.org/en/"
+            "latest/user/installation.html#gpu-support")  # pragma: no cover
+
+
+class Conv2DCCLayer(BaseConvLayer):
+    """
+    lasagne.layers.Conv2DCCLayer(incoming, num_filters, filter_size,
+    stride=(1, 1), pad=0, untie_biases=False, W=None,
+    b=lasagne.init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify,
+    dimshuffle=True, flip_filters=False, partial_sum=1, **kwargs)
+
+    2D convolutional layer
+
+    Performs a 2D convolution on its input and optionally adds a bias and
+    applies an elementwise nonlinearity.  This is an alternative implementation
+    which uses the cuda-convnet wrappers from pylearn2:
+    ``pylearn2.sandbox.cuda_convnet.filter_acts.FilterActs``.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape. This
+        layer expects a 4D tensor as its input, with shape
+        ``(batch_size, num_input_channels, input_rows, input_columns)``.
+        If automatic dimshuffling is disabled (see notes), the shape should be
+        ``(num_input_channels, input_rows, input_columns, batch_size)``
+        instead (c01b axis order).
+
+    num_filters : int
+        The number of learnable convolutional filters this layer has.
+
+    filter_size : int or iterable of int
+        An integer or a 2-element tuple specifying the size of the filters.
+        This layer does not support non-square filters.
+
+    stride : int or iterable of int
+        An integer or a 2-element tuple specifying the stride of the
+        convolution operation. This layer does not support using different
+        strides along both axes.
+
+    pad : int, iterable of int, 'full', 'same' or 'valid' (default: 0)
+        By default, the convolution is only computed where the input and the
+        filter fully overlap (a valid convolution). When ``stride=1``, this
+        yields an output that is smaller than the input by ``filter_size - 1``.
+        The `pad` argument allows you to implicitly pad the input with zeros,
+        extending the output size.
+
+        A single integer results in symmetric zero-padding of the given size on
+        all borders. This layer does not support using different amounts of
+        padding along both axes, but for compatibility to other layers you can
+        still specify the padding as a tuple of two same-valued integers.
+
+        ``'full'`` pads with one less than the filter size on both sides. This
+        is equivalent to computing the convolution wherever the input and the
+        filter overlap by at least one position.
+
+        ``'same'`` pads with half the filter size (rounded down) on both sides.
+        When ``stride=1`` this results in an output size equal to the input
+        size. Even filter size is not supported.
+
+        ``'valid'`` is an alias for ``0`` (no padding / a valid convolution).
+
+        Note that ``'full'`` and ``'same'`` can be faster than equivalent
+        integer values due to optimizations by Theano.
+
+    untie_biases : bool (default: False)
+        If ``False``, the layer will have a bias parameter for each channel,
+        which is shared across all positions in this channel. As a result, the
+        `b` attribute will be a vector (1D).
+
+        If ``True``, the layer will have separate bias parameters for each
+        position in each channel. As a result, the `b` attribute will be a
+        3D tensor.
+
+    W : Theano shared variable, expression, numpy array or callable
+        Initial value, expression or initializer for the weights.
+        These should be a 4D tensor with shape
+        ``(num_filters, num_input_channels, filter_rows, filter_columns)``.
+        If automatic dimshuffling is disabled (see notes), the shape should be
+        ``(num_input_channels, input_rows, input_columns, num_filters)``
+        instead (c01b axis order).
+        See :func:`lasagne.utils.create_param` for more information.
+
+    b : Theano shared variable, expression, numpy array, callable or ``None``
+        Initial value, expression or initializer for the biases. If set to
+        ``None``, the layer will have no biases. Otherwise, biases should be
+        a 1D array with shape ``(num_filters,)`` if `untied_biases` is set to
+        ``False``. If it is set to ``True``, its shape should be
+        ``(num_filters, output_rows, output_columns)`` instead.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the layer activations. If None
+        is provided, the layer will be linear.
+
+    dimshuffle : bool (default: True)
+        If ``True``, the layer will automatically apply the necessary
+        dimshuffle operations to deal with the fact that the cuda-convnet
+        implementation uses c01b (batch-size-last) axis order instead of bc01
+        (batch-size-first), which is the Lasagne/Theano default. This makes the
+        layer interoperable with other Lasagne layers.
+
+        If ``False``, this automatic dimshuffling is disabled and the layer
+        will expect its input and parameters to have c01b axis order. It is up
+        to the user to ensure this. :class:`ShuffleBC01ToC01BLayer` and
+        :class:`ShuffleC01BToBC01Layer` can be used to convert between bc01 and
+        c01b axis order.
+
+    flip_filters : bool (default: False)
+        Whether to flip the filters and perform a convolution, or not to flip
+        them and perform a correlation. Flipping adds a bit of overhead, so it
+        is disabled by default. In most cases this does not make a difference
+        anyway because the filters are learnt. However, ``flip_filters`` should
+        be set to ``True`` if weights are loaded into it that were learnt using
+        a regular :class:`lasagne.layers.Conv2DLayer`, for example.
+
+    partial_sum : int or None (default: 1)
+        This value tunes the trade-off between memory usage and performance.
+        You can specify any positive integer that is a divisor of the output
+        feature map size (i.e. output rows times output columns). Higher
+        values decrease memory usage, but also performance. Specifying 0 or
+        ``None`` means the highest possible value will be used. The Lasagne
+        default of ``1`` gives the best performance, but also the highest
+        memory usage.
+
+        More information about this parameter can be found in the
+        `cuda-convnet documentation
+        <https://code.google.com/p/cuda-convnet/wiki/LayerParams>`_.
+
+    **kwargs
+        Any additional keyword arguments are passed to the `Layer` superclass.
+
+    Attributes
+    ----------
+    W : Theano shared variable or expression
+        Variable or expression representing the filter weights.
+
+    b : Theano shared variable or expression
+        Variable or expression representing the biases.
+
+    Notes
+    -----
+    The cuda-convnet convolution implementation has several limitations:
+
+    * only square filters are supported.
+    * only identical strides in the horizontal and vertical direction are
+      supported.
+    * the number of filters must be a multiple of 16.
+    * the number of input channels must be even, or less than or equal to
+      3.
+    * if the gradient w.r.t. the input is to be computed, the number of
+      channels must be divisible by 4.
+    * performance is optimal when the batch size is a multiple of 128 (but
+      other batch sizes are supported).
+    * this layer only works on the GPU.
+
+    The cuda-convnet convolution implementation uses c01b (batch-size-last)
+    axis order by default. The Theano/Lasagne default is bc01
+    (batch-size-first). This layer automatically adds the necessary dimshuffle
+    operations for the input and the parameters so that it is interoperable
+    with other layers that assume bc01 axis order. However, these additional
+    dimshuffle operations may sometimes negatively affect performance. For this
+    reason, it is possible to disable them by setting ``dimshuffle=False``. In
+    this case, the user is expected to manually ensure that the input and
+    parameters have the correct axis order. :class:`ShuffleBC01ToC01BLayer` and
+    :class:`ShuffleC01BToBC01Layer` can be used to convert between bc01 and
+    c01b axis order.
+    """
+    def __init__(self, incoming, num_filters, filter_size, stride=(1, 1),
+                 pad=0, untie_biases=False, W=None,
+                 b=init.Constant(0.), nonlinearity=nonlinearities.rectify,
+                 dimshuffle=True, flip_filters=False, partial_sum=1,
+                 **kwargs):
+        if W is None:
+            if dimshuffle:
+                W = init.GlorotUniform()
+            else:
+                W = init.GlorotUniform(c01b=True)
+        self.dimshuffle = dimshuffle
+
+        super(Conv2DCCLayer, self).__init__(incoming, num_filters, filter_size,
+                                            stride, pad, untie_biases, W, b,
+                                            nonlinearity, flip_filters, n=2,
+                                            **kwargs)
+        self.partial_sum = partial_sum
+
+        if self.filter_size[0] != self.filter_size[1]:
+            raise RuntimeError("Conv2DCCLayer only supports square filters, "
+                               "but filter_size=(%d, %d)" % filter_size)
+
+        if self.stride[0] != self.stride[1]:
+            raise RuntimeError("Conv2DCCLayer only supports square strides, "
+                               "but stride=(%d, %d)" % stride)
+
+        if self.num_filters % 16 != 0:
+            raise RuntimeError("Conv2DCCLayer requires num_filters to be a "
+                               "multiple of 16, but num_filters is "
+                               "%d" % num_filters)
+
+        if not (self.num_input_channels < 4 or
+                self.num_input_channels % 4 == 0):
+            raise RuntimeError("Conv2DCCLayer requires the number of input "
+                               "channels to be 1, 2, 3 or a multiple of 4, "
+                               "but it is %d" % self.num_input_channels)
+
+        if isinstance(self.pad, tuple):
+            if self.pad[0] != self.pad[1]:
+                raise RuntimeError("Conv2DCCLayer only supports square "
+                                   "padding, but pad=(%d, %d)" % pad)
+            pad = self.pad[0]
+        elif self.pad == 'same':
+            pad = self.filter_size[0] // 2
+        elif self.pad == 'full':
+            pad = self.filter_size[0] - 1
+
+        if not self.dimshuffle and self.untie_biases and self.b is not None:
+            del self.params[self.b]
+            biases_shape = (num_filters, self.output_shape[1],
+                            self.output_shape[2])
+            self.b = self.add_param(b, biases_shape, name="b",
+                                    regularizable=False)
+
+        self.filter_acts_op = FilterActs(stride=self.stride[0],
+                                         partial_sum=self.partial_sum,
+                                         pad=pad)
+
+    @property
+    def num_input_channels(self):
+        if self.dimshuffle:
+            return self.input_shape[1]
+        else:
+            return self.input_shape[0]
+
+    def get_W_shape(self):
+        if self.dimshuffle:
+            return super(Conv2DCCLayer, self).get_W_shape()
+        else:
+            return ((self.num_input_channels,) +
+                    self.filter_size +
+                    (self.num_filters,))
+
+    def get_output_shape_for(self, input_shape):
+        if not self.dimshuffle:
+            # c01b to bc01
+            input_shape = (input_shape[3], input_shape[0],
+                           input_shape[1], input_shape[2])
+        shape = super(Conv2DCCLayer, self).get_output_shape_for(input_shape)
+        if not self.dimshuffle:
+            # bc01 to c01b
+            shape = (shape[1], shape[2], shape[3], shape[0])
+        return shape
+
+    def get_output_for(self, input, **kwargs):
+        if self.dimshuffle:
+            filters = self.W.dimshuffle(1, 2, 3, 0)  # bc01 to c01b
+            input = input.dimshuffle(1, 2, 3, 0)  # bc01 to c01b
+        else:
+            filters = self.W
+
+        if self.flip_filters:
+            filters = filters[:, ::-1, ::-1, :]  # flip top-down, left-right
+
+        contiguous_filters = gpu_contiguous(filters)
+        contiguous_input = gpu_contiguous(input)
+        conved = self.filter_acts_op(contiguous_input, contiguous_filters)
+
+        if self.stride != 1:
+            # cuda-convnet calculates a non-standard strided output shape,
+            # so we need to truncate the output in this case
+            pad = self.pad if isinstance(self.pad, tuple) else (self.pad,) * 2
+            true_rows = conv_output_length(input.shape[1],
+                                           self.filter_size[0],
+                                           self.stride[0],
+                                           pad[0])
+            true_columns = conv_output_length(input.shape[2],
+                                              self.filter_size[1],
+                                              self.stride[1],
+                                              pad[1])
+            conved = conved[:, :true_rows, :true_columns, :]
+
+        if self.b is not None:
+            if self.untie_biases:
+                biases = self.b.dimshuffle(0, 1, 2, 'x')  # c01 to c01b
+            else:
+                biases = self.b.dimshuffle(0, 'x', 'x', 'x')  # c to c01b
+            conved += biases
+
+        conved = self.nonlinearity(conved)
+
+        if self.dimshuffle:
+            return conved.dimshuffle(3, 0, 1, 2)  # c01b to bc01
+        else:
+            return conved
+
+
+class MaxPool2DCCLayer(Layer):
+    """
+    2D max-pooling layer
+
+    Performs 2D max-pooling over the two trailing axes of a 4D input tensor
+    (or over axis 1 and 2 if ``dimshuffle=False``, see notes). This is an
+    alternative implementation which uses the cuda-convnet wrappers from
+    pylearn2: ``pylearn2.sandbox.cuda_convnet.pool.MaxPool``.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    pool_size : integer or iterable
+        The length of the pooling region in each dimension.  If an integer, it
+        is promoted to a square pooling region. If an iterable, it should have
+        two elements. This layer does not support non-square pooling regions.
+
+    stride : integer, iterable or ``None``
+        The strides between sucessive pooling regions in each dimension.
+        If ``None`` then ``stride = pool_size``. This layer does not support
+        using different strides along both axes.
+
+    pad : integer or iterable (default: 0)
+        This implementation does not support custom padding, so this argument
+        must always be set to ``0``. It exists only to make sure the
+        interface is compatible with :class:`lasagne.layers.MaxPool2DLayer`.
+
+    ignore_border : bool (default: False)
+        This implementation always includes partial pooling regions, so this
+        argument must always be set to False. It exists only to make sure the
+        interface is compatible with :class:`lasagne.layers.MaxPool2DLayer`.
+
+    dimshuffle : bool (default: True)
+        If ``True``, the layer will automatically apply the necessary
+        dimshuffle operations to deal with the fact that the cuda-convnet
+        implementation uses c01b (batch-size-last) axis order instead of bc01
+        (batch-size-first), which is the Lasagne/Theano default. This makes the
+        layer interoperable with other Lasagne layers.
+
+        If ``False``, this automatic dimshuffling is disabled and the layer
+        will expect its input and parameters to have c01b axis order. It is up
+        to the user to ensure this. :class:`ShuffleBC01ToC01BLayer` and
+        :class:`ShuffleC01BToBC01Layer` can be used to convert between bc01 and
+        c01b axis order.
+
+    **kwargs
+        Any additional keyword arguments are passed to the :class:`Layer`
+        superclass.
+
+    Notes
+    -----
+    The cuda-convnet max-pooling implementation has several limitations:
+
+    * only square pooling regions are supported.
+    * only identical strides in the horizontal and vertical direction are
+      supported.
+    * only square inputs are supported. (This limitation does not exist for
+      the convolution implementation.)
+    * partial pooling regions are always included (``ignore_border`` is forced
+      to ``False``).
+    * custom padding is not supported (``pad`` is forced to ``0``).
+    * this layer only works on the GPU.
+
+    The cuda-convnet pooling implementation uses c01b (batch-size-last)
+    axis order by default. The Theano/Lasagne default is bc01
+    (batch-size-first). This layer automatically adds the necessary dimshuffle
+    operations for the input and the parameters so that it is interoperable
+    with other layers that assume bc01 axis order. However, these additional
+    dimshuffle operations may sometimes negatively affect performance. For this
+    reason, it is possible to disable them by setting ``dimshuffle=False``. In
+    this case, the user is expected to manually ensure that the input and
+    parameters have the correct axis order. :class:`ShuffleBC01ToC01BLayer` and
+    :class:`ShuffleC01BToBC01Layer` can be used to convert between bc01 and
+    c01b axis order.
+    """
+    def __init__(self, incoming, pool_size, stride=None, ignore_border=False,
+                 dimshuffle=True, **kwargs):
+        from pylearn2.sandbox.cuda_convnet.pool import MaxPool
+
+        if 'pad' in kwargs:
+            pad = kwargs.pop('pad')
+            if as_tuple(pad, 2) != (0, 0):
+                raise NotImplementedError("MaxPool2DCCLayer does not "
+                                          "support padding")
+
+        super(MaxPool2DCCLayer, self).__init__(incoming, **kwargs)
+
+        pool_size = as_tuple(pool_size, 2)
+
+        if pool_size[0] != pool_size[1]:
+            raise NotImplementedError("MaxPool2DCCLayer only supports square "
+                                      "pooling regions, but pool_size=(%d, %d)"
+                                      % pool_size)
+
+        self.pool_size = pool_size[0]
+
+        if stride is None:
+            self.stride = self.pool_size
+        else:
+            stride = as_tuple(stride, 2)
+            if stride[0] != stride[1]:
+                raise NotImplementedError("MaxPool2DCCLayer only supports "
+                                          "using the same stride in both "
+                                          "directions but stride=(%d, %d)"
+                                          % stride)
+            self.stride = stride[0]
+
+        if self.stride > self.pool_size:
+            raise NotImplementedError("MaxPool2DCCLayer only supports "
+                                      "stride <= pool_size.")
+
+        # The ignore_border argument is for compatibility with MaxPool2DLayer.
+        # ignore_border=True is not supported. Borders are never ignored.
+        if ignore_border:
+            raise NotImplementedError("MaxPool2DCCLayer does not support "
+                                      "ignore_border=True.")
+
+        self.dimshuffle = dimshuffle
+
+        self.pool_op = MaxPool(ds=self.pool_size, stride=self.stride)
+
+    def get_output_shape_for(self, input_shape):
+        if self.dimshuffle:
+            batch_size = input_shape[0]
+            num_input_channels = input_shape[1]
+            input_rows, input_columns = input_shape[2:4]
+        else:
+            batch_size = input_shape[3]
+            num_input_channels = input_shape[0]
+            input_rows, input_columns = input_shape[1:3]
+
+        output_rows = pool_output_length(input_rows,
+                                         pool_size=self.pool_size,
+                                         stride=self.stride,
+                                         pad=0,
+                                         ignore_border=False,
+                                         )
+        output_columns = pool_output_length(input_columns,
+                                            pool_size=self.pool_size,
+                                            stride=self.stride,
+                                            pad=0,
+                                            ignore_border=False,
+                                            )
+
+        if self.dimshuffle:
+            return (batch_size, num_input_channels, output_rows,
+                    output_columns)
+        else:
+            return (num_input_channels, output_rows, output_columns,
+                    batch_size)
+
+    def get_output_for(self, input, **kwargs):
+        if self.dimshuffle:
+            input = input.dimshuffle(1, 2, 3, 0)  # bc01 to c01b
+
+        contiguous_input = gpu_contiguous(input)
+        pooled = self.pool_op(contiguous_input)
+
+        if self.dimshuffle:
+            return pooled.dimshuffle(3, 0, 1, 2)  # c01b to bc01
+        else:
+            return pooled
+
+
+# Helper classes for switching between bc01 and c01b input formats
+
+class ShuffleBC01ToC01BLayer(Layer):
+    """
+    shuffle 4D input from bc01 (batch-size-first) order to c01b
+    (batch-size-last) order.
+
+    This layer can be used for interoperability between c01b and bc01 layers.
+    For example, :class:`MaxPool2DCCLayer` and :class:`Conv2DCCLayer` operate
+    in c01b mode when they are created with ``dimshuffle=False``.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    **kwargs
+        Any additional keyword arguments are passed to the `Layer` superclass.
+    """
+    def get_output_shape_for(self, input_shape):
+        return (input_shape[1], input_shape[2], input_shape[3], input_shape[0])
+
+    def get_output_for(self, input, **kwargs):
+        return input.dimshuffle(1, 2, 3, 0)
+
+bc01_to_c01b = ShuffleBC01ToC01BLayer  # shortcut
+
+
+class ShuffleC01BToBC01Layer(Layer):
+    """
+    shuffle 4D input from c01b (batch-size-last) order to bc01
+    (batch-size-first) order.
+
+    This layer can be used for interoperability between c01b and bc01 layers.
+    For example, :class:`MaxPool2DCCLayer` and :class:`Conv2DCCLayer` operate
+    in c01b mode when they are created with ``dimshuffle=False``.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    **kwargs
+        Any additional keyword arguments are passed to the `Layer` superclass.
+    """
+    def get_output_shape_for(self, input_shape):
+        return (input_shape[3], input_shape[0], input_shape[1], input_shape[2])
+
+    def get_output_for(self, input, **kwargs):
+        return input.dimshuffle(3, 0, 1, 2)
+
+c01b_to_bc01 = ShuffleC01BToBC01Layer  # shortcut
+
+
+# c01b versions of other Layer classes
+
+class NINLayer_c01b(Layer):
+    """
+    lasagne.layers.NINLayer_c01b(incoming, num_units, untie_biases=False,
+    W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.),
+    nonlinearity=lasagne.nonlinearities.rectify, **kwargs)
+
+    Network-in-network layer with c01b axis ordering.
+
+    This is a c01b version of :class:`lasagne.layers.NINLayer`.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape
+
+    num_units : int
+        The number of units of the layer
+
+    untie_biases : bool
+        If ``False``, the network has a single bias vector similar to a dense
+        layer. If ``True``, a separate bias vector is used for each spatial
+        position.
+
+    W : Theano shared variable, numpy array or callable
+        An initializer for the weights of the layer. If a shared variable or a
+        numpy array is provided the shape should be
+        (num_units, num_input_channels).
+        See :func:`lasagne.utils.create_param` for more information.
+
+    b : Theano shared variable, numpy array, callable or None
+        An initializer for the biases of the layer. If a shared variable or a
+        numpy array is provided the correct shape is determined by the
+        untie_biases setting. If untie_biases is ``False``, then the shape
+        should be ``(num_units,)``. If untie_biases is ``True`` then the shape
+        should be ``(num_units, rows, columns)``. If ``None`` is provided the
+        layer will have no biases.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the layer activations. If None
+        is provided, the layer will be linear.
+
+    **kwargs
+        Any additional keyword arguments are passed to the `Layer` superclass.
+    """
+    def __init__(self, incoming, num_units, untie_biases=False,
+                 W=init.GlorotUniform(), b=init.Constant(0.),
+                 nonlinearity=nonlinearities.rectify, **kwargs):
+        super(NINLayer_c01b, self).__init__(incoming, **kwargs)
+        if nonlinearity is None:
+            self.nonlinearity = nonlinearities.identity
+        else:
+            self.nonlinearity = nonlinearity
+
+        self.num_units = num_units
+        self.untie_biases = untie_biases
+
+        num_input_channels = self.input_shape[0]
+
+        self.W = self.add_param(W, (num_units, num_input_channels), name="W")
+        if b is None:
+            self.b = None
+        else:
+            if self.untie_biases:
+                biases_shape = (num_units,) + self.output_shape[1:-1]
+            else:
+                biases_shape = (num_units,)
+            self.b = self.add_param(b, biases_shape, name="b",
+                                    regularizable=False)
+
+    def get_output_shape_for(self, input_shape):
+        return (self.num_units,) + input_shape[1:]
+
+    def get_output_for(self, input, **kwargs):
+        # fc * c01b... = f01b...
+        out = T.tensordot(self.W, input, axes=[[1], [0]])
+
+        if self.b is None:
+            activation = out
+        else:
+            if self.untie_biases:
+                bias_axes = range(input.ndim - 1) + ['x']
+            else:
+                bias_axes = [0] + (['x'] * (input.ndim - 1))
+            b_shuffled = self.b.dimshuffle(bias_axes)
+            activation = out + b_shuffled
+
+        return self.nonlinearity(activation)
diff --git a/lasagne/layers/dense.py b/lasagne/layers/dense.py
new file mode 100644
index 0000000..2aaf206
--- /dev/null
+++ b/lasagne/layers/dense.py
@@ -0,0 +1,192 @@
+import numpy as np
+import theano.tensor as T
+
+from .. import init
+from .. import nonlinearities
+
+from .base import Layer
+
+
+__all__ = [
+    "DenseLayer",
+    "NINLayer",
+]
+
+
+class DenseLayer(Layer):
+    """
+    lasagne.layers.DenseLayer(incoming, num_units,
+    W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.),
+    nonlinearity=lasagne.nonlinearities.rectify, **kwargs)
+
+    A fully connected layer.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape
+
+    num_units : int
+        The number of units of the layer
+
+    W : Theano shared variable, expression, numpy array or callable
+        Initial value, expression or initializer for the weights.
+        These should be a matrix with shape ``(num_inputs, num_units)``.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    b : Theano shared variable, expression, numpy array, callable or ``None``
+        Initial value, expression or initializer for the biases. If set to
+        ``None``, the layer will have no biases. Otherwise, biases should be
+        a 1D array with shape ``(num_units,)``.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the layer activations. If None
+        is provided, the layer will be linear.
+
+    Examples
+    --------
+    >>> from lasagne.layers import InputLayer, DenseLayer
+    >>> l_in = InputLayer((100, 20))
+    >>> l1 = DenseLayer(l_in, num_units=50)
+
+    Notes
+    -----
+    If the input to this layer has more than two axes, it will flatten the
+    trailing axes. This is useful for when a dense layer follows a
+    convolutional layer, for example. It is not necessary to insert a
+    :class:`FlattenLayer` in this case.
+    """
+    def __init__(self, incoming, num_units, W=init.GlorotUniform(),
+                 b=init.Constant(0.), nonlinearity=nonlinearities.rectify,
+                 **kwargs):
+        super(DenseLayer, self).__init__(incoming, **kwargs)
+        self.nonlinearity = (nonlinearities.identity if nonlinearity is None
+                             else nonlinearity)
+
+        self.num_units = num_units
+
+        num_inputs = int(np.prod(self.input_shape[1:]))
+
+        self.W = self.add_param(W, (num_inputs, num_units), name="W")
+        if b is None:
+            self.b = None
+        else:
+            self.b = self.add_param(b, (num_units,), name="b",
+                                    regularizable=False)
+
+    def get_output_shape_for(self, input_shape):
+        return (input_shape[0], self.num_units)
+
+    def get_output_for(self, input, **kwargs):
+        if input.ndim > 2:
+            # if the input has more than two dimensions, flatten it into a
+            # batch of feature vectors.
+            input = input.flatten(2)
+
+        activation = T.dot(input, self.W)
+        if self.b is not None:
+            activation = activation + self.b.dimshuffle('x', 0)
+        return self.nonlinearity(activation)
+
+
+class NINLayer(Layer):
+    """
+    lasagne.layers.NINLayer(incoming, num_units, untie_biases=False,
+    W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.),
+    nonlinearity=lasagne.nonlinearities.rectify, **kwargs)
+
+    Network-in-network layer.
+    Like DenseLayer, but broadcasting across all trailing dimensions beyond the
+    2nd.  This results in a convolution operation with filter size 1 on all
+    trailing dimensions.  Any number of trailing dimensions is supported,
+    so NINLayer can be used to implement 1D, 2D, 3D, ... convolutions.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape
+
+    num_units : int
+        The number of units of the layer
+
+    untie_biases : bool
+        If false the network has a single bias vector similar to a dense
+        layer. If true a separate bias vector is used for each trailing
+        dimension beyond the 2nd.
+
+    W : Theano shared variable, expression, numpy array or callable
+        Initial value, expression or initializer for the weights.
+        These should be a matrix with shape ``(num_inputs, num_units)``,
+        where ``num_inputs`` is the size of the second dimension of the input.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    b : Theano shared variable, expression, numpy array, callable or ``None``
+        Initial value, expression or initializer for the biases. If set to
+        ``None``, the layer will have no biases. Otherwise, biases should be
+        a 1D array with shape ``(num_units,)`` for ``untie_biases=False``, and
+        a tensor of shape ``(num_units, input_shape[2], ..., input_shape[-1])``
+        for ``untie_biases=True``.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the layer activations. If None
+        is provided, the layer will be linear.
+
+    Examples
+    --------
+    >>> from lasagne.layers import InputLayer, NINLayer
+    >>> l_in = InputLayer((100, 20, 10, 3))
+    >>> l1 = NINLayer(l_in, num_units=5)
+
+    References
+    ----------
+    .. [1] Lin, Min, Qiang Chen, and Shuicheng Yan (2013):
+           Network in network. arXiv preprint arXiv:1312.4400.
+    """
+    def __init__(self, incoming, num_units, untie_biases=False,
+                 W=init.GlorotUniform(), b=init.Constant(0.),
+                 nonlinearity=nonlinearities.rectify, **kwargs):
+        super(NINLayer, self).__init__(incoming, **kwargs)
+        self.nonlinearity = (nonlinearities.identity if nonlinearity is None
+                             else nonlinearity)
+
+        self.num_units = num_units
+        self.untie_biases = untie_biases
+
+        num_input_channels = self.input_shape[1]
+
+        self.W = self.add_param(W, (num_input_channels, num_units), name="W")
+        if b is None:
+            self.b = None
+        else:
+            if self.untie_biases:
+                biases_shape = (num_units,) + self.output_shape[2:]
+            else:
+                biases_shape = (num_units,)
+            self.b = self.add_param(b, biases_shape, name="b",
+                                    regularizable=False)
+
+    def get_output_shape_for(self, input_shape):
+        return (input_shape[0], self.num_units) + input_shape[2:]
+
+    def get_output_for(self, input, **kwargs):
+        # cf * bc01... = fb01...
+        out_r = T.tensordot(self.W, input, axes=[[0], [1]])
+        # input dims to broadcast over
+        remaining_dims = range(2, input.ndim)
+        # bf01...
+        out = out_r.dimshuffle(1, 0, *remaining_dims)
+
+        if self.b is None:
+            activation = out
+        else:
+            if self.untie_biases:
+                # no broadcast
+                remaining_dims_biases = range(1, input.ndim - 1)
+            else:
+                remaining_dims_biases = ['x'] * (input.ndim - 2)  # broadcast
+            b_shuffled = self.b.dimshuffle('x', 0, *remaining_dims_biases)
+            activation = out + b_shuffled
+
+        return self.nonlinearity(activation)
diff --git a/lasagne/layers/dnn.py b/lasagne/layers/dnn.py
new file mode 100644
index 0000000..0547a6b
--- /dev/null
+++ b/lasagne/layers/dnn.py
@@ -0,0 +1,593 @@
+import theano
+from theano.sandbox.cuda import dnn
+
+from .. import init
+from .. import nonlinearities
+from .base import Layer
+
+from .conv import conv_output_length, BaseConvLayer
+from .pool import pool_output_length
+from ..utils import as_tuple
+
+if not theano.sandbox.cuda.cuda_enabled:
+    raise ImportError(
+            "requires GPU support -- see http://lasagne.readthedocs.org/en/"
+            "latest/user/installation.html#gpu-support")  # pragma: no cover
+elif not dnn.dnn_available():
+    raise ImportError(
+            "cuDNN not available: %s\nSee http://lasagne.readthedocs.org/en/"
+            "latest/user/installation.html#cudnn" %
+            dnn.dnn_available.msg)  # pragma: no cover
+
+
+__all__ = [
+    "Pool2DDNNLayer",
+    "MaxPool2DDNNLayer",
+    "Pool3DDNNLayer",
+    "MaxPool3DDNNLayer",
+    "Conv2DDNNLayer",
+    "Conv3DDNNLayer",
+    "SpatialPyramidPoolingDNNLayer",
+]
+
+
+class Pool2DDNNLayer(Layer):
+    """
+    2D pooling layer
+
+    Performs 2D mean- or max-pooling over the two trailing axes of a 4D input
+    tensor. This is an alternative implementation which uses
+    ``theano.sandbox.cuda.dnn.dnn_pool`` directly.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    pool_size : integer or iterable
+        The length of the pooling region in each dimension. If an integer, it
+        is promoted to a square pooling region. If an iterable, it should have
+        two elements.
+
+    stride : integer, iterable or ``None``
+        The strides between sucessive pooling regions in each dimension.
+        If ``None`` then ``stride = pool_size``.
+
+    pad : integer or iterable
+        Number of elements to be added on each side of the input
+        in each dimension. Each value must be less than
+        the corresponding stride.
+
+    ignore_border : bool (default: True)
+        This implementation never includes partial pooling regions, so this
+        argument must always be set to True. It exists only to make sure the
+        interface is compatible with :class:`lasagne.layers.MaxPool2DLayer`.
+
+    mode : string
+        Pooling mode, one of 'max', 'average_inc_pad' or 'average_exc_pad'.
+        Defaults to 'max'.
+
+    **kwargs
+        Any additional keyword arguments are passed to the :class:`Layer`
+        superclass.
+
+    Notes
+    -----
+    The value used to pad the input is chosen to be less than
+    the minimum of the input, so that the output of each pooling region
+    always corresponds to some element in the unpadded input region.
+
+    This is a drop-in replacement for :class:`lasagne.layers.MaxPool2DLayer`.
+    Its interface is the same, except it does not support the ``ignore_border``
+    argument.
+    """
+    def __init__(self, incoming, pool_size, stride=None, pad=(0, 0),
+                 ignore_border=True, mode='max', **kwargs):
+        super(Pool2DDNNLayer, self).__init__(incoming, **kwargs)
+        if len(self.input_shape) != 4:
+            raise ValueError("Tried to create a 2D pooling layer with "
+                             "input shape %r. Expected 4 input dimensions "
+                             "(batchsize, channels, 2 spatial dimensions)."
+                             % (self.input_shape,))
+        self.pool_size = as_tuple(pool_size, 2)
+        if stride is None:
+            self.stride = self.pool_size
+        else:
+            self.stride = as_tuple(stride, 2)
+        self.pad = as_tuple(pad, 2)
+        self.mode = mode
+        # The ignore_border argument is for compatibility with MaxPool2DLayer.
+        # ignore_border=False is not supported. Borders are always ignored.
+        if not ignore_border:
+            raise NotImplementedError("Pool2DDNNLayer does not support "
+                                      "ignore_border=False.")
+
+    def get_output_shape_for(self, input_shape):
+        output_shape = list(input_shape)  # copy / convert to mutable list
+
+        output_shape[2] = pool_output_length(input_shape[2],
+                                             pool_size=self.pool_size[0],
+                                             stride=self.stride[0],
+                                             pad=self.pad[0],
+                                             ignore_border=True,
+                                             )
+
+        output_shape[3] = pool_output_length(input_shape[3],
+                                             pool_size=self.pool_size[1],
+                                             stride=self.stride[1],
+                                             pad=self.pad[1],
+                                             ignore_border=True,
+                                             )
+
+        return tuple(output_shape)
+
+    def get_output_for(self, input, **kwargs):
+        return dnn.dnn_pool(input, self.pool_size, self.stride,
+                            self.mode, self.pad)
+
+
+class MaxPool2DDNNLayer(Pool2DDNNLayer):
+    """
+    2D max-pooling layer
+
+    Subclass of :class:`Pool2DDNNLayer` fixing ``mode='max'``, provided for
+    compatibility to other ``MaxPool2DLayer`` classes.
+    """
+    def __init__(self, incoming, pool_size, stride=None,
+                 pad=(0, 0), ignore_border=True, **kwargs):
+        super(MaxPool2DDNNLayer, self).__init__(incoming, pool_size, stride,
+                                                pad, ignore_border, mode='max',
+                                                **kwargs)
+
+
+class Pool3DDNNLayer(Layer):
+    """
+    3D pooling layer
+
+    Performs 3D mean- or max-pooling over the 3 trailing axes of a 5D input
+    tensor. This is an alternative implementation which uses
+    ``theano.sandbox.cuda.dnn.dnn_pool`` directly.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    pool_size : integer or iterable
+        The length of the pooling region in each dimension. If an integer, it
+        is promoted to a square pooling region. If an iterable, it should have
+        two elements.
+
+    stride : integer, iterable or ``None``
+        The strides between sucessive pooling regions in each dimension.
+        If ``None`` then ``stride = pool_size``.
+
+    pad : integer or iterable
+        Number of elements to be added on each side of the input
+        in each dimension. Each value must be less than
+        the corresponding stride.
+
+    ignore_border : bool (default: True)
+        This implementation never includes partial pooling regions, so this
+        argument must always be set to True. It exists only to make sure the
+        interface is compatible with :class:`lasagne.layers.MaxPool2DLayer`.
+
+    mode : string
+        Pooling mode, one of 'max', 'average_inc_pad' or 'average_exc_pad'.
+        Defaults to 'max'.
+
+    **kwargs
+        Any additional keyword arguments are passed to the :class:`Layer`
+        superclass.
+
+    Notes
+    -----
+    The value used to pad the input is chosen to be less than
+    the minimum of the input, so that the output of each pooling region
+    always corresponds to some element in the unpadded input region.
+
+    """
+    def __init__(self, incoming, pool_size, stride=None, pad=(0, 0, 0),
+                 ignore_border=True, mode='max', **kwargs):
+        super(Pool3DDNNLayer, self).__init__(incoming, **kwargs)
+        if len(self.input_shape) != 5:
+            raise ValueError("Tried to create a 3D pooling layer with "
+                             "input shape %r. Expected 5 input dimensions "
+                             "(batchsize, channels, 3 spatial dimensions)."
+                             % (self.input_shape,))
+        self.pool_size = as_tuple(pool_size, 3)
+        if stride is None:
+            self.stride = self.pool_size
+        else:
+            self.stride = as_tuple(stride, 3)
+        self.pad = as_tuple(pad, 3)
+        self.mode = mode
+        # The ignore_border argument is for compatibility with MaxPool2DLayer.
+        # ignore_border=False is not supported. Borders are always ignored.
+        if not ignore_border:
+            raise NotImplementedError("Pool3DDNNLayer does not support "
+                                      "ignore_border=False.")
+
+    def get_output_shape_for(self, input_shape):
+        output_shape = list(input_shape)  # copy / convert to mutable list
+
+        output_shape[2] = pool_output_length(input_shape[2],
+                                             pool_size=self.pool_size[0],
+                                             stride=self.stride[0],
+                                             pad=self.pad[0],
+                                             ignore_border=True,
+                                             )
+
+        output_shape[3] = pool_output_length(input_shape[3],
+                                             pool_size=self.pool_size[1],
+                                             stride=self.stride[1],
+                                             pad=self.pad[1],
+                                             ignore_border=True,
+                                             )
+
+        output_shape[4] = pool_output_length(input_shape[4],
+                                             pool_size=self.pool_size[2],
+                                             stride=self.stride[2],
+                                             pad=self.pad[2],
+                                             ignore_border=True,
+                                             )
+
+        return tuple(output_shape)
+
+    def get_output_for(self, input, **kwargs):
+        return dnn.dnn_pool(input, self.pool_size, self.stride,
+                            self.mode, self.pad)
+
+
+class MaxPool3DDNNLayer(Pool3DDNNLayer):
+    """
+    3D max-pooling layer
+
+    Subclass of :class:`Pool3DDNNLayer` fixing ``mode='max'``, provided for
+    consistency to ``MaxPool2DLayer`` classes.
+    """
+    def __init__(self, incoming, pool_size, stride=None,
+                 pad=(0, 0, 0), ignore_border=True, **kwargs):
+        super(MaxPool3DDNNLayer, self).__init__(incoming, pool_size, stride,
+                                                pad, ignore_border, mode='max',
+                                                **kwargs)
+
+
+class Conv2DDNNLayer(BaseConvLayer):
+    """
+    lasagne.layers.Conv2DDNNLayer(incoming, num_filters, filter_size,
+    stride=(1, 1), pad=0, untie_biases=False,
+    W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.),
+    nonlinearity=lasagne.nonlinearities.rectify, flip_filters=False,
+    **kwargs)
+
+    2D convolutional layer
+
+    Performs a 2D convolution on its input and optionally adds a bias and
+    applies an elementwise nonlinearity.  This is an alternative implementation
+    which uses ``theano.sandbox.cuda.dnn.dnn_conv`` directly.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape. The
+        output of this layer should be a 4D tensor, with shape
+        ``(batch_size, num_input_channels, input_rows, input_columns)``.
+
+    num_filters : int
+        The number of learnable convolutional filters this layer has.
+
+    filter_size : int or iterable of int
+        An integer or a 2-element tuple specifying the size of the filters.
+
+    stride : int or iterable of int
+        An integer or a 2-element tuple specifying the stride of the
+        convolution operation.
+
+    pad : int, iterable of int, 'full', 'same' or 'valid' (default: 0)
+        By default, the convolution is only computed where the input and the
+        filter fully overlap (a valid convolution). When ``stride=1``, this
+        yields an output that is smaller than the input by ``filter_size - 1``.
+        The `pad` argument allows you to implicitly pad the input with zeros,
+        extending the output size.
+
+        A single integer results in symmetric zero-padding of the given size on
+        all borders, a tuple of two integers allows different symmetric padding
+        per dimension.
+
+        ``'full'`` pads with one less than the filter size on both sides. This
+        is equivalent to computing the convolution wherever the input and the
+        filter overlap by at least one position.
+
+        ``'same'`` pads with half the filter size (rounded down) on both sides.
+        When ``stride=1`` this results in an output size equal to the input
+        size. Even filter size is not supported.
+
+        ``'valid'`` is an alias for ``0`` (no padding / a valid convolution).
+
+        Note that ``'full'`` and ``'same'`` can be faster than equivalent
+        integer values due to optimizations by Theano.
+
+    untie_biases : bool (default: False)
+        If ``False``, the layer will have a bias parameter for each channel,
+        which is shared across all positions in this channel. As a result, the
+        `b` attribute will be a vector (1D).
+
+        If True, the layer will have separate bias parameters for each
+        position in each channel. As a result, the `b` attribute will be a
+        3D tensor.
+
+    W : Theano shared variable, expression, numpy array or callable
+        Initial value, expression or initializer for the weights.
+        These should be a 4D tensor with shape
+        ``(num_filters, num_input_channels, filter_rows, filter_columns)``.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    b : Theano shared variable, expression, numpy array, callable or ``None``
+        Initial value, expression or initializer for the biases. If set to
+        ``None``, the layer will have no biases. Otherwise, biases should be
+        a 1D array with shape ``(num_filters,)`` if `untied_biases` is set to
+        ``False``. If it is set to ``True``, its shape should be
+        ``(num_filters, output_rows, output_columns)`` instead.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the layer activations. If None
+        is provided, the layer will be linear.
+
+    flip_filters : bool (default: False)
+        Whether to flip the filters and perform a convolution, or not to flip
+        them and perform a correlation. Flipping adds a bit of overhead, so it
+        is disabled by default. In most cases this does not make a difference
+        anyway because the filters are learnt. However, ``flip_filters`` should
+        be set to ``True`` if weights are loaded into it that were learnt using
+        a regular :class:`lasagne.layers.Conv2DLayer`, for example.
+
+    **kwargs
+        Any additional keyword arguments are passed to the `Layer` superclass.
+
+    Attributes
+    ----------
+    W : Theano shared variable or expression
+        Variable or expression representing the filter weights.
+
+    b : Theano shared variable or expression
+        Variable or expression representing the biases.
+    """
+    def __init__(self, incoming, num_filters, filter_size, stride=(1, 1),
+                 pad=0, untie_biases=False, W=init.GlorotUniform(),
+                 b=init.Constant(0.), nonlinearity=nonlinearities.rectify,
+                 flip_filters=False, **kwargs):
+        super(Conv2DDNNLayer, self).__init__(incoming, num_filters,
+                                             filter_size, stride, pad,
+                                             untie_biases, W, b, nonlinearity,
+                                             flip_filters, n=2, **kwargs)
+
+    def convolve(self, input, **kwargs):
+        # by default we assume 'cross', consistent with corrmm.
+        conv_mode = 'conv' if self.flip_filters else 'cross'
+        border_mode = self.pad
+        if border_mode == 'same':
+            border_mode = tuple(s // 2 for s in self.filter_size)
+
+        conved = dnn.dnn_conv(img=input,
+                              kerns=self.W,
+                              subsample=self.stride,
+                              border_mode=border_mode,
+                              conv_mode=conv_mode
+                              )
+        return conved
+
+
+class Conv3DDNNLayer(BaseConvLayer):
+    """
+    lasagne.layers.Conv3DDNNLayer(incoming, num_filters, filter_size,
+    stride=(1, 1, 1), pad=0, untie_biases=False,
+    W=lasagne.init.GlorotUniform(), b=lasagne.init.Constant(0.),
+    nonlinearity=lasagne.nonlinearities.rectify, flip_filters=False,
+    **kwargs)
+
+    3D convolutional layer
+
+    Performs a 3D convolution on its input and optionally adds a bias and
+    applies an elementwise nonlinearity.  This implementation uses
+    ``theano.sandbox.cuda.dnn.dnn_conv3d`` directly.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape. The
+        output of this layer should be a 5D tensor, with shape ``(batch_size,
+        num_input_channels, input_depth, input_rows, input_columns)``.
+
+    num_filters : int
+        The number of learnable convolutional filters this layer has.
+
+    filter_size : int or iterable of int
+        An integer or a 3-element tuple specifying the size of the filters.
+
+    stride : int or iterable of int
+        An integer or a 3-element tuple specifying the stride of the
+        convolution operation.
+
+    pad : int, iterable of int, 'full', 'same' or 'valid' (default: 0)
+        By default, the convolution is only computed where the input and the
+        filter fully overlap (a valid convolution). When ``stride=1``, this
+        yields an output that is smaller than the input by ``filter_size - 1``.
+        The `pad` argument allows you to implicitly pad the input with zeros,
+        extending the output size.
+
+        A single integer results in symmetric zero-padding of the given size on
+        all borders, a tuple of three integers allows different symmetric
+        padding per dimension.
+
+        ``'full'`` pads with one less than the filter size on both sides. This
+        is equivalent to computing the convolution wherever the input and the
+        filter overlap by at least one position.
+
+        ``'same'`` pads with half the filter size (rounded down) on both sides.
+        When ``stride=1`` this results in an output size equal to the input
+        size. Even filter size is not supported.
+
+        ``'valid'`` is an alias for ``0`` (no padding / a valid convolution).
+
+        Note that ``'full'`` and ``'same'`` can be faster than equivalent
+        integer values due to optimizations by Theano.
+
+    untie_biases : bool (default: False)
+        If ``False``, the layer will have a bias parameter for each channel,
+        which is shared across all positions in this channel. As a result, the
+        `b` attribute will be a vector (1D).
+
+        If True, the layer will have separate bias parameters for each
+        position in each channel. As a result, the `b` attribute will be a
+        4D tensor.
+
+    W : Theano shared variable, expression, numpy array or callable
+        Initial value, expression or initializer for the weights.
+        These should be a 5D tensor with shape ``(num_filters,
+        num_input_channels, filter_depth, filter_rows, filter_columns)``.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    b : Theano shared variable, expression, numpy array, callable or ``None``
+        Initial value, expression or initializer for the biases. If set to
+        ``None``, the layer will have no biases. Otherwise, biases should be
+        a 1D array with shape ``(num_filters,)`` if `untied_biases` is set to
+        ``False``. If it is set to ``True``, its shape should be
+        ``(num_filters, output_depth, output_rows, output_columns)`` instead.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the layer activations. If None
+        is provided, the layer will be linear.
+
+    flip_filters : bool (default: False)
+        Whether to flip the filters and perform a convolution, or not to flip
+        them and perform a correlation. Flipping adds a bit of overhead, so it
+        is disabled by default. In most cases this does not make a difference
+        anyway because the filters are learned, but if you want to compute
+        predictions with pre-trained weights, take care if they need flipping.
+
+    **kwargs
+        Any additional keyword arguments are passed to the `Layer` superclass.
+
+    Attributes
+    ----------
+    W : Theano shared variable or expression
+        Variable or expression representing the filter weights.
+
+    b : Theano shared variable or expression
+        Variable or expression representing the biases.
+    """
+    def __init__(self, incoming, num_filters, filter_size, stride=(1, 1, 1),
+                 pad=0, untie_biases=False, W=init.GlorotUniform(),
+                 b=init.Constant(0.), nonlinearity=nonlinearities.rectify,
+                 flip_filters=False, **kwargs):
+        super(Conv3DDNNLayer, self).__init__(incoming, num_filters,
+                                             filter_size, stride, pad,
+                                             untie_biases, W, b, nonlinearity,
+                                             flip_filters, n=3, **kwargs)
+
+    def convolve(self, input, **kwargs):
+        # by default we assume 'cross', consistent with corrmm.
+        conv_mode = 'conv' if self.flip_filters else 'cross'
+        border_mode = self.pad
+        if border_mode == 'same':
+            border_mode = tuple(s // 2 for s in self.filter_size)
+
+        conved = dnn.dnn_conv3d(img=input,
+                                kerns=self.W,
+                                subsample=self.stride,
+                                border_mode=border_mode,
+                                conv_mode=conv_mode
+                                )
+        return conved
+
+
+class SpatialPyramidPoolingDNNLayer(Layer):
+    """
+    Spatial Pyramid Pooling Layer
+
+    Performs spatial pyramid pooling (SPP) over the input.
+    It will turn a 2D input of arbitrary size into an output of fixed
+    dimension.
+    Hence, the convolutional part of a DNN can be connected to a dense part
+    with a fixed number of nodes even if the dimensions of the
+    input image are unknown.
+
+    The pooling is performed over :math:`l` pooling levels.
+    Each pooling level :math:`i` will create :math:`M_i` output features.
+    :math:`M_i` is given by :math:`n_i * n_i`,
+    with :math:`n_i` as the number of pooling operation per dimension in
+    level :math:`i`, and we use a list of the :math:`n_i`'s as a
+    parameter for SPP-Layer.
+    The length of this list is the level of the spatial pyramid.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    pool_dims : list of integers
+        The list of :math:`n_i`'s that define the output dimension of each
+        pooling level :math:`i`. The length of pool_dims is the level of
+        the spatial pyramid.
+
+    mode : string
+        Pooling mode, one of 'max', 'average_inc_pad' or 'average_exc_pad'.
+        Defaults to 'max'.
+
+    **kwargs
+        Any additional keyword arguments are passed to the :class:`Layer`
+        superclass.
+
+    Notes
+    -----
+    This layer should be inserted between the convolutional part of a
+    DNN and its dense part. Convolutions can be used for
+    arbitrary input dimensions, but the size of their output will
+    depend on their input dimensions. Connecting the output of the
+    convolutional to the dense part then usually demands us to fix
+    the dimensions of the network's InputLayer.
+    The spatial pyramid pooling layer, however, allows us to leave the
+    network input dimensions arbitrary. The advantage over a global
+    pooling layer is the added robustness against object deformations
+    due to the pooling on different scales.
+
+    References
+    ----------
+    .. [1] He, Kaiming et al (2015):
+           Spatial Pyramid Pooling in Deep Convolutional Networks
+           for Visual Recognition.
+           http://arxiv.org/pdf/1406.4729.pdf.
+    """
+    def __init__(self, incoming, pool_dims=[4, 2, 1], mode='max', **kwargs):
+            super(SpatialPyramidPoolingDNNLayer, self).__init__(incoming,
+                                                                **kwargs)
+            if len(self.input_shape) != 4:
+                raise ValueError("Tried to create a SPP layer with "
+                                 "input shape %r. Expected 4 input dimensions "
+                                 "(batchsize, channels, 2 spatial dimensions)."
+                                 % (self.input_shape,))
+            self.mode = mode
+            self.pool_dims = pool_dims
+
+    def get_output_for(self, input, **kwargs):
+        input_size = tuple(symb if fixed is None else fixed
+                           for fixed, symb
+                           in zip(self.input_shape[2:], input.shape[2:]))
+        pool_list = []
+        for pool_dim in self.pool_dims:
+            win_size = tuple((i + pool_dim - 1) // pool_dim
+                             for i in input_size)
+            str_size = tuple(i // pool_dim for i in input_size)
+
+            pool = dnn.dnn_pool(input, win_size, str_size, self.mode, (0, 0))
+            pool = pool.flatten(3)
+            pool_list.append(pool)
+
+        return theano.tensor.concatenate(pool_list, axis=2)
+
+    def get_output_shape_for(self, input_shape):
+        num_features = sum(p*p for p in self.pool_dims)
+        return (input_shape[0], input_shape[1], num_features)
diff --git a/lasagne/layers/embedding.py b/lasagne/layers/embedding.py
new file mode 100644
index 0000000..b05cb19
--- /dev/null
+++ b/lasagne/layers/embedding.py
@@ -0,0 +1,69 @@
+import numpy as np
+import theano.tensor as T
+
+from .. import init
+from .base import Layer
+
+
+__all__ = [
+    "EmbeddingLayer"
+]
+
+
+class EmbeddingLayer(Layer):
+    """
+    lasagne.layers.EmbeddingLayer(incoming, input_size, output_size,
+    W=lasagne.init.Normal(), **kwargs)
+
+    A layer for word embeddings. The input should be an integer type
+    Tensor variable.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    input_size: int
+        The Number of different embeddings. The last embedding will have index
+        input_size - 1.
+
+    output_size : int
+        The size of each embedding.
+
+    W : Theano shared variable, expression, numpy array or callable
+        Initial value, expression or initializer for the embedding matrix.
+        This should be a matrix with shape ``(input_size, output_size)``.
+        See :func:`lasagne.utils.create_param` for more information.
+
+    Examples
+    --------
+    >>> from lasagne.layers import EmbeddingLayer, InputLayer, get_output
+    >>> import theano
+    >>> x = T.imatrix()
+    >>> l_in = InputLayer((3, ))
+    >>> W = np.arange(3*5).reshape((3, 5)).astype('float32')
+    >>> l1 = EmbeddingLayer(l_in, input_size=3, output_size=5, W=W)
+    >>> output = get_output(l1, x)
+    >>> f = theano.function([x], output)
+    >>> x_test = np.array([[0, 2], [1, 2]]).astype('int32')
+    >>> f(x_test)
+    array([[[  0.,   1.,   2.,   3.,   4.],
+            [ 10.,  11.,  12.,  13.,  14.]],
+    <BLANKLINE>
+           [[  5.,   6.,   7.,   8.,   9.],
+            [ 10.,  11.,  12.,  13.,  14.]]], dtype=float32)
+    """
+    def __init__(self, incoming, input_size, output_size,
+                 W=init.Normal(), **kwargs):
+        super(EmbeddingLayer, self).__init__(incoming, **kwargs)
+
+        self.input_size = input_size
+        self.output_size = output_size
+
+        self.W = self.add_param(W, (input_size, output_size), name="W")
+
+    def get_output_shape_for(self, input_shape):
+        return input_shape + (self.output_size, )
+
+    def get_output_for(self, input, **kwargs):
+        return self.W[input]
diff --git a/lasagne/layers/helper.py b/lasagne/layers/helper.py
new file mode 100644
index 0000000..02a2039
--- /dev/null
+++ b/lasagne/layers/helper.py
@@ -0,0 +1,520 @@
+from collections import deque
+from difflib import get_close_matches
+from inspect import getargspec
+from itertools import chain
+from warnings import warn
+
+import theano
+import numpy as np
+
+from .. import utils
+
+
+__all__ = [
+    "get_all_layers",
+    "get_output",
+    "get_output_shape",
+    "get_all_params",
+    "count_params",
+    "get_all_param_values",
+    "set_all_param_values",
+]
+
+
+def get_all_layers(layer, treat_as_input=None):
+    """
+    This function gathers all layers below one or more given :class:`Layer`
+    instances, including the given layer(s). Its main use is to collect all
+    layers of a network just given the output layer(s). The layers are
+    guaranteed to be returned in a topological order: a layer in the result
+    list is always preceded by all layers its input depends on.
+
+    Parameters
+    ----------
+    layer : Layer or list
+        the :class:`Layer` instance for which to gather all layers feeding
+        into it, or a list of :class:`Layer` instances.
+
+    treat_as_input : None or iterable
+        an iterable of :class:`Layer` instances to treat as input layers
+        with no layers feeding into them. They will show up in the result
+        list, but their incoming layers will not be collected (unless they
+        are required for other layers as well).
+
+    Returns
+    -------
+    list
+        a list of :class:`Layer` instances feeding into the given
+        instance(s) either directly or indirectly, and the given
+        instance(s) themselves, in topological order.
+
+    Examples
+    --------
+    >>> from lasagne.layers import InputLayer, DenseLayer
+    >>> l_in = InputLayer((100, 20))
+    >>> l1 = DenseLayer(l_in, num_units=50)
+    >>> get_all_layers(l1) == [l_in, l1]
+    True
+    >>> l2 = DenseLayer(l_in, num_units=10)
+    >>> get_all_layers([l2, l1]) == [l_in, l2, l1]
+    True
+    >>> get_all_layers([l1, l2]) == [l_in, l1, l2]
+    True
+    >>> l3 = DenseLayer(l2, num_units=20)
+    >>> get_all_layers(l3) == [l_in, l2, l3]
+    True
+    >>> get_all_layers(l3, treat_as_input=[l2]) == [l2, l3]
+    True
+    """
+    # We perform a depth-first search. We add a layer to the result list only
+    # after adding all its incoming layers (if any) or when detecting a cycle.
+    # We use a LIFO stack to avoid ever running into recursion depth limits.
+    try:
+        queue = deque(layer)
+    except TypeError:
+        queue = deque([layer])
+    seen = set()
+    done = set()
+    result = []
+
+    # If treat_as_input is given, we pretend we've already collected all their
+    # incoming layers.
+    if treat_as_input is not None:
+        seen.update(treat_as_input)
+
+    while queue:
+        # Peek at the leftmost node in the queue.
+        layer = queue[0]
+        if layer is None:
+            # Some node had an input_layer set to `None`. Just ignore it.
+            queue.popleft()
+        elif layer not in seen:
+            # We haven't seen this node yet: Mark it and queue all incomings
+            # to be processed first. If there are no incomings, the node will
+            # be appended to the result list in the next iteration.
+            seen.add(layer)
+            if hasattr(layer, 'input_layers'):
+                queue.extendleft(reversed(layer.input_layers))
+            elif hasattr(layer, 'input_layer'):
+                queue.appendleft(layer.input_layer)
+        else:
+            # We've been here before: Either we've finished all its incomings,
+            # or we've detected a cycle. In both cases, we remove the layer
+            # from the queue and append it to the result list.
+            queue.popleft()
+            if layer not in done:
+                result.append(layer)
+                done.add(layer)
+
+    return result
+
+
+def get_output(layer_or_layers, inputs=None, **kwargs):
+    """
+    Computes the output of the network at one or more given layers.
+    Optionally, you can define the input(s) to propagate through the network
+    instead of using the input variable(s) associated with the network's
+    input layer(s).
+
+    Parameters
+    ----------
+    layer_or_layers : Layer or list
+        the :class:`Layer` instance for which to compute the output
+        expressions, or a list of :class:`Layer` instances.
+
+    inputs : None, Theano expression, numpy array, or dict
+        If None, uses the input variables associated with the
+        :class:`InputLayer` instances.
+        If a Theano expression, this defines the input for a single
+        :class:`InputLayer` instance. Will throw a ValueError if there
+        are multiple :class:`InputLayer` instances.
+        If a numpy array, this will be wrapped as a Theano constant
+        and used just like a Theano expression.
+        If a dictionary, any :class:`Layer` instance (including the
+        input layers) can be mapped to a Theano expression or numpy
+        array to use instead of its regular output.
+
+    Returns
+    -------
+    output : Theano expression or list
+        the output of the given layer(s) for the given network input
+
+    Notes
+    -----
+    Depending on your network architecture, `get_output([l1, l2])` may
+    be crucially different from `[get_output(l1), get_output(l2)]`. Only
+    the former ensures that the output expressions depend on the same
+    intermediate expressions. For example, when `l1` and `l2` depend on
+    a common dropout layer, the former will use the same dropout mask for
+    both, while the latter will use two different dropout masks.
+    """
+    from .input import InputLayer
+    from .base import MergeLayer
+    # track accepted kwargs used by get_output_for
+    accepted_kwargs = {'deterministic'}
+    # obtain topological ordering of all layers the output layer(s) depend on
+    treat_as_input = inputs.keys() if isinstance(inputs, dict) else []
+    all_layers = get_all_layers(layer_or_layers, treat_as_input)
+    # initialize layer-to-expression mapping from all input layers
+    all_outputs = dict((layer, layer.input_var)
+                       for layer in all_layers
+                       if isinstance(layer, InputLayer) and
+                       layer not in treat_as_input)
+    # update layer-to-expression mapping from given input(s), if any
+    if isinstance(inputs, dict):
+        all_outputs.update((layer, utils.as_theano_expression(expr))
+                           for layer, expr in inputs.items())
+    elif inputs is not None:
+        if len(all_outputs) > 1:
+            raise ValueError("get_output() was called with a single input "
+                             "expression on a network with multiple input "
+                             "layers. Please call it with a dictionary of "
+                             "input expressions instead.")
+        for input_layer in all_outputs:
+            all_outputs[input_layer] = utils.as_theano_expression(inputs)
+    # update layer-to-expression mapping by propagating the inputs
+    for layer in all_layers:
+        if layer not in all_outputs:
+            try:
+                if isinstance(layer, MergeLayer):
+                    layer_inputs = [all_outputs[input_layer]
+                                    for input_layer in layer.input_layers]
+                else:
+                    layer_inputs = all_outputs[layer.input_layer]
+            except KeyError:
+                # one of the input_layer attributes must have been `None`
+                raise ValueError("get_output() was called without giving an "
+                                 "input expression for the free-floating "
+                                 "layer %r. Please call it with a dictionary "
+                                 "mapping this layer to an input expression."
+                                 % layer)
+            all_outputs[layer] = layer.get_output_for(layer_inputs, **kwargs)
+            try:
+                names, _, _, defaults = getargspec(layer.get_output_for)
+            except TypeError:
+                # If introspection is not possible, skip it
+                pass
+            else:
+                if defaults is not None:
+                    accepted_kwargs |= set(names[-len(defaults):])
+            accepted_kwargs |= set(layer.get_output_kwargs)
+    unused_kwargs = set(kwargs.keys()) - accepted_kwargs
+    if unused_kwargs:
+        suggestions = []
+        for kwarg in unused_kwargs:
+            suggestion = get_close_matches(kwarg, accepted_kwargs)
+            if suggestion:
+                suggestions.append('%s (perhaps you meant %s)'
+                                   % (kwarg, suggestion[0]))
+            else:
+                suggestions.append(kwarg)
+        warn("get_output() was called with unused kwargs:\n\t%s"
+             % "\n\t".join(suggestions))
+    # return the output(s) of the requested layer(s) only
+    try:
+        return [all_outputs[layer] for layer in layer_or_layers]
+    except TypeError:
+        return all_outputs[layer_or_layers]
+
+
+def get_output_shape(layer_or_layers, input_shapes=None):
+    """
+    Computes the output shape of the network at one or more given layers.
+
+    Parameters
+    ----------
+    layer_or_layers : Layer or list
+        the :class:`Layer` instance for which to compute the output
+        shapes, or a list of :class:`Layer` instances.
+
+    input_shapes : None, tuple, or dict
+        If None, uses the input shapes associated with the
+        :class:`InputLayer` instances.
+        If a tuple, this defines the input shape for a single
+        :class:`InputLayer` instance. Will throw a ValueError if there
+        are multiple :class:`InputLayer` instances.
+        If a dictionary, any :class:`Layer` instance (including the
+        input layers) can be mapped to a shape tuple to use instead of
+        its regular output shape.
+
+    Returns
+    -------
+    tuple or list
+        the output shape of the given layer(s) for the given network input
+    """
+    # shortcut: return precomputed shapes if we do not need to propagate any
+    if input_shapes is None or input_shapes == {}:
+        try:
+            return [layer.output_shape for layer in layer_or_layers]
+        except TypeError:
+            return layer_or_layers.output_shape
+
+    from .input import InputLayer
+    from .base import MergeLayer
+    # obtain topological ordering of all layers the output layer(s) depend on
+    if isinstance(input_shapes, dict):
+        treat_as_input = input_shapes.keys()
+    else:
+        treat_as_input = []
+
+    all_layers = get_all_layers(layer_or_layers, treat_as_input)
+    # initialize layer-to-shape mapping from all input layers
+    all_shapes = dict((layer, layer.shape)
+                      for layer in all_layers
+                      if isinstance(layer, InputLayer) and
+                      layer not in treat_as_input)
+    # update layer-to-shape mapping from given input(s), if any
+    if isinstance(input_shapes, dict):
+        all_shapes.update(input_shapes)
+    elif input_shapes is not None:
+        if len(all_shapes) > 1:
+            raise ValueError("get_output_shape() was called with a single "
+                             "input shape on a network with multiple input "
+                             "layers. Please call it with a dictionary of "
+                             "input shapes instead.")
+        for input_layer in all_shapes:
+            all_shapes[input_layer] = input_shapes
+    # update layer-to-shape mapping by propagating the input shapes
+    for layer in all_layers:
+        if layer not in all_shapes:
+            if isinstance(layer, MergeLayer):
+                input_shapes = [all_shapes[input_layer]
+                                for input_layer in layer.input_layers]
+            else:
+                input_shapes = all_shapes[layer.input_layer]
+            all_shapes[layer] = layer.get_output_shape_for(input_shapes)
+    # return the output shape(s) of the requested layer(s) only
+    try:
+        return [all_shapes[layer] for layer in layer_or_layers]
+    except TypeError:
+        return all_shapes[layer_or_layers]
+
+
+def get_all_params(layer, unwrap_shared=True, **tags):
+    """
+    Returns a list of Theano shared variables or expressions that
+    parameterize the layer.
+
+    This function gathers all parameters of all layers below one or more given
+    :class:`Layer` instances, including the layer(s) itself. Its main use is to
+    collect all parameters of a network just given the output layer(s).
+
+    By default, all shared variables that participate in the forward pass will
+    be returned. The list can optionally be filtered by specifying tags as
+    keyword arguments. For example, ``trainable=True`` will only return
+    trainable parameters, and ``regularizable=True`` will only return
+    parameters that can be regularized (e.g., by L2 decay).
+
+    Parameters
+    ----------
+    layer : Layer or list
+        The :class:`Layer` instance for which to gather all parameters, or a
+        list of :class:`Layer` instances.
+
+    unwrap_shared : bool (default: True)
+        Affects only parameters that were set to a Theano expression. If
+        ``True`` the function returns the shared variables contained in
+        the expression, otherwise the Theano expression itself.
+
+    **tags (optional)
+        tags can be specified to filter the list. Specifying ``tag1=True``
+        will limit the list to parameters that are tagged with ``tag1``.
+        Specifying ``tag1=False`` will limit the list to parameters that
+        are not tagged with ``tag1``. Commonly used tags are
+        ``regularizable`` and ``trainable``.
+
+    Returns
+    -------
+    params : list
+        A list of Theano shared variables or expressions representing
+        the parameters.
+
+    Notes
+    -----
+    If any of the layers' parameters was set to a Theano expression instead
+    of a shared variable, `unwrap_shared` controls whether to return the
+    shared variables involved in that expression (``unwrap_shared=True``,
+    the default), or the expression itself (``unwrap_shared=False``). In
+    either case, tag filtering applies to the expressions, considering all
+    variables within an expression to be tagged the same.
+
+    Examples
+    --------
+    Collecting all parameters from a two-layer network:
+
+    >>> from lasagne.layers import InputLayer, DenseLayer
+    >>> l_in = InputLayer((100, 20))
+    >>> l1 = DenseLayer(l_in, num_units=50)
+    >>> l2 = DenseLayer(l1, num_units=30)
+    >>> all_params = get_all_params(l2)
+    >>> all_params == [l1.W, l1.b, l2.W, l2.b]
+    True
+
+    Parameters can be filtered by tags, and parameter expressions are
+    unwrapped to return involved shared variables by default:
+
+    >>> from lasagne.utils import floatX
+    >>> w1 = theano.shared(floatX(.01 * np.random.randn(50, 30)))
+    >>> w2 = theano.shared(floatX(1))
+    >>> l2 = DenseLayer(l1, num_units=30, W=theano.tensor.exp(w1) - w2, b=None)
+    >>> all_params = get_all_params(l2, regularizable=True)
+    >>> all_params == [l1.W, w1, w2]
+    True
+
+    When disabling unwrapping, the expression for ``l2.W`` is returned instead:
+
+    >>> all_params = get_all_params(l2, regularizable=True,
+    ...                             unwrap_shared=False)
+    >>> all_params == [l1.W, l2.W]
+    True
+    """
+    layers = get_all_layers(layer)
+    params = chain.from_iterable(l.get_params(
+            unwrap_shared=unwrap_shared, **tags) for l in layers)
+    return utils.unique(params)
+
+
+def count_params(layer, **tags):
+    """
+    This function counts all parameters (i.e., the number of scalar
+    values) of all layers below one or more given :class:`Layer` instances,
+    including the layer(s) itself.
+
+    This is useful to compare the capacity of various network architectures.
+    All parameters returned by the :class:`Layer`s' `get_params` methods are
+    counted.
+
+    Parameters
+    ----------
+    layer : Layer or list
+        The :class:`Layer` instance for which to count the parameters, or a
+        list of :class:`Layer` instances.
+
+    **tags (optional)
+        tags can be specified to filter the list of parameter variables that
+        will be included in the count. Specifying ``tag1=True``
+        will limit the list to parameters that are tagged with ``tag1``.
+        Specifying ``tag1=False`` will limit the list to parameters that
+        are not tagged with ``tag1``. Commonly used tags are
+        ``regularizable`` and ``trainable``.
+
+    Returns
+    -------
+    int
+        The total number of learnable parameters.
+
+    Examples
+    --------
+    >>> from lasagne.layers import InputLayer, DenseLayer
+    >>> l_in = InputLayer((100, 20))
+    >>> l1 = DenseLayer(l_in, num_units=50)
+    >>> param_count = count_params(l1)
+    >>> param_count
+    1050
+    >>> param_count == 20 * 50 + 50  # 20 input * 50 units + 50 biases
+    True
+    """
+    params = get_all_params(layer, **tags)
+    shapes = [p.get_value().shape for p in params]
+    counts = [np.prod(shape) for shape in shapes]
+    return sum(counts)
+
+
+def get_all_param_values(layer, **tags):
+    """
+    This function returns the values of the parameters of all layers below one
+    or more given :class:`Layer` instances, including the layer(s) itself.
+
+    This function can be used in conjunction with set_all_param_values to save
+    and restore model parameters.
+
+    Parameters
+    ----------
+    layer : Layer or list
+        The :class:`Layer` instance for which to gather all parameter values,
+        or a list of :class:`Layer` instances.
+
+    **tags (optional)
+        tags can be specified to filter the list. Specifying ``tag1=True``
+        will limit the list to parameters that are tagged with ``tag1``.
+        Specifying ``tag1=False`` will limit the list to parameters that
+        are not tagged with ``tag1``. Commonly used tags are
+        ``regularizable`` and ``trainable``.
+
+    Returns
+    -------
+    list of numpy.array
+        A list of numpy arrays representing the parameter values.
+
+    Examples
+    --------
+    >>> from lasagne.layers import InputLayer, DenseLayer
+    >>> l_in = InputLayer((100, 20))
+    >>> l1 = DenseLayer(l_in, num_units=50)
+    >>> all_param_values = get_all_param_values(l1)
+    >>> (all_param_values[0] == l1.W.get_value()).all()
+    True
+    >>> (all_param_values[1] == l1.b.get_value()).all()
+    True
+    """
+    params = get_all_params(layer, **tags)
+    return [p.get_value() for p in params]
+
+
+def set_all_param_values(layer, values, **tags):
+    """
+    Given a list of numpy arrays, this function sets the parameters of all
+    layers below one or more given :class:`Layer` instances (including the
+    layer(s) itself) to the given values.
+
+    This function can be used in conjunction with get_all_param_values to save
+    and restore model parameters.
+
+    Parameters
+    ----------
+    layer : Layer or list
+        The :class:`Layer` instance for which to set all parameter values, or a
+        list of :class:`Layer` instances.
+
+    values : list of numpy.array
+        A list of numpy arrays representing the parameter values, must match
+        the number of parameters.
+        Every parameter's shape must match the shape of its new value.
+
+    **tags (optional)
+        tags can be specified to filter the list of parameters to be set.
+        Specifying ``tag1=True`` will limit the list to parameters that are
+        tagged with ``tag1``.
+        Specifying ``tag1=False`` will limit the list to parameters that
+        are not tagged with ``tag1``. Commonly used tags are
+        ``regularizable`` and ``trainable``.
+
+    Raises
+    ------
+    ValueError
+        If the number of values is not equal to the number of params, or
+        if a parameter's shape does not match the shape of its new value.
+
+    Examples
+    --------
+    >>> from lasagne.layers import InputLayer, DenseLayer
+    >>> l_in = InputLayer((100, 20))
+    >>> l1 = DenseLayer(l_in, num_units=50)
+    >>> all_param_values = get_all_param_values(l1)
+    >>> # all_param_values is now [l1.W.get_value(), l1.b.get_value()]
+    >>> # ...
+    >>> set_all_param_values(l1, all_param_values)
+    >>> # the parameter values are restored.
+    """
+    params = get_all_params(layer, **tags)
+    if len(params) != len(values):
+        raise ValueError("mismatch: got %d values to set %d parameters" %
+                         (len(values), len(params)))
+
+    for p, v in zip(params, values):
+        if p.get_value().shape != v.shape:
+            raise ValueError("mismatch: parameter has shape %r but value to "
+                             "set has shape %r" %
+                             (p.get_value().shape, v.shape))
+        else:
+            p.set_value(v)
diff --git a/lasagne/layers/input.py b/lasagne/layers/input.py
new file mode 100644
index 0000000..a48cfad
--- /dev/null
+++ b/lasagne/layers/input.py
@@ -0,0 +1,75 @@
+from collections import OrderedDict
+
+import theano
+import theano.tensor as T
+
+from .. import utils
+
+from .base import Layer
+
+
+__all__ = [
+    "InputLayer",
+]
+
+
+class InputLayer(Layer):
+    """
+    This layer holds a symbolic variable that represents a network input. A
+    variable can be specified when the layer is instantiated, else it is
+    created.
+
+    Parameters
+    ----------
+    shape : tuple of `int` or `None` elements
+        The shape of the input. Any element can be `None` to indicate that the
+        size of that dimension is not fixed at compile time.
+
+    input_var : Theano symbolic variable or `None` (default: `None`)
+        A variable representing a network input. If it is not provided, a
+        variable will be created.
+
+    Raises
+    ------
+    ValueError
+        If the dimension of `input_var` is not equal to `len(shape)`
+
+    Notes
+    -----
+    The first dimension usually indicates the batch size. If you specify it,
+    Theano may apply more optimizations while compiling the training or
+    prediction function, but the compiled function will not accept data of a
+    different batch size at runtime. To compile for a variable batch size, set
+    the first shape element to `None` instead.
+
+    Examples
+    --------
+    >>> from lasagne.layers import InputLayer
+    >>> l_in = InputLayer((100, 20))
+    """
+    def __init__(self, shape, input_var=None, name=None, **kwargs):
+        self.shape = shape
+        if any(d is not None and d <= 0 for d in self.shape):
+            raise ValueError((
+                "Cannot create InputLayer with a non-positive shape "
+                "dimension. shape=%r, self.name=%r") % (
+                    self.shape, name))
+
+        ndim = len(shape)
+        if input_var is None:
+            # create the right TensorType for the given number of dimensions
+            input_var_type = T.TensorType(theano.config.floatX, [False] * ndim)
+            var_name = ("%s.input" % name) if name is not None else "input"
+            input_var = input_var_type(var_name)
+        else:
+            # ensure the given variable has the correct dimensionality
+            if input_var.ndim != ndim:
+                raise ValueError("shape has %d dimensions, but variable has "
+                                 "%d" % (ndim, input_var.ndim))
+        self.input_var = input_var
+        self.name = name
+        self.params = OrderedDict()
+
+    @Layer.output_shape.getter
+    def output_shape(self):
+        return self.shape
diff --git a/lasagne/layers/merge.py b/lasagne/layers/merge.py
new file mode 100644
index 0000000..737f7be
--- /dev/null
+++ b/lasagne/layers/merge.py
@@ -0,0 +1,403 @@
+import theano.tensor as T
+
+from .base import MergeLayer
+
+
+__all__ = [
+    "autocrop",
+    "autocrop_array_shapes",
+    "ConcatLayer",
+    "concat",
+    "ElemwiseMergeLayer",
+    "ElemwiseSumLayer",
+]
+
+
+def autocrop(inputs, cropping):
+    """
+    Crops the given input arrays.
+
+    Cropping takes a sequence of inputs and crops them per-axis in order to
+    ensure that their sizes are consistent so that they can be combined
+    in an element-wise fashion. If cropping is enabled for a specific axis,
+    the minimum size in that axis of all inputs is computed, and all
+    inputs are cropped to that size.
+
+    The per-axis cropping modes are:
+
+    `None`: this axis is not cropped, inputs are unchanged in this axis
+
+    `'lower'`: inputs are cropped choosing the lower portion in this axis
+    (`a[:crop_size, ...]`)
+
+    `'upper'`: inputs are cropped choosing the upper portion in this axis
+    (`a[-crop_size:, ...]`)
+
+    `'center'`: inputs are cropped choosing the central portion in this axis
+    (``a[offset:offset+crop_size, ...]`` where
+    ``offset = (a.shape[0]-crop_size)//2)``
+
+    Parameters
+    ----------
+    inputs : list of Theano expressions
+        The input arrays in the form of a list of Theano expressions
+
+    cropping : list of cropping modes
+        Cropping modes, one for each axis. If length of `cropping` is less
+        than the number of axes in the inputs, it is padded with `None`.
+        If `cropping` is None, `input` is returned as is.
+
+    Returns
+    -------
+    list of Theano expressions
+        each expression is the cropped version of the corresponding input
+
+    Example
+    -------
+    For example, given three inputs:
+
+    >>> import numpy
+    >>> import theano
+
+    >>> a = numpy.random.random((1, 2, 3, 4))
+    >>> b = numpy.random.random((5, 4, 4, 2))
+    >>> c = numpy.random.random((7, 1, 8, 9))
+
+    Cropping mode for each axis:
+
+    >>> cropping = [None, 'lower', 'center', 'upper']
+
+    Crop (note that the input arrays are converted to Theano vars first,
+    and that the results are converted back from Theano expressions to
+    numpy arrays by calling `eval()`)
+    >>> xa, xb, xc = autocrop([theano.shared(a), \
+                               theano.shared(b), \
+                               theano.shared(c)], cropping)
+    >>> xa, xb, xc = xa.eval(), xb.eval(), xc.eval()
+
+    They will be left as is in axis 0 and cropped in the other three,
+    choosing the lower, center and upper portions:
+
+    Axis 0: choose all, axis 1: lower 1 element,
+    axis 2: central 3 (all) and axis 3: upper 2
+    >>> (xa == a[:, :1, :3, -2:]).all()
+    True
+
+    Axis 0: choose all, axis 1: lower 1 element,
+    axis 2: central 3 starting at 0 and axis 3: upper 2 (all)
+    >>> (xb == b[:, :1, :3, -2:]).all()
+    True
+
+    Axis 0: all, axis 1: lower 1 element (all),
+    axis 2: central 3 starting at 2 and axis 3: upper 2
+    >>> (xc == c[:, :1, 2:5:, -2:]).all()
+    True
+    """
+    if cropping is None:
+        # No cropping in any dimension
+        return inputs
+    else:
+        # Get the number of dimensions
+        ndim = inputs[0].ndim
+        # Check for consistent number of dimensions
+        if not all(input.ndim == ndim for input in inputs):
+            raise ValueError("Not all inputs are of the same "
+                             "dimensionality. Got {0} inputs of "
+                             "dimensionalities {1}.".format(
+                                len(inputs),
+                                [input.ndim for input in inputs]))
+        # Get the shape of each input, where each shape will be a Theano
+        # expression
+        shapes = [input.shape for input in inputs]
+        # Convert the shapes to a matrix expression
+        shapes_tensor = T.as_tensor_variable(shapes)
+        # Min along axis 0 to get the minimum size in each dimension
+        min_shape = T.min(shapes_tensor, axis=0)
+
+        # Nested list of slices; each list in `slices` corresponds to
+        # an input and contains a slice for each dimension
+        slices_by_input = [[] for i in range(len(inputs))]
+
+        # If there are more dimensions than cropping entries, pad
+        # the cropping
+        cropping = list(cropping)
+        if ndim > len(cropping):
+            cropping = list(cropping) + \
+                         [None] * (ndim - len(cropping))
+
+        # For each dimension
+        for dim, cr in enumerate(cropping):
+            if cr is None:
+                # Don't crop this dimension
+                slice_all = slice(None)
+                for slices in slices_by_input:
+                    slices.append(slice_all)
+            else:
+                # We crop all inputs in the dimension `dim` so that they
+                # are the minimum found in this dimension from all inputs
+                sz = min_shape[dim]
+                if cr == 'lower':
+                    # Choose the first `sz` elements
+                    slc_lower = slice(None, sz)
+                    for slices in slices_by_input:
+                        slices.append(slc_lower)
+                elif cr == 'upper':
+                    # Choose the last `sz` elements
+                    slc_upper = slice(-sz, None)
+                    for slices in slices_by_input:
+                        slices.append(slc_upper)
+                elif cr == 'center':
+                    # Choose `sz` elements from the center
+                    for sh, slices in zip(shapes, slices_by_input):
+                        offset = (sh[dim] - sz) // 2
+                        slices.append(slice(offset, offset+sz))
+                else:
+                    raise ValueError(
+                        'Unknown crop mode \'{0}\''.format(cr))
+
+        return [input[slices] for input, slices in
+                zip(inputs, slices_by_input)]
+
+
+def autocrop_array_shapes(input_shapes, cropping):
+    """
+    Computes the shapes of the given arrays after auto-cropping is applied.
+
+    For more information on cropping, see the :func:`autocrop` function
+    documentation.
+
+    Parameters
+    ----------
+    input_shapes : the shapes of input arrays prior to cropping in
+        the form of a list of tuples
+
+    cropping : a list of cropping modes, one for each axis. If length of
+        `cropping` is less than the number of axes in the inputs, it is
+        padded with `None`. If `cropping` is None, `input_shapes` is returned
+        as is. For more information on their values and operation, see the
+        :func:`autocrop` documentation.
+
+    Returns
+    -------
+    list of tuples
+        each tuple is a cropped version of the corresponding input
+        shape tuple in `input_shapes`
+
+    For example, given three input shapes with 4 axes each:
+
+    >>> a = (1, 2, 3, 4)
+    >>> b = (5, 4, 4, 2)
+    >>> c = (7, 1, 8, 9)
+
+    Cropping mode for each axis:
+
+    >>> cropping = [None, 'lower', 'center', 'upper']
+
+    Apply:
+
+    >>> cropped_shapes = autocrop_array_shapes([a, b, c], cropping)
+    >>> cropped_shapes[0]
+    (1, 1, 3, 2)
+
+    >>> cropped_shapes[1]
+    (5, 1, 3, 2)
+
+    >>> cropped_shapes[2]
+    (7, 1, 3, 2)
+
+    Note that axis 0 remains unchanged, where all the others are cropped
+    to the minimum size in that axis.
+    """
+    if cropping is None:
+        return input_shapes
+    else:
+        # Check for consistent number of dimensions
+        ndim = len(input_shapes[0])
+        if not all(len(sh) == ndim for sh in input_shapes):
+            raise ValueError("Not all inputs are of the same "
+                             "dimensionality. Got {0} inputs of "
+                             "dimensionalities {1}.".format(
+                                len(input_shapes),
+                                [len(sh) for sh in input_shapes]))
+
+        result = []
+
+        # If there are more dimensions than cropping entries, pad
+        # the cropping
+        cropping = list(cropping)
+        if ndim > len(cropping):
+            cropping = list(cropping) + \
+                         [None] * (ndim - len(cropping))
+
+        for sh, cr in zip(zip(*input_shapes), cropping):
+            if cr is None:
+                result.append(sh)
+            elif cr in {'lower', 'center', 'upper'}:
+                result.append([min(sh)] * len(sh))
+            else:
+                raise ValueError('Unknown crop mode \'{0}\''.format(cr))
+        return [tuple(sh) for sh in zip(*result)]
+
+
+class ConcatLayer(MergeLayer):
+    """
+    Concatenates multiple inputs along the specified axis. Inputs should have
+    the same shape except for the dimension specified in axis, which can have
+    different sizes.
+
+    Parameters
+    -----------
+    incomings : a list of :class:`Layer` instances or tuples
+        The layers feeding into this layer, or expected input shapes
+
+    axis : int
+        Axis which inputs are joined over
+
+    cropping : None or [crop]
+        Cropping for each input axis. Cropping is described in the docstring
+        for :func:`autocrop`. Cropping is always disabled for `axis`.
+    """
+    def __init__(self, incomings, axis=1, cropping=None, **kwargs):
+        super(ConcatLayer, self).__init__(incomings, **kwargs)
+        self.axis = axis
+        if cropping is not None:
+            # If cropping is enabled, don't crop on the selected axis
+            cropping = list(cropping)
+            cropping[axis] = None
+        self.cropping = cropping
+
+    def get_output_shape_for(self, input_shapes):
+        input_shapes = autocrop_array_shapes(input_shapes, self.cropping)
+        # Infer the output shape by grabbing, for each axis, the first
+        # input size that is not `None` (if there is any)
+        output_shape = [next((s for s in sizes if s is not None), None)
+                        for sizes in zip(*input_shapes)]
+
+        def match(shape1, shape2):
+            axis = self.axis if self.axis >= 0 else len(shape1) + self.axis
+            return (len(shape1) == len(shape2) and
+                    all(i == axis or s1 is None or s2 is None or s1 == s2
+                        for i, (s1, s2) in enumerate(zip(shape1, shape2))))
+
+        # Check for compatibility with inferred output shape
+        if not all(match(shape, output_shape) for shape in input_shapes):
+            raise ValueError("Mismatch: input shapes must be the same except "
+                             "in the concatenation axis")
+        # Infer output shape on concatenation axis and return
+        sizes = [input_shape[self.axis] for input_shape in input_shapes]
+        concat_size = None if any(s is None for s in sizes) else sum(sizes)
+        output_shape[self.axis] = concat_size
+        return tuple(output_shape)
+
+    def get_output_for(self, inputs, **kwargs):
+        inputs = autocrop(inputs, self.cropping)
+        return T.concatenate(inputs, axis=self.axis)
+
+concat = ConcatLayer  # shortcut
+
+
+class ElemwiseMergeLayer(MergeLayer):
+    """
+    This layer performs an elementwise merge of its input layers.
+    It requires all input layers to have the same output shape.
+
+    Parameters
+    ----------
+    incomings : a list of :class:`Layer` instances or tuples
+        the layers feeding into this layer, or expected input shapes,
+        with all incoming shapes being equal
+
+    merge_function : callable
+        the merge function to use. Should take two arguments and return the
+        updated value. Some possible merge functions are ``theano.tensor``:
+        ``mul``, ``add``, ``maximum`` and ``minimum``.
+
+    cropping : None or [crop]
+        Cropping for each input axis. Cropping is described in the docstring
+        for :func:`autocrop`
+
+    See Also
+    --------
+    ElemwiseSumLayer : Shortcut for sum layer.
+    """
+
+    def __init__(self, incomings, merge_function, cropping=None, **kwargs):
+        super(ElemwiseMergeLayer, self).__init__(incomings, **kwargs)
+        self.merge_function = merge_function
+        self.cropping = cropping
+
+    def get_output_shape_for(self, input_shapes):
+        input_shapes = autocrop_array_shapes(input_shapes, self.cropping)
+        # Infer the output shape by grabbing, for each axis, the first
+        # input size that is not `None` (if there is any)
+        output_shape = tuple(next((s for s in sizes if s is not None), None)
+                             for sizes in zip(*input_shapes))
+
+        def match(shape1, shape2):
+            return (len(shape1) == len(shape2) and
+                    all(s1 is None or s2 is None or s1 == s2
+                        for s1, s2 in zip(shape1, shape2)))
+
+        # Check for compatibility with inferred output shape
+        if not all(match(shape, output_shape) for shape in input_shapes):
+            raise ValueError("Mismatch: not all input shapes are the same")
+        return output_shape
+
+    def get_output_for(self, inputs, **kwargs):
+        inputs = autocrop(inputs, self.cropping)
+        output = None
+        for input in inputs:
+            if output is not None:
+                output = self.merge_function(output, input)
+            else:
+                output = input
+        return output
+
+
+class ElemwiseSumLayer(ElemwiseMergeLayer):
+    """
+    This layer performs an elementwise sum of its input layers.
+    It requires all input layers to have the same output shape.
+
+    Parameters
+    ----------
+    incomings : a list of :class:`Layer` instances or tuples
+        the layers feeding into this layer, or expected input shapes,
+        with all incoming shapes being equal
+
+    coeffs: list or scalar
+        A same-sized list of coefficients, or a single coefficient that
+        is to be applied to all instances. By default, these will not
+        be included in the learnable parameters of this layer.
+
+    cropping : None or [crop]
+        Cropping for each input axis. Cropping is described in the docstring
+        for :func:`autocrop`
+
+    Notes
+    -----
+    Depending on your architecture, this can be used to avoid the more
+    costly :class:`ConcatLayer`. For example, instead of concatenating layers
+    before a :class:`DenseLayer`, insert separate :class:`DenseLayer` instances
+    of the same number of output units and add them up afterwards. (This avoids
+    the copy operations in concatenation, but splits up the dot product.)
+    """
+    def __init__(self, incomings, coeffs=1, cropping=None, **kwargs):
+        super(ElemwiseSumLayer, self).__init__(incomings, T.add,
+                                               cropping=cropping, **kwargs)
+        if isinstance(coeffs, list):
+            if len(coeffs) != len(incomings):
+                raise ValueError("Mismatch: got %d coeffs for %d incomings" %
+                                 (len(coeffs), len(incomings)))
+        else:
+            coeffs = [coeffs] * len(incomings)
+
+        self.coeffs = coeffs
+
+    def get_output_for(self, inputs, **kwargs):
+        # if needed multiply each input by its coefficient
+        inputs = [input * coeff if coeff != 1 else input
+                  for coeff, input in zip(self.coeffs, inputs)]
+
+        # pass scaled inputs to the super class for summing
+        return super(ElemwiseSumLayer, self).get_output_for(inputs, **kwargs)
diff --git a/lasagne/layers/noise.py b/lasagne/layers/noise.py
new file mode 100644
index 0000000..7cbf81e
--- /dev/null
+++ b/lasagne/layers/noise.py
@@ -0,0 +1,136 @@
+import theano
+import theano.tensor as T
+
+from .base import Layer
+from ..random import get_rng
+
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
+
+
+__all__ = [
+    "DropoutLayer",
+    "dropout",
+    "GaussianNoiseLayer",
+]
+
+
+class DropoutLayer(Layer):
+    """Dropout layer
+
+    Sets values to zero with probability p. See notes for disabling dropout
+    during testing.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        the layer feeding into this layer, or the expected input shape
+    p : float or scalar tensor
+        The probability of setting a value to zero
+    rescale : bool
+        If true the input is rescaled with input / (1-p) when deterministic
+        is False.
+
+    Notes
+    -----
+    The dropout layer is a regularizer that randomly sets input values to
+    zero; see [1]_, [2]_ for why this might improve generalization.
+    During training you should set deterministic to false and during
+    testing you should set deterministic to true.
+
+    If rescale is true the input is scaled with input / (1-p) when
+    deterministic is false, see references for further discussion. Note that
+    this implementation scales the input at training time.
+
+    References
+    ----------
+    .. [1] Hinton, G., Srivastava, N., Krizhevsky, A., Sutskever, I.,
+           Salakhutdinov, R. R. (2012):
+           Improving neural networks by preventing co-adaptation of feature
+           detectors. arXiv preprint arXiv:1207.0580.
+
+    .. [2] Srivastava Nitish, Hinton, G., Krizhevsky, A., Sutskever,
+           I., & Salakhutdinov, R. R. (2014):
+           Dropout: A Simple Way to Prevent Neural Networks from Overfitting.
+           Journal of Machine Learning Research, 5(Jun)(2), 1929-1958.
+    """
+    def __init__(self, incoming, p=0.5, rescale=True, **kwargs):
+        super(DropoutLayer, self).__init__(incoming, **kwargs)
+        self._srng = RandomStreams(get_rng().randint(1, 2147462579))
+        self.p = p
+        self.rescale = rescale
+
+    def get_output_for(self, input, deterministic=False, **kwargs):
+        """
+        Parameters
+        ----------
+        input : tensor
+            output from the previous layer
+        deterministic : bool
+            If true dropout and scaling is disabled, see notes
+        """
+        if deterministic or self.p == 0:
+            return input
+        else:
+            # Using theano constant to prevent upcasting
+            one = T.constant(1)
+
+            retain_prob = one - self.p
+            if self.rescale:
+                input /= retain_prob
+
+            # use nonsymbolic shape for dropout mask if possible
+            input_shape = self.input_shape
+            if any(s is None for s in input_shape):
+                input_shape = input.shape
+
+            return input * self._srng.binomial(input_shape, p=retain_prob,
+                                               dtype=input.dtype)
+
+dropout = DropoutLayer  # shortcut
+
+
+class GaussianNoiseLayer(Layer):
+    """Gaussian noise layer.
+
+    Add zero-mean Gaussian noise of given standard deviation to the input [1]_.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+            the layer feeding into this layer, or the expected input shape
+    sigma : float or tensor scalar
+            Standard deviation of added Gaussian noise
+
+    Notes
+    -----
+    The Gaussian noise layer is a regularizer. During training you should set
+    deterministic to false and during testing you should set deterministic to
+    true.
+
+    References
+    ----------
+    .. [1] K.-C. Jim, C. Giles, and B. Horne (1996):
+           An analysis of noise in recurrent neural networks: convergence and
+           generalization.
+           IEEE Transactions on Neural Networks, 7(6):1424-1438.
+    """
+    def __init__(self, incoming, sigma=0.1, **kwargs):
+        super(GaussianNoiseLayer, self).__init__(incoming, **kwargs)
+        self._srng = RandomStreams(get_rng().randint(1, 2147462579))
+        self.sigma = sigma
+
+    def get_output_for(self, input, deterministic=False, **kwargs):
+        """
+        Parameters
+        ----------
+        input : tensor
+            output from the previous layer
+        deterministic : bool
+            If true noise is disabled, see notes
+        """
+        if deterministic or self.sigma == 0:
+            return input
+        else:
+            return input + self._srng.normal(input.shape,
+                                             avg=0.0,
+                                             std=self.sigma)
diff --git a/lasagne/layers/normalization.py b/lasagne/layers/normalization.py
new file mode 100644
index 0000000..c16d6a9
--- /dev/null
+++ b/lasagne/layers/normalization.py
@@ -0,0 +1,375 @@
+# -*- coding: utf-8 -*-
+
+"""
+The :class:`LocalResponseNormalization2DLayer
+<lasagne.layers.LocalResponseNormalization2DLayer>` implementation contains
+code from `pylearn2 <http://github.com/lisa-lab/pylearn2>`_, which is covered
+by the following license:
+
+
+Copyright (c) 2011--2014, Université de Montréal
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+import theano
+import theano.tensor as T
+
+from .. import init
+from .. import nonlinearities
+
+from .base import Layer
+
+
+__all__ = [
+    "LocalResponseNormalization2DLayer",
+    "BatchNormLayer",
+    "batch_norm",
+]
+
+
+class LocalResponseNormalization2DLayer(Layer):
+    """
+    Cross-channel Local Response Normalization for 2D feature maps.
+
+    Aggregation is purely across channels, not within channels,
+    and performed "pixelwise".
+
+    If the value of the :math:`i` th channel is :math:`x_i`, the output is
+
+    .. math::
+        x_i = \\frac{x_i}{ (k + ( \\alpha \\sum_j x_j^2 ))^\\beta }
+
+    where the summation is performed over this position on :math:`n`
+    neighboring channels.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape. Must
+        follow *BC01* layout, i.e., ``(batchsize, channels, rows, columns)``.
+    alpha : float scalar
+        coefficient, see equation above
+    k : float scalar
+        offset, see equation above
+    beta : float scalar
+        exponent, see equation above
+    n : int
+        number of adjacent channels to normalize over, must be odd
+
+    Notes
+    -----
+    This code is adapted from pylearn2. See the module docstring for license
+    information.
+    """
+
+    def __init__(self, incoming, alpha=1e-4, k=2, beta=0.75, n=5, **kwargs):
+        super(LocalResponseNormalization2DLayer, self).__init__(incoming,
+                                                                **kwargs)
+        self.alpha = alpha
+        self.k = k
+        self.beta = beta
+        self.n = n
+        if n % 2 == 0:
+            raise NotImplementedError("Only works with odd n")
+
+    def get_output_shape_for(self, input_shape):
+        return input_shape
+
+    def get_output_for(self, input, **kwargs):
+        input_shape = self.input_shape
+        if any(s is None for s in input_shape):
+            input_shape = input.shape
+        half_n = self.n // 2
+        input_sqr = T.sqr(input)
+        b, ch, r, c = input_shape
+        extra_channels = T.alloc(0., b, ch + 2*half_n, r, c)
+        input_sqr = T.set_subtensor(extra_channels[:, half_n:half_n+ch, :, :],
+                                    input_sqr)
+        scale = self.k
+        for i in range(self.n):
+            scale += self.alpha * input_sqr[:, i:i+ch, :, :]
+        scale = scale ** self.beta
+        return input / scale
+
+
+class BatchNormLayer(Layer):
+    """
+    lasagne.layers.BatchNormLayer(incoming, axes='auto', epsilon=1e-4,
+    alpha=0.1, beta=lasagne.init.Constant(0), gamma=lasagne.init.Constant(1),
+    mean=lasagne.init.Constant(0), inv_std=lasagne.init.Constant(1), **kwargs)
+
+    Batch Normalization
+
+    This layer implements batch normalization of its inputs, following [1]_:
+
+    .. math::
+        y = \\frac{x - \\mu}{\\sqrt{\\sigma^2 + \\epsilon}} \\gamma + \\beta
+
+    That is, the input is normalized to zero mean and unit variance, and then
+    linearly transformed. The crucial part is that the mean and variance are
+    computed across the batch dimension, i.e., over examples, not per example.
+
+    During training, :math:`\\mu` and :math:`\\sigma^2` are defined to be the
+    mean and variance of the current input mini-batch :math:`x`, and during
+    testing, they are replaced with average statistics over the training
+    data. Consequently, this layer has four stored parameters: :math:`\\beta`,
+    :math:`\\gamma`, and the averages :math:`\\mu` and :math:`\\sigma^2`
+    (nota bene: instead of :math:`\\sigma^2`, the layer actually stores
+    :math:`1 / \\sqrt{\\sigma^2 + \\epsilon}`, for compatibility to cuDNN).
+    By default, this layer learns the average statistics as exponential moving
+    averages computed during training, so it can be plugged into an existing
+    network without any changes of the training procedure (see Notes).
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape
+    axes : 'auto', int or tuple of int
+        The axis or axes to normalize over. If ``'auto'`` (the default),
+        normalize over all axes except for the second: this will normalize over
+        the minibatch dimension for dense layers, and additionally over all
+        spatial dimensions for convolutional layers.
+    epsilon : scalar
+        Small constant :math:`\\epsilon` added to the variance before taking
+        the square root and dividing by it, to avoid numerical problems
+    alpha : scalar
+        Coefficient for the exponential moving average of batch-wise means and
+        standard deviations computed during training; the closer to one, the
+        more it will depend on the last batches seen
+    beta : Theano shared variable, expression, numpy array, callable or None
+        Initial value, expression or initializer for :math:`\\beta`. Must match
+        the incoming shape, skipping all axes in `axes`. Set to ``None`` to fix
+        it to 0.0 instead of learning it.
+        See :func:`lasagne.utils.create_param` for more information.
+    gamma : Theano shared variable, expression, numpy array, callable or None
+        Initial value, expression or initializer for :math:`\\gamma`. Must
+        match the incoming shape, skipping all axes in `axes`. Set to ``None``
+        to fix it to 1.0 instead of learning it.
+        See :func:`lasagne.utils.create_param` for more information.
+    mean : Theano shared variable, expression, numpy array, or callable
+        Initial value, expression or initializer for :math:`\\mu`. Must match
+        the incoming shape, skipping all axes in `axes`.
+        See :func:`lasagne.utils.create_param` for more information.
+    inv_std : Theano shared variable, expression, numpy array, or callable
+        Initial value, expression or initializer for :math:`1 / \\sqrt{
+        \\sigma^2 + \\epsilon}`. Must match the incoming shape, skipping all
+        axes in `axes`.
+        See :func:`lasagne.utils.create_param` for more information.
+    **kwargs
+        Any additional keyword arguments are passed to the :class:`Layer`
+        superclass.
+
+    Notes
+    -----
+    This layer should be inserted between a linear transformation (such as a
+    :class:`DenseLayer`, or :class:`Conv2DLayer`) and its nonlinearity. The
+    convenience function :func:`batch_norm` modifies an existing layer to
+    insert batch normalization in front of its nonlinearity.
+
+    The behavior can be controlled by passing keyword arguments to
+    :func:`lasagne.layers.get_output()` when building the output expression
+    of any network containing this layer.
+
+    During training, [1]_ normalize each input mini-batch by its statistics
+    and update an exponential moving average of the statistics to be used for
+    validation. This can be achieved by passing ``deterministic=False``.
+    For validation, [1]_ normalize each input mini-batch by the stored
+    statistics. This can be achieved by passing ``deterministic=True``.
+
+    For more fine-grained control, ``batch_norm_update_averages`` can be passed
+    to update the exponential moving averages (``True``) or not (``False``),
+    and ``batch_norm_use_averages`` can be passed to use the exponential moving
+    averages for normalization (``True``) or normalize each mini-batch by its
+    own statistics (``False``). These settings override ``deterministic``.
+
+    Note that for testing a model after training, [1]_ replace the stored
+    exponential moving average statistics by fixing all network weights and
+    re-computing average statistics over the training data in a layerwise
+    fashion. This is not part of the layer implementation.
+
+    In case you set `axes` to not include the batch dimension (the first axis,
+    usually), normalization is done per example, not across examples. This does
+    not require any averages, so you can pass ``batch_norm_update_averages``
+    and ``batch_norm_use_averages`` as ``False`` in this case.
+
+    See also
+    --------
+    batch_norm : Convenience function to apply batch normalization to a layer
+
+    References
+    ----------
+    .. [1] Ioffe, Sergey and Szegedy, Christian (2015):
+           Batch Normalization: Accelerating Deep Network Training by Reducing
+           Internal Covariate Shift. http://arxiv.org/abs/1502.03167.
+    """
+    def __init__(self, incoming, axes='auto', epsilon=1e-4, alpha=0.1,
+                 beta=init.Constant(0), gamma=init.Constant(1),
+                 mean=init.Constant(0), inv_std=init.Constant(1), **kwargs):
+        super(BatchNormLayer, self).__init__(incoming, **kwargs)
+
+        if axes == 'auto':
+            # default: normalize over all but the second axis
+            axes = (0,) + tuple(range(2, len(self.input_shape)))
+        elif isinstance(axes, int):
+            axes = (axes,)
+        self.axes = axes
+
+        self.epsilon = epsilon
+        self.alpha = alpha
+
+        # create parameters, ignoring all dimensions in axes
+        shape = [size for axis, size in enumerate(self.input_shape)
+                 if axis not in self.axes]
+        if any(size is None for size in shape):
+            raise ValueError("BatchNormLayer needs specified input sizes for "
+                             "all axes not normalized over.")
+        if beta is None:
+            self.beta = None
+        else:
+            self.beta = self.add_param(beta, shape, 'beta',
+                                       trainable=True, regularizable=False)
+        if gamma is None:
+            self.gamma = None
+        else:
+            self.gamma = self.add_param(gamma, shape, 'gamma',
+                                        trainable=True, regularizable=True)
+        self.mean = self.add_param(mean, shape, 'mean',
+                                   trainable=False, regularizable=False)
+        self.inv_std = self.add_param(inv_std, shape, 'inv_std',
+                                      trainable=False, regularizable=False)
+
+    def get_output_for(self, input, deterministic=False,
+                       batch_norm_use_averages=None,
+                       batch_norm_update_averages=None, **kwargs):
+        input_mean = input.mean(self.axes)
+        input_inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon))
+
+        # Decide whether to use the stored averages or mini-batch statistics
+        if batch_norm_use_averages is None:
+            batch_norm_use_averages = deterministic
+        use_averages = batch_norm_use_averages
+
+        if use_averages:
+            mean = self.mean
+            inv_std = self.inv_std
+        else:
+            mean = input_mean
+            inv_std = input_inv_std
+
+        # Decide whether to update the stored averages
+        if batch_norm_update_averages is None:
+            batch_norm_update_averages = not deterministic
+        update_averages = batch_norm_update_averages
+
+        if update_averages:
+            # Trick: To update the stored statistics, we create memory-aliased
+            # clones of the stored statistics:
+            running_mean = theano.clone(self.mean, share_inputs=False)
+            running_inv_std = theano.clone(self.inv_std, share_inputs=False)
+            # set a default update for them:
+            running_mean.default_update = ((1 - self.alpha) * running_mean +
+                                           self.alpha * input_mean)
+            running_inv_std.default_update = ((1 - self.alpha) *
+                                              running_inv_std +
+                                              self.alpha * input_inv_std)
+            # and make sure they end up in the graph without participating in
+            # the computation (this way their default_update will be collected
+            # and applied, but the computation will be optimized away):
+            mean += 0 * running_mean
+            inv_std += 0 * running_inv_std
+
+        # prepare dimshuffle pattern inserting broadcastable axes as needed
+        param_axes = iter(range(input.ndim - len(self.axes)))
+        pattern = ['x' if input_axis in self.axes
+                   else next(param_axes)
+                   for input_axis in range(input.ndim)]
+
+        # apply dimshuffle pattern to all parameters
+        beta = 0 if self.beta is None else self.beta.dimshuffle(pattern)
+        gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern)
+        mean = mean.dimshuffle(pattern)
+        inv_std = inv_std.dimshuffle(pattern)
+
+        # normalize
+        normalized = (input - mean) * (gamma * inv_std) + beta
+        return normalized
+
+
+def batch_norm(layer, **kwargs):
+    """
+    Apply batch normalization to an existing layer. This is a convenience
+    function modifying an existing layer to include batch normalization: It
+    will steal the layer's nonlinearity if there is one (effectively
+    introducing the normalization right before the nonlinearity), remove
+    the layer's bias if there is one (because it would be redundant), and add
+    a :class:`BatchNormLayer` and :class:`NonlinearityLayer` on top.
+
+    Parameters
+    ----------
+    layer : A :class:`Layer` instance
+        The layer to apply the normalization to; note that it will be
+        irreversibly modified as specified above
+    **kwargs
+        Any additional keyword arguments are passed on to the
+        :class:`BatchNormLayer` constructor.
+
+    Returns
+    -------
+    BatchNormLayer or NonlinearityLayer instance
+        A batch normalization layer stacked on the given modified `layer`, or
+        a nonlinearity layer stacked on top of both if `layer` was nonlinear.
+
+    Examples
+    --------
+    Just wrap any layer into a :func:`batch_norm` call on creating it:
+
+    >>> from lasagne.layers import InputLayer, DenseLayer, batch_norm
+    >>> from lasagne.nonlinearities import tanh
+    >>> l1 = InputLayer((64, 768))
+    >>> l2 = batch_norm(DenseLayer(l1, num_units=500, nonlinearity=tanh))
+
+    This introduces batch normalization right before its nonlinearity:
+
+    >>> from lasagne.layers import get_all_layers
+    >>> [l.__class__.__name__ for l in get_all_layers(l2)]
+    ['InputLayer', 'DenseLayer', 'BatchNormLayer', 'NonlinearityLayer']
+    """
+    nonlinearity = getattr(layer, 'nonlinearity', None)
+    if nonlinearity is not None:
+        layer.nonlinearity = nonlinearities.identity
+    if hasattr(layer, 'b') and layer.b is not None:
+        del layer.params[layer.b]
+        layer.b = None
+    bn_name = (kwargs.pop('name', None) or
+               (getattr(layer, 'name', None) and layer.name + '_bn'))
+    layer = BatchNormLayer(layer, name=bn_name, **kwargs)
+    if nonlinearity is not None:
+        from .special import NonlinearityLayer
+        nonlin_name = bn_name and bn_name + '_nonlin'
+        layer = NonlinearityLayer(layer, nonlinearity, name=nonlin_name)
+    return layer
diff --git a/lasagne/layers/pool.py b/lasagne/layers/pool.py
new file mode 100644
index 0000000..86379ca
--- /dev/null
+++ b/lasagne/layers/pool.py
@@ -0,0 +1,639 @@
+import theano.tensor as T
+
+from .base import Layer
+from ..utils import as_tuple
+
+from theano.tensor.signal.pool import pool_2d
+
+
+__all__ = [
+    "MaxPool1DLayer",
+    "MaxPool2DLayer",
+    "Pool1DLayer",
+    "Pool2DLayer",
+    "Upscale1DLayer",
+    "Upscale2DLayer",
+    "FeaturePoolLayer",
+    "FeatureWTALayer",
+    "GlobalPoolLayer",
+]
+
+
+def pool_output_length(input_length, pool_size, stride, pad, ignore_border):
+    """
+    Compute the output length of a pooling operator
+    along a single dimension.
+
+    Parameters
+    ----------
+    input_length : integer
+        The length of the input in the pooling dimension
+    pool_size : integer
+        The length of the pooling region
+    stride : integer
+        The stride between successive pooling regions
+    pad : integer
+        The number of elements to be added to the input on each side.
+    ignore_border: bool
+        If ``True``, partial pooling regions will be ignored.
+        Must be ``True`` if ``pad != 0``.
+
+    Returns
+    -------
+    output_length
+        * None if either input is None.
+        * Computed length of the pooling operator otherwise.
+
+    Notes
+    -----
+    When ``ignore_border == True``, this is given by the number of full
+    pooling regions that fit in the padded input length,
+    divided by the stride (rounding down).
+
+    If ``ignore_border == False``, a single partial pooling region is
+    appended if at least one input element would be left uncovered otherwise.
+    """
+    if input_length is None or pool_size is None:
+        return None
+
+    if ignore_border:
+        output_length = input_length + 2 * pad - pool_size + 1
+        output_length = (output_length + stride - 1) // stride
+
+    # output length calculation taken from:
+    # https://github.com/Theano/Theano/blob/master/theano/tensor/signal/downsample.py
+    else:
+        assert pad == 0
+
+        if stride >= pool_size:
+            output_length = (input_length + stride - 1) // stride
+        else:
+            output_length = max(
+                0, (input_length - pool_size + stride - 1) // stride) + 1
+
+    return output_length
+
+
+class Pool1DLayer(Layer):
+    """
+    1D pooling layer
+
+    Performs 1D mean or max-pooling over the trailing axis
+    of a 3D input tensor.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    pool_size : integer or iterable
+        The length of the pooling region. If an iterable, it should have a
+        single element.
+
+    stride : integer, iterable or ``None``
+        The stride between sucessive pooling regions.
+        If ``None`` then ``stride == pool_size``.
+
+    pad : integer or iterable
+        The number of elements to be added to the input on each side.
+        Must be less than stride.
+
+    ignore_border : bool
+        If ``True``, partial pooling regions will be ignored.
+        Must be ``True`` if ``pad != 0``.
+
+    mode : {'max', 'average_inc_pad', 'average_exc_pad'}
+        Pooling mode: max-pooling or mean-pooling including/excluding zeros
+        from partially padded pooling regions. Default is 'max'.
+
+    **kwargs
+        Any additional keyword arguments are passed to the :class:`Layer`
+        superclass.
+
+    See Also
+    --------
+    MaxPool1DLayer : Shortcut for max pooling layer.
+
+    Notes
+    -----
+    The value used to pad the input is chosen to be less than
+    the minimum of the input, so that the output of each pooling region
+    always corresponds to some element in the unpadded input region.
+
+    Using ``ignore_border=False`` prevents Theano from using cuDNN for the
+    operation, so it will fall back to a slower implementation.
+    """
+    def __init__(self, incoming, pool_size, stride=None, pad=0,
+                 ignore_border=True, mode='max', **kwargs):
+        super(Pool1DLayer, self).__init__(incoming, **kwargs)
+
+        if len(self.input_shape) != 3:
+            raise ValueError("Tried to create a 1D pooling layer with "
+                             "input shape %r. Expected 3 input dimensions "
+                             "(batchsize, channels, 1 spatial dimensions)."
+                             % (self.input_shape,))
+
+        self.pool_size = as_tuple(pool_size, 1)
+        self.stride = self.pool_size if stride is None else as_tuple(stride, 1)
+        self.pad = as_tuple(pad, 1)
+        self.ignore_border = ignore_border
+        self.mode = mode
+
+    def get_output_shape_for(self, input_shape):
+        output_shape = list(input_shape)  # copy / convert to mutable list
+
+        output_shape[-1] = pool_output_length(input_shape[-1],
+                                              pool_size=self.pool_size[0],
+                                              stride=self.stride[0],
+                                              pad=self.pad[0],
+                                              ignore_border=self.ignore_border,
+                                              )
+
+        return tuple(output_shape)
+
+    def get_output_for(self, input, **kwargs):
+        input_4d = T.shape_padright(input, 1)
+
+        pooled = pool_2d(input_4d,
+                         ds=(self.pool_size[0], 1),
+                         st=(self.stride[0], 1),
+                         ignore_border=self.ignore_border,
+                         padding=(self.pad[0], 0),
+                         mode=self.mode,
+                         )
+        return pooled[:, :, :, 0]
+
+
+class Pool2DLayer(Layer):
+    """
+    2D pooling layer
+
+    Performs 2D mean or max-pooling over the two trailing axes
+    of a 4D input tensor.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    pool_size : integer or iterable
+        The length of the pooling region in each dimension.  If an integer, it
+        is promoted to a square pooling region. If an iterable, it should have
+        two elements.
+
+    stride : integer, iterable or ``None``
+        The strides between sucessive pooling regions in each dimension.
+        If ``None`` then ``stride = pool_size``.
+
+    pad : integer or iterable
+        Number of elements to be added on each side of the input
+        in each dimension. Each value must be less than
+        the corresponding stride.
+
+    ignore_border : bool
+        If ``True``, partial pooling regions will be ignored.
+        Must be ``True`` if ``pad != (0, 0)``.
+
+    mode : {'max', 'average_inc_pad', 'average_exc_pad'}
+        Pooling mode: max-pooling or mean-pooling including/excluding zeros
+        from partially padded pooling regions. Default is 'max'.
+
+    **kwargs
+        Any additional keyword arguments are passed to the :class:`Layer`
+        superclass.
+
+    See Also
+    --------
+    MaxPool2DLayer : Shortcut for max pooling layer.
+
+    Notes
+    -----
+    The value used to pad the input is chosen to be less than
+    the minimum of the input, so that the output of each pooling region
+    always corresponds to some element in the unpadded input region.
+
+    Using ``ignore_border=False`` prevents Theano from using cuDNN for the
+    operation, so it will fall back to a slower implementation.
+    """
+
+    def __init__(self, incoming, pool_size, stride=None, pad=(0, 0),
+                 ignore_border=True, mode='max', **kwargs):
+        super(Pool2DLayer, self).__init__(incoming, **kwargs)
+
+        self.pool_size = as_tuple(pool_size, 2)
+
+        if len(self.input_shape) != 4:
+            raise ValueError("Tried to create a 2D pooling layer with "
+                             "input shape %r. Expected 4 input dimensions "
+                             "(batchsize, channels, 2 spatial dimensions)."
+                             % (self.input_shape,))
+
+        if stride is None:
+            self.stride = self.pool_size
+        else:
+            self.stride = as_tuple(stride, 2)
+
+        self.pad = as_tuple(pad, 2)
+
+        self.ignore_border = ignore_border
+        self.mode = mode
+
+    def get_output_shape_for(self, input_shape):
+        output_shape = list(input_shape)  # copy / convert to mutable list
+
+        output_shape[2] = pool_output_length(input_shape[2],
+                                             pool_size=self.pool_size[0],
+                                             stride=self.stride[0],
+                                             pad=self.pad[0],
+                                             ignore_border=self.ignore_border,
+                                             )
+
+        output_shape[3] = pool_output_length(input_shape[3],
+                                             pool_size=self.pool_size[1],
+                                             stride=self.stride[1],
+                                             pad=self.pad[1],
+                                             ignore_border=self.ignore_border,
+                                             )
+
+        return tuple(output_shape)
+
+    def get_output_for(self, input, **kwargs):
+        pooled = pool_2d(input,
+                         ds=self.pool_size,
+                         st=self.stride,
+                         ignore_border=self.ignore_border,
+                         padding=self.pad,
+                         mode=self.mode,
+                         )
+        return pooled
+
+
+class MaxPool1DLayer(Pool1DLayer):
+    """
+    1D max-pooling layer
+
+    Performs 1D max-pooling over the trailing axis of a 3D input tensor.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    pool_size : integer or iterable
+        The length of the pooling region. If an iterable, it should have a
+        single element.
+
+    stride : integer, iterable or ``None``
+        The stride between sucessive pooling regions.
+        If ``None`` then ``stride == pool_size``.
+
+    pad : integer or iterable
+        The number of elements to be added to the input on each side.
+        Must be less than stride.
+
+    ignore_border : bool
+        If ``True``, partial pooling regions will be ignored.
+        Must be ``True`` if ``pad != 0``.
+
+    **kwargs
+        Any additional keyword arguments are passed to the :class:`Layer`
+        superclass.
+
+    Notes
+    -----
+    The value used to pad the input is chosen to be less than
+    the minimum of the input, so that the output of each pooling region
+    always corresponds to some element in the unpadded input region.
+
+    Using ``ignore_border=False`` prevents Theano from using cuDNN for the
+    operation, so it will fall back to a slower implementation.
+    """
+
+    def __init__(self, incoming, pool_size, stride=None, pad=0,
+                 ignore_border=True, **kwargs):
+        super(MaxPool1DLayer, self).__init__(incoming,
+                                             pool_size,
+                                             stride,
+                                             pad,
+                                             ignore_border,
+                                             mode='max',
+                                             **kwargs)
+
+
+class MaxPool2DLayer(Pool2DLayer):
+    """
+    2D max-pooling layer
+
+    Performs 2D max-pooling over the two trailing axes of a 4D input tensor.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    pool_size : integer or iterable
+        The length of the pooling region in each dimension.  If an integer, it
+        is promoted to a square pooling region. If an iterable, it should have
+        two elements.
+
+    stride : integer, iterable or ``None``
+        The strides between sucessive pooling regions in each dimension.
+        If ``None`` then ``stride = pool_size``.
+
+    pad : integer or iterable
+        Number of elements to be added on each side of the input
+        in each dimension. Each value must be less than
+        the corresponding stride.
+
+    ignore_border : bool
+        If ``True``, partial pooling regions will be ignored.
+        Must be ``True`` if ``pad != (0, 0)``.
+
+    **kwargs
+        Any additional keyword arguments are passed to the :class:`Layer`
+        superclass.
+
+    Notes
+    -----
+    The value used to pad the input is chosen to be less than
+    the minimum of the input, so that the output of each pooling region
+    always corresponds to some element in the unpadded input region.
+
+    Using ``ignore_border=False`` prevents Theano from using cuDNN for the
+    operation, so it will fall back to a slower implementation.
+    """
+
+    def __init__(self, incoming, pool_size, stride=None, pad=(0, 0),
+                 ignore_border=True, **kwargs):
+        super(MaxPool2DLayer, self).__init__(incoming,
+                                             pool_size,
+                                             stride,
+                                             pad,
+                                             ignore_border,
+                                             mode='max',
+                                             **kwargs)
+
+# TODO: add reshape-based implementation to MaxPool*DLayer
+# TODO: add MaxPool3DLayer
+
+
+class Upscale1DLayer(Layer):
+    """
+    1D upscaling layer
+
+    Performs 1D upscaling over the trailing axis of a 3D input tensor.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    scale_factor : integer or iterable
+        The scale factor. If an iterable, it should have one element.
+
+    **kwargs
+        Any additional keyword arguments are passed to the :class:`Layer`
+        superclass.
+    """
+
+    def __init__(self, incoming, scale_factor, **kwargs):
+        super(Upscale1DLayer, self).__init__(incoming, **kwargs)
+
+        self.scale_factor = as_tuple(scale_factor, 1)
+
+        if self.scale_factor[0] < 1:
+            raise ValueError('Scale factor must be >= 1, not {0}'.format(
+                self.scale_factor))
+
+    def get_output_shape_for(self, input_shape):
+        output_shape = list(input_shape)  # copy / convert to mutable list
+        if output_shape[2] is not None:
+            output_shape[2] *= self.scale_factor[0]
+        return tuple(output_shape)
+
+    def get_output_for(self, input, **kwargs):
+        a, = self.scale_factor
+        upscaled = input
+        if a > 1:
+            upscaled = T.extra_ops.repeat(upscaled, a, 2)
+        return upscaled
+
+
+class Upscale2DLayer(Layer):
+    """
+    2D upscaling layer
+
+    Performs 2D upscaling over the two trailing axes of a 4D input tensor.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    scale_factor : integer or iterable
+        The scale factor in each dimension. If an integer, it is promoted to
+        a square scale factor region. If an iterable, it should have two
+        elements.
+
+    **kwargs
+        Any additional keyword arguments are passed to the :class:`Layer`
+        superclass.
+    """
+
+    def __init__(self, incoming, scale_factor, **kwargs):
+        super(Upscale2DLayer, self).__init__(incoming, **kwargs)
+
+        self.scale_factor = as_tuple(scale_factor, 2)
+
+        if self.scale_factor[0] < 1 or self.scale_factor[1] < 1:
+            raise ValueError('Scale factor must be >= 1, not {0}'.format(
+                self.scale_factor))
+
+    def get_output_shape_for(self, input_shape):
+        output_shape = list(input_shape)  # copy / convert to mutable list
+        if output_shape[2] is not None:
+            output_shape[2] *= self.scale_factor[0]
+        if output_shape[3] is not None:
+            output_shape[3] *= self.scale_factor[1]
+        return tuple(output_shape)
+
+    def get_output_for(self, input, **kwargs):
+        a, b = self.scale_factor
+        upscaled = input
+        if b > 1:
+            upscaled = T.extra_ops.repeat(upscaled, b, 3)
+        if a > 1:
+            upscaled = T.extra_ops.repeat(upscaled, a, 2)
+        return upscaled
+
+
+class FeaturePoolLayer(Layer):
+    """
+    lasagne.layers.FeaturePoolLayer(incoming, pool_size, axis=1,
+    pool_function=theano.tensor.max, **kwargs)
+
+    Feature pooling layer
+
+    This layer pools across a given axis of the input. By default this is axis
+    1, which corresponds to the feature axis for :class:`DenseLayer`,
+    :class:`Conv1DLayer` and :class:`Conv2DLayer`. The layer can be used to
+    implement maxout.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    pool_size : integer
+        the size of the pooling regions, i.e. the number of features / feature
+        maps to be pooled together.
+
+    axis : integer
+        the axis along which to pool. The default value of ``1`` works
+        for :class:`DenseLayer`, :class:`Conv1DLayer` and :class:`Conv2DLayer`.
+
+    pool_function : callable
+        the pooling function to use. This defaults to `theano.tensor.max`
+        (i.e. max-pooling) and can be replaced by any other aggregation
+        function.
+
+    **kwargs
+        Any additional keyword arguments are passed to the :class:`Layer`
+        superclass.
+
+    Notes
+    -----
+    This layer requires that the size of the axis along which it pools is a
+    multiple of the pool size.
+    """
+
+    def __init__(self, incoming, pool_size, axis=1, pool_function=T.max,
+                 **kwargs):
+        super(FeaturePoolLayer, self).__init__(incoming, **kwargs)
+        self.pool_size = pool_size
+        self.axis = axis
+        self.pool_function = pool_function
+
+        num_feature_maps = self.input_shape[self.axis]
+        if num_feature_maps % self.pool_size != 0:
+            raise ValueError("Number of input feature maps (%d) is not a "
+                             "multiple of the pool size (pool_size=%d)" %
+                             (num_feature_maps, self.pool_size))
+
+    def get_output_shape_for(self, input_shape):
+        output_shape = list(input_shape)  # make a mutable copy
+        output_shape[self.axis] = input_shape[self.axis] // self.pool_size
+        return tuple(output_shape)
+
+    def get_output_for(self, input, **kwargs):
+        input_shape = tuple(input.shape)
+        num_feature_maps = input_shape[self.axis]
+        num_feature_maps_out = num_feature_maps // self.pool_size
+
+        pool_shape = (input_shape[:self.axis] +
+                      (num_feature_maps_out, self.pool_size) +
+                      input_shape[self.axis+1:])
+
+        input_reshaped = input.reshape(pool_shape)
+        return self.pool_function(input_reshaped, axis=self.axis + 1)
+
+
+class FeatureWTALayer(Layer):
+    """
+    'Winner Take All' layer
+
+    This layer performs 'Winner Take All' (WTA) across feature maps: zero out
+    all but the maximal activation value within a region.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    pool_size : integer
+        the number of feature maps per region.
+
+    axis : integer
+        the axis along which the regions are formed.
+
+    **kwargs
+        Any additional keyword arguments are passed to the :class:`Layer`
+        superclass.
+
+    Notes
+    -----
+    This layer requires that the size of the axis along which it groups units
+    is a multiple of the pool size.
+    """
+
+    def __init__(self, incoming, pool_size, axis=1, **kwargs):
+        super(FeatureWTALayer, self).__init__(incoming, **kwargs)
+        self.pool_size = pool_size
+        self.axis = axis
+
+        num_feature_maps = self.input_shape[self.axis]
+        if num_feature_maps % self.pool_size != 0:
+            raise ValueError("Number of input feature maps (%d) is not a "
+                             "multiple of the region size (pool_size=%d)" %
+                             (num_feature_maps, self.pool_size))
+
+    def get_output_for(self, input, **kwargs):
+        num_feature_maps = input.shape[self.axis]
+        num_pools = num_feature_maps // self.pool_size
+
+        pool_shape = ()
+        arange_shuffle_pattern = ()
+        for k in range(self.axis):
+            pool_shape += (input.shape[k],)
+            arange_shuffle_pattern += ('x',)
+
+        pool_shape += (num_pools, self.pool_size)
+        arange_shuffle_pattern += ('x', 0)
+
+        for k in range(self.axis + 1, input.ndim):
+            pool_shape += (input.shape[k],)
+            arange_shuffle_pattern += ('x',)
+
+        input_reshaped = input.reshape(pool_shape)
+        max_indices = T.argmax(input_reshaped, axis=self.axis + 1,
+                               keepdims=True)
+
+        arange = T.arange(self.pool_size).dimshuffle(*arange_shuffle_pattern)
+        mask = T.eq(max_indices, arange).reshape(input.shape)
+
+        return input * mask
+
+
+class GlobalPoolLayer(Layer):
+    """
+    lasagne.layers.GlobalPoolLayer(incoming,
+    pool_function=theano.tensor.mean, **kwargs)
+
+    Global pooling layer
+
+    This layer pools globally across all trailing dimensions beyond the 2nd.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    pool_function : callable
+        the pooling function to use. This defaults to `theano.tensor.mean`
+        (i.e. mean-pooling) and can be replaced by any other aggregation
+        function.
+
+    **kwargs
+        Any additional keyword arguments are passed to the :class:`Layer`
+        superclass.
+    """
+
+    def __init__(self, incoming, pool_function=T.mean, **kwargs):
+        super(GlobalPoolLayer, self).__init__(incoming, **kwargs)
+        self.pool_function = pool_function
+
+    def get_output_shape_for(self, input_shape):
+        return input_shape[:2]
+
+    def get_output_for(self, input, **kwargs):
+        return self.pool_function(input.flatten(3), axis=2)
diff --git a/lasagne/layers/recurrent.py b/lasagne/layers/recurrent.py
new file mode 100644
index 0000000..0d6aed5
--- /dev/null
+++ b/lasagne/layers/recurrent.py
@@ -0,0 +1,1480 @@
+# -*- coding: utf-8 -*-
+"""
+Layers to construct recurrent networks. Recurrent layers can be used similarly
+to feed-forward layers except that the input shape is expected to be
+``(batch_size, sequence_length, num_inputs)``.   The CustomRecurrentLayer can
+also support more than one "feature" dimension (e.g. using convolutional
+connections), but for all other layers, dimensions trailing the third
+dimension are flattened.
+
+The following recurrent layers are implemented:
+
+.. currentmodule:: lasagne.layers
+
+.. autosummary::
+    :nosignatures:
+
+    CustomRecurrentLayer
+    RecurrentLayer
+    LSTMLayer
+    GRULayer
+
+For recurrent layers with gates we use a helper class to set up the parameters
+in each gate:
+
+.. autosummary::
+    :nosignatures:
+
+    Gate
+
+Please refer to that class if you need to modify initial conditions of gates.
+
+Recurrent layers and feed-forward layers can be combined in the same network
+by using a few reshape operations; please refer to the example below.
+
+Examples
+--------
+The following example demonstrates how recurrent layers can be easily mixed
+with feed-forward layers using :class:`ReshapeLayer` and how to build a
+network with variable batch size and number of time steps.
+
+>>> from lasagne.layers import *
+>>> num_inputs, num_units, num_classes = 10, 12, 5
+>>> # By setting the first two dimensions as None, we are allowing them to vary
+>>> # They correspond to batch size and sequence length, so we will be able to
+>>> # feed in batches of varying size with sequences of varying length.
+>>> l_inp = InputLayer((None, None, num_inputs))
+>>> # We can retrieve symbolic references to the input variable's shape, which
+>>> # we will later use in reshape layers.
+>>> batchsize, seqlen, _ = l_inp.input_var.shape
+>>> l_lstm = LSTMLayer(l_inp, num_units=num_units)
+>>> # In order to connect a recurrent layer to a dense layer, we need to
+>>> # flatten the first two dimensions (our "sample dimensions"); this will
+>>> # cause each time step of each sequence to be processed independently
+>>> l_shp = ReshapeLayer(l_lstm, (-1, num_units))
+>>> l_dense = DenseLayer(l_shp, num_units=num_classes)
+>>> # To reshape back to our original shape, we can use the symbolic shape
+>>> # variables we retrieved above.
+>>> l_out = ReshapeLayer(l_dense, (batchsize, seqlen, num_classes))
+"""
+import numpy as np
+import theano
+import theano.tensor as T
+from .. import nonlinearities
+from .. import init
+from ..utils import unroll_scan
+
+from .base import MergeLayer, Layer
+from .input import InputLayer
+from .dense import DenseLayer
+from . import helper
+
+__all__ = [
+    "CustomRecurrentLayer",
+    "RecurrentLayer",
+    "Gate",
+    "LSTMLayer",
+    "GRULayer"
+]
+
+
+class CustomRecurrentLayer(MergeLayer):
+    """
+    lasagne.layers.recurrent.CustomRecurrentLayer(incoming, input_to_hidden,
+    hidden_to_hidden, nonlinearity=lasagne.nonlinearities.rectify,
+    hid_init=lasagne.init.Constant(0.), backwards=False,
+    learn_init=False, gradient_steps=-1, grad_clipping=0,
+    unroll_scan=False, precompute_input=True, mask_input=None,
+    only_return_final=False, **kwargs)
+
+    A layer which implements a recurrent connection.
+
+    This layer allows you to specify custom input-to-hidden and
+    hidden-to-hidden connections by instantiating :class:`lasagne.layers.Layer`
+    instances and passing them on initialization.  Note that these connections
+    can consist of multiple layers chained together.  The output shape for the
+    provided input-to-hidden and hidden-to-hidden connections must be the same.
+    If you are looking for a standard, densely-connected recurrent layer,
+    please see :class:`RecurrentLayer`.  The output is computed by
+
+    .. math ::
+        h_t = \sigma(f_i(x_t) + f_h(h_{t-1}))
+
+    Parameters
+    ----------
+    incoming : a :class:`lasagne.layers.Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape.
+    input_to_hidden : :class:`lasagne.layers.Layer`
+        :class:`lasagne.layers.Layer` instance which connects input to the
+        hidden state (:math:`f_i`).  This layer may be connected to a chain of
+        layers, which must end in a :class:`lasagne.layers.InputLayer` with the
+        same input shape as `incoming`, except for the first dimension: When
+        ``precompute_input == True`` (the default), it must be
+        ``incoming.output_shape[0]*incoming.output_shape[1]`` or ``None``; when
+        ``precompute_input == False``, it must be ``incoming.output_shape[0]``
+        or ``None``.
+    hidden_to_hidden : :class:`lasagne.layers.Layer`
+        Layer which connects the previous hidden state to the new state
+        (:math:`f_h`).  This layer may be connected to a chain of layers, which
+        must end in a :class:`lasagne.layers.InputLayer` with the same input
+        shape as `hidden_to_hidden`'s output shape.
+    nonlinearity : callable or None
+        Nonlinearity to apply when computing new state (:math:`\sigma`). If
+        None is provided, no nonlinearity will be applied.
+    hid_init : callable, np.ndarray, theano.shared or :class:`Layer`
+        Initializer for initial hidden state (:math:`h_0`).
+    backwards : bool
+        If True, process the sequence backwards and then reverse the
+        output again such that the output from the layer is always
+        from :math:`x_1` to :math:`x_n`.
+    learn_init : bool
+        If True, initial hidden values are learned.
+    gradient_steps : int
+        Number of timesteps to include in the backpropagated gradient.
+        If -1, backpropagate through the entire sequence.
+    grad_clipping : float
+        If nonzero, the gradient messages are clipped to the given value during
+        the backward pass.  See [1]_ (p. 6) for further explanation.
+    unroll_scan : bool
+        If True the recursion is unrolled instead of using scan. For some
+        graphs this gives a significant speed up but it might also consume
+        more memory. When `unroll_scan` is True, backpropagation always
+        includes the full sequence, so `gradient_steps` must be set to -1 and
+        the input sequence length must be known at compile time (i.e., cannot
+        be given as None).
+    precompute_input : bool
+        If True, precompute input_to_hid before iterating through
+        the sequence. This can result in a speedup at the expense of
+        an increase in memory usage.
+    mask_input : :class:`lasagne.layers.Layer`
+        Layer which allows for a sequence mask to be input, for when sequences
+        are of variable length.  Default `None`, which means no mask will be
+        supplied (i.e. all sequences are of the same length).
+    only_return_final : bool
+        If True, only return the final sequential output (e.g. for tasks where
+        a single target value for the entire sequence is desired).  In this
+        case, Theano makes an optimization which saves memory.
+
+    Examples
+    --------
+
+    The following example constructs a simple `CustomRecurrentLayer` which
+    has dense input-to-hidden and hidden-to-hidden connections.
+
+    >>> import lasagne
+    >>> n_batch, n_steps, n_in = (2, 3, 4)
+    >>> n_hid = 5
+    >>> l_in = lasagne.layers.InputLayer((n_batch, n_steps, n_in))
+    >>> l_in_hid = lasagne.layers.DenseLayer(
+    ...     lasagne.layers.InputLayer((None, n_in)), n_hid)
+    >>> l_hid_hid = lasagne.layers.DenseLayer(
+    ...     lasagne.layers.InputLayer((None, n_hid)), n_hid)
+    >>> l_rec = lasagne.layers.CustomRecurrentLayer(l_in, l_in_hid, l_hid_hid)
+
+    The CustomRecurrentLayer can also support "convolutional recurrence", as is
+    demonstrated below.
+
+    >>> n_batch, n_steps, n_channels, width, height = (2, 3, 4, 5, 6)
+    >>> n_out_filters = 7
+    >>> filter_shape = (3, 3)
+    >>> l_in = lasagne.layers.InputLayer(
+    ...     (n_batch, n_steps, n_channels, width, height))
+    >>> l_in_to_hid = lasagne.layers.Conv2DLayer(
+    ...     lasagne.layers.InputLayer((None, n_channels, width, height)),
+    ...     n_out_filters, filter_shape, pad='same')
+    >>> l_hid_to_hid = lasagne.layers.Conv2DLayer(
+    ...     lasagne.layers.InputLayer(l_in_to_hid.output_shape),
+    ...     n_out_filters, filter_shape, pad='same')
+    >>> l_rec = lasagne.layers.CustomRecurrentLayer(
+    ...     l_in, l_in_to_hid, l_hid_to_hid)
+
+    References
+    ----------
+    .. [1] Graves, Alex: "Generating sequences with recurrent neural networks."
+           arXiv preprint arXiv:1308.0850 (2013).
+    """
+    def __init__(self, incoming, input_to_hidden, hidden_to_hidden,
+                 nonlinearity=nonlinearities.rectify,
+                 hid_init=init.Constant(0.),
+                 backwards=False,
+                 learn_init=False,
+                 gradient_steps=-1,
+                 grad_clipping=0,
+                 unroll_scan=False,
+                 precompute_input=True,
+                 mask_input=None,
+                 only_return_final=False,
+                 **kwargs):
+
+        # This layer inherits from a MergeLayer, because it can have three
+        # inputs - the layer input, the mask and the initial hidden state.  We
+        # will just provide the layer input as incomings, unless a mask input
+        # or initial hidden state was provided.
+        incomings = [incoming]
+        self.mask_incoming_index = -1
+        self.hid_init_incoming_index = -1
+        if mask_input is not None:
+            incomings.append(mask_input)
+            self.mask_incoming_index = len(incomings)-1
+        if isinstance(hid_init, Layer):
+            incomings.append(hid_init)
+            self.hid_init_incoming_index = len(incomings)-1
+
+        super(CustomRecurrentLayer, self).__init__(incomings, **kwargs)
+
+        self.input_to_hidden = input_to_hidden
+        self.hidden_to_hidden = hidden_to_hidden
+        self.learn_init = learn_init
+        self.backwards = backwards
+        self.gradient_steps = gradient_steps
+        self.grad_clipping = grad_clipping
+        self.unroll_scan = unroll_scan
+        self.precompute_input = precompute_input
+        self.only_return_final = only_return_final
+
+        if unroll_scan and gradient_steps != -1:
+            raise ValueError(
+                "Gradient steps must be -1 when unroll_scan is true.")
+
+        # Retrieve the dimensionality of the incoming layer
+        input_shape = self.input_shapes[0]
+
+        if unroll_scan and input_shape[1] is None:
+            raise ValueError("Input sequence length cannot be specified as "
+                             "None when unroll_scan is True")
+
+        # Check that the input_to_hidden connection can appropriately handle
+        # a first dimension of input_shape[0]*input_shape[1] when we will
+        # precompute the input dot product
+        if (self.precompute_input and
+                input_to_hidden.output_shape[0] is not None and
+                input_shape[0] is not None and
+                input_shape[1] is not None and
+                (input_to_hidden.output_shape[0] !=
+                 input_shape[0]*input_shape[1])):
+            raise ValueError(
+                'When precompute_input == True, '
+                'input_to_hidden.output_shape[0] must equal '
+                'incoming.output_shape[0]*incoming.output_shape[1] '
+                '(i.e. batch_size*sequence_length) or be None but '
+                'input_to_hidden.output_shape[0] = {} and '
+                'incoming.output_shape[0]*incoming.output_shape[1] = '
+                '{}'.format(input_to_hidden.output_shape[0],
+                            input_shape[0]*input_shape[1]))
+
+        # Check that the first dimension of input_to_hidden and
+        # hidden_to_hidden's outputs match when we won't precompute the input
+        # dot product
+        if (not self.precompute_input and
+                input_to_hidden.output_shape[0] is not None and
+                hidden_to_hidden.output_shape[0] is not None and
+                (input_to_hidden.output_shape[0] !=
+                 hidden_to_hidden.output_shape[0])):
+            raise ValueError(
+                'When precompute_input == False, '
+                'input_to_hidden.output_shape[0] must equal '
+                'hidden_to_hidden.output_shape[0] but '
+                'input_to_hidden.output_shape[0] = {} and '
+                'hidden_to_hidden.output_shape[0] = {}'.format(
+                    input_to_hidden.output_shape[0],
+                    hidden_to_hidden.output_shape[0]))
+
+        # Check that input_to_hidden and hidden_to_hidden output shapes match,
+        # but don't check a dimension if it's None for either shape
+        if not all(s1 is None or s2 is None or s1 == s2
+                   for s1, s2 in zip(input_to_hidden.output_shape[1:],
+                                     hidden_to_hidden.output_shape[1:])):
+            raise ValueError("The output shape for input_to_hidden and "
+                             "hidden_to_hidden must be equal after the first "
+                             "dimension, but input_to_hidden.output_shape={} "
+                             "and hidden_to_hidden.output_shape={}".format(
+                                 input_to_hidden.output_shape,
+                                 hidden_to_hidden.output_shape))
+
+        # Check that input_to_hidden's output shape is the same as
+        # hidden_to_hidden's input shape but don't check a dimension if it's
+        # None for either shape
+        if not all(s1 is None or s2 is None or s1 == s2
+                   for s1, s2 in zip(input_to_hidden.output_shape[1:],
+                                     hidden_to_hidden.input_shape[1:])):
+            raise ValueError("The output shape for input_to_hidden "
+                             "must be equal to the input shape of "
+                             "hidden_to_hidden after the first dimension, but "
+                             "input_to_hidden.output_shape={} and "
+                             "hidden_to_hidden.input_shape={}".format(
+                                 input_to_hidden.output_shape,
+                                 hidden_to_hidden.input_shape))
+
+        if nonlinearity is None:
+            self.nonlinearity = nonlinearities.identity
+        else:
+            self.nonlinearity = nonlinearity
+
+        # Initialize hidden state
+        if isinstance(hid_init, Layer):
+            self.hid_init = hid_init
+        else:
+            self.hid_init = self.add_param(
+                hid_init, (1,) + hidden_to_hidden.output_shape[1:],
+                name="hid_init", trainable=learn_init, regularizable=False)
+
+    def get_params(self, **tags):
+        # Get all parameters from this layer, the master layer
+        params = super(CustomRecurrentLayer, self).get_params(**tags)
+        # Combine with all parameters from the child layers
+        params += helper.get_all_params(self.input_to_hidden, **tags)
+        params += helper.get_all_params(self.hidden_to_hidden, **tags)
+        return params
+
+    def get_output_shape_for(self, input_shapes):
+        # The shape of the input to this layer will be the first element
+        # of input_shapes, whether or not a mask input is being used.
+        input_shape = input_shapes[0]
+        # When only_return_final is true, the second (sequence step) dimension
+        # will be flattened
+        if self.only_return_final:
+            return (input_shape[0],) + self.hidden_to_hidden.output_shape[1:]
+        # Otherwise, the shape will be (n_batch, n_steps, trailing_dims...)
+        else:
+            return ((input_shape[0], input_shape[1]) +
+                    self.hidden_to_hidden.output_shape[1:])
+
+    def get_output_for(self, inputs, **kwargs):
+        """
+        Compute this layer's output function given a symbolic input variable.
+
+        Parameters
+        ----------
+        inputs : list of theano.TensorType
+            `inputs[0]` should always be the symbolic input variable.  When
+            this layer has a mask input (i.e. was instantiated with
+            `mask_input != None`, indicating that the lengths of sequences in
+            each batch vary), `inputs` should have length 2, where `inputs[1]`
+            is the `mask`.  The `mask` should be supplied as a Theano variable
+            denoting whether each time step in each sequence in the batch is
+            part of the sequence or not.  `mask` should be a matrix of shape
+            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
+            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
+            of sequence i)``. When the hidden state of this layer is to be
+            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
+            should have length at least 2, and `inputs[-1]` is the hidden state
+            to prefill with.
+
+        Returns
+        -------
+        layer_output : theano.TensorType
+            Symbolic output variable.
+        """
+        # Retrieve the layer input
+        input = inputs[0]
+        # Retrieve the mask when it is supplied
+        mask = None
+        hid_init = None
+        if self.mask_incoming_index > 0:
+            mask = inputs[self.mask_incoming_index]
+        if self.hid_init_incoming_index > 0:
+            hid_init = inputs[self.hid_init_incoming_index]
+
+        # Input should be provided as (n_batch, n_time_steps, n_features)
+        # but scan requires the iterable dimension to be first
+        # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
+        input = input.dimshuffle(1, 0, *range(2, input.ndim))
+        seq_len, num_batch = input.shape[0], input.shape[1]
+
+        if self.precompute_input:
+            # Because the input is given for all time steps, we can precompute
+            # the inputs to hidden before scanning. First we need to reshape
+            # from (seq_len, batch_size, trailing dimensions...) to
+            # (seq_len*batch_size, trailing dimensions...)
+            # This strange use of a generator in a tuple was because
+            # input.shape[2:] was raising a Theano error
+            trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim))
+            input = T.reshape(input, (seq_len*num_batch,) + trailing_dims)
+            input = helper.get_output(
+                self.input_to_hidden, input, **kwargs)
+
+            # Reshape back to (seq_len, batch_size, trailing dimensions...)
+            trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim))
+            input = T.reshape(input, (seq_len, num_batch) + trailing_dims)
+
+        # We will always pass the hidden-to-hidden layer params to step
+        non_seqs = helper.get_all_params(self.hidden_to_hidden)
+        # When we are not precomputing the input, we also need to pass the
+        # input-to-hidden parameters to step
+        if not self.precompute_input:
+            non_seqs += helper.get_all_params(self.input_to_hidden)
+
+        # Create single recurrent computation step function
+        def step(input_n, hid_previous, *args):
+            # Compute the hidden-to-hidden activation
+            hid_pre = helper.get_output(
+                self.hidden_to_hidden, hid_previous, **kwargs)
+
+            # If the dot product is precomputed then add it, otherwise
+            # calculate the input_to_hidden values and add them
+            if self.precompute_input:
+                hid_pre += input_n
+            else:
+                hid_pre += helper.get_output(
+                    self.input_to_hidden, input_n, **kwargs)
+
+            # Clip gradients
+            if self.grad_clipping:
+                hid_pre = theano.gradient.grad_clip(
+                    hid_pre, -self.grad_clipping, self.grad_clipping)
+
+            return self.nonlinearity(hid_pre)
+
+        def step_masked(input_n, mask_n, hid_previous, *args):
+            # Skip over any input with mask 0 by copying the previous
+            # hidden state; proceed normally for any input with mask 1.
+            hid = step(input_n, hid_previous, *args)
+            hid_out = T.switch(mask_n, hid, hid_previous)
+            return [hid_out]
+
+        if mask is not None:
+            mask = mask.dimshuffle(1, 0, 'x')
+            sequences = [input, mask]
+            step_fun = step_masked
+        else:
+            sequences = input
+            step_fun = step
+
+        if not isinstance(self.hid_init, Layer):
+            # The code below simply repeats self.hid_init num_batch times in
+            # its first dimension.  Turns out using a dot product and a
+            # dimshuffle is faster than T.repeat.
+            dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
+                        [0, self.hid_init.ndim - 1])
+            hid_init = T.dot(T.ones((num_batch, 1)),
+                             self.hid_init.dimshuffle(dot_dims))
+
+        if self.unroll_scan:
+            # Retrieve the dimensionality of the incoming layer
+            input_shape = self.input_shapes[0]
+            # Explicitly unroll the recurrence instead of using scan
+            hid_out = unroll_scan(
+                fn=step_fun,
+                sequences=sequences,
+                outputs_info=[hid_init],
+                go_backwards=self.backwards,
+                non_sequences=non_seqs,
+                n_steps=input_shape[1])[0]
+        else:
+            # Scan op iterates over first dimension of input and repeatedly
+            # applies the step function
+            hid_out = theano.scan(
+                fn=step_fun,
+                sequences=sequences,
+                go_backwards=self.backwards,
+                outputs_info=[hid_init],
+                non_sequences=non_seqs,
+                truncate_gradient=self.gradient_steps,
+                strict=True)[0]
+
+        # When it is requested that we only return the final sequence step,
+        # we need to slice it out immediately after scan is applied
+        if self.only_return_final:
+            hid_out = hid_out[-1]
+        else:
+            # dimshuffle back to (n_batch, n_time_steps, n_features))
+            hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))
+
+            # if scan is backward reverse the output
+            if self.backwards:
+                hid_out = hid_out[:, ::-1]
+
+        return hid_out
+
+
+class RecurrentLayer(CustomRecurrentLayer):
+    """
+    lasagne.layers.recurrent.RecurrentLayer(incoming, num_units,
+    W_in_to_hid=lasagne.init.Uniform(), W_hid_to_hid=lasagne.init.Uniform(),
+    b=lasagne.init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify,
+    hid_init=lasagne.init.Constant(0.), backwards=False, learn_init=False,
+    gradient_steps=-1, grad_clipping=0, unroll_scan=False,
+    precompute_input=True, mask_input=None, only_return_final=False, **kwargs)
+
+    Dense recurrent neural network (RNN) layer
+
+    A "vanilla" RNN layer, which has dense input-to-hidden and
+    hidden-to-hidden connections.  The output is computed as
+
+    .. math ::
+        h_t = \sigma(x_t W_x + h_{t-1} W_h + b)
+
+    Parameters
+    ----------
+    incoming : a :class:`lasagne.layers.Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape.
+    num_units : int
+        Number of hidden units in the layer.
+    W_in_to_hid : Theano shared variable, numpy array or callable
+        Initializer for input-to-hidden weight matrix (:math:`W_x`).
+    W_hid_to_hid : Theano shared variable, numpy array or callable
+        Initializer for hidden-to-hidden weight matrix (:math:`W_h`).
+    b : Theano shared variable, numpy array, callable or None
+        Initializer for bias vector (:math:`b`). If None is provided there will
+        be no bias.
+    nonlinearity : callable or None
+        Nonlinearity to apply when computing new state (:math:`\sigma`). If
+        None is provided, no nonlinearity will be applied.
+    hid_init : callable, np.ndarray, theano.shared or :class:`Layer`
+        Initializer for initial hidden state (:math:`h_0`).
+    backwards : bool
+        If True, process the sequence backwards and then reverse the
+        output again such that the output from the layer is always
+        from :math:`x_1` to :math:`x_n`.
+    learn_init : bool
+        If True, initial hidden values are learned.
+    gradient_steps : int
+        Number of timesteps to include in the backpropagated gradient.
+        If -1, backpropagate through the entire sequence.
+    grad_clipping : float
+        If nonzero, the gradient messages are clipped to the given value during
+        the backward pass.  See [1]_ (p. 6) for further explanation.
+    unroll_scan : bool
+        If True the recursion is unrolled instead of using scan. For some
+        graphs this gives a significant speed up but it might also consume
+        more memory. When `unroll_scan` is True, backpropagation always
+        includes the full sequence, so `gradient_steps` must be set to -1 and
+        the input sequence length must be known at compile time (i.e., cannot
+        be given as None).
+    precompute_input : bool
+        If True, precompute input_to_hid before iterating through
+        the sequence. This can result in a speedup at the expense of
+        an increase in memory usage.
+    mask_input : :class:`lasagne.layers.Layer`
+        Layer which allows for a sequence mask to be input, for when sequences
+        are of variable length.  Default `None`, which means no mask will be
+        supplied (i.e. all sequences are of the same length).
+    only_return_final : bool
+        If True, only return the final sequential output (e.g. for tasks where
+        a single target value for the entire sequence is desired).  In this
+        case, Theano makes an optimization which saves memory.
+
+    References
+    ----------
+    .. [1] Graves, Alex: "Generating sequences with recurrent neural networks."
+           arXiv preprint arXiv:1308.0850 (2013).
+    """
+    def __init__(self, incoming, num_units,
+                 W_in_to_hid=init.Uniform(),
+                 W_hid_to_hid=init.Uniform(),
+                 b=init.Constant(0.),
+                 nonlinearity=nonlinearities.rectify,
+                 hid_init=init.Constant(0.),
+                 backwards=False,
+                 learn_init=False,
+                 gradient_steps=-1,
+                 grad_clipping=0,
+                 unroll_scan=False,
+                 precompute_input=True,
+                 mask_input=None,
+                 only_return_final=False,
+                 **kwargs):
+
+        if isinstance(incoming, tuple):
+            input_shape = incoming
+        else:
+            input_shape = incoming.output_shape
+        # Retrieve the supplied name, if it exists; otherwise use ''
+        if 'name' in kwargs:
+            basename = kwargs['name'] + '.'
+            # Create a separate version of kwargs for the contained layers
+            # which does not include 'name'
+            layer_kwargs = dict((key, arg) for key, arg in kwargs.items()
+                                if key != 'name')
+        else:
+            basename = ''
+            layer_kwargs = kwargs
+        # We will be passing the input at each time step to the dense layer,
+        # so we need to remove the second dimension (the time dimension)
+        in_to_hid = DenseLayer(InputLayer((None,) + input_shape[2:]),
+                               num_units, W=W_in_to_hid, b=b,
+                               nonlinearity=None,
+                               name=basename + 'input_to_hidden',
+                               **layer_kwargs)
+        # The hidden-to-hidden layer expects its inputs to have num_units
+        # features because it recycles the previous hidden state
+        hid_to_hid = DenseLayer(InputLayer((None, num_units)),
+                                num_units, W=W_hid_to_hid, b=None,
+                                nonlinearity=None,
+                                name=basename + 'hidden_to_hidden',
+                                **layer_kwargs)
+
+        # Make child layer parameters intuitively accessible
+        self.W_in_to_hid = in_to_hid.W
+        self.W_hid_to_hid = hid_to_hid.W
+        self.b = in_to_hid.b
+
+        # Just use the CustomRecurrentLayer with the DenseLayers we created
+        super(RecurrentLayer, self).__init__(
+            incoming, in_to_hid, hid_to_hid, nonlinearity=nonlinearity,
+            hid_init=hid_init, backwards=backwards, learn_init=learn_init,
+            gradient_steps=gradient_steps,
+            grad_clipping=grad_clipping, unroll_scan=unroll_scan,
+            precompute_input=precompute_input, mask_input=mask_input,
+            only_return_final=only_return_final, **kwargs)
+
+
+class Gate(object):
+    """
+    lasagne.layers.recurrent.Gate(W_in=lasagne.init.Normal(0.1),
+    W_hid=lasagne.init.Normal(0.1), W_cell=lasagne.init.Normal(0.1),
+    b=lasagne.init.Constant(0.), nonlinearity=lasagne.nonlinearities.sigmoid)
+
+    Simple class to hold the parameters for a gate connection.  We define
+    a gate loosely as something which computes the linear mix of two inputs,
+    optionally computes an element-wise product with a third, adds a bias, and
+    applies a nonlinearity.
+
+    Parameters
+    ----------
+    W_in : Theano shared variable, numpy array or callable
+        Initializer for input-to-gate weight matrix.
+    W_hid : Theano shared variable, numpy array or callable
+        Initializer for hidden-to-gate weight matrix.
+    W_cell : Theano shared variable, numpy array, callable, or None
+        Initializer for cell-to-gate weight vector.  If None, no cell-to-gate
+        weight vector will be stored.
+    b : Theano shared variable, numpy array or callable
+        Initializer for input gate bias vector.
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the input gate activation. If None
+        is provided, no nonlinearity will be applied.
+
+    Examples
+    --------
+    For :class:`LSTMLayer` the bias of the forget gate is often initialized to
+    a large positive value to encourage the layer initially remember the cell
+    value, see e.g. [1]_ page 15.
+
+    >>> import lasagne
+    >>> forget_gate = Gate(b=lasagne.init.Constant(5.0))
+    >>> l_lstm = LSTMLayer((10, 20, 30), num_units=10,
+    ...                    forgetgate=forget_gate)
+
+    References
+    ----------
+    .. [1] Gers, Felix A., Jürgen Schmidhuber, and Fred Cummins. "Learning to
+           forget: Continual prediction with LSTM." Neural computation 12.10
+           (2000): 2451-2471.
+
+    """
+    def __init__(self, W_in=init.Normal(0.1), W_hid=init.Normal(0.1),
+                 W_cell=init.Normal(0.1), b=init.Constant(0.),
+                 nonlinearity=nonlinearities.sigmoid):
+        self.W_in = W_in
+        self.W_hid = W_hid
+        # Don't store a cell weight vector when cell is None
+        if W_cell is not None:
+            self.W_cell = W_cell
+        self.b = b
+        # For the nonlinearity, if None is supplied, use identity
+        if nonlinearity is None:
+            self.nonlinearity = nonlinearities.identity
+        else:
+            self.nonlinearity = nonlinearity
+
+
+class LSTMLayer(MergeLayer):
+    r"""
+    lasagne.layers.recurrent.LSTMLayer(incoming, num_units,
+    ingate=lasagne.layers.Gate(), forgetgate=lasagne.layers.Gate(),
+    cell=lasagne.layers.Gate(
+    W_cell=None, nonlinearity=lasagne.nonlinearities.tanh),
+    outgate=lasagne.layers.Gate(),
+    nonlinearity=lasagne.nonlinearities.tanh,
+    cell_init=lasagne.init.Constant(0.),
+    hid_init=lasagne.init.Constant(0.), backwards=False, learn_init=False,
+    peepholes=True, gradient_steps=-1, grad_clipping=0, unroll_scan=False,
+    precompute_input=True, mask_input=None, only_return_final=False, **kwargs)
+
+    A long short-term memory (LSTM) layer.
+
+    Includes optional "peephole connections" and a forget gate.  Based on the
+    definition in [1]_, which is the current common definition.  The output is
+    computed by
+
+    .. math ::
+
+        i_t &= \sigma_i(x_t W_{xi} + h_{t-1} W_{hi}
+               + w_{ci} \odot c_{t-1} + b_i)\\
+        f_t &= \sigma_f(x_t W_{xf} + h_{t-1} W_{hf}
+               + w_{cf} \odot c_{t-1} + b_f)\\
+        c_t &= f_t \odot c_{t - 1}
+               + i_t \odot \sigma_c(x_t W_{xc} + h_{t-1} W_{hc} + b_c)\\
+        o_t &= \sigma_o(x_t W_{xo} + h_{t-1} W_{ho} + w_{co} \odot c_t + b_o)\\
+        h_t &= o_t \odot \sigma_h(c_t)
+
+    Parameters
+    ----------
+    incoming : a :class:`lasagne.layers.Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape.
+    num_units : int
+        Number of hidden/cell units in the layer.
+    ingate : Gate
+        Parameters for the input gate (:math:`i_t`): :math:`W_{xi}`,
+        :math:`W_{hi}`, :math:`w_{ci}`, :math:`b_i`, and :math:`\sigma_i`.
+    forgetgate : Gate
+        Parameters for the forget gate (:math:`f_t`): :math:`W_{xf}`,
+        :math:`W_{hf}`, :math:`w_{cf}`, :math:`b_f`, and :math:`\sigma_f`.
+    cell : Gate
+        Parameters for the cell computation (:math:`c_t`): :math:`W_{xc}`,
+        :math:`W_{hc}`, :math:`b_c`, and :math:`\sigma_c`.
+    outgate : Gate
+        Parameters for the output gate (:math:`o_t`): :math:`W_{xo}`,
+        :math:`W_{ho}`, :math:`w_{co}`, :math:`b_o`, and :math:`\sigma_o`.
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the output (:math:`\sigma_h`). If
+        None is provided, no nonlinearity will be applied.
+    cell_init : callable, np.ndarray, theano.shared or :class:`Layer`
+        Initializer for initial cell state (:math:`c_0`).
+    hid_init : callable, np.ndarray, theano.shared or :class:`Layer`
+        Initializer for initial hidden state (:math:`h_0`).
+    backwards : bool
+        If True, process the sequence backwards and then reverse the
+        output again such that the output from the layer is always
+        from :math:`x_1` to :math:`x_n`.
+    learn_init : bool
+        If True, initial hidden values are learned.
+    peepholes : bool
+        If True, the LSTM uses peephole connections.
+        When False, `ingate.W_cell`, `forgetgate.W_cell` and
+        `outgate.W_cell` are ignored.
+    gradient_steps : int
+        Number of timesteps to include in the backpropagated gradient.
+        If -1, backpropagate through the entire sequence.
+    grad_clipping : float
+        If nonzero, the gradient messages are clipped to the given value during
+        the backward pass.  See [1]_ (p. 6) for further explanation.
+    unroll_scan : bool
+        If True the recursion is unrolled instead of using scan. For some
+        graphs this gives a significant speed up but it might also consume
+        more memory. When `unroll_scan` is True, backpropagation always
+        includes the full sequence, so `gradient_steps` must be set to -1 and
+        the input sequence length must be known at compile time (i.e., cannot
+        be given as None).
+    precompute_input : bool
+        If True, precompute input_to_hid before iterating through
+        the sequence. This can result in a speedup at the expense of
+        an increase in memory usage.
+    mask_input : :class:`lasagne.layers.Layer`
+        Layer which allows for a sequence mask to be input, for when sequences
+        are of variable length.  Default `None`, which means no mask will be
+        supplied (i.e. all sequences are of the same length).
+    only_return_final : bool
+        If True, only return the final sequential output (e.g. for tasks where
+        a single target value for the entire sequence is desired).  In this
+        case, Theano makes an optimization which saves memory.
+
+    References
+    ----------
+    .. [1] Graves, Alex: "Generating sequences with recurrent neural networks."
+           arXiv preprint arXiv:1308.0850 (2013).
+    """
+    def __init__(self, incoming, num_units,
+                 ingate=Gate(),
+                 forgetgate=Gate(),
+                 cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh),
+                 outgate=Gate(),
+                 nonlinearity=nonlinearities.tanh,
+                 cell_init=init.Constant(0.),
+                 hid_init=init.Constant(0.),
+                 backwards=False,
+                 learn_init=False,
+                 peepholes=True,
+                 gradient_steps=-1,
+                 grad_clipping=0,
+                 unroll_scan=False,
+                 precompute_input=True,
+                 mask_input=None,
+                 only_return_final=False,
+                 **kwargs):
+
+        # This layer inherits from a MergeLayer, because it can have four
+        # inputs - the layer input, the mask, the initial hidden state and the
+        # inital cell state. We will just provide the layer input as incomings,
+        # unless a mask input, inital hidden state or initial cell state was
+        # provided.
+        incomings = [incoming]
+        self.mask_incoming_index = -1
+        self.hid_init_incoming_index = -1
+        self.cell_init_incoming_index = -1
+        if mask_input is not None:
+            incomings.append(mask_input)
+            self.mask_incoming_index = len(incomings)-1
+        if isinstance(hid_init, Layer):
+            incomings.append(hid_init)
+            self.hid_init_incoming_index = len(incomings)-1
+        if isinstance(cell_init, Layer):
+            incomings.append(cell_init)
+            self.cell_init_incoming_index = len(incomings)-1
+
+        # Initialize parent layer
+        super(LSTMLayer, self).__init__(incomings, **kwargs)
+
+        # If the provided nonlinearity is None, make it linear
+        if nonlinearity is None:
+            self.nonlinearity = nonlinearities.identity
+        else:
+            self.nonlinearity = nonlinearity
+
+        self.learn_init = learn_init
+        self.num_units = num_units
+        self.backwards = backwards
+        self.peepholes = peepholes
+        self.gradient_steps = gradient_steps
+        self.grad_clipping = grad_clipping
+        self.unroll_scan = unroll_scan
+        self.precompute_input = precompute_input
+        self.only_return_final = only_return_final
+
+        if unroll_scan and gradient_steps != -1:
+            raise ValueError(
+                "Gradient steps must be -1 when unroll_scan is true.")
+
+        # Retrieve the dimensionality of the incoming layer
+        input_shape = self.input_shapes[0]
+
+        if unroll_scan and input_shape[1] is None:
+            raise ValueError("Input sequence length cannot be specified as "
+                             "None when unroll_scan is True")
+
+        num_inputs = np.prod(input_shape[2:])
+
+        def add_gate_params(gate, gate_name):
+            """ Convenience function for adding layer parameters from a Gate
+            instance. """
+            return (self.add_param(gate.W_in, (num_inputs, num_units),
+                                   name="W_in_to_{}".format(gate_name)),
+                    self.add_param(gate.W_hid, (num_units, num_units),
+                                   name="W_hid_to_{}".format(gate_name)),
+                    self.add_param(gate.b, (num_units,),
+                                   name="b_{}".format(gate_name),
+                                   regularizable=False),
+                    gate.nonlinearity)
+
+        # Add in parameters from the supplied Gate instances
+        (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate,
+         self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate')
+
+        (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate,
+         self.nonlinearity_forgetgate) = add_gate_params(forgetgate,
+                                                         'forgetgate')
+
+        (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell,
+         self.nonlinearity_cell) = add_gate_params(cell, 'cell')
+
+        (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate,
+         self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate')
+
+        # If peephole (cell to gate) connections were enabled, initialize
+        # peephole connections.  These are elementwise products with the cell
+        # state, so they are represented as vectors.
+        if self.peepholes:
+            self.W_cell_to_ingate = self.add_param(
+                ingate.W_cell, (num_units, ), name="W_cell_to_ingate")
+
+            self.W_cell_to_forgetgate = self.add_param(
+                forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate")
+
+            self.W_cell_to_outgate = self.add_param(
+                outgate.W_cell, (num_units, ), name="W_cell_to_outgate")
+
+        # Setup initial values for the cell and the hidden units
+        if isinstance(cell_init, Layer):
+            self.cell_init = cell_init
+        else:
+            self.cell_init = self.add_param(
+                cell_init, (1, num_units), name="cell_init",
+                trainable=learn_init, regularizable=False)
+
+        if isinstance(hid_init, Layer):
+            self.hid_init = hid_init
+        else:
+            self.hid_init = self.add_param(
+                hid_init, (1, self.num_units), name="hid_init",
+                trainable=learn_init, regularizable=False)
+
+    def get_output_shape_for(self, input_shapes):
+        # The shape of the input to this layer will be the first element
+        # of input_shapes, whether or not a mask input is being used.
+        input_shape = input_shapes[0]
+        # When only_return_final is true, the second (sequence step) dimension
+        # will be flattened
+        if self.only_return_final:
+            return input_shape[0], self.num_units
+        # Otherwise, the shape will be (n_batch, n_steps, num_units)
+        else:
+            return input_shape[0], input_shape[1], self.num_units
+
+    def get_output_for(self, inputs, **kwargs):
+        """
+        Compute this layer's output function given a symbolic input variable
+
+        Parameters
+        ----------
+        inputs : list of theano.TensorType
+            `inputs[0]` should always be the symbolic input variable.  When
+            this layer has a mask input (i.e. was instantiated with
+            `mask_input != None`, indicating that the lengths of sequences in
+            each batch vary), `inputs` should have length 2, where `inputs[1]`
+            is the `mask`.  The `mask` should be supplied as a Theano variable
+            denoting whether each time step in each sequence in the batch is
+            part of the sequence or not.  `mask` should be a matrix of shape
+            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
+            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
+            of sequence i)``. When the hidden state of this layer is to be
+            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
+            should have length at least 2, and `inputs[-1]` is the hidden state
+            to prefill with. When the cell state of this layer is to be
+            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
+            should have length at least 2, and `inputs[-1]` is the hidden state
+            to prefill with. When both the cell state and the hidden state are
+            being pre-filled `inputs[-2]` is the hidden state, while
+            `inputs[-1]` is the cell state.
+
+        Returns
+        -------
+        layer_output : theano.TensorType
+            Symbolic output variable.
+        """
+        # Retrieve the layer input
+        input = inputs[0]
+        # Retrieve the mask when it is supplied
+        mask = None
+        hid_init = None
+        cell_init = None
+        if self.mask_incoming_index > 0:
+            mask = inputs[self.mask_incoming_index]
+        if self.hid_init_incoming_index > 0:
+            hid_init = inputs[self.hid_init_incoming_index]
+        if self.cell_init_incoming_index > 0:
+            cell_init = inputs[self.cell_init_incoming_index]
+
+        # Treat all dimensions after the second as flattened feature dimensions
+        if input.ndim > 3:
+            input = T.flatten(input, 3)
+
+        # Because scan iterates over the first dimension we dimshuffle to
+        # (n_time_steps, n_batch, n_features)
+        input = input.dimshuffle(1, 0, 2)
+        seq_len, num_batch, _ = input.shape
+
+        # Stack input weight matrices into a (num_inputs, 4*num_units)
+        # matrix, which speeds up computation
+        W_in_stacked = T.concatenate(
+            [self.W_in_to_ingate, self.W_in_to_forgetgate,
+             self.W_in_to_cell, self.W_in_to_outgate], axis=1)
+
+        # Same for hidden weight matrices
+        W_hid_stacked = T.concatenate(
+            [self.W_hid_to_ingate, self.W_hid_to_forgetgate,
+             self.W_hid_to_cell, self.W_hid_to_outgate], axis=1)
+
+        # Stack biases into a (4*num_units) vector
+        b_stacked = T.concatenate(
+            [self.b_ingate, self.b_forgetgate,
+             self.b_cell, self.b_outgate], axis=0)
+
+        if self.precompute_input:
+            # Because the input is given for all time steps, we can
+            # precompute_input the inputs dot weight matrices before scanning.
+            # W_in_stacked is (n_features, 4*num_units). input is then
+            # (n_time_steps, n_batch, 4*num_units).
+            input = T.dot(input, W_in_stacked) + b_stacked
+
+        # When theano.scan calls step, input_n will be (n_batch, 4*num_units).
+        # We define a slicing function that extract the input to each LSTM gate
+        def slice_w(x, n):
+            return x[:, n*self.num_units:(n+1)*self.num_units]
+
+        # Create single recurrent computation step function
+        # input_n is the n'th vector of the input
+        def step(input_n, cell_previous, hid_previous, *args):
+            if not self.precompute_input:
+                input_n = T.dot(input_n, W_in_stacked) + b_stacked
+
+            # Calculate gates pre-activations and slice
+            gates = input_n + T.dot(hid_previous, W_hid_stacked)
+
+            # Clip gradients
+            if self.grad_clipping:
+                gates = theano.gradient.grad_clip(
+                    gates, -self.grad_clipping, self.grad_clipping)
+
+            # Extract the pre-activation gate values
+            ingate = slice_w(gates, 0)
+            forgetgate = slice_w(gates, 1)
+            cell_input = slice_w(gates, 2)
+            outgate = slice_w(gates, 3)
+
+            if self.peepholes:
+                # Compute peephole connections
+                ingate += cell_previous*self.W_cell_to_ingate
+                forgetgate += cell_previous*self.W_cell_to_forgetgate
+
+            # Apply nonlinearities
+            ingate = self.nonlinearity_ingate(ingate)
+            forgetgate = self.nonlinearity_forgetgate(forgetgate)
+            cell_input = self.nonlinearity_cell(cell_input)
+
+            # Compute new cell value
+            cell = forgetgate*cell_previous + ingate*cell_input
+
+            if self.peepholes:
+                outgate += cell*self.W_cell_to_outgate
+            outgate = self.nonlinearity_outgate(outgate)
+
+            # Compute new hidden unit activation
+            hid = outgate*self.nonlinearity(cell)
+            return [cell, hid]
+
+        def step_masked(input_n, mask_n, cell_previous, hid_previous, *args):
+            cell, hid = step(input_n, cell_previous, hid_previous, *args)
+
+            # Skip over any input with mask 0 by copying the previous
+            # hidden state; proceed normally for any input with mask 1.
+            cell = T.switch(mask_n, cell, cell_previous)
+            hid = T.switch(mask_n, hid, hid_previous)
+
+            return [cell, hid]
+
+        if mask is not None:
+            # mask is given as (batch_size, seq_len). Because scan iterates
+            # over first dimension, we dimshuffle to (seq_len, batch_size) and
+            # add a broadcastable dimension
+            mask = mask.dimshuffle(1, 0, 'x')
+            sequences = [input, mask]
+            step_fun = step_masked
+        else:
+            sequences = input
+            step_fun = step
+
+        ones = T.ones((num_batch, 1))
+        if not isinstance(self.cell_init, Layer):
+            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
+            cell_init = T.dot(ones, self.cell_init)
+
+        if not isinstance(self.hid_init, Layer):
+            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
+            hid_init = T.dot(ones, self.hid_init)
+
+        # The hidden-to-hidden weight matrix is always used in step
+        non_seqs = [W_hid_stacked]
+        # The "peephole" weight matrices are only used when self.peepholes=True
+        if self.peepholes:
+            non_seqs += [self.W_cell_to_ingate,
+                         self.W_cell_to_forgetgate,
+                         self.W_cell_to_outgate]
+
+        # When we aren't precomputing the input outside of scan, we need to
+        # provide the input weights and biases to the step function
+        if not self.precompute_input:
+            non_seqs += [W_in_stacked, b_stacked]
+
+        if self.unroll_scan:
+            # Retrieve the dimensionality of the incoming layer
+            input_shape = self.input_shapes[0]
+            # Explicitly unroll the recurrence instead of using scan
+            cell_out, hid_out = unroll_scan(
+                fn=step_fun,
+                sequences=sequences,
+                outputs_info=[cell_init, hid_init],
+                go_backwards=self.backwards,
+                non_sequences=non_seqs,
+                n_steps=input_shape[1])
+        else:
+            # Scan op iterates over first dimension of input and repeatedly
+            # applies the step function
+            cell_out, hid_out = theano.scan(
+                fn=step_fun,
+                sequences=sequences,
+                outputs_info=[cell_init, hid_init],
+                go_backwards=self.backwards,
+                truncate_gradient=self.gradient_steps,
+                non_sequences=non_seqs,
+                strict=True)[0]
+
+        # When it is requested that we only return the final sequence step,
+        # we need to slice it out immediately after scan is applied
+        if self.only_return_final:
+            hid_out = hid_out[-1]
+        else:
+            # dimshuffle back to (n_batch, n_time_steps, n_features))
+            hid_out = hid_out.dimshuffle(1, 0, 2)
+
+            # if scan is backward reverse the output
+            if self.backwards:
+                hid_out = hid_out[:, ::-1]
+
+        return hid_out
+
+
+class GRULayer(MergeLayer):
+    r"""
+    lasagne.layers.recurrent.GRULayer(incoming, num_units,
+    resetgate=lasagne.layers.Gate(W_cell=None),
+    updategate=lasagne.layers.Gate(W_cell=None),
+    hidden_update=lasagne.layers.Gate(
+    W_cell=None, lasagne.nonlinearities.tanh),
+    hid_init=lasagne.init.Constant(0.), backwards=False, learn_init=False,
+    gradient_steps=-1, grad_clipping=0, unroll_scan=False,
+    precompute_input=True, mask_input=None, only_return_final=False, **kwargs)
+
+    Gated Recurrent Unit (GRU) Layer
+
+    Implements the recurrent step proposed in [1]_, which computes the output
+    by
+
+    .. math ::
+        r_t &= \sigma_r(x_t W_{xr} + h_{t - 1} W_{hr} + b_r)\\
+        u_t &= \sigma_u(x_t W_{xu} + h_{t - 1} W_{hu} + b_u)\\
+        c_t &= \sigma_c(x_t W_{xc} + r_t \odot (h_{t - 1} W_{hc}) + b_c)\\
+        h_t &= (1 - u_t) \odot h_{t - 1} + u_t \odot c_t
+
+    Parameters
+    ----------
+    incoming : a :class:`lasagne.layers.Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape.
+    num_units : int
+        Number of hidden units in the layer.
+    resetgate : Gate
+        Parameters for the reset gate (:math:`r_t`): :math:`W_{xr}`,
+        :math:`W_{hr}`, :math:`b_r`, and :math:`\sigma_r`.
+    updategate : Gate
+        Parameters for the update gate (:math:`u_t`): :math:`W_{xu}`,
+        :math:`W_{hu}`, :math:`b_u`, and :math:`\sigma_u`.
+    hidden_update : Gate
+        Parameters for the hidden update (:math:`c_t`): :math:`W_{xc}`,
+        :math:`W_{hc}`, :math:`b_c`, and :math:`\sigma_c`.
+    hid_init : callable, np.ndarray, theano.shared or :class:`Layer`
+        Initializer for initial hidden state (:math:`h_0`).
+    backwards : bool
+        If True, process the sequence backwards and then reverse the
+        output again such that the output from the layer is always
+        from :math:`x_1` to :math:`x_n`.
+    learn_init : bool
+        If True, initial hidden values are learned.
+    gradient_steps : int
+        Number of timesteps to include in the backpropagated gradient.
+        If -1, backpropagate through the entire sequence.
+    grad_clipping : float
+        If nonzero, the gradient messages are clipped to the given value during
+        the backward pass.  See [1]_ (p. 6) for further explanation.
+    unroll_scan : bool
+        If True the recursion is unrolled instead of using scan. For some
+        graphs this gives a significant speed up but it might also consume
+        more memory. When `unroll_scan` is True, backpropagation always
+        includes the full sequence, so `gradient_steps` must be set to -1 and
+        the input sequence length must be known at compile time (i.e., cannot
+        be given as None).
+    precompute_input : bool
+        If True, precompute input_to_hid before iterating through
+        the sequence. This can result in a speedup at the expense of
+        an increase in memory usage.
+    mask_input : :class:`lasagne.layers.Layer`
+        Layer which allows for a sequence mask to be input, for when sequences
+        are of variable length.  Default `None`, which means no mask will be
+        supplied (i.e. all sequences are of the same length).
+    only_return_final : bool
+        If True, only return the final sequential output (e.g. for tasks where
+        a single target value for the entire sequence is desired).  In this
+        case, Theano makes an optimization which saves memory.
+
+    References
+    ----------
+    .. [1] Cho, Kyunghyun, et al: On the properties of neural
+       machine translation: Encoder-decoder approaches.
+       arXiv preprint arXiv:1409.1259 (2014).
+    .. [2] Chung, Junyoung, et al.: Empirical Evaluation of Gated
+       Recurrent Neural Networks on Sequence Modeling.
+       arXiv preprint arXiv:1412.3555 (2014).
+    .. [3] Graves, Alex: "Generating sequences with recurrent neural networks."
+           arXiv preprint arXiv:1308.0850 (2013).
+
+    Notes
+    -----
+    An alternate update for the candidate hidden state is proposed in [2]_:
+
+    .. math::
+        c_t &= \sigma_c(x_t W_{ic} + (r_t \odot h_{t - 1})W_{hc} + b_c)\\
+
+    We use the formulation from [1]_ because it allows us to do all matrix
+    operations in a single dot product.
+    """
+    def __init__(self, incoming, num_units,
+                 resetgate=Gate(W_cell=None),
+                 updategate=Gate(W_cell=None),
+                 hidden_update=Gate(W_cell=None,
+                                    nonlinearity=nonlinearities.tanh),
+                 hid_init=init.Constant(0.),
+                 backwards=False,
+                 learn_init=False,
+                 gradient_steps=-1,
+                 grad_clipping=0,
+                 unroll_scan=False,
+                 precompute_input=True,
+                 mask_input=None,
+                 only_return_final=False,
+                 **kwargs):
+
+        # This layer inherits from a MergeLayer, because it can have three
+        # inputs - the layer input, the mask and the initial hidden state.  We
+        # will just provide the layer input as incomings, unless a mask input
+        # or initial hidden state was provided.
+        incomings = [incoming]
+        self.mask_incoming_index = -1
+        self.hid_init_incoming_index = -1
+        if mask_input is not None:
+            incomings.append(mask_input)
+            self.mask_incoming_index = len(incomings)-1
+        if isinstance(hid_init, Layer):
+            incomings.append(hid_init)
+            self.hid_init_incoming_index = len(incomings)-1
+
+        # Initialize parent layer
+        super(GRULayer, self).__init__(incomings, **kwargs)
+
+        self.learn_init = learn_init
+        self.num_units = num_units
+        self.grad_clipping = grad_clipping
+        self.backwards = backwards
+        self.gradient_steps = gradient_steps
+        self.unroll_scan = unroll_scan
+        self.precompute_input = precompute_input
+        self.only_return_final = only_return_final
+
+        if unroll_scan and gradient_steps != -1:
+            raise ValueError(
+                "Gradient steps must be -1 when unroll_scan is true.")
+
+        # Retrieve the dimensionality of the incoming layer
+        input_shape = self.input_shapes[0]
+
+        if unroll_scan and input_shape[1] is None:
+            raise ValueError("Input sequence length cannot be specified as "
+                             "None when unroll_scan is True")
+
+        # Input dimensionality is the output dimensionality of the input layer
+        num_inputs = np.prod(input_shape[2:])
+
+        def add_gate_params(gate, gate_name):
+            """ Convenience function for adding layer parameters from a Gate
+            instance. """
+            return (self.add_param(gate.W_in, (num_inputs, num_units),
+                                   name="W_in_to_{}".format(gate_name)),
+                    self.add_param(gate.W_hid, (num_units, num_units),
+                                   name="W_hid_to_{}".format(gate_name)),
+                    self.add_param(gate.b, (num_units,),
+                                   name="b_{}".format(gate_name),
+                                   regularizable=False),
+                    gate.nonlinearity)
+
+        # Add in all parameters from gates
+        (self.W_in_to_updategate, self.W_hid_to_updategate, self.b_updategate,
+         self.nonlinearity_updategate) = add_gate_params(updategate,
+                                                         'updategate')
+        (self.W_in_to_resetgate, self.W_hid_to_resetgate, self.b_resetgate,
+         self.nonlinearity_resetgate) = add_gate_params(resetgate, 'resetgate')
+
+        (self.W_in_to_hidden_update, self.W_hid_to_hidden_update,
+         self.b_hidden_update, self.nonlinearity_hid) = add_gate_params(
+             hidden_update, 'hidden_update')
+
+        # Initialize hidden state
+        if isinstance(hid_init, Layer):
+            self.hid_init = hid_init
+        else:
+            self.hid_init = self.add_param(
+                hid_init, (1, self.num_units), name="hid_init",
+                trainable=learn_init, regularizable=False)
+
+    def get_output_shape_for(self, input_shapes):
+        # The shape of the input to this layer will be the first element
+        # of input_shapes, whether or not a mask input is being used.
+        input_shape = input_shapes[0]
+        # When only_return_final is true, the second (sequence step) dimension
+        # will be flattened
+        if self.only_return_final:
+            return input_shape[0], self.num_units
+        # Otherwise, the shape will be (n_batch, n_steps, num_units)
+        else:
+            return input_shape[0], input_shape[1], self.num_units
+
+    def get_output_for(self, inputs, **kwargs):
+        """
+        Compute this layer's output function given a symbolic input variable
+
+        Parameters
+        ----------
+        inputs : list of theano.TensorType
+            `inputs[0]` should always be the symbolic input variable.  When
+            this layer has a mask input (i.e. was instantiated with
+            `mask_input != None`, indicating that the lengths of sequences in
+            each batch vary), `inputs` should have length 2, where `inputs[1]`
+            is the `mask`.  The `mask` should be supplied as a Theano variable
+            denoting whether each time step in each sequence in the batch is
+            part of the sequence or not.  `mask` should be a matrix of shape
+            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
+            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
+            of sequence i)``. When the hidden state of this layer is to be
+            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
+            should have length at least 2, and `inputs[-1]` is the hidden state
+            to prefill with.
+
+        Returns
+        -------
+        layer_output : theano.TensorType
+            Symbolic output variable.
+        """
+        # Retrieve the layer input
+        input = inputs[0]
+        # Retrieve the mask when it is supplied
+        mask = None
+        hid_init = None
+        if self.mask_incoming_index > 0:
+            mask = inputs[self.mask_incoming_index]
+        if self.hid_init_incoming_index > 0:
+            hid_init = inputs[self.hid_init_incoming_index]
+
+        # Treat all dimensions after the second as flattened feature dimensions
+        if input.ndim > 3:
+            input = T.flatten(input, 3)
+
+        # Because scan iterates over the first dimension we dimshuffle to
+        # (n_time_steps, n_batch, n_features)
+        input = input.dimshuffle(1, 0, 2)
+        seq_len, num_batch, _ = input.shape
+
+        # Stack input weight matrices into a (num_inputs, 3*num_units)
+        # matrix, which speeds up computation
+        W_in_stacked = T.concatenate(
+            [self.W_in_to_resetgate, self.W_in_to_updategate,
+             self.W_in_to_hidden_update], axis=1)
+
+        # Same for hidden weight matrices
+        W_hid_stacked = T.concatenate(
+            [self.W_hid_to_resetgate, self.W_hid_to_updategate,
+             self.W_hid_to_hidden_update], axis=1)
+
+        # Stack gate biases into a (3*num_units) vector
+        b_stacked = T.concatenate(
+            [self.b_resetgate, self.b_updategate,
+             self.b_hidden_update], axis=0)
+
+        if self.precompute_input:
+            # precompute_input inputs*W. W_in is (n_features, 3*num_units).
+            # input is then (n_batch, n_time_steps, 3*num_units).
+            input = T.dot(input, W_in_stacked) + b_stacked
+
+        # When theano.scan calls step, input_n will be (n_batch, 3*num_units).
+        # We define a slicing function that extract the input to each GRU gate
+        def slice_w(x, n):
+            return x[:, n*self.num_units:(n+1)*self.num_units]
+
+        # Create single recurrent computation step function
+        # input__n is the n'th vector of the input
+        def step(input_n, hid_previous, *args):
+            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1}
+            hid_input = T.dot(hid_previous, W_hid_stacked)
+
+            if self.grad_clipping:
+                input_n = theano.gradient.grad_clip(
+                    input_n, -self.grad_clipping, self.grad_clipping)
+                hid_input = theano.gradient.grad_clip(
+                    hid_input, -self.grad_clipping, self.grad_clipping)
+
+            if not self.precompute_input:
+                # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
+                input_n = T.dot(input_n, W_in_stacked) + b_stacked
+
+            # Reset and update gates
+            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
+            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
+            resetgate = self.nonlinearity_resetgate(resetgate)
+            updategate = self.nonlinearity_updategate(updategate)
+
+            # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
+            hidden_update_in = slice_w(input_n, 2)
+            hidden_update_hid = slice_w(hid_input, 2)
+            hidden_update = hidden_update_in + resetgate*hidden_update_hid
+            if self.grad_clipping:
+                hidden_update = theano.gradient.grad_clip(
+                    hidden_update, -self.grad_clipping, self.grad_clipping)
+            hidden_update = self.nonlinearity_hid(hidden_update)
+
+            # Compute (1 - u_t)h_{t - 1} + u_t c_t
+            hid = (1 - updategate)*hid_previous + updategate*hidden_update
+            return hid
+
+        def step_masked(input_n, mask_n, hid_previous, *args):
+            hid = step(input_n, hid_previous, *args)
+
+            # Skip over any input with mask 0 by copying the previous
+            # hidden state; proceed normally for any input with mask 1.
+            hid = T.switch(mask_n, hid, hid_previous)
+
+            return hid
+
+        if mask is not None:
+            # mask is given as (batch_size, seq_len). Because scan iterates
+            # over first dimension, we dimshuffle to (seq_len, batch_size) and
+            # add a broadcastable dimension
+            mask = mask.dimshuffle(1, 0, 'x')
+            sequences = [input, mask]
+            step_fun = step_masked
+        else:
+            sequences = [input]
+            step_fun = step
+
+        if not isinstance(self.hid_init, Layer):
+            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
+            hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init)
+
+        # The hidden-to-hidden weight matrix is always used in step
+        non_seqs = [W_hid_stacked]
+        # When we aren't precomputing the input outside of scan, we need to
+        # provide the input weights and biases to the step function
+        if not self.precompute_input:
+            non_seqs += [W_in_stacked, b_stacked]
+
+        if self.unroll_scan:
+            # Retrieve the dimensionality of the incoming layer
+            input_shape = self.input_shapes[0]
+            # Explicitly unroll the recurrence instead of using scan
+            hid_out = unroll_scan(
+                fn=step_fun,
+                sequences=sequences,
+                outputs_info=[hid_init],
+                go_backwards=self.backwards,
+                non_sequences=non_seqs,
+                n_steps=input_shape[1])[0]
+        else:
+            # Scan op iterates over first dimension of input and repeatedly
+            # applies the step function
+            hid_out = theano.scan(
+                fn=step_fun,
+                sequences=sequences,
+                go_backwards=self.backwards,
+                outputs_info=[hid_init],
+                non_sequences=non_seqs,
+                truncate_gradient=self.gradient_steps,
+                strict=True)[0]
+
+        # When it is requested that we only return the final sequence step,
+        # we need to slice it out immediately after scan is applied
+        if self.only_return_final:
+            hid_out = hid_out[-1]
+        else:
+            # dimshuffle back to (n_batch, n_time_steps, n_features))
+            hid_out = hid_out.dimshuffle(1, 0, 2)
+
+            # if scan is backward reverse the output
+            if self.backwards:
+                hid_out = hid_out[:, ::-1]
+
+        return hid_out
diff --git a/lasagne/layers/shape.py b/lasagne/layers/shape.py
new file mode 100644
index 0000000..4f5e7ef
--- /dev/null
+++ b/lasagne/layers/shape.py
@@ -0,0 +1,397 @@
+import numpy as np
+import theano.tensor as T
+
+from ..theano_extensions import padding
+
+from .base import Layer
+
+
+__all__ = [
+    "FlattenLayer",
+    "flatten",
+    "ReshapeLayer",
+    "reshape",
+    "DimshuffleLayer",
+    "dimshuffle",
+    "PadLayer",
+    "pad",
+    "SliceLayer"
+]
+
+
+class FlattenLayer(Layer):
+    """
+    A layer that flattens its input. The leading ``outdim-1`` dimensions of
+    the output will have the same shape as the input. The remaining dimensions
+    are collapsed into the last dimension.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape.
+    outdim : int
+        The number of dimensions in the output.
+
+    See Also
+    --------
+    flatten  : Shortcut
+    """
+    def __init__(self, incoming, outdim=2, **kwargs):
+        super(FlattenLayer, self).__init__(incoming, **kwargs)
+        self.outdim = outdim
+
+        if outdim < 1:
+            raise ValueError('Dim must be >0, was %i', outdim)
+
+    def get_output_shape_for(self, input_shape):
+        to_flatten = input_shape[self.outdim - 1:]
+
+        if any(s is None for s in to_flatten):
+            flattened = None
+        else:
+            flattened = int(np.prod(to_flatten))
+
+        return input_shape[:self.outdim - 1] + (flattened,)
+
+    def get_output_for(self, input, **kwargs):
+        return input.flatten(self.outdim)
+
+flatten = FlattenLayer  # shortcut
+
+
+class ReshapeLayer(Layer):
+    """
+    A layer reshaping its input tensor to another tensor of the same total
+    number of elements.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape
+
+    shape : tuple
+        The target shape specification. Each element can be one of:
+
+        * ``i``, a positive integer directly giving the size of the dimension
+        * ``[i]``, a single-element list of int, denoting to use the size
+          of the ``i`` th input dimension
+        * ``-1``, denoting to infer the size for this dimension to match
+          the total number of elements in the input tensor (cannot be used
+          more than once in a specification)
+        * TensorVariable directly giving the size of the dimension
+
+    Examples
+    --------
+    >>> from lasagne.layers import InputLayer, ReshapeLayer
+    >>> l_in = InputLayer((32, 100, 20))
+    >>> l1 = ReshapeLayer(l_in, ((32, 50, 40)))
+    >>> l1.output_shape
+    (32, 50, 40)
+    >>> l_in = InputLayer((None, 100, 20))
+    >>> l1 = ReshapeLayer(l_in, ([0], [1], 5, -1))
+    >>> l1.output_shape
+    (None, 100, 5, 4)
+
+    Notes
+    -----
+    The tensor elements will be fetched and placed in C-like order. That
+    is, reshaping `[1,2,3,4,5,6]` to shape `(2,3)` will result in a matrix
+    `[[1,2,3],[4,5,6]]`, not in `[[1,3,5],[2,4,6]]` (Fortran-like order),
+    regardless of the memory layout of the input tensor. For C-contiguous
+    input, reshaping is cheap, for others it may require copying the data.
+    """
+
+    def __init__(self, incoming, shape, **kwargs):
+        super(ReshapeLayer, self).__init__(incoming, **kwargs)
+        shape = tuple(shape)
+        for s in shape:
+            if isinstance(s, int):
+                if s == 0 or s < - 1:
+                    raise ValueError("`shape` integers must be positive or -1")
+            elif isinstance(s, list):
+                if len(s) != 1 or not isinstance(s[0], int) or s[0] < 0:
+                    raise ValueError("`shape` input references must be "
+                                     "single-element lists of int >= 0")
+            elif isinstance(s, T.TensorVariable):
+                if s.ndim != 0:
+                    raise ValueError(
+                        "A symbolic variable in a shape specification must be "
+                        "a scalar, but had %i dimensions" % s.ndim)
+            else:
+                raise ValueError("`shape` must be a tuple of int and/or [int]")
+        if sum(s == -1 for s in shape) > 1:
+            raise ValueError("`shape` cannot contain multiple -1")
+        self.shape = shape
+        # try computing the output shape once as a sanity check
+        self.get_output_shape_for(self.input_shape)
+
+    def get_output_shape_for(self, input_shape, **kwargs):
+        # Initialize output shape from shape specification
+        output_shape = list(self.shape)
+        # First, replace all `[i]` with the corresponding input dimension, and
+        # mask parts of the shapes thus becoming irrelevant for -1 inference
+        masked_input_shape = list(input_shape)
+        masked_output_shape = list(output_shape)
+        for dim, o in enumerate(output_shape):
+            if isinstance(o, list):
+                if o[0] >= len(input_shape):
+                    raise ValueError("specification contains [%d], but input "
+                                     "shape has %d dimensions only" %
+                                     (o[0], len(input_shape)))
+                output_shape[dim] = input_shape[o[0]]
+                masked_output_shape[dim] = input_shape[o[0]]
+                if (input_shape[o[0]] is None) \
+                   and (masked_input_shape[o[0]] is None):
+                        # first time we copied this unknown input size: mask
+                        # it, we have a 1:1 correspondence between out[dim] and
+                        # in[o[0]] and can ignore it for -1 inference even if
+                        # it is unknown.
+                        masked_input_shape[o[0]] = 1
+                        masked_output_shape[dim] = 1
+        # Secondly, replace all symbolic shapes with `None`, as we cannot
+        # infer their size here.
+        for dim, o in enumerate(output_shape):
+            if isinstance(o, T.TensorVariable):
+                output_shape[dim] = None
+                masked_output_shape[dim] = None
+        # From the shapes, compute the sizes of the input and output tensor
+        input_size = (None if any(x is None for x in masked_input_shape)
+                      else np.prod(masked_input_shape))
+        output_size = (None if any(x is None for x in masked_output_shape)
+                       else np.prod(masked_output_shape))
+        del masked_input_shape, masked_output_shape
+        # Finally, infer value for -1 if needed
+        if -1 in output_shape:
+            dim = output_shape.index(-1)
+            if (input_size is None) or (output_size is None):
+                output_shape[dim] = None
+                output_size = None
+            else:
+                output_size *= -1
+                output_shape[dim] = input_size // output_size
+                output_size *= output_shape[dim]
+        # Sanity check
+        if (input_size is not None) and (output_size is not None) \
+           and (input_size != output_size):
+            raise ValueError("%s cannot be reshaped to specification %s. "
+                             "The total size mismatches." %
+                             (input_shape, self.shape))
+        return tuple(output_shape)
+
+    def get_output_for(self, input, **kwargs):
+        # Replace all `[i]` with the corresponding input dimension
+        output_shape = list(self.shape)
+        for dim, o in enumerate(output_shape):
+            if isinstance(o, list):
+                output_shape[dim] = input.shape[o[0]]
+        # Everything else is handled by Theano
+        return input.reshape(tuple(output_shape))
+
+reshape = ReshapeLayer  # shortcut
+
+
+class DimshuffleLayer(Layer):
+    """
+    A layer that rearranges the dimension of its input tensor, maintaining
+    the same same total number of elements.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        the layer feeding into this layer, or the expected input shape
+
+    pattern : tuple
+        The new dimension order, with each element giving the index
+        of the dimension in the input tensor or `'x'` to broadcast it.
+        For example `(3,2,1,0)` will reverse the order of a 4-dimensional
+        tensor. Use `'x'` to broadcast, e.g. `(3,2,1,'x',0)` will
+        take a 4 tensor of shape `(2,3,5,7)` as input and produce a
+        tensor of shape `(7,5,3,1,2)` with the 4th dimension being
+        broadcast-able. In general, all dimensions in the input tensor
+        must be used to generate the output tensor. Omitting a dimension
+        attempts to collapse it; this can only be done to broadcast-able
+        dimensions, e.g. a 5-tensor of shape `(7,5,3,1,2)` with the 4th
+        being broadcast-able can be shuffled with the pattern `(4,2,1,0)`
+        collapsing the 4th dimension resulting in a tensor of shape
+        `(2,3,5,7)`.
+
+    Examples
+    --------
+    >>> from lasagne.layers import InputLayer, DimshuffleLayer
+    >>> l_in = InputLayer((2, 3, 5, 7))
+    >>> l1 = DimshuffleLayer(l_in, (3, 2, 1, 'x', 0))
+    >>> l1.output_shape
+    (7, 5, 3, 1, 2)
+    >>> l2 = DimshuffleLayer(l1, (4, 2, 1, 0))
+    >>> l2.output_shape
+    (2, 3, 5, 7)
+    """
+    def __init__(self, incoming, pattern, **kwargs):
+        super(DimshuffleLayer, self).__init__(incoming, **kwargs)
+
+        # Sanity check the pattern
+        used_dims = set()
+        for p in pattern:
+            if isinstance(p, int):
+                # Dimension p
+                if p in used_dims:
+                    raise ValueError("pattern contains dimension {0} more "
+                                     "than once".format(p))
+                used_dims.add(p)
+            elif p == 'x':
+                # Broadcast
+                pass
+            else:
+                raise ValueError("pattern should only contain dimension"
+                                 "indices or 'x', not {0}".format(p))
+
+        self.pattern = pattern
+
+        # try computing the output shape once as a sanity check
+        self.get_output_shape_for(self.input_shape)
+
+    def get_output_shape_for(self, input_shape):
+        # Build output shape while keeping track of the dimensions that we are
+        # attempting to collapse, so we can ensure that they are broadcastable
+        output_shape = []
+        dims_used = [False] * len(input_shape)
+        for p in self.pattern:
+            if isinstance(p, int):
+                if p < 0 or p >= len(input_shape):
+                    raise ValueError("pattern contains {0}, but input shape "
+                                     "has {1} dimensions "
+                                     "only".format(p, len(input_shape)))
+                # Dimension p
+                o = input_shape[p]
+                dims_used[p] = True
+            elif p == 'x':
+                # Broadcast; will be of size 1
+                o = 1
+            output_shape.append(o)
+
+        for i, (dim_size, used) in enumerate(zip(input_shape, dims_used)):
+            if not used and dim_size != 1 and dim_size is not None:
+                raise ValueError(
+                    "pattern attempted to collapse dimension "
+                    "{0} of size {1}; dimensions with size != 1/None are not"
+                    "broadcastable and cannot be "
+                    "collapsed".format(i, dim_size))
+
+        return tuple(output_shape)
+
+    def get_output_for(self, input, **kwargs):
+        return input.dimshuffle(self.pattern)
+
+dimshuffle = DimshuffleLayer  # shortcut
+
+
+class PadLayer(Layer):
+    """
+    Pad all dimensions except the first ``batch_ndim`` with ``width``
+    zeros on both sides, or with another value specified in ``val``.
+    Individual padding for each dimension or edge can be specified
+    using a tuple or list of tuples for ``width``.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape
+
+    width : int, iterable of int, or iterable of tuple
+        Padding width. If an int, pads each axis symmetrically with the same
+        amount in the beginning and end. If an iterable of int, defines the
+        symmetric padding width separately for each axis. If an iterable of
+        tuples of two ints, defines a seperate padding width for each beginning
+        and end of each axis.
+
+    val : float
+        Value used for padding
+
+    batch_ndim : int
+        Dimensions up to this value are not padded. For padding convolutional
+        layers this should be set to 2 so the sample and filter dimensions are
+        not padded
+    """
+    def __init__(self, incoming, width, val=0, batch_ndim=2, **kwargs):
+        super(PadLayer, self).__init__(incoming, **kwargs)
+        self.width = width
+        self.val = val
+        self.batch_ndim = batch_ndim
+
+    def get_output_shape_for(self, input_shape):
+        output_shape = list(input_shape)
+
+        if isinstance(self.width, int):
+            widths = [self.width] * (len(input_shape) - self.batch_ndim)
+        else:
+            widths = self.width
+
+        for k, w in enumerate(widths):
+            if output_shape[k + self.batch_ndim] is None:
+                continue
+            else:
+                try:
+                    l, r = w
+                except TypeError:
+                    l = r = w
+                output_shape[k + self.batch_ndim] += l + r
+        return tuple(output_shape)
+
+    def get_output_for(self, input, **kwargs):
+        return padding.pad(input, self.width, self.val, self.batch_ndim)
+
+pad = PadLayer  # shortcut
+
+
+class SliceLayer(Layer):
+    """
+    Slices the input at a specific axis and at specific indices.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape
+
+    indices : int or slice instance
+        If an ``int``, selects a single element from the given axis, dropping
+        the axis. If a slice, selects all elements in the given range, keeping
+        the axis.
+
+    axis : int
+        Specifies the axis from which the indices are selected.
+
+    Examples
+    --------
+    >>> from lasagne.layers import SliceLayer, InputLayer
+    >>> l_in = InputLayer((2, 3, 4))
+    >>> SliceLayer(l_in, indices=0, axis=1).output_shape
+    ... # equals input[:, 0]
+    (2, 4)
+    >>> SliceLayer(l_in, indices=slice(0, 1), axis=1).output_shape
+    ... # equals input[:, 0:1]
+    (2, 1, 4)
+    >>> SliceLayer(l_in, indices=slice(-2, None), axis=-1).output_shape
+    ... # equals input[..., -2:]
+    (2, 3, 2)
+    """
+    def __init__(self, incoming, indices, axis=-1, **kwargs):
+        super(SliceLayer, self).__init__(incoming, **kwargs)
+        self.slice = indices
+        self.axis = axis
+
+    def get_output_shape_for(self, input_shape):
+        output_shape = list(input_shape)
+        if isinstance(self.slice, int):
+            del output_shape[self.axis]
+        elif input_shape[self.axis] is not None:
+            output_shape[self.axis] = len(
+                range(*self.slice.indices(input_shape[self.axis])))
+        else:
+            output_shape[self.axis] = None
+        return tuple(output_shape)
+
+    def get_output_for(self, input, **kwargs):
+        axis = self.axis
+        if axis < 0:
+            axis += input.ndim
+        return input[(slice(None),) * axis + (self.slice,)]
diff --git a/lasagne/layers/special.py b/lasagne/layers/special.py
new file mode 100644
index 0000000..13f7716
--- /dev/null
+++ b/lasagne/layers/special.py
@@ -0,0 +1,1155 @@
+import theano
+import theano.tensor as T
+import numpy as np
+
+from .. import init
+from .. import nonlinearities
+from ..utils import as_tuple, floatX
+from ..random import get_rng
+from .base import Layer, MergeLayer
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
+
+
+__all__ = [
+    "NonlinearityLayer",
+    "BiasLayer",
+    "ScaleLayer",
+    "standardize",
+    "ExpressionLayer",
+    "InverseLayer",
+    "TransformerLayer",
+    "TPSTransformerLayer",
+    "ParametricRectifierLayer",
+    "prelu",
+    "RandomizedRectifierLayer",
+    "rrelu",
+]
+
+
+class NonlinearityLayer(Layer):
+    """
+    lasagne.layers.NonlinearityLayer(incoming,
+    nonlinearity=lasagne.nonlinearities.rectify, **kwargs)
+
+    A layer that just applies a nonlinearity.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape
+
+    nonlinearity : callable or None
+        The nonlinearity that is applied to the layer activations. If None
+        is provided, the layer will be linear.
+    """
+    def __init__(self, incoming, nonlinearity=nonlinearities.rectify,
+                 **kwargs):
+        super(NonlinearityLayer, self).__init__(incoming, **kwargs)
+        self.nonlinearity = (nonlinearities.identity if nonlinearity is None
+                             else nonlinearity)
+
+    def get_output_for(self, input, **kwargs):
+        return self.nonlinearity(input)
+
+
+class BiasLayer(Layer):
+    """
+    lasagne.layers.BiasLayer(incoming, b=lasagne.init.Constant(0),
+    shared_axes='auto', **kwargs)
+
+    A layer that just adds a (trainable) bias term.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape
+
+    b : Theano shared variable, expression, numpy array, callable or ``None``
+        Initial value, expression or initializer for the biases. If set to
+        ``None``, the layer will have no biases and pass through its input
+        unchanged. Otherwise, the bias shape must match the incoming shape,
+        skipping those axes the biases are shared over (see the example below).
+        See :func:`lasagne.utils.create_param` for more information.
+
+    shared_axes : 'auto', int or tuple of int
+        The axis or axes to share biases over. If ``'auto'`` (the default),
+        share over all axes except for the second: this will share biases over
+        the minibatch dimension for dense layers, and additionally over all
+        spatial dimensions for convolutional layers.
+
+    Notes
+    -----
+    The bias parameter dimensionality is the input dimensionality minus the
+    number of axes the biases are shared over, which matches the bias parameter
+    conventions of :class:`DenseLayer` or :class:`Conv2DLayer`. For example:
+
+    >>> layer = BiasLayer((20, 30, 40, 50), shared_axes=(0, 2))
+    >>> layer.b.get_value().shape
+    (30, 50)
+    """
+    def __init__(self, incoming, b=init.Constant(0), shared_axes='auto',
+                 **kwargs):
+        super(BiasLayer, self).__init__(incoming, **kwargs)
+
+        if shared_axes == 'auto':
+            # default: share biases over all but the second axis
+            shared_axes = (0,) + tuple(range(2, len(self.input_shape)))
+        elif isinstance(shared_axes, int):
+            shared_axes = (shared_axes,)
+        self.shared_axes = shared_axes
+
+        if b is None:
+            self.b = None
+        else:
+            # create bias parameter, ignoring all dimensions in shared_axes
+            shape = [size for axis, size in enumerate(self.input_shape)
+                     if axis not in self.shared_axes]
+            if any(size is None for size in shape):
+                raise ValueError("BiasLayer needs specified input sizes for "
+                                 "all axes that biases are not shared over.")
+            self.b = self.add_param(b, shape, 'b', regularizable=False)
+
+    def get_output_for(self, input, **kwargs):
+        if self.b is not None:
+            bias_axes = iter(range(self.b.ndim))
+            pattern = ['x' if input_axis in self.shared_axes
+                       else next(bias_axes)
+                       for input_axis in range(input.ndim)]
+            return input + self.b.dimshuffle(*pattern)
+        else:
+            return input
+
+
+class ScaleLayer(Layer):
+    """
+    lasagne.layers.ScaleLayer(incoming, scales=lasagne.init.Constant(1),
+    shared_axes='auto', **kwargs)
+
+    A layer that scales its inputs by learned coefficients.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape
+
+    scales : Theano shared variable, expression, numpy array, or callable
+        Initial value, expression or initializer for the scale.  The scale
+        shape must match the incoming shape, skipping those axes the scales are
+        shared over (see the example below).  See
+        :func:`lasagne.utils.create_param` for more information.
+
+    shared_axes : 'auto', int or tuple of int
+        The axis or axes to share scales over. If ``'auto'`` (the default),
+        share over all axes except for the second: this will share scales over
+        the minibatch dimension for dense layers, and additionally over all
+        spatial dimensions for convolutional layers.
+
+    Notes
+    -----
+    The scales parameter dimensionality is the input dimensionality minus the
+    number of axes the scales are shared over, which matches the bias parameter
+    conventions of :class:`DenseLayer` or :class:`Conv2DLayer`. For example:
+
+    >>> layer = ScaleLayer((20, 30, 40, 50), shared_axes=(0, 2))
+    >>> layer.scales.get_value().shape
+    (30, 50)
+    """
+    def __init__(self, incoming, scales=init.Constant(1), shared_axes='auto',
+                 **kwargs):
+        super(ScaleLayer, self).__init__(incoming, **kwargs)
+
+        if shared_axes == 'auto':
+            # default: share scales over all but the second axis
+            shared_axes = (0,) + tuple(range(2, len(self.input_shape)))
+        elif isinstance(shared_axes, int):
+            shared_axes = (shared_axes,)
+        self.shared_axes = shared_axes
+
+        # create scales parameter, ignoring all dimensions in shared_axes
+        shape = [size for axis, size in enumerate(self.input_shape)
+                 if axis not in self.shared_axes]
+        if any(size is None for size in shape):
+            raise ValueError("ScaleLayer needs specified input sizes for "
+                             "all axes that scales are not shared over.")
+        self.scales = self.add_param(
+            scales, shape, 'scales', regularizable=False)
+
+    def get_output_for(self, input, **kwargs):
+        axes = iter(range(self.scales.ndim))
+        pattern = ['x' if input_axis in self.shared_axes
+                   else next(axes) for input_axis in range(input.ndim)]
+        return input * self.scales.dimshuffle(*pattern)
+
+
+def standardize(layer, offset, scale, shared_axes='auto'):
+    """
+    Convenience function for standardizing inputs by applying a fixed offset
+    and scale.  This is usually useful when you want the input to your network
+    to, say, have zero mean and unit standard deviation over the feature
+    dimensions.  This layer allows you to include the appropriate statistics to
+    achieve this normalization as part of your network, and applies them to its
+    input.  The statistics are supplied as the `offset` and `scale` parameters,
+    which are applied to the input by subtracting `offset` and dividing by
+    `scale`, sharing dimensions as specified by the `shared_axes` argument.
+
+    Parameters
+    ----------
+    layer : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape.
+    offset : Theano shared variable or numpy array
+        The offset to apply (via subtraction) to the axis/axes being
+        standardized.
+    scale : Theano shared variable or numpy array
+        The scale to apply (via division) to the axis/axes being standardized.
+    shared_axes : 'auto', int or tuple of int
+        The axis or axes to share the offset and scale over. If ``'auto'`` (the
+        default), share over all axes except for the second: this will share
+        scales over the minibatch dimension for dense layers, and additionally
+        over all spatial dimensions for convolutional layers.
+
+    Examples
+    --------
+    Assuming your training data exists in a 2D numpy ndarray called
+    ``training_data``, you can use this function to scale input features to the
+    [0, 1] range based on the training set statistics like so:
+
+    >>> import lasagne
+    >>> import numpy as np
+    >>> training_data = np.random.standard_normal((100, 20))
+    >>> input_shape = (None, training_data.shape[1])
+    >>> l_in = lasagne.layers.InputLayer(input_shape)
+    >>> offset = training_data.min(axis=0)
+    >>> scale = training_data.max(axis=0) - training_data.min(axis=0)
+    >>> l_std = standardize(l_in, offset, scale, shared_axes=0)
+
+    Alternatively, to z-score your inputs based on training set statistics, you
+    could set ``offset = training_data.mean(axis=0)`` and
+    ``scale = training_data.std(axis=0)`` instead.
+    """
+    # Subtract the offset
+    layer = BiasLayer(layer, -offset, shared_axes)
+    # Do not optimize the offset parameter
+    layer.params[layer.b].remove('trainable')
+    # Divide by the scale
+    layer = ScaleLayer(layer, floatX(1.)/scale, shared_axes)
+    # Do not optimize the scales parameter
+    layer.params[layer.scales].remove('trainable')
+    return layer
+
+
+class ExpressionLayer(Layer):
+    """
+    This layer provides boilerplate for a custom layer that applies a
+    simple transformation to the input.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape.
+
+    function : callable
+        A function to be applied to the output of the previous layer.
+
+    output_shape : None, callable, tuple, or 'auto'
+        Specifies the output shape of this layer. If a tuple, this fixes the
+        output shape for any input shape (the tuple can contain None if some
+        dimensions may vary). If a callable, it should return the calculated
+        output shape given the input shape. If None, the output shape is
+        assumed to be the same as the input shape. If 'auto', an attempt will
+        be made to automatically infer the correct output shape.
+
+    Notes
+    -----
+    An :class:`ExpressionLayer` that does not change the shape of the data
+    (i.e., is constructed with the default setting of ``output_shape=None``)
+    is functionally equivalent to a :class:`NonlinearityLayer`.
+
+    Examples
+    --------
+    >>> from lasagne.layers import InputLayer, ExpressionLayer
+    >>> l_in = InputLayer((32, 100, 20))
+    >>> l1 = ExpressionLayer(l_in, lambda X: X.mean(-1), output_shape='auto')
+    >>> l1.output_shape
+    (32, 100)
+    """
+    def __init__(self, incoming, function, output_shape=None, **kwargs):
+        super(ExpressionLayer, self).__init__(incoming, **kwargs)
+
+        if output_shape is None:
+            self._output_shape = None
+        elif output_shape == 'auto':
+            self._output_shape = 'auto'
+        elif hasattr(output_shape, '__call__'):
+            self.get_output_shape_for = output_shape
+        else:
+            self._output_shape = tuple(output_shape)
+
+        self.function = function
+
+    def get_output_shape_for(self, input_shape):
+        if self._output_shape is None:
+            return input_shape
+        elif self._output_shape is 'auto':
+            input_shape = (0 if s is None else s for s in input_shape)
+            X = theano.tensor.alloc(0, *input_shape)
+            output_shape = self.function(X).shape.eval()
+            output_shape = tuple(s if s else None for s in output_shape)
+            return output_shape
+        else:
+            return self._output_shape
+
+    def get_output_for(self, input, **kwargs):
+        return self.function(input)
+
+
+class InverseLayer(MergeLayer):
+    """
+    The :class:`InverseLayer` class performs inverse operations
+    for a single layer of a neural network by applying the
+    partial derivative of the layer to be inverted with
+    respect to its input: transposed layer
+    for a :class:`DenseLayer`, deconvolutional layer for
+    :class:`Conv2DLayer`, :class:`Conv1DLayer`; or
+    an unpooling layer for :class:`MaxPool2DLayer`.
+
+    It is specially useful for building (convolutional)
+    autoencoders with tied parameters.
+
+    Note that if the layer to be inverted contains a nonlinearity
+    and/or a bias, the :class:`InverseLayer` will include the derivative
+    of that in its computation.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape.
+    layer : a :class:`Layer` instance or a tuple
+        The layer with respect to which the instance of the
+        :class:`InverseLayer` is inverse to.
+
+    Examples
+    --------
+    >>> import lasagne
+    >>> from lasagne.layers import InputLayer, Conv2DLayer, DenseLayer
+    >>> from lasagne.layers import InverseLayer
+    >>> l_in = InputLayer((100, 3, 28, 28))
+    >>> l1 = Conv2DLayer(l_in, num_filters=16, filter_size=5)
+    >>> l2 = DenseLayer(l1, num_units=20)
+    >>> l_u2 = InverseLayer(l2, l2)  # backprop through l2
+    >>> l_u1 = InverseLayer(l_u2, l1)  # backprop through l1
+    """
+    def __init__(self, incoming, layer, **kwargs):
+
+        super(InverseLayer, self).__init__(
+            [incoming, layer, layer.input_layer], **kwargs)
+
+    def get_output_shape_for(self, input_shapes):
+        return input_shapes[2]
+
+    def get_output_for(self, inputs, **kwargs):
+        input, layer_out, layer_in = inputs
+        return theano.grad(None, wrt=layer_in, known_grads={layer_out: input})
+
+
+class TransformerLayer(MergeLayer):
+    """
+    Spatial transformer layer
+
+    The layer applies an affine transformation on the input. The affine
+    transformation is parameterized with six learned parameters [1]_.
+    The output is interpolated with a bilinear transformation.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape. The
+        output of this layer should be a 4D tensor, with shape
+        ``(batch_size, num_input_channels, input_rows, input_columns)``.
+
+    localization_network : a :class:`Layer` instance
+        The network that calculates the parameters of the affine
+        transformation. See the example for how to initialize to the identity
+        transform.
+
+    downsample_factor : float or iterable of float
+        A float or a 2-element tuple specifying the downsample factor for the
+        output image (in both spatial dimensions). A value of 1 will keep the
+        original size of the input. Values larger than 1 will downsample the
+        input. Values below 1 will upsample the input.
+
+    References
+    ----------
+    .. [1]  Max Jaderberg, Karen Simonyan, Andrew Zisserman,
+            Koray Kavukcuoglu (2015):
+            Spatial Transformer Networks. NIPS 2015,
+            http://papers.nips.cc/paper/5854-spatial-transformer-networks.pdf
+
+    Examples
+    --------
+    Here we set up the layer to initially do the identity transform, similarly
+    to [1]_. Note that you will want to use a localization with linear output.
+    If the output from the localization networks is [t1, t2, t3, t4, t5, t6]
+    then t1 and t5 determines zoom, t2 and t4 determines skewness, and t3 and
+    t6 move the center position.
+
+    >>> import numpy as np
+    >>> import lasagne
+    >>> b = np.zeros((2, 3), dtype='float32')
+    >>> b[0, 0] = 1
+    >>> b[1, 1] = 1
+    >>> b = b.flatten()  # identity transform
+    >>> W = lasagne.init.Constant(0.0)
+    >>> l_in = lasagne.layers.InputLayer((None, 3, 28, 28))
+    >>> l_loc = lasagne.layers.DenseLayer(l_in, num_units=6, W=W, b=b,
+    ... nonlinearity=None)
+    >>> l_trans = lasagne.layers.TransformerLayer(l_in, l_loc)
+    """
+    def __init__(self, incoming, localization_network, downsample_factor=1,
+                 **kwargs):
+        super(TransformerLayer, self).__init__(
+            [incoming, localization_network], **kwargs)
+        self.downsample_factor = as_tuple(downsample_factor, 2)
+
+        input_shp, loc_shp = self.input_shapes
+
+        if loc_shp[-1] != 6 or len(loc_shp) != 2:
+            raise ValueError("The localization network must have "
+                             "output shape: (batch_size, 6)")
+        if len(input_shp) != 4:
+            raise ValueError("The input network must have a 4-dimensional "
+                             "output shape: (batch_size, num_input_channels, "
+                             "input_rows, input_columns)")
+
+    def get_output_shape_for(self, input_shapes):
+        shape = input_shapes[0]
+        factors = self.downsample_factor
+        return (shape[:2] + tuple(None if s is None else int(s / f)
+                                  for s, f in zip(shape[2:], factors)))
+
+    def get_output_for(self, inputs, **kwargs):
+        # see eq. (1) and sec 3.1 in [1]
+        input, theta = inputs
+        return _transform_affine(theta, input, self.downsample_factor)
+
+
+def _transform_affine(theta, input, downsample_factor):
+    num_batch, num_channels, height, width = input.shape
+    theta = T.reshape(theta, (-1, 2, 3))
+
+    # grid of (x_t, y_t, 1), eq (1) in ref [1]
+    out_height = T.cast(height / downsample_factor[0], 'int64')
+    out_width = T.cast(width / downsample_factor[1], 'int64')
+    grid = _meshgrid(out_height, out_width)
+
+    # Transform A x (x_t, y_t, 1)^T -> (x_s, y_s)
+    T_g = T.dot(theta, grid)
+    x_s = T_g[:, 0]
+    y_s = T_g[:, 1]
+    x_s_flat = x_s.flatten()
+    y_s_flat = y_s.flatten()
+
+    # dimshuffle input to  (bs, height, width, channels)
+    input_dim = input.dimshuffle(0, 2, 3, 1)
+    input_transformed = _interpolate(
+        input_dim, x_s_flat, y_s_flat,
+        out_height, out_width)
+
+    output = T.reshape(
+        input_transformed, (num_batch, out_height, out_width, num_channels))
+    output = output.dimshuffle(0, 3, 1, 2)  # dimshuffle to conv format
+    return output
+
+
+def _interpolate(im, x, y, out_height, out_width):
+    # *_f are floats
+    num_batch, height, width, channels = im.shape
+    height_f = T.cast(height, theano.config.floatX)
+    width_f = T.cast(width, theano.config.floatX)
+
+    # clip coordinates to [-1, 1]
+    x = T.clip(x, -1, 1)
+    y = T.clip(y, -1, 1)
+
+    # scale coordinates from [-1, 1] to [0, width/height - 1]
+    x = (x + 1) / 2 * (width_f - 1)
+    y = (y + 1) / 2 * (height_f - 1)
+
+    # obtain indices of the 2x2 pixel neighborhood surrounding the coordinates;
+    # we need those in floatX for interpolation and in int64 for indexing. for
+    # indexing, we need to take care they do not extend past the image.
+    x0_f = T.floor(x)
+    y0_f = T.floor(y)
+    x1_f = x0_f + 1
+    y1_f = y0_f + 1
+    x0 = T.cast(x0_f, 'int64')
+    y0 = T.cast(y0_f, 'int64')
+    x1 = T.cast(T.minimum(x1_f, width_f - 1), 'int64')
+    y1 = T.cast(T.minimum(y1_f, height_f - 1), 'int64')
+
+    # The input is [num_batch, height, width, channels]. We do the lookup in
+    # the flattened input, i.e [num_batch*height*width, channels]. We need
+    # to offset all indices to match the flat version
+    dim2 = width
+    dim1 = width*height
+    base = T.repeat(
+        T.arange(num_batch, dtype='int64')*dim1, out_height*out_width)
+    base_y0 = base + y0*dim2
+    base_y1 = base + y1*dim2
+    idx_a = base_y0 + x0
+    idx_b = base_y1 + x0
+    idx_c = base_y0 + x1
+    idx_d = base_y1 + x1
+
+    # use indices to lookup pixels for all samples
+    im_flat = im.reshape((-1, channels))
+    Ia = im_flat[idx_a]
+    Ib = im_flat[idx_b]
+    Ic = im_flat[idx_c]
+    Id = im_flat[idx_d]
+
+    # calculate interpolated values
+    wa = ((x1_f-x) * (y1_f-y)).dimshuffle(0, 'x')
+    wb = ((x1_f-x) * (y-y0_f)).dimshuffle(0, 'x')
+    wc = ((x-x0_f) * (y1_f-y)).dimshuffle(0, 'x')
+    wd = ((x-x0_f) * (y-y0_f)).dimshuffle(0, 'x')
+    output = T.sum([wa*Ia, wb*Ib, wc*Ic, wd*Id], axis=0)
+    return output
+
+
+def _linspace(start, stop, num):
+    # Theano linspace. Behaves similar to np.linspace
+    start = T.cast(start, theano.config.floatX)
+    stop = T.cast(stop, theano.config.floatX)
+    num = T.cast(num, theano.config.floatX)
+    step = (stop-start)/(num-1)
+    return T.arange(num, dtype=theano.config.floatX)*step+start
+
+
+def _meshgrid(height, width):
+    # This function is the grid generator from eq. (1) in reference [1].
+    # It is equivalent to the following numpy code:
+    #  x_t, y_t = np.meshgrid(np.linspace(-1, 1, width),
+    #                         np.linspace(-1, 1, height))
+    #  ones = np.ones(np.prod(x_t.shape))
+    #  grid = np.vstack([x_t.flatten(), y_t.flatten(), ones])
+    # It is implemented in Theano instead to support symbolic grid sizes.
+    # Note: If the image size is known at layer construction time, we could
+    # compute the meshgrid offline in numpy instead of doing it dynamically
+    # in Theano. However, it hardly affected performance when we tried.
+    x_t = T.dot(T.ones((height, 1)),
+                _linspace(-1.0, 1.0, width).dimshuffle('x', 0))
+    y_t = T.dot(_linspace(-1.0, 1.0, height).dimshuffle(0, 'x'),
+                T.ones((1, width)))
+
+    x_t_flat = x_t.reshape((1, -1))
+    y_t_flat = y_t.reshape((1, -1))
+    ones = T.ones_like(x_t_flat)
+    grid = T.concatenate([x_t_flat, y_t_flat, ones], axis=0)
+    return grid
+
+
+class TPSTransformerLayer(MergeLayer):
+    """
+    Spatial transformer layer
+
+    The layer applies a thin plate spline transformation [2]_ on the input
+    as in [1]_. The thin plate spline transform is determined based on the
+    movement of some number of control points. The starting positions for
+    these control points are fixed. The output is interpolated with a
+    bilinear transformation.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape. The
+        output of this layer should be a 4D tensor, with shape
+        ``(batch_size, num_input_channels, input_rows, input_columns)``.
+
+    localization_network : a :class:`Layer` instance
+        The network that calculates the parameters of the thin plate spline
+        transformation as the x and y coordinates of the destination offsets of
+        each control point. The output of the localization network  should
+        be a 2D tensor, with shape ``(batch_size, 2 * num_control_points)``
+
+    downsample_factor : float or iterable of float
+        A float or a 2-element tuple specifying the downsample factor for the
+        output image (in both spatial dimensions). A value of 1 will keep the
+        original size of the input. Values larger than 1 will downsample the
+        input. Values below 1 will upsample the input.
+
+    control_points : integer
+        The number of control points to be used for the thin plate spline
+        transformation. These points will be arranged as a grid along the
+        image, so the value must be a perfect square. Default is 16.
+
+    precompute_grid : 'auto' or boolean
+        Flag to precompute the U function [2]_ for the grid and source
+        points. If 'auto', will be set to true as long as the input height
+        and width are specified. If true, the U function is computed when the
+        layer is constructed for a fixed input shape. If false, grid will be
+        computed as part of the Theano computational graph, which is
+        substantially slower as this computation scales with
+        num_pixels*num_control_points. Default is 'auto'.
+
+    References
+    ----------
+    .. [1]  Max Jaderberg, Karen Simonyan, Andrew Zisserman,
+            Koray Kavukcuoglu (2015):
+            Spatial Transformer Networks. NIPS 2015,
+            http://papers.nips.cc/paper/5854-spatial-transformer-networks.pdf
+    .. [2]  Fred L. Bookstein (1989):
+            Principal warps: thin-plate splines and the decomposition of
+            deformations. IEEE Transactions on
+            Pattern Analysis and Machine Intelligence.
+            http://doi.org/10.1109/34.24792
+
+    Examples
+    --------
+    Here, we'll implement an identity transform using a thin plate spline
+    transform. First we'll create the destination control point offsets. To
+    make everything invariant to the shape of the image, the x and y range
+    of the image is normalized to [-1, 1] as in ref [1]_. To replicate an
+    identity transform, we'll set the bias to have all offsets be 0. More
+    complicated transformations can easily be implemented using different x
+    and y offsets (importantly, each control point can have it's own pair of
+    offsets).
+
+    >>> import numpy as np
+    >>> import lasagne
+    >>>
+    >>> # Create the network
+    >>> # we'll initialize the weights and biases to zero, so it starts
+    >>> # as the identity transform (all control point offsets are zero)
+    >>> W = b = lasagne.init.Constant(0.0)
+    >>>
+    >>> # Set the number of points
+    >>> num_points = 16
+    >>>
+    >>> l_in = lasagne.layers.InputLayer((None, 3, 28, 28))
+    >>> l_loc = lasagne.layers.DenseLayer(l_in, num_units=2*num_points,
+    ...                                   W=W, b=b, nonlinearity=None)
+    >>> l_trans = lasagne.layers.TPSTransformerLayer(l_in, l_loc,
+    ...                                          control_points=num_points)
+    """
+
+    def __init__(self, incoming, localization_network, downsample_factor=1,
+                 control_points=16, precompute_grid='auto', **kwargs):
+        super(TPSTransformerLayer, self).__init__(
+                [incoming, localization_network], **kwargs)
+
+        self.downsample_factor = as_tuple(downsample_factor, 2)
+        self.control_points = control_points
+
+        input_shp, loc_shp = self.input_shapes
+
+        # Error checking
+        if loc_shp[-1] != 2 * control_points or len(loc_shp) != 2:
+            raise ValueError("The localization network must have "
+                             "output shape: (batch_size, "
+                             "2*control_points)")
+
+        if round(np.sqrt(control_points)) != np.sqrt(
+                control_points):
+            raise ValueError("The number of control points must be"
+                             " a perfect square.")
+
+        if len(input_shp) != 4:
+            raise ValueError("The input network must have a 4-dimensional "
+                             "output shape: (batch_size, num_input_channels, "
+                             "input_rows, input_columns)")
+
+        # Process precompute grid
+        can_precompute_grid = all(s is not None for s in input_shp[2:])
+        if precompute_grid == 'auto':
+            precompute_grid = can_precompute_grid
+        elif precompute_grid and not can_precompute_grid:
+            raise ValueError("Grid can only be precomputed if the input "
+                             "height and width are pre-specified.")
+        self.precompute_grid = precompute_grid
+
+        # Create source points and L matrix
+        self.right_mat, self.L_inv, self.source_points, self.out_height, \
+            self.out_width = _initialize_tps(
+                control_points, input_shp, self.downsample_factor,
+                precompute_grid)
+
+    def get_output_shape_for(self, input_shapes):
+        shape = input_shapes[0]
+        factors = self.downsample_factor
+        return (shape[:2] + tuple(None if s is None else int(s / f)
+                                  for s, f in zip(shape[2:], factors)))
+
+    def get_output_for(self, inputs, **kwargs):
+        # see eq. (1) and sec 3.1 in [1]
+        # Get input and destination control points
+        input, dest_offsets = inputs
+        return _transform_thin_plate_spline(
+                dest_offsets, input, self.right_mat, self.L_inv,
+                self.source_points, self.out_height, self.out_width,
+                self.precompute_grid, self.downsample_factor)
+
+
+def _transform_thin_plate_spline(
+        dest_offsets, input, right_mat, L_inv, source_points, out_height,
+        out_width, precompute_grid, downsample_factor):
+
+    num_batch, num_channels, height, width = input.shape
+    num_control_points = source_points.shape[1]
+
+    # reshape destination offsets to be (num_batch, 2, num_control_points)
+    # and add to source_points
+    dest_points = source_points + T.reshape(
+            dest_offsets, (num_batch, 2, num_control_points))
+
+    # Solve as in ref [2]
+    coefficients = T.dot(dest_points, L_inv[:, 3:].T)
+
+    if precompute_grid:
+
+        # Transform each point on the source grid (image_size x image_size)
+        right_mat = T.tile(right_mat.dimshuffle('x', 0, 1), (num_batch, 1, 1))
+        transformed_points = T.batched_dot(coefficients, right_mat)
+
+    else:
+
+        # Transformed grid
+        out_height = T.cast(height / downsample_factor[0], 'int64')
+        out_width = T.cast(width / downsample_factor[1], 'int64')
+        orig_grid = _meshgrid(out_height, out_width)
+        orig_grid = orig_grid[0:2, :]
+        orig_grid = T.tile(orig_grid, (num_batch, 1, 1))
+
+        # Transform each point on the source grid (image_size x image_size)
+        transformed_points = _get_transformed_points_tps(
+                orig_grid, source_points, coefficients, num_control_points,
+                num_batch)
+
+    # Get out new points
+    x_transformed = transformed_points[:, 0].flatten()
+    y_transformed = transformed_points[:, 1].flatten()
+
+    # dimshuffle input to  (bs, height, width, channels)
+    input_dim = input.dimshuffle(0, 2, 3, 1)
+    input_transformed = _interpolate(
+            input_dim, x_transformed, y_transformed,
+            out_height, out_width)
+
+    output = T.reshape(input_transformed,
+                       (num_batch, out_height, out_width, num_channels))
+    output = output.dimshuffle(0, 3, 1, 2)  # dimshuffle to conv format
+    return output
+
+
+def _get_transformed_points_tps(new_points, source_points, coefficients,
+                                num_points, batch_size):
+    """
+    Calculates the transformed points' value using the provided coefficients
+
+    :param new_points: num_batch x 2 x num_to_transform tensor
+    :param source_points: 2 x num_points array of source points
+    :param coefficients: coefficients (should be shape (num_batch, 2,
+        control_points + 3))
+    :param num_points: the number of points
+
+    :return: the x and y coordinates of each transformed point. Shape (
+        num_batch, 2, num_to_transform)
+    """
+
+    # Calculate the U function for the new point and each source point as in
+    # ref [2]
+    # The U function is simply U(r) = r^2 * log(r^2), where r^2 is the
+    # squared distance
+
+    # Calculate the squared dist between the new point and the source points
+    to_transform = new_points.dimshuffle(0, 'x', 1, 2)
+    stacked_transform = T.tile(to_transform, (1, num_points, 1, 1))
+    r_2 = T.sum(((stacked_transform - source_points.dimshuffle(
+            'x', 1, 0, 'x')) ** 2), axis=2)
+
+    # Take the product (r^2 * log(r^2)), being careful to avoid NaNs
+    log_r_2 = T.log(r_2)
+    distances = T.switch(T.isnan(log_r_2), r_2 * log_r_2, 0.)
+
+    # Add in the coefficients for the affine translation (1, x, and y,
+    # corresponding to a_1, a_x, and a_y)
+    upper_array = T.concatenate([T.ones((batch_size, 1, new_points.shape[2]),
+                                        dtype=theano.config.floatX),
+                                 new_points], axis=1)
+    right_mat = T.concatenate([upper_array, distances], axis=1)
+
+    # Calculate the new value as the dot product
+    new_value = T.batched_dot(coefficients, right_mat)
+    return new_value
+
+
+def _U_func_numpy(x1, y1, x2, y2):
+    """
+    Function which implements the U function from Bookstein paper
+    :param x1: x coordinate of the first point
+    :param y1: y coordinate of the first point
+    :param x2: x coordinate of the second point
+    :param y2: y coordinate of the second point
+    :return: value of z
+    """
+
+    # Return zero if same point
+    if x1 == x2 and y1 == y2:
+        return 0.
+
+    # Calculate the squared Euclidean norm (r^2)
+    r_2 = (x2 - x1) ** 2 + (y2 - y1) ** 2
+
+    # Return the squared norm (r^2 * log r^2)
+    return r_2 * np.log(r_2)
+
+
+def _initialize_tps(num_control_points, input_shape, downsample_factor,
+                    precompute_grid):
+    """
+    Initializes the thin plate spline calculation by creating the source
+    point array and the inverted L matrix used for calculating the
+    transformations as in ref [2]_
+
+    :param num_control_points: the number of control points. Must be a
+        perfect square. Points will be used to generate an evenly spaced grid.
+    :param input_shape: tuple with 4 elements specifying the input shape
+    :param downsample_factor: tuple with 2 elements specifying the
+        downsample for the height and width, respectively
+    :param precompute_grid: boolean specifying whether to precompute the
+        grid matrix
+    :return:
+        right_mat: shape (num_control_points + 3, out_height*out_width) tensor
+        L_inv: shape (num_control_points + 3, num_control_points + 3) tensor
+        source_points: shape (2, num_control_points) tensor
+        out_height: tensor constant specifying the ouptut height
+        out_width: tensor constant specifying the output width
+
+    """
+
+    # break out input_shape
+    _, _, height, width = input_shape
+
+    # Create source grid
+    grid_size = np.sqrt(num_control_points)
+    x_control_source, y_control_source = np.meshgrid(
+        np.linspace(-1, 1, grid_size),
+        np.linspace(-1, 1, grid_size))
+
+    # Create 2 x num_points array of source points
+    source_points = np.vstack(
+            (x_control_source.flatten(), y_control_source.flatten()))
+
+    # Convert to floatX
+    source_points = source_points.astype(theano.config.floatX)
+
+    # Get number of equations
+    num_equations = num_control_points + 3
+
+    # Initialize L to be num_equations square matrix
+    L = np.zeros((num_equations, num_equations), dtype=theano.config.floatX)
+
+    # Create P matrix components
+    L[0, 3:num_equations] = 1.
+    L[1:3, 3:num_equations] = source_points
+    L[3:num_equations, 0] = 1.
+    L[3:num_equations, 1:3] = source_points.T
+
+    # Loop through each pair of points and create the K matrix
+    for point_1 in range(num_control_points):
+        for point_2 in range(point_1, num_control_points):
+
+            L[point_1 + 3, point_2 + 3] = _U_func_numpy(
+                    source_points[0, point_1], source_points[1, point_1],
+                    source_points[0, point_2], source_points[1, point_2])
+
+            if point_1 != point_2:
+                L[point_2 + 3, point_1 + 3] = L[point_1 + 3, point_2 + 3]
+
+    # Invert
+    L_inv = np.linalg.inv(L)
+
+    if precompute_grid:
+        # Construct grid
+        out_height = np.array(height / downsample_factor[0]).astype('int64')
+        out_width = np.array(width / downsample_factor[1]).astype('int64')
+        x_t, y_t = np.meshgrid(np.linspace(-1, 1, out_width),
+                               np.linspace(-1, 1, out_height))
+        ones = np.ones(np.prod(x_t.shape))
+        orig_grid = np.vstack([x_t.flatten(), y_t.flatten(), ones])
+        orig_grid = orig_grid[0:2, :]
+        orig_grid = orig_grid.astype(theano.config.floatX)
+
+        # Construct right mat
+
+        # First Calculate the U function for the new point and each source
+        # point as in ref [2]
+        # The U function is simply U(r) = r^2 * log(r^2), where r^2 is the
+        # squared distance
+        to_transform = orig_grid[:, :, np.newaxis].transpose(2, 0, 1)
+        stacked_transform = np.tile(to_transform, (num_control_points, 1, 1))
+        stacked_source_points = \
+            source_points[:, :, np.newaxis].transpose(1, 0, 2)
+        r_2 = np.sum((stacked_transform - stacked_source_points) ** 2, axis=1)
+
+        # Take the product (r^2 * log(r^2)), being careful to avoid NaNs
+        log_r_2 = np.log(r_2)
+        log_r_2[np.isinf(log_r_2)] = 0.
+        distances = r_2 * log_r_2
+
+        # Add in the coefficients for the affine translation (1, x, and y,
+        # corresponding to a_1, a_x, and a_y)
+        upper_array = np.ones(shape=(1, orig_grid.shape[1]),
+                              dtype=theano.config.floatX)
+        upper_array = np.concatenate([upper_array, orig_grid], axis=0)
+        right_mat = np.concatenate([upper_array, distances], axis=0)
+
+        # Convert to tensors
+        out_height = T.as_tensor_variable(out_height)
+        out_width = T.as_tensor_variable(out_width)
+        right_mat = T.as_tensor_variable(right_mat)
+
+    else:
+        out_height = None
+        out_width = None
+        right_mat = None
+
+    # Convert to tensors
+    L_inv = T.as_tensor_variable(L_inv)
+    source_points = T.as_tensor_variable(source_points)
+
+    return right_mat, L_inv, source_points, out_height, out_width
+
+
+class ParametricRectifierLayer(Layer):
+    """
+    lasagne.layers.ParametricRectifierLayer(incoming,
+    alpha=init.Constant(0.25), shared_axes='auto', **kwargs)
+
+    A layer that applies parametric rectify nonlinearity to its input
+    following [1]_.
+
+    Equation for the parametric rectifier linear unit:
+    :math:`\\varphi(x) = \\max(x,0) + \\alpha \\min(x,0)`
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape
+
+    alpha : Theano shared variable, expression, numpy array or callable
+        Initial value, expression or initializer for the alpha values. The
+        shape must match the incoming shape, skipping those axes the alpha
+        values are shared over (see the example below).
+        See :func:`lasagne.utils.create_param` for more information.
+
+    shared_axes : 'auto', 'all', int or tuple of int
+        The axes along which the parameters of the rectifier units are
+        going to be shared. If ``'auto'`` (the default), share over all axes
+        except for the second - this will share the parameter over the
+        minibatch dimension for dense layers, and additionally over all
+        spatial dimensions for convolutional layers. If ``'all'``, share over
+        all axes, which corresponds to a single scalar parameter.
+
+    **kwargs
+        Any additional keyword arguments are passed to the `Layer` superclass.
+
+     References
+    ----------
+    .. [1] K He, X Zhang et al. (2015):
+       Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+       ImageNet Classification,
+       http://arxiv.org/abs/1502.01852
+
+    Notes
+    -----
+    The alpha parameter dimensionality is the input dimensionality minus the
+    number of axes it is shared over, which matches the same convention as
+    the :class:`BiasLayer`.
+
+    >>> layer = ParametricRectifierLayer((20, 3, 28, 28), shared_axes=(0, 3))
+    >>> layer.alpha.get_value().shape
+    (3, 28)
+    """
+    def __init__(self, incoming, alpha=init.Constant(0.25), shared_axes='auto',
+                 **kwargs):
+        super(ParametricRectifierLayer, self).__init__(incoming, **kwargs)
+        if shared_axes == 'auto':
+            self.shared_axes = (0,) + tuple(range(2, len(self.input_shape)))
+        elif shared_axes == 'all':
+            self.shared_axes = tuple(range(len(self.input_shape)))
+        elif isinstance(shared_axes, int):
+            self.shared_axes = (shared_axes,)
+        else:
+            self.shared_axes = shared_axes
+
+        shape = [size for axis, size in enumerate(self.input_shape)
+                 if axis not in self.shared_axes]
+        if any(size is None for size in shape):
+            raise ValueError("ParametricRectifierLayer needs input sizes for "
+                             "all axes that alpha's are not shared over.")
+        self.alpha = self.add_param(alpha, shape, name="alpha",
+                                    regularizable=False)
+
+    def get_output_for(self, input, **kwargs):
+        axes = iter(range(self.alpha.ndim))
+        pattern = ['x' if input_axis in self.shared_axes
+                   else next(axes)
+                   for input_axis in range(input.ndim)]
+        alpha = self.alpha.dimshuffle(pattern)
+        return theano.tensor.nnet.relu(input, alpha)
+
+
+def prelu(layer, **kwargs):
+    """
+    Convenience function to apply parametric rectify to a given layer's output.
+    Will set the layer's nonlinearity to identity if there is one and will
+    apply the parametric rectifier instead.
+
+    Parameters
+    ----------
+    layer: a :class:`Layer` instance
+        The `Layer` instance to apply the parametric rectifier layer to;
+        note that it will be irreversibly modified as specified above
+
+    **kwargs
+        Any additional keyword arguments are passed to the
+        :class:`ParametericRectifierLayer`
+
+    Examples
+    --------
+    Note that this function modifies an existing layer, like this:
+
+    >>> from lasagne.layers import InputLayer, DenseLayer, prelu
+    >>> layer = InputLayer((32, 100))
+    >>> layer = DenseLayer(layer, num_units=200)
+    >>> layer = prelu(layer)
+
+    In particular, :func:`prelu` can *not* be passed as a nonlinearity.
+    """
+    nonlinearity = getattr(layer, 'nonlinearity', None)
+    if nonlinearity is not None:
+        layer.nonlinearity = nonlinearities.identity
+    return ParametricRectifierLayer(layer, **kwargs)
+
+
+class RandomizedRectifierLayer(Layer):
+    """
+    A layer that applies a randomized leaky rectify nonlinearity to its input.
+
+    The randomized leaky rectifier was first proposed and used in the Kaggle
+    NDSB Competition, and later evaluated in [1]_. Compared to the standard
+    leaky rectifier :func:`leaky_rectify`, it has a randomly sampled slope
+    for negative input during training, and a fixed slope during evaluation.
+
+    Equation for the randomized rectifier linear unit during training:
+    :math:`\\varphi(x) = \\max((\\sim U(lower, upper)) \\cdot x, x)`
+
+    During evaluation, the factor is fixed to the arithmetic mean of `lower`
+    and `upper`.
+
+    Parameters
+    ----------
+    incoming : a :class:`Layer` instance or a tuple
+        The layer feeding into this layer, or the expected input shape
+
+    lower : Theano shared variable, expression, or constant
+        The lower bound for the randomly chosen slopes.
+
+    upper : Theano shared variable, expression, or constant
+        The upper bound for the randomly chosen slopes.
+
+    shared_axes : 'auto', 'all', int or tuple of int
+        The axes along which the random slopes of the rectifier units are
+        going to be shared. If ``'auto'`` (the default), share over all axes
+        except for the second - this will share the random slope over the
+        minibatch dimension for dense layers, and additionally over all
+        spatial dimensions for convolutional layers. If ``'all'``, share over
+        all axes, thus using a single random slope.
+
+    **kwargs
+        Any additional keyword arguments are passed to the `Layer` superclass.
+
+     References
+    ----------
+    .. [1] Bing Xu, Naiyan Wang et al. (2015):
+       Empirical Evaluation of Rectified Activations in Convolutional Network,
+       http://arxiv.org/abs/1505.00853
+    """
+    def __init__(self, incoming, lower=0.3, upper=0.8, shared_axes='auto',
+                 **kwargs):
+        super(RandomizedRectifierLayer, self).__init__(incoming, **kwargs)
+        self._srng = RandomStreams(get_rng().randint(1, 2147462579))
+        self.lower = lower
+        self.upper = upper
+
+        if not isinstance(lower > upper, theano.Variable) and lower > upper:
+            raise ValueError("Upper bound for RandomizedRectifierLayer needs "
+                             "to be higher than lower bound.")
+
+        if shared_axes == 'auto':
+            self.shared_axes = (0,) + tuple(range(2, len(self.input_shape)))
+        elif shared_axes == 'all':
+            self.shared_axes = tuple(range(len(self.input_shape)))
+        elif isinstance(shared_axes, int):
+            self.shared_axes = (shared_axes,)
+        else:
+            self.shared_axes = shared_axes
+
+    def get_output_for(self, input, deterministic=False, **kwargs):
+        """
+        Parameters
+        ----------
+        input : tensor
+            output from the previous layer
+        deterministic : bool
+            If true, the arithmetic mean of lower and upper are used for the
+            leaky slope.
+        """
+        if deterministic or self.upper == self.lower:
+            return theano.tensor.nnet.relu(input, (self.upper+self.lower)/2.0)
+        else:
+            shape = list(self.input_shape)
+            if any(s is None for s in shape):
+                shape = list(input.shape)
+            for ax in self.shared_axes:
+                shape[ax] = 1
+
+            rnd = self._srng.uniform(tuple(shape),
+                                     low=self.lower,
+                                     high=self.upper,
+                                     dtype=theano.config.floatX)
+            rnd = theano.tensor.addbroadcast(rnd, *self.shared_axes)
+            return theano.tensor.nnet.relu(input, rnd)
+
+
+def rrelu(layer, **kwargs):
+    """
+    Convenience function to apply randomized rectify to a given layer's output.
+    Will set the layer's nonlinearity to identity if there is one and will
+    apply the randomized rectifier instead.
+
+    Parameters
+    ----------
+    layer: a :class:`Layer` instance
+        The `Layer` instance to apply the randomized rectifier layer to;
+        note that it will be irreversibly modified as specified above
+
+    **kwargs
+        Any additional keyword arguments are passed to the
+        :class:`RandomizedRectifierLayer`
+
+    Examples
+    --------
+    Note that this function modifies an existing layer, like this:
+
+    >>> from lasagne.layers import InputLayer, DenseLayer, rrelu
+    >>> layer = InputLayer((32, 100))
+    >>> layer = DenseLayer(layer, num_units=200)
+    >>> layer = rrelu(layer)
+
+    In particular, :func:`rrelu` can *not* be passed as a nonlinearity.
+    """
+    nonlinearity = getattr(layer, 'nonlinearity', None)
+    if nonlinearity is not None:
+        layer.nonlinearity = nonlinearities.identity
+    return RandomizedRectifierLayer(layer, **kwargs)
diff --git a/lasagne/nonlinearities.py b/lasagne/nonlinearities.py
new file mode 100644
index 0000000..b734bac
--- /dev/null
+++ b/lasagne/nonlinearities.py
@@ -0,0 +1,305 @@
+# -*- coding: utf-8 -*-
+"""
+Non-linear activation functions for artificial neurons.
+"""
+
+import theano.tensor
+
+
+# sigmoid
+def sigmoid(x):
+    """Sigmoid activation function :math:`\\varphi(x) = \\frac{1}{1 + e^{-x}}`
+
+    Parameters
+    ----------
+    x : float32
+        The activation (the summed, weighted input of a neuron).
+
+    Returns
+    -------
+    float32 in [0, 1]
+        The output of the sigmoid function applied to the activation.
+    """
+    return theano.tensor.nnet.sigmoid(x)
+
+
+# softmax (row-wise)
+def softmax(x):
+    """Softmax activation function
+    :math:`\\varphi(\\mathbf{x})_j =
+    \\frac{e^{\mathbf{x}_j}}{\sum_{k=1}^K e^{\mathbf{x}_k}}`
+    where :math:`K` is the total number of neurons in the layer. This
+    activation function gets applied row-wise.
+
+    Parameters
+    ----------
+    x : float32
+        The activation (the summed, weighted input of a neuron).
+
+    Returns
+    -------
+    float32 where the sum of the row is 1 and each single value is in [0, 1]
+        The output of the softmax function applied to the activation.
+    """
+    return theano.tensor.nnet.softmax(x)
+
+
+# tanh
+def tanh(x):
+    """Tanh activation function :math:`\\varphi(x) = \\tanh(x)`
+
+    Parameters
+    ----------
+    x : float32
+        The activation (the summed, weighted input of a neuron).
+
+    Returns
+    -------
+    float32 in [-1, 1]
+        The output of the tanh function applied to the activation.
+    """
+    return theano.tensor.tanh(x)
+
+
+# scaled tanh
+class ScaledTanH(object):
+    """Scaled tanh :math:`\\varphi(x) = \\tanh(\\alpha \\cdot x) \\cdot \\beta`
+
+    This is a modified tanh function which allows to rescale both the input and
+    the output of the activation.
+
+    Scaling the input down will result in decreasing the maximum slope of the
+    tanh and as a result it will be in the linear regime in a larger interval
+    of the input space. Scaling the input up will increase the maximum slope
+    of the tanh and thus bring it closer to a step function.
+
+    Scaling the output changes the output interval to :math:`[-\\beta,\\beta]`.
+
+    Parameters
+    ----------
+    scale_in : float32
+        The scale parameter :math:`\\alpha` for the input
+
+    scale_out : float32
+        The scale parameter :math:`\\beta` for the output
+
+    Methods
+    -------
+    __call__(x)
+        Apply the scaled tanh function to the activation `x`.
+
+    Examples
+    --------
+    In contrast to other activation functions in this module, this is
+    a class that needs to be instantiated to obtain a callable:
+
+    >>> from lasagne.layers import InputLayer, DenseLayer
+    >>> l_in = InputLayer((None, 100))
+    >>> from lasagne.nonlinearities import ScaledTanH
+    >>> scaled_tanh = ScaledTanH(scale_in=0.5, scale_out=2.27)
+    >>> l1 = DenseLayer(l_in, num_units=200, nonlinearity=scaled_tanh)
+
+    Notes
+    -----
+    LeCun et al. (in [1]_, Section 4.4) suggest ``scale_in=2./3`` and
+    ``scale_out=1.7159``, which has :math:`\\varphi(\\pm 1) = \\pm 1`,
+    maximum second derivative at 1, and an effective gain close to 1.
+
+    By carefully matching :math:`\\alpha` and :math:`\\beta`, the nonlinearity
+    can also be tuned to preserve the mean and variance of its input:
+
+      * ``scale_in=0.5``, ``scale_out=2.4``: If the input is a random normal
+        variable, the output will have zero mean and unit variance.
+      * ``scale_in=1``, ``scale_out=1.6``: Same property, but with a smaller
+        linear regime in input space.
+      * ``scale_in=0.5``, ``scale_out=2.27``: If the input is a uniform normal
+        variable, the output will have zero mean and unit variance.
+      * ``scale_in=1``, ``scale_out=1.48``: Same property, but with a smaller
+        linear regime in input space.
+
+    References
+    ----------
+    .. [1] LeCun, Yann A., et al. (1998):
+       Efficient BackProp,
+       http://link.springer.com/chapter/10.1007/3-540-49430-8_2,
+       http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
+    .. [2] Masci, Jonathan, et al. (2011):
+       Stacked Convolutional Auto-Encoders for Hierarchical Feature Extraction,
+       http://link.springer.com/chapter/10.1007/978-3-642-21735-7_7,
+       http://people.idsia.ch/~ciresan/data/icann2011.pdf
+    """
+
+    def __init__(self, scale_in=1, scale_out=1):
+        self.scale_in = scale_in
+        self.scale_out = scale_out
+
+    def __call__(self, x):
+        return theano.tensor.tanh(x * self.scale_in) * self.scale_out
+
+
+ScaledTanh = ScaledTanH  # alias with alternative capitalization
+
+
+# rectify
+def rectify(x):
+    """Rectify activation function :math:`\\varphi(x) = \\max(0, x)`
+
+    Parameters
+    ----------
+    x : float32
+        The activation (the summed, weighted input of a neuron).
+
+    Returns
+    -------
+    float32
+        The output of the rectify function applied to the activation.
+    """
+    return theano.tensor.nnet.relu(x)
+
+
+# leaky rectify
+class LeakyRectify(object):
+    """Leaky rectifier :math:`\\varphi(x) = \\max(\\alpha \\cdot x, x)`
+
+    The leaky rectifier was introduced in [1]_. Compared to the standard
+    rectifier :func:`rectify`, it has a nonzero gradient for negative input,
+    which often helps convergence.
+
+    Parameters
+    ----------
+    leakiness : float
+        Slope for negative input, usually between 0 and 1.
+        A leakiness of 0 will lead to the standard rectifier,
+        a leakiness of 1 will lead to a linear activation function,
+        and any value in between will give a leaky rectifier.
+
+    Methods
+    -------
+    __call__(x)
+        Apply the leaky rectify function to the activation `x`.
+
+    Examples
+    --------
+    In contrast to other activation functions in this module, this is
+    a class that needs to be instantiated to obtain a callable:
+
+    >>> from lasagne.layers import InputLayer, DenseLayer
+    >>> l_in = InputLayer((None, 100))
+    >>> from lasagne.nonlinearities import LeakyRectify
+    >>> custom_rectify = LeakyRectify(0.1)
+    >>> l1 = DenseLayer(l_in, num_units=200, nonlinearity=custom_rectify)
+
+    Alternatively, you can use the provided instance for leakiness=0.01:
+
+    >>> from lasagne.nonlinearities import leaky_rectify
+    >>> l2 = DenseLayer(l_in, num_units=200, nonlinearity=leaky_rectify)
+
+    Or the one for a high leakiness of 1/3:
+
+    >>> from lasagne.nonlinearities import very_leaky_rectify
+    >>> l3 = DenseLayer(l_in, num_units=200, nonlinearity=very_leaky_rectify)
+
+    See Also
+    --------
+    leaky_rectify: Instance with default leakiness of 0.01, as in [1]_.
+    very_leaky_rectify: Instance with high leakiness of 1/3, as in [2]_.
+
+    References
+    ----------
+    .. [1] Maas et al. (2013):
+       Rectifier Nonlinearities Improve Neural Network Acoustic Models,
+       http://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf
+    .. [2] Graham, Benjamin (2014):
+       Spatially-sparse convolutional neural networks,
+       http://arxiv.org/abs/1409.6070
+    """
+    def __init__(self, leakiness=0.01):
+        self.leakiness = leakiness
+
+    def __call__(self, x):
+        return theano.tensor.nnet.relu(x, self.leakiness)
+
+
+leaky_rectify = LeakyRectify()  # shortcut with default leakiness
+leaky_rectify.__doc__ = """leaky_rectify(x)
+
+    Instance of :class:`LeakyRectify` with leakiness :math:`\\alpha=0.01`
+    """
+
+
+very_leaky_rectify = LeakyRectify(1./3)  # shortcut with high leakiness
+very_leaky_rectify.__doc__ = """very_leaky_rectify(x)
+
+     Instance of :class:`LeakyRectify` with leakiness :math:`\\alpha=1/3`
+     """
+
+
+# elu
+def elu(x):
+    """Exponential Linear Unit :math:`\\varphi(x) = (x > 0) ? x : e^x - 1`
+
+    The Exponential Linear Unit (ELU) was introduced in [1]_. Compared to the
+    linear rectifier :func:`rectify`, it has a mean activation closer to zero
+    and nonzero gradient for negative input, which can help convergence.
+    Compared to the leaky rectifier :class:`LeakyRectify`, it saturates for
+    highly negative inputs.
+
+    Parameters
+    ----------
+    x : float32
+        The activation (the summed, weighed input of a neuron).
+
+    Returns
+    -------
+    float32
+        The output of the exponential linear unit for the activation.
+
+    Notes
+    -----
+    In [1]_, an additional parameter :math:`\\alpha` controls the (negative)
+    saturation value for negative inputs, but is set to 1 for all experiments.
+    It is omitted here.
+
+    References
+    ----------
+    .. [1] Djork-Arné Clevert, Thomas Unterthiner, Sepp Hochreiter (2015):
+       Fast and Accurate Deep Network Learning by Exponential Linear Units
+       (ELUs), http://arxiv.org/abs/1511.07289
+    """
+    return theano.tensor.switch(x > 0, x, theano.tensor.exp(x) - 1)
+
+
+# softplus
+def softplus(x):
+    """Softplus activation function :math:`\\varphi(x) = \\log(1 + e^x)`
+
+    Parameters
+    ----------
+    x : float32
+        The activation (the summed, weighted input of a neuron).
+
+    Returns
+    -------
+    float32
+        The output of the softplus function applied to the activation.
+    """
+    return theano.tensor.nnet.softplus(x)
+
+
+# linear
+def linear(x):
+    """Linear activation function :math:`\\varphi(x) = x`
+
+    Parameters
+    ----------
+    x : float32
+        The activation (the summed, weighted input of a neuron).
+
+    Returns
+    -------
+    float32
+        The output of the identity applied to the activation.
+    """
+    return x
+
+identity = linear
diff --git a/lasagne/objectives.py b/lasagne/objectives.py
new file mode 100644
index 0000000..5da3f16
--- /dev/null
+++ b/lasagne/objectives.py
@@ -0,0 +1,379 @@
+"""
+Provides some minimal help with building loss expressions for training or
+validating a neural network.
+
+Five functions build element- or item-wise loss expressions from network
+predictions and targets:
+
+.. autosummary::
+    :nosignatures:
+
+    binary_crossentropy
+    categorical_crossentropy
+    squared_error
+    binary_hinge_loss
+    multiclass_hinge_loss
+
+A convenience function aggregates such losses into a scalar expression
+suitable for differentiation:
+
+.. autosummary::
+    :nosignatures:
+
+    aggregate
+
+Note that these functions only serve to write more readable code, but are
+completely optional. Essentially, any differentiable scalar Theano expression
+can be used as a training objective.
+
+Finally, two functions compute evaluation measures that are useful for
+validation and testing only, not for training:
+
+.. autosummary::
+   :nosignatures:
+
+   binary_accuracy
+   categorical_accuracy
+
+Those can also be aggregated into a scalar expression if needed.
+
+Examples
+--------
+Assuming you have a simple neural network for 3-way classification:
+
+>>> from lasagne.layers import InputLayer, DenseLayer, get_output
+>>> from lasagne.nonlinearities import softmax, rectify
+>>> l_in = InputLayer((100, 20))
+>>> l_hid = DenseLayer(l_in, num_units=30, nonlinearity=rectify)
+>>> l_out = DenseLayer(l_hid, num_units=3, nonlinearity=softmax)
+
+And Theano variables representing your network input and targets:
+
+>>> import theano
+>>> data = theano.tensor.matrix('data')
+>>> targets = theano.tensor.matrix('targets')
+
+You'd first construct an element-wise loss expression:
+
+>>> from lasagne.objectives import categorical_crossentropy, aggregate
+>>> predictions = get_output(l_out, data)
+>>> loss = categorical_crossentropy(predictions, targets)
+
+Then aggregate it into a scalar (you could also just call ``mean()`` on it):
+
+>>> loss = aggregate(loss, mode='mean')
+
+Finally, this gives a loss expression you can pass to any of the update
+methods in :mod:`lasagne.updates`. For validation of a network, you will
+usually want to repeat these steps with deterministic network output, i.e.,
+without dropout or any other nondeterministic computation in between:
+
+>>> test_predictions = get_output(l_out, data, deterministic=True)
+>>> test_loss = categorical_crossentropy(test_predictions, targets)
+>>> test_loss = aggregate(test_loss)
+
+This gives a loss expression good for monitoring validation error.
+"""
+
+import theano.tensor.nnet
+
+from lasagne.layers import get_output
+
+__all__ = [
+    "binary_crossentropy",
+    "categorical_crossentropy",
+    "squared_error",
+    "aggregate",
+    "binary_hinge_loss",
+    "multiclass_hinge_loss",
+    "binary_accuracy",
+    "categorical_accuracy"
+]
+
+
+def binary_crossentropy(predictions, targets):
+    """Computes the binary cross-entropy between predictions and targets.
+
+    .. math:: L = -t \\log(p) - (1 - t) \\log(1 - p)
+
+    Parameters
+    ----------
+    predictions : Theano tensor
+        Predictions in (0, 1), such as sigmoidal output of a neural network.
+    targets : Theano tensor
+        Targets in [0, 1], such as ground truth labels.
+
+    Returns
+    -------
+    Theano tensor
+        An expression for the element-wise binary cross-entropy.
+
+    Notes
+    -----
+    This is the loss function of choice for binary classification problems
+    and sigmoid output units.
+    """
+    return theano.tensor.nnet.binary_crossentropy(predictions, targets)
+
+
+def categorical_crossentropy(predictions, targets):
+    """Computes the categorical cross-entropy between predictions and targets.
+
+    .. math:: L_i = - \\sum_j{t_{i,j} \\log(p_{i,j})}
+
+    Parameters
+    ----------
+    predictions : Theano 2D tensor
+        Predictions in (0, 1), such as softmax output of a neural network,
+        with data points in rows and class probabilities in columns.
+    targets : Theano 2D tensor or 1D tensor
+        Either targets in [0, 1] matching the layout of `predictions`, or
+        a vector of int giving the correct class index per data point.
+
+    Returns
+    -------
+    Theano 1D tensor
+        An expression for the item-wise categorical cross-entropy.
+
+    Notes
+    -----
+    This is the loss function of choice for multi-class classification
+    problems and softmax output units. For hard targets, i.e., targets
+    that assign all of the probability to a single class per data point,
+    providing a vector of int for the targets is usually slightly more
+    efficient than providing a matrix with a single 1.0 per row.
+    """
+    return theano.tensor.nnet.categorical_crossentropy(predictions, targets)
+
+
+def squared_error(a, b):
+    """Computes the element-wise squared difference between two tensors.
+
+    .. math:: L = (p - t)^2
+
+    Parameters
+    ----------
+    a, b : Theano tensor
+        The tensors to compute the squared difference between.
+
+    Returns
+    -------
+    Theano tensor
+        An expression for the element-wise squared difference.
+
+    Notes
+    -----
+    This is the loss function of choice for many regression problems
+    or auto-encoders with linear output units.
+    """
+    return (a - b)**2
+
+
+def aggregate(loss, weights=None, mode='mean'):
+    """Aggregates an element- or item-wise loss to a scalar loss.
+
+    Parameters
+    ----------
+    loss : Theano tensor
+        The loss expression to aggregate.
+    weights : Theano tensor, optional
+        The weights for each element or item, must be broadcastable to
+        the same shape as `loss` if given. If omitted, all elements will
+        be weighted the same.
+    mode : {'mean', 'sum', 'normalized_sum'}
+        Whether to aggregate by averaging, by summing or by summing and
+        dividing by the total weights (which requires `weights` to be given).
+
+    Returns
+    -------
+    Theano scalar
+        A scalar loss expression suitable for differentiation.
+
+    Notes
+    -----
+    By supplying binary weights (i.e., only using values 0 and 1), this
+    function can also be used for masking out particular entries in the
+    loss expression. Note that masked entries still need to be valid
+    values, not-a-numbers (NaNs) will propagate through.
+
+    When applied to batch-wise loss expressions, setting `mode` to
+    ``'normalized_sum'`` ensures that the loss per batch is of a similar
+    magnitude, independent of associated weights. However, it means that
+    a given data point contributes more to the loss when it shares a batch
+    with low-weighted or masked data points than with high-weighted ones.
+    """
+    if weights is not None:
+        loss = loss * weights
+    if mode == 'mean':
+        return loss.mean()
+    elif mode == 'sum':
+        return loss.sum()
+    elif mode == 'normalized_sum':
+        if weights is None:
+            raise ValueError("require weights for mode='normalized_sum'")
+        return loss.sum() / weights.sum()
+    else:
+        raise ValueError("mode must be 'mean', 'sum' or 'normalized_sum', "
+                         "got %r" % mode)
+
+
+def binary_hinge_loss(predictions, targets, binary=True, delta=1):
+    """Computes the binary hinge loss between predictions and targets.
+
+    .. math:: L_i = \\max(0, \\delta - t_i p_i)
+
+    Parameters
+    ----------
+    predictions : Theano tensor
+        Predictions in (0, 1), such as sigmoidal output of a neural network.
+    targets : Theano tensor
+        Targets in {0, 1} (or in {-1, 1} depending on `binary`), such as
+        ground truth labels.
+    binary : bool, default True
+        ``True`` if targets are in {0, 1}, ``False`` if they are in {-1, 1}
+    delta : scalar, default 1
+        The hinge loss margin
+
+    Returns
+    -------
+    Theano tensor
+        An expression for the element-wise binary hinge loss
+
+    Notes
+    -----
+    This is an alternative to the binary cross-entropy loss for binary
+    classification problems
+    """
+    if binary:
+        targets = 2 * targets - 1
+    return theano.tensor.nnet.relu(delta - predictions * targets)
+
+
+def multiclass_hinge_loss(predictions, targets, delta=1):
+    """Computes the multi-class hinge loss between predictions and targets.
+
+    .. math:: L_i = \\max_{j \\not = p_i} (0, t_j - t_{p_i} + \\delta)
+
+    Parameters
+    ----------
+    predictions : Theano 2D tensor
+        Predictions in (0, 1), such as softmax output of a neural network,
+        with data points in rows and class probabilities in columns.
+    targets : Theano 2D tensor or 1D tensor
+        Either a vector of int giving the correct class index per data point
+        or a 2D tensor of one-hot encoding of the correct class in the same
+        layout as predictions (non-binary targets in [0, 1] do not work!)
+    delta : scalar, default 1
+        The hinge loss margin
+
+    Returns
+    -------
+    Theano 1D tensor
+        An expression for the item-wise multi-class hinge loss
+
+    Notes
+    -----
+    This is an alternative to the categorical cross-entropy loss for
+    multi-class classification problems
+    """
+    num_cls = predictions.shape[1]
+    if targets.ndim == predictions.ndim - 1:
+        targets = theano.tensor.extra_ops.to_one_hot(targets, num_cls)
+    elif targets.ndim != predictions.ndim:
+        raise TypeError('rank mismatch between targets and predictions')
+    corrects = predictions[targets.nonzero()]
+    rest = theano.tensor.reshape(predictions[(1-targets).nonzero()],
+                                 (-1, num_cls-1))
+    rest = theano.tensor.max(rest, axis=1)
+    return theano.tensor.nnet.relu(rest - corrects + delta)
+
+
+def binary_accuracy(predictions, targets, threshold=0.5):
+    """Computes the binary accuracy between predictions and targets.
+
+    .. math:: L_i = \\mathbb{I}(t_i = \mathbb{I}(p_i \\ge \\alpha))
+
+    Parameters
+    ----------
+    predictions : Theano tensor
+        Predictions in [0, 1], such as a sigmoidal output of a neural network,
+        giving the probability of the positive class
+    targets : Theano tensor
+        Targets in {0, 1}, such as ground truth labels.
+    threshold : scalar, default: 0.5
+        Specifies at what threshold to consider the predictions being of the
+        positive class
+
+    Returns
+    -------
+    Theano tensor
+        An expression for the element-wise binary accuracy in {0, 1}
+
+    Notes
+    -----
+    This objective function should not be used with a gradient calculation;
+    its gradient is zero everywhere. It is intended as a convenience for
+    validation and testing, not training.
+
+    To obtain the average accuracy, call :func:`theano.tensor.mean()` on the
+    result, passing ``dtype=theano.config.floatX`` to compute the mean on GPU.
+    """
+    predictions = theano.tensor.ge(predictions, threshold)
+    return theano.tensor.eq(predictions, targets)
+
+
+def categorical_accuracy(predictions, targets, top_k=1):
+    """Computes the categorical accuracy between predictions and targets.
+
+    .. math:: L_i = \\mathbb{I}(t_i = \\operatorname{argmax}_c p_{i,c})
+
+    Can be relaxed to allow matches among the top :math:`k` predictions:
+
+    .. math::
+        L_i = \\mathbb{I}(t_i \\in \\operatorname{argsort}_c (-p_{i,c})_{:k})
+
+    Parameters
+    ----------
+    predictions : Theano 2D tensor
+        Predictions in (0, 1), such as softmax output of a neural network,
+        with data points in rows and class probabilities in columns.
+    targets : Theano 2D tensor or 1D tensor
+        Either a vector of int giving the correct class index per data point
+        or a 2D tensor of 1 hot encoding of the correct class in the same
+        layout as predictions
+    top_k : int
+        Regard a prediction to be correct if the target class is among the
+        `top_k` largest class probabilities. For the default value of 1, a
+        prediction is correct only if the target class is the most probable.
+
+    Returns
+    -------
+    Theano 1D tensor
+        An expression for the item-wise categorical accuracy in {0, 1}
+
+    Notes
+    -----
+    This is a strictly non differential function as it includes an argmax.
+    This objective function should never be used with a gradient calculation.
+    It is intended as a convenience for validation and testing not training.
+
+    To obtain the average accuracy, call :func:`theano.tensor.mean()` on the
+    result, passing ``dtype=theano.config.floatX`` to compute the mean on GPU.
+    """
+    if targets.ndim == predictions.ndim:
+        targets = theano.tensor.argmax(targets, axis=-1)
+    elif targets.ndim != predictions.ndim - 1:
+        raise TypeError('rank mismatch between targets and predictions')
+
+    if top_k == 1:
+        # standard categorical accuracy
+        top = theano.tensor.argmax(predictions, axis=-1)
+        return theano.tensor.eq(top, targets)
+    else:
+        # top-k accuracy
+        top = theano.tensor.argsort(predictions, axis=-1)
+        # (Theano cannot index with [..., -top_k:], we need to simulate that)
+        top = top[[slice(None) for _ in range(top.ndim - 1)] +
+                  [slice(-top_k, None)]]
+        targets = theano.tensor.shape_padaxis(targets, axis=-1)
+        return theano.tensor.any(theano.tensor.eq(top, targets), axis=-1)
diff --git a/lasagne/random.py b/lasagne/random.py
new file mode 100644
index 0000000..65a0e70
--- /dev/null
+++ b/lasagne/random.py
@@ -0,0 +1,36 @@
+"""
+A module with a package-wide random number generator,
+used for weight initialization and seeding noise layers.
+This can be replaced by a :class:`numpy.random.RandomState` instance with a
+particular seed to facilitate reproducibility.
+"""
+
+import numpy as np
+
+
+_rng = np.random
+
+
+def get_rng():
+    """Get the package-level random number generator.
+
+    Returns
+    -------
+    :class:`numpy.random.RandomState` instance
+        The :class:`numpy.random.RandomState` instance passed to the most
+        recent call of :func:`set_rng`, or ``numpy.random`` if :func:`set_rng`
+        has never been called.
+    """
+    return _rng
+
+
+def set_rng(new_rng):
+    """Set the package-level random number generator.
+
+    Parameters
+    ----------
+    new_rng : ``numpy.random`` or a :class:`numpy.random.RandomState` instance
+        The random number generator to use.
+    """
+    global _rng
+    _rng = new_rng
diff --git a/lasagne/regularization.py b/lasagne/regularization.py
new file mode 100644
index 0000000..d3672a9
--- /dev/null
+++ b/lasagne/regularization.py
@@ -0,0 +1,189 @@
+"""
+Functions to apply regularization to the weights in a network.
+
+We provide functions to calculate the L1 and L2 penalty. Penalty functions
+take a tensor as input and calculate the penalty contribution from that tensor:
+
+.. autosummary::
+    :nosignatures:
+
+    l1
+    l2
+
+A helper function can be used to apply a penalty function to a tensor or a
+list of tensors:
+
+.. autosummary::
+    :nosignatures:
+
+    apply_penalty
+
+Finally we provide two helper functions for applying a penalty function to the
+parameters in a layer or the parameters in a group of layers:
+
+.. autosummary::
+    :nosignatures:
+
+    regularize_layer_params_weighted
+    regularize_network_params
+
+Examples
+--------
+>>> import lasagne
+>>> import theano.tensor as T
+>>> import theano
+>>> from lasagne.nonlinearities import softmax
+>>> from lasagne.layers import InputLayer, DenseLayer, get_output
+>>> from lasagne.regularization import regularize_layer_params_weighted, l2, l1
+>>> from lasagne.regularization import regularize_layer_params
+>>> layer_in = InputLayer((100, 20))
+>>> layer1 = DenseLayer(layer_in, num_units=3)
+>>> layer2 = DenseLayer(layer1, num_units=5, nonlinearity=softmax)
+>>> x = T.matrix('x')  # shp: num_batch x num_features
+>>> y = T.ivector('y') # shp: num_batch
+>>> l_out = get_output(layer2, x)
+>>> loss = T.mean(T.nnet.categorical_crossentropy(l_out, y))
+>>> layers = {layer1: 0.1, layer2: 0.5}
+>>> l2_penalty = regularize_layer_params_weighted(layers, l2)
+>>> l1_penalty = regularize_layer_params(layer2, l1) * 1e-4
+>>> loss = loss + l2_penalty + l1_penalty
+"""
+import theano.tensor as T
+from .layers import Layer, get_all_params
+
+
+def l1(x):
+    """Computes the L1 norm of a tensor
+
+    Parameters
+    ----------
+    x : Theano tensor
+
+    Returns
+    -------
+    Theano scalar
+        l1 norm (sum of absolute values of elements)
+    """
+    return T.sum(abs(x))
+
+
+def l2(x):
+    """Computes the squared L2 norm of a tensor
+
+    Parameters
+    ----------
+    x : Theano tensor
+
+    Returns
+    -------
+    Theano scalar
+        squared l2 norm (sum of squared values of elements)
+    """
+    return T.sum(x**2)
+
+
+def apply_penalty(tensor_or_tensors, penalty, **kwargs):
+    """
+    Computes the total cost for applying a specified penalty
+    to a tensor or group of tensors.
+
+    Parameters
+    ----------
+    tensor_or_tensors : Theano tensor or list of tensors
+    penalty : callable
+    **kwargs
+        keyword arguments passed to penalty.
+
+    Returns
+    -------
+    Theano scalar
+        a scalar expression for the total penalty cost
+    """
+    try:
+        return sum(penalty(x, **kwargs) for x in tensor_or_tensors)
+    except (TypeError, ValueError):
+        return penalty(tensor_or_tensors, **kwargs)
+
+
+def regularize_layer_params(layer, penalty,
+                            tags={'regularizable': True}, **kwargs):
+    """
+    Computes a regularization cost by applying a penalty to the parameters
+    of a layer or group of layers.
+
+    Parameters
+    ----------
+    layer : a :class:`Layer` instances or list of layers.
+    penalty : callable
+    tags: dict
+        Tag specifications which filter the parameters of the layer or layers.
+        By default, only parameters with the `regularizable` tag are included.
+    **kwargs
+        keyword arguments passed to penalty.
+
+    Returns
+    -------
+    Theano scalar
+        a scalar expression for the cost
+    """
+    layers = [layer, ] if isinstance(layer, Layer) else layer
+    all_params = []
+
+    for layer in layers:
+        all_params += layer.get_params(**tags)
+
+    return apply_penalty(all_params, penalty, **kwargs)
+
+
+def regularize_layer_params_weighted(layers, penalty,
+                                     tags={'regularizable': True}, **kwargs):
+    """
+    Computes a regularization cost by applying a penalty to the parameters
+    of a layer or group of layers, weighted by a coefficient for each layer.
+
+    Parameters
+    ----------
+    layers : dict
+        A mapping from :class:`Layer` instances to coefficients.
+    penalty : callable
+    tags: dict
+        Tag specifications which filter the parameters of the layer or layers.
+        By default, only parameters with the `regularizable` tag are included.
+    **kwargs
+        keyword arguments passed to penalty.
+
+    Returns
+    -------
+    Theano scalar
+        a scalar expression for the cost
+    """
+    return sum(coeff * apply_penalty(layer.get_params(**tags),
+                                     penalty,
+                                     **kwargs)
+               for layer, coeff in layers.items()
+               )
+
+
+def regularize_network_params(layer, penalty,
+                              tags={'regularizable': True}, **kwargs):
+    """
+    Computes a regularization cost by applying a penalty to the parameters
+    of all layers in a network.
+
+    Parameters
+    ----------
+    layer : a :class:`Layer` instance.
+        Parameters of this layer and all layers below it will be penalized.
+    penalty : callable
+    tags: dict
+        Tag specifications which filter the parameters of the layer or layers.
+        By default, only parameters with the `regularizable` tag are included.
+    **kwargs
+        keyword arguments passed to penalty.
+
+    Returns
+    -------
+    Theano scalar
+        a scalar expression for the cost
+    """
+    return apply_penalty(get_all_params(layer, **tags), penalty, **kwargs)
diff --git a/lasagne/tests/conftest.py b/lasagne/tests/conftest.py
new file mode 100644
index 0000000..9e3c776
--- /dev/null
+++ b/lasagne/tests/conftest.py
@@ -0,0 +1,10 @@
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption("--runslow", action="store_true", help="run slow tests")
+
+
+def pytest_runtest_setup(item):
+    if 'slow' in item.keywords and not item.config.getoption("--runslow"):
+        pytest.skip("need --runslow option to run")
diff --git a/lasagne/tests/layers/conftest.py b/lasagne/tests/layers/conftest.py
new file mode 100644
index 0000000..38114cf
--- /dev/null
+++ b/lasagne/tests/layers/conftest.py
@@ -0,0 +1,13 @@
+from mock import Mock
+import pytest
+
+
+ at pytest.fixture
+def dummy_input_layer():
+    from lasagne.layers.input import InputLayer
+    input_layer = InputLayer((2, 3, 4))
+    mock = Mock(input_layer)
+    mock.shape = input_layer.shape
+    mock.input_var = input_layer.input_var
+    mock.output_shape = input_layer.output_shape
+    return mock
diff --git a/lasagne/tests/layers/test_base.py b/lasagne/tests/layers/test_base.py
new file mode 100644
index 0000000..b234b1b
--- /dev/null
+++ b/lasagne/tests/layers/test_base.py
@@ -0,0 +1,180 @@
+from mock import Mock
+import numpy
+import pytest
+import theano
+
+
+class TestLayer:
+    @pytest.fixture
+    def layer(self):
+        from lasagne.layers.base import Layer
+        return Layer(Mock(output_shape=(None,)))
+
+    @pytest.fixture
+    def named_layer(self):
+        from lasagne.layers.base import Layer
+        return Layer(Mock(output_shape=(None,)), name='layer_name')
+
+    def test_input_shape(self, layer):
+        assert layer.input_shape == layer.input_layer.output_shape
+
+    def test_get_output_shape_for(self, layer):
+        shape = Mock()
+        assert layer.get_output_shape_for(shape) == shape
+
+    @pytest.fixture
+    def layer_from_shape(self):
+        from lasagne.layers.base import Layer
+        return Layer((None, 20))
+
+    def test_layer_from_shape(self, layer_from_shape):
+        layer = layer_from_shape
+        assert layer.input_layer is None
+        assert layer.input_shape == (None, 20)
+
+    def test_named_layer(self, named_layer):
+        assert named_layer.name == 'layer_name'
+
+    def test_get_params(self, layer):
+        assert layer.get_params() == []
+
+    def test_get_params_tags(self, layer):
+        a_shape = (20, 50)
+        a = numpy.random.normal(0, 1, a_shape)
+        A = layer.add_param(a, a_shape, name='A', tag1=True, tag2=False)
+
+        b_shape = (30, 20)
+        b = numpy.random.normal(0, 1, b_shape)
+        B = layer.add_param(b, b_shape, name='B', tag1=True, tag2=True)
+
+        c_shape = (40, 10)
+        c = numpy.random.normal(0, 1, c_shape)
+        C = layer.add_param(c, c_shape, name='C', tag2=True)
+
+        assert layer.get_params() == [A, B, C]
+        assert layer.get_params(tag1=True) == [A, B]
+        assert layer.get_params(tag1=False) == [C]
+        assert layer.get_params(tag2=True) == [B, C]
+        assert layer.get_params(tag2=False) == [A]
+        assert layer.get_params(tag1=True, tag2=True) == [B]
+
+    def test_get_params_expressions(self, layer):
+        x, y, z = (theano.shared(0, name=n) for n in 'xyz')
+        W1 = layer.add_param(x**2 + theano.tensor.log(y), (), tag1=True)
+        W2 = layer.add_param(theano.tensor.matrix(), (10, 10), tag1=True)
+        W3 = layer.add_param(z.T, (), tag2=True)
+        # layer.params stores the parameter expressions:
+        assert list(layer.params.keys()) == [W1, W2, W3]
+        # layer.get_params() returns the underlying shared variables:
+        assert layer.get_params() == [x, y, z]
+        # filtering acts on the parameter expressions:
+        assert layer.get_params(tag1=True) == [x, y]
+        assert layer.get_params(tag2=True) == [z]
+
+    def test_add_param_tags(self, layer):
+        a_shape = (20, 50)
+        a = numpy.random.normal(0, 1, a_shape)
+        A = layer.add_param(a, a_shape)
+        assert A in layer.params
+        assert 'trainable' in layer.params[A]
+        assert 'regularizable' in layer.params[A]
+
+        b_shape = (30, 20)
+        b = numpy.random.normal(0, 1, b_shape)
+        B = layer.add_param(b, b_shape, trainable=False)
+        assert B in layer.params
+        assert 'trainable' not in layer.params[B]
+        assert 'regularizable' in layer.params[B]
+
+        c_shape = (40, 10)
+        c = numpy.random.normal(0, 1, c_shape)
+        C = layer.add_param(c, c_shape, tag1=True)
+        assert C in layer.params
+        assert 'trainable' in layer.params[C]
+        assert 'regularizable' in layer.params[C]
+        assert 'tag1' in layer.params[C]
+
+    def test_add_param_name(self, layer):
+        a_shape = (20, 50)
+        a = numpy.random.normal(0, 1, a_shape)
+        A = layer.add_param(a, a_shape, name='A')
+        assert A.name == 'A'
+
+    def test_add_param_named_layer_name(self, named_layer):
+        a_shape = (20, 50)
+        a = numpy.random.normal(0, 1, a_shape)
+        A = named_layer.add_param(a, a_shape, name='A')
+        assert A.name == 'layer_name.A'
+
+    def test_get_output_for_notimplemented(self, layer):
+        with pytest.raises(NotImplementedError):
+            layer.get_output_for(Mock())
+
+    def test_nonpositive_input_dims_raises_value_error(self, layer):
+        from lasagne.layers.base import Layer
+        neg_input_layer = Mock(output_shape=(None, -1, -1))
+        zero_input_layer = Mock(output_shape=(None, 0, 0))
+        pos_input_layer = Mock(output_shape=(None, 1, 1))
+        with pytest.raises(ValueError):
+            Layer(neg_input_layer)
+        with pytest.raises(ValueError):
+            Layer(zero_input_layer)
+        Layer(pos_input_layer)
+
+    def test_symbolic_output_shape(self):
+        from lasagne.layers.base import Layer
+
+        class WrongLayer(Layer):
+            def get_output_shape_for(self, input_shape):
+                return theano.tensor.vector().shape
+        with pytest.raises(ValueError) as exc:
+            WrongLayer((None,)).output_shape
+        assert "symbolic output shape" in exc.value.args[0]
+
+
+class TestMergeLayer:
+    @pytest.fixture
+    def layer(self):
+        from lasagne.layers.base import MergeLayer
+        return MergeLayer([Mock(), Mock()])
+
+    def test_input_shapes(self, layer):
+        assert layer.input_shapes == [l.output_shape
+                                      for l in layer.input_layers]
+
+    @pytest.fixture
+    def layer_from_shape(self):
+        from lasagne.layers.input import InputLayer
+        from lasagne.layers.base import MergeLayer
+        return MergeLayer(
+            [(None, 20),
+             Mock(InputLayer((None,)), output_shape=(None,))]
+        )
+
+    def test_layer_from_shape(self, layer_from_shape):
+        layer = layer_from_shape
+        assert layer.input_layers[0] is None
+        assert layer.input_shapes[0] == (None, 20)
+        assert layer.input_layers[1] is not None
+        assert (layer.input_shapes[1] == layer.input_layers[1].output_shape)
+
+    def test_get_params(self, layer):
+        assert layer.get_params() == []
+
+    def test_get_output_shape_for_notimplemented(self, layer):
+        with pytest.raises(NotImplementedError):
+            layer.get_output_shape_for(Mock())
+
+    def test_get_output_for_notimplemented(self, layer):
+        with pytest.raises(NotImplementedError):
+            layer.get_output_for(Mock())
+
+    def test_symbolic_output_shape(self):
+        from lasagne.layers.base import MergeLayer
+
+        class WrongLayer(MergeLayer):
+            def get_output_shape_for(self, input_shapes):
+                return theano.tensor.vector().shape
+        with pytest.raises(ValueError) as exc:
+            WrongLayer([(None,)]).output_shape
+        assert "symbolic output shape" in exc.value.args[0]
diff --git a/lasagne/tests/layers/test_conv.py b/lasagne/tests/layers/test_conv.py
new file mode 100644
index 0000000..369091d
--- /dev/null
+++ b/lasagne/tests/layers/test_conv.py
@@ -0,0 +1,781 @@
+import numpy as np
+import pytest
+import importlib
+import theano
+
+import lasagne
+from lasagne.utils import floatX, as_tuple
+
+
+def convNd(input, kernel, pad, stride=1, n=None):
+    """Execute a batch of a stack of N-dimensional convolutions.
+
+    Parameters
+    ----------
+    input : numpy array
+    kernel : numpy array
+    pad : {0, 'valid', 'same', 'full'}, int or tuple of int
+    stride : int or tuple of int
+    n : int
+
+    Returns
+    -------
+    numpy array
+    """
+    if n is None:
+        n = input.ndim - 2
+    if pad not in ['valid', 'same', 'full']:
+        pad = as_tuple(pad, n, int)
+        input = np.pad(input, [(p, p) for p in (0, 0) + pad], mode='constant')
+        pad = 'valid'
+
+    output = np.zeros((input.shape[0], kernel.shape[0]) +
+                      tuple(i + k - 1 for i, k in zip(input.shape[2:],
+                                                      kernel.shape[2:])))
+
+    if n == 1:
+        for i in range(kernel.shape[2]):
+            f = kernel[:, :, i:i+1]
+            c = (input[:, np.newaxis] * f).sum(axis=2)
+            output[:, :,
+                   i:i + input.shape[2]] += c
+    elif n == 2:
+        for i in range(kernel.shape[2]):
+            for j in range(kernel.shape[3]):
+                f = kernel[:, :, i:i+1, j:j+1]
+                c = (input[:, np.newaxis] * f).sum(axis=2)
+                output[:, :,
+                       i:i + input.shape[2],
+                       j:j + input.shape[3]] += c
+    elif n == 3:
+        for i in range(kernel.shape[2]):
+            for j in range(kernel.shape[3]):
+                for k in range(kernel.shape[4]):
+                    f = kernel[:, :, i:i+1, j:j+1, k:k+1]
+                    c = (input[:, np.newaxis] * f).sum(axis=2)
+                    output[:, :,
+                           i:i + input.shape[2],
+                           j:j + input.shape[3],
+                           k:k + input.shape[4]] += c
+    else:
+        raise NotImplementedError("convNd() only supports n in (1, 2, 3)")
+
+    if pad == 'valid':
+        trim = tuple(k - 1 for k in kernel.shape[2:])
+        slices = [slice(None), slice(None)]
+        slices += [slice(t, -t or None) for t in trim]
+        output = output[slices]
+    elif pad == 'same':
+        shift = tuple((k - 1) // 2 for k in kernel.shape[2:])
+        slices = [slice(None), slice(None)]
+        slices += [slice(s, s + i) for s, i in zip(shift, input.shape[2:])]
+        output = output[slices]
+
+    stride = as_tuple(stride, n, int)
+    if any(s > 1 for s in stride):
+        slices = [slice(None), slice(None)]
+        slices += [slice(None, None, s) for s in stride]
+        output = output[slices]
+
+    return output
+
+
+def dilate(input, factors):
+    """Inserts `factors[i] - 1` zeros between input elements on axis i."""
+    output = np.zeros(tuple((s-1)*f + 1 for s, f in zip(input.shape, factors)),
+                      dtype=input.dtype)
+    output[[slice(None, None, factor) for factor in factors]] = input
+    return output
+
+
+def transposed_convNd(input, kernel, crop, stride=1, n=None):
+    if n is None:
+        n = input.ndim - 2
+    if crop == 'valid':
+        pad = 'full'
+    elif crop == 'full':
+        pad = 'valid'
+    elif crop == 'same':
+        pad = 'same'
+    else:
+        crop = as_tuple(crop, n, int)
+        pad = tuple(f - 1 - c for f, c in zip(kernel.shape[2:], crop))
+    stride = as_tuple(stride, n, int)
+    dilated_input = dilate(input, (1, 1) + stride)
+    return convNd(dilated_input, kernel, pad, stride=1, n=n)
+
+
+def dilated_convNd(input, kernel, pad, dilation=1, n=None):
+    if n is None:
+        n = input.ndim - 2
+    dilation = as_tuple(dilation, n, int)
+    dilated_kernel = dilate(kernel, (1, 1) + dilation)
+    return convNd(input, dilated_kernel, pad, stride=1, n=n)
+
+
+def convNd_test_sets(n):
+    def _convert(input, kernel, output, kwargs):
+        return [theano.shared(floatX(input)), floatX(kernel), output, kwargs]
+
+    extra_shape = (11, 16, 23)
+    input_shape = (3, 1) + extra_shape[-n:]
+
+    for pad in (0, 1, 2, 'full', 'same'):
+        for stride in (1, 2, 3):
+            for filter_size in (1, 3):
+                if stride > filter_size:
+                    continue
+                input = np.random.random(input_shape)
+                kernel = np.random.random((16, 1) + (filter_size,) * n)
+                output = convNd(input, kernel, pad, stride, n=n)
+                yield _convert(input, kernel, output, {'pad': pad,
+                                                       'stride': stride,
+                                                       'flip_filters': True,
+                                                       })
+
+    # bias-less case
+    input = np.random.random(input_shape)
+    kernel = np.random.random((16, 1) + (3,) * n)
+    output = convNd(input, kernel, pad='valid')
+    yield _convert(input, kernel, output, {'b': None, 'flip_filters': True})
+    # untie_biases=True case
+    yield _convert(input, kernel, output, {'untie_biases': True,
+                                           'flip_filters': True})
+    # pad='valid' case
+    yield _convert(input, kernel, output, {'pad': 'valid',
+                                           'flip_filters': True})
+    # flip_filters=False case
+    flip = (slice(None), slice(None)) + (slice(None, None, -1),) * n
+    output = convNd(input, kernel[flip], pad='valid')
+    yield _convert(input, kernel, output, {'flip_filters': False})
+
+
+def conv3d_test_sets():
+    return convNd_test_sets(3)
+
+
+def conv2d_test_sets():
+    return convNd_test_sets(2)
+
+
+def conv1d_test_sets():
+    return convNd_test_sets(1)
+
+
+def transp_conv2d_test_sets():
+    def _convert(input, kernel, output, kwargs):
+        return [floatX(input), floatX(kernel), output, kwargs]
+
+    input_shape = (3, 1, 11, 16)
+    for crop in (0, 1, 2, 'full', 'same'):
+        for stride in (1, 2, 3):
+            for filter_size in (1, 3):
+                if stride > filter_size:
+                    continue
+                if crop not in ('full', 'same') and crop > (filter_size - 1):
+                    continue
+                input = np.random.random(input_shape)
+                kernel = np.random.random((16, 1, filter_size, filter_size))
+                output = transposed_convNd(input, kernel, crop, stride, 2)
+                yield _convert(input, kernel, output, {'crop': crop,
+                                                       'stride': stride,
+                                                       'flip_filters': True})
+
+    # bias-less case
+    input = np.random.random(input_shape)
+    kernel = np.random.random((16, 1, 3, 3))
+    output = transposed_convNd(input, kernel, 'valid')
+    yield _convert(input, kernel, output, {'b': None, 'flip_filters': True})
+    # untie_biases=True case
+    yield _convert(input, kernel, output, {'untie_biases': True,
+                                           'flip_filters': True})
+    # crop='valid' case
+    yield _convert(input, kernel, output, {'crop': 'valid',
+                                           'flip_filters': True})
+    # flip_filters=False case
+    output = transposed_convNd(input, kernel[:, :, ::-1, ::-1], 'valid')
+    yield _convert(input, kernel, output, {'flip_filters': False})
+
+
+def dilated_conv2d_test_sets():
+    def _convert(input, kernel, output, kwargs):
+        return [floatX(input), floatX(kernel), output, kwargs]
+
+    input_shape = (3, 1, 11, 16)
+    for dilation in (1, 2, 3):
+        for filter_size in (1, 3):
+            input = np.random.random(input_shape)
+            kernel = np.random.random((16, 1, filter_size, filter_size))
+            kernel_flip = kernel[:, :, ::-1, ::-1]
+            output = dilated_convNd(input, kernel_flip, 'valid', dilation, 2)
+            yield _convert(input, kernel, output, {'dilation': dilation})
+
+    # bias-less case
+    input = np.random.random(input_shape)
+    kernel = np.random.random((16, 1, 3, 3))
+    output = dilated_convNd(input, kernel[:, :, ::-1, ::-1], pad='valid')
+    yield _convert(input, kernel, output, {'b': None})
+    # untie_biases=True case
+    yield _convert(input, kernel, output, {'untie_biases': True})
+
+
+def test_conv_output_length():
+    from lasagne.layers.conv import conv_output_length
+
+    assert conv_output_length(13, 5, 3, 'valid') == 3
+    assert conv_output_length(13, 5, 3, 0) == 3
+    assert conv_output_length(13, 5, 3, 'full') == 6
+    assert conv_output_length(13, 5, 3, 'same') == 5
+    assert conv_output_length(13, 5, 3, 2) == 5
+
+    with pytest.raises(ValueError) as exc:
+        conv_output_length(13, 5, 3, '_nonexistent_mode')
+    assert "Invalid pad: " in exc.value.args[0]
+
+
+def test_conv_input_length():
+    from lasagne.layers.conv import conv_input_length
+
+    # using the examples from https://github.com/vdumoulin/conv_arithmetic
+    # no padding, no strides
+    assert conv_input_length(2, 3, 1, 'valid') == 4
+    assert conv_input_length(2, 3, 1, 0) == 4
+    # padding, no strides
+    assert conv_input_length(6, 4, 1, 2) == 5
+    # no padding, strides
+    assert conv_input_length(2, 3, 2, 0) == 5
+    # padding, strides
+    assert conv_input_length(3, 3, 2, 'same') == 5
+    # full convolution
+    assert conv_input_length(3, 3, 2, 'full') == 3
+
+    with pytest.raises(ValueError) as exc:
+        conv_input_length(3, 5, 3, '_nonexistent_mode')
+    assert "Invalid pad: " in exc.value.args[0]
+
+
+ at pytest.fixture
+def DummyInputLayer():
+    def factory(shape):
+        from lasagne.layers.input import InputLayer
+        return InputLayer(shape)
+    return factory
+
+
+class TestBaseConvLayer:
+
+    def test_infer_dimensionality(self):
+        from lasagne.layers.conv import BaseConvLayer
+        shape = (10, 20, 30, 40, 50, 60)
+        for n in range(1, 4):
+            layer = BaseConvLayer(shape[:n+2], 1, 3)
+            assert layer.n == n
+
+    def test_convolve_not_implemented(self):
+        from lasagne.layers.conv import BaseConvLayer
+        layer = BaseConvLayer((10, 20, 30), 1, 3)
+        with pytest.raises(NotImplementedError):
+            layer.convolve(theano.tensor.tensor3())
+
+    def test_fail_on_mismatching_dimensionality(self):
+        from lasagne.layers.conv import BaseConvLayer
+        with pytest.raises(ValueError) as exc:
+            BaseConvLayer((10, 20, 30), 1, 3, n=2)
+        assert "Expected 4 input dimensions" in exc.value.args[0]
+        with pytest.raises(ValueError) as exc:
+            BaseConvLayer((10, 20, 30, 40), 1, 3, n=1)
+        assert "Expected 3 input dimensions" in exc.value.args[0]
+
+
+class TestConv1DLayer:
+
+    @pytest.mark.parametrize(
+        "input, kernel, output, kwargs", list(conv1d_test_sets()))
+    def test_defaults(self, DummyInputLayer,
+                      input, kernel, output, kwargs):
+        b, c, w = input.shape.eval()
+        input_layer = DummyInputLayer((b, c, w))
+        try:
+            from lasagne.layers.conv import Conv1DLayer
+            layer = Conv1DLayer(
+                input_layer,
+                num_filters=kernel.shape[0],
+                filter_size=kernel.shape[2],
+                W=kernel,
+                **kwargs
+            )
+            actual = layer.get_output_for(input).eval()
+            assert actual.shape == output.shape
+            assert actual.shape == layer.output_shape
+            assert np.allclose(actual, output)
+
+        except NotImplementedError:
+            pass
+
+    def test_init_none_nonlinearity_bias(self, DummyInputLayer):
+        from lasagne.layers.conv import Conv1DLayer
+        input_layer = DummyInputLayer((1, 2, 3))
+        layer = Conv1DLayer(input_layer, num_filters=16, filter_size=(3,),
+                            nonlinearity=None, b=None)
+        assert layer.nonlinearity == lasagne.nonlinearities.identity
+        assert layer.b is None
+
+    def test_invalid_pad(self, DummyInputLayer):
+        from lasagne.layers.conv import Conv1DLayer
+        input_layer = DummyInputLayer((1, 2, 3))
+        with pytest.raises(TypeError) as exc:
+            layer = Conv1DLayer(input_layer, num_filters=16, filter_size=(3,),
+                                pad='_nonexistent_mode')
+        assert "iterable of int" in exc.value.args[0]
+
+        with pytest.raises(NotImplementedError) as exc:
+            layer = Conv1DLayer(input_layer, num_filters=16, filter_size=(4,),
+                                pad='same')
+        assert "requires odd filter size" in exc.value.args[0]
+
+
+class TestConv2DLayerImplementations:
+
+    @pytest.fixture(
+        params=[
+            ('lasagne.layers', 'Conv2DLayer'),
+            ('lasagne.layers.cuda_convnet', 'Conv2DCCLayer'),
+            ('lasagne.layers.corrmm', 'Conv2DMMLayer'),
+            ('lasagne.layers.dnn', 'Conv2DDNNLayer'),
+        ],
+    )
+    def Conv2DImpl(self, request):
+        impl_module_name, impl_name = request.param
+        try:
+            mod = importlib.import_module(impl_module_name)
+        except ImportError:
+            pytest.skip("{} not available".format(impl_module_name))
+
+        return getattr(mod, impl_name)
+
+    @pytest.mark.parametrize(
+        "input, kernel, output, kwargs", list(conv2d_test_sets()))
+    def test_defaults(self, Conv2DImpl, DummyInputLayer,
+                      input, kernel, output, kwargs):
+        b, c, h, w = input.shape.eval()
+        input_layer = DummyInputLayer((b, c, h, w))
+        try:
+            layer = Conv2DImpl(
+                input_layer,
+                num_filters=kernel.shape[0],
+                filter_size=kernel.shape[2:],
+                W=kernel,
+                **kwargs
+            )
+            actual = layer.get_output_for(input).eval()
+            assert actual.shape == output.shape
+            assert actual.shape == layer.output_shape
+            assert np.allclose(actual, output)
+
+        except NotImplementedError:
+            pytest.skip()
+
+    @pytest.mark.parametrize(
+        "input, kernel, output, kwargs", list(conv2d_test_sets()))
+    def test_with_nones(self, Conv2DImpl, DummyInputLayer,
+                        input, kernel, output, kwargs):
+        if kwargs.get('untie_biases', False):
+            pytest.skip()
+        b, c, h, w = input.shape.eval()
+        input_layer = DummyInputLayer((None, c, None, None))
+        try:
+            layer = Conv2DImpl(
+                input_layer,
+                num_filters=kernel.shape[0],
+                filter_size=kernel.shape[2:],
+                W=kernel,
+                **kwargs
+            )
+            actual = layer.get_output_for(input).eval()
+
+            assert layer.output_shape == (None,
+                                          kernel.shape[0],
+                                          None,
+                                          None)
+            assert actual.shape == output.shape
+            assert np.allclose(actual, output)
+
+        except NotImplementedError:
+            pytest.skip()
+
+    def test_init_none_nonlinearity_bias(self, Conv2DImpl, DummyInputLayer):
+        input_layer = DummyInputLayer((1, 2, 3, 3))
+        layer = Conv2DImpl(input_layer, num_filters=16, filter_size=(3, 3),
+                           nonlinearity=None, b=None)
+        assert layer.nonlinearity == lasagne.nonlinearities.identity
+        assert layer.b is None
+
+    def test_invalid_pad(self, Conv2DImpl, DummyInputLayer):
+        input_layer = DummyInputLayer((1, 2, 3, 3))
+        with pytest.raises(TypeError) as exc:
+            layer = Conv2DImpl(input_layer, num_filters=16, filter_size=(3, 3),
+                               pad='_nonexistent_mode')
+        assert "iterable of int" in exc.value.args[0]
+
+        with pytest.raises(NotImplementedError) as exc:
+            layer = Conv2DImpl(input_layer, num_filters=16, filter_size=(4, 4),
+                               pad='same')
+        assert "requires odd filter size" in exc.value.args[0]
+
+    def test_get_params(self, Conv2DImpl, DummyInputLayer):
+        input_layer = DummyInputLayer((128, 3, 32, 32))
+        layer = Conv2DImpl(input_layer, num_filters=16, filter_size=(3, 3))
+        assert layer.get_params() == [layer.W, layer.b]
+        assert layer.get_params(regularizable=False) == [layer.b]
+        assert layer.get_params(regularizable=True) == [layer.W]
+        assert layer.get_params(trainable=True) == [layer.W, layer.b]
+        assert layer.get_params(trainable=False) == []
+        assert layer.get_params(_nonexistent_tag=True) == []
+        assert layer.get_params(_nonexistent_tag=False) == [layer.W, layer.b]
+
+
+class TestConv3DLayerImplementations:
+
+    @pytest.fixture(
+        params=[
+            ('lasagne.layers.dnn', 'Conv3DDNNLayer'),
+        ],
+    )
+    def Conv3DImpl(self, request):
+        impl_module_name, impl_name = request.param
+        try:
+            mod = importlib.import_module(impl_module_name)
+        except ImportError:
+            pytest.skip("{} not available".format(impl_module_name))
+
+        return getattr(mod, impl_name)
+
+    @pytest.mark.parametrize(
+        "input, kernel, output, kwargs", list(conv3d_test_sets()))
+    def test_defaults(self, Conv3DImpl, DummyInputLayer,
+                      input, kernel, output, kwargs):
+        b, c, d, h, w = input.shape.eval()
+        input_layer = DummyInputLayer((b, c, d, h, w))
+        try:
+            layer = Conv3DImpl(
+                input_layer,
+                num_filters=kernel.shape[0],
+                filter_size=kernel.shape[2:],
+                W=kernel,
+                **kwargs
+            )
+            actual = layer.get_output_for(input).eval()
+            assert actual.shape == output.shape
+            assert actual.shape == layer.output_shape
+            assert np.allclose(actual, output)
+
+        except NotImplementedError:
+            pytest.skip()
+
+    @pytest.mark.parametrize(
+        "input, kernel, output, kwargs", list(conv3d_test_sets()))
+    def test_with_nones(self, Conv3DImpl, DummyInputLayer,
+                        input, kernel, output, kwargs):
+        if kwargs.get('untie_biases', False):
+            pytest.skip()
+        b, c, d, h, w = input.shape.eval()
+        input_layer = DummyInputLayer((None, c, None, None, None))
+        try:
+            layer = Conv3DImpl(
+                input_layer,
+                num_filters=kernel.shape[0],
+                filter_size=kernel.shape[2:],
+                W=kernel,
+                **kwargs
+            )
+            actual = layer.get_output_for(input).eval()
+
+            assert layer.output_shape == (None,
+                                          kernel.shape[0],
+                                          None,
+                                          None,
+                                          None)
+            assert actual.shape == output.shape
+            assert np.allclose(actual, output)
+
+        except NotImplementedError:
+            pytest.skip()
+
+    def test_init_none_nonlinearity_bias(self, Conv3DImpl, DummyInputLayer):
+        input_layer = DummyInputLayer((1, 2, 3, 3, 3))
+        layer = Conv3DImpl(input_layer, num_filters=16, filter_size=(3, 3, 3),
+                           nonlinearity=None, b=None)
+        assert layer.nonlinearity == lasagne.nonlinearities.identity
+        assert layer.b is None
+
+    def test_invalid_pad(self, Conv3DImpl, DummyInputLayer):
+        input_layer = DummyInputLayer((1, 2, 3, 3, 3))
+        with pytest.raises(TypeError) as exc:
+            layer = Conv3DImpl(input_layer, num_filters=16,
+                               filter_size=(3, 3, 3),
+                               pad='_nonexistent_mode')
+        assert "iterable of int" in exc.value.args[0]
+
+        with pytest.raises(NotImplementedError) as exc:
+            layer = Conv3DImpl(input_layer, num_filters=16,
+                               filter_size=(4, 4, 4),
+                               pad='same')
+        assert "requires odd filter size" in exc.value.args[0]
+
+    def test_get_params(self, Conv3DImpl, DummyInputLayer):
+        input_layer = DummyInputLayer((128, 3, 32, 32, 32))
+        layer = Conv3DImpl(input_layer, num_filters=16, filter_size=(3, 3, 3))
+        assert layer.get_params() == [layer.W, layer.b]
+        assert layer.get_params(regularizable=False) == [layer.b]
+        assert layer.get_params(regularizable=True) == [layer.W]
+        assert layer.get_params(trainable=True) == [layer.W, layer.b]
+        assert layer.get_params(trainable=False) == []
+        assert layer.get_params(_nonexistent_tag=True) == []
+        assert layer.get_params(_nonexistent_tag=False) == [layer.W, layer.b]
+
+
+class TestTransposedConv2DLayer:
+    @pytest.mark.parametrize(
+        "input, kernel, output, kwargs", list(transp_conv2d_test_sets()))
+    def test_defaults(self, DummyInputLayer, input, kernel, output, kwargs):
+        from lasagne.layers import TransposedConv2DLayer
+        b, c, h, w = input.shape
+        input_layer = DummyInputLayer((b, c, h, w))
+        layer = TransposedConv2DLayer(
+                input_layer,
+                num_filters=kernel.shape[0],
+                filter_size=kernel.shape[2:],
+                W=kernel.transpose(1, 0, 2, 3),
+                **kwargs)
+        actual = layer.get_output_for(input).eval()
+        assert actual.shape == output.shape
+        assert actual.shape == layer.output_shape
+        assert np.allclose(actual, output)
+
+    @pytest.mark.parametrize(
+        "input, kernel, output, kwargs", list(transp_conv2d_test_sets()))
+    def test_with_nones(self, DummyInputLayer, input, kernel, output, kwargs):
+        if kwargs.get('untie_biases', False):
+            pytest.skip()
+        from lasagne.layers import TransposedConv2DLayer
+        b, c, h, w = input.shape
+        input_layer = DummyInputLayer((None, c, None, None))
+        layer = TransposedConv2DLayer(
+                input_layer,
+                num_filters=kernel.shape[0],
+                filter_size=kernel.shape[2:],
+                W=kernel.transpose(1, 0, 2, 3),
+                **kwargs)
+        assert layer.output_shape == (None, output.shape[1], None, None)
+        actual = layer.get_output_for(input).eval()
+        assert actual.shape == output.shape
+        assert np.allclose(actual, output)
+
+
+class TestDilatedConv2DLayer:
+    @pytest.mark.parametrize(
+        "input, kernel, output, kwargs", list(dilated_conv2d_test_sets()))
+    def test_defaults(self, DummyInputLayer, input, kernel, output, kwargs):
+        from lasagne.layers import DilatedConv2DLayer
+        b, c, h, w = input.shape
+        input_layer = DummyInputLayer((b, c, h, w))
+        layer = DilatedConv2DLayer(
+                input_layer,
+                num_filters=kernel.shape[0],
+                filter_size=kernel.shape[2:],
+                W=kernel.transpose(1, 0, 2, 3),
+                **kwargs)
+        actual = layer.get_output_for(theano.shared(input)).eval()
+        assert actual.shape == output.shape
+        assert actual.shape == layer.output_shape
+        assert np.allclose(actual, output)
+
+    @pytest.mark.parametrize(
+        "input, kernel, output, kwargs", list(dilated_conv2d_test_sets()))
+    def test_with_nones(self, DummyInputLayer, input, kernel, output, kwargs):
+        if kwargs.get('untie_biases', False):
+            pytest.skip()
+        from lasagne.layers import DilatedConv2DLayer
+        b, c, h, w = input.shape
+        input_layer = DummyInputLayer((None, c, None, None))
+        layer = DilatedConv2DLayer(
+                input_layer,
+                num_filters=kernel.shape[0],
+                filter_size=kernel.shape[2:],
+                W=kernel.transpose(1, 0, 2, 3),
+                **kwargs)
+        assert layer.output_shape == (None, output.shape[1], None, None)
+        actual = layer.get_output_for(input).eval()
+        assert actual.shape == output.shape
+        assert np.allclose(actual, output)
+
+    def test_unsupported_settings(self, DummyInputLayer):
+        from lasagne.layers import DilatedConv2DLayer
+        input_layer = DummyInputLayer((10, 20, 30, 40))
+        for pad in 'same', 'full', 1:
+            with pytest.raises(NotImplementedError) as exc:
+                DilatedConv2DLayer(input_layer, 2, 3, pad=pad)
+            assert "requires pad=0" in exc.value.args[0]
+        with pytest.raises(NotImplementedError) as exc:
+            DilatedConv2DLayer(input_layer, 2, 3, flip_filters=True)
+        assert "requires flip_filters=False" in exc.value.args[0]
+
+
+class TestConv2DDNNLayer:
+    def test_import_without_gpu_or_cudnn_raises(self):
+        from theano.sandbox import cuda
+        if cuda.cuda_enabled and cuda.dnn.dnn_available():
+            pytest.skip()
+        else:
+            with pytest.raises(ImportError):
+                import lasagne.layers.dnn
+
+
+class TestConv2DMMLayer:
+    def test_import_without_gpu_raises(self):
+        from theano.sandbox import cuda
+        if cuda.cuda_enabled:
+            pytest.skip()
+        else:
+            with pytest.raises(ImportError):
+                import lasagne.layers.corrmm
+
+
+class TestConv2DCCLayer:
+    def test_import_without_gpu_raises(self):
+        from theano.sandbox import cuda
+        if cuda.cuda_enabled:
+            pytest.skip()
+        else:
+            with pytest.raises(ImportError):
+                import lasagne.layers.cuda_convnet
+
+    def test_unsupported_settings(self, DummyInputLayer):
+        try:
+            from lasagne.layers.cuda_convnet import Conv2DCCLayer
+        except ImportError:
+            pytest.skip("cuda_convnet not available")
+
+        input_layer = DummyInputLayer((128, 3, 32, 32))
+
+        with pytest.raises(RuntimeError) as exc:
+            layer = Conv2DCCLayer(input_layer, num_filters=16,
+                                  filter_size=(3, 5))
+        assert ("Conv2DCCLayer only supports square filters" in
+                exc.value.args[0])
+
+        with pytest.raises(RuntimeError) as exc:
+            layer = Conv2DCCLayer(input_layer, num_filters=16,
+                                  filter_size=(3, 3), stride=(1, 2))
+        assert ("Conv2DCCLayer only supports square strides" in
+                exc.value.args[0])
+
+        with pytest.raises(RuntimeError) as exc:
+            layer = Conv2DCCLayer(input_layer, num_filters=15,
+                                  filter_size=(3, 3))
+        assert ("Conv2DCCLayer requires num_filters to be a multiple of 16" in
+                exc.value.args[0])
+
+        with pytest.raises(RuntimeError) as exc:
+            layer = Conv2DCCLayer(input_layer, num_filters=16,
+                                  filter_size=(3, 3), pad=(1, 2))
+        assert ("Conv2DCCLayer only supports square padding" in
+                exc.value.args[0])
+
+        input_layer = DummyInputLayer((128, 7, 32, 32))
+
+        with pytest.raises(RuntimeError) as exc:
+            layer = Conv2DCCLayer(input_layer, num_filters=16,
+                                  filter_size=(3, 3))
+        assert ("Conv2DCCLayer requires the number of input channels to be "
+                "1, 2, 3 or a multiple of 4" in exc.value.args[0])
+
+    def test_pad(self, DummyInputLayer):
+        try:
+            from lasagne.layers.cuda_convnet import Conv2DCCLayer
+        except ImportError:
+            pytest.skip("cuda_convnet not available")
+
+        input_layer = DummyInputLayer((128, 3, 32, 32))
+        layer = Conv2DCCLayer(input_layer, num_filters=16, filter_size=(3, 3),
+                              pad=(3, 3))
+        assert layer.output_shape == (128, 16, 36, 36)
+
+    def test_dimshuffle_false_shapes(self, DummyInputLayer):
+        try:
+            from lasagne.layers.cuda_convnet import Conv2DCCLayer
+        except ImportError:
+            pytest.skip("cuda_convnet not available")
+
+        input_layer = DummyInputLayer((4, 32, 32, 128))  # c01b instead of bc01
+        layer = Conv2DCCLayer(input_layer, num_filters=16, filter_size=(3, 3),
+                              dimshuffle=False)
+        assert layer.W.get_value().shape == (4, 3, 3, 16)
+        assert layer.b.get_value().shape == (16,)
+
+        layer = Conv2DCCLayer(input_layer, num_filters=16, filter_size=(3, 3),
+                              dimshuffle=False, untie_biases=True)
+        assert layer.W.get_value().shape == (4, 3, 3, 16)
+        assert layer.b.get_value().shape == (16, 30, 30)
+
+    def test_dimshuffle_false_get_output_for(self, DummyInputLayer):
+        try:
+            from lasagne.layers.cuda_convnet import Conv2DCCLayer
+        except ImportError:
+            pytest.skip("cuda_convnet not available")
+
+        # this implementation is tested against FilterActs instead of
+        # theano.tensor.nnet.conv.conv2d because using the latter leads to
+        # numerical precision errors.
+        from pylearn2.sandbox.cuda_convnet.filter_acts import FilterActs
+        filter_acts = FilterActs(stride=1, pad=0, partial_sum=1)
+
+        input = theano.shared(floatX(np.random.random((4, 5, 5, 8))))
+        kernel = theano.shared(floatX(np.random.random((4, 3, 3, 16))))
+
+        input_layer = DummyInputLayer((4, 5, 5, 8))  # c01b instead of bc01
+        layer = Conv2DCCLayer(input_layer, num_filters=16, filter_size=(3, 3),
+                              dimshuffle=False, W=kernel, b=None,
+                              nonlinearity=None)
+
+        output = np.array(filter_acts(input, kernel).eval())
+
+        actual = layer.get_output_for(input).eval()
+        actual = np.array(actual)
+        assert actual.shape == output.shape
+        assert actual.shape == layer.output_shape
+        assert np.allclose(actual, output)
+
+
+class TestShuffleLayers:
+    def test_bc01_to_c01b(self):
+        from lasagne.layers.input import InputLayer
+        try:
+            from lasagne.layers.cuda_convnet import ShuffleBC01ToC01BLayer
+        except ImportError:
+            pytest.skip("cuda_convnet not available")
+
+        input_layer = InputLayer((1, 2, 3, 4))
+        layer = ShuffleBC01ToC01BLayer(input_layer)
+        assert layer.output_shape == (2, 3, 4, 1)
+
+        input = floatX(np.random.random((1, 2, 3, 4)))
+        output = input.transpose(1, 2, 3, 0)
+        actual = layer.get_output_for(theano.shared(input)).eval()
+        assert np.allclose(output, actual)
+
+    def test_c01b_to_bc01(self):
+        from lasagne.layers.input import InputLayer
+        try:
+            from lasagne.layers.cuda_convnet import ShuffleC01BToBC01Layer
+        except ImportError:
+            pytest.skip("cuda_convnet not available")
+
+        input_layer = InputLayer((1, 2, 3, 4))
+        layer = ShuffleC01BToBC01Layer(input_layer)
+        assert layer.output_shape == (4, 1, 2, 3)
+
+        input = floatX(np.random.random((1, 2, 3, 4)))
+        output = input.transpose(3, 0, 1, 2)
+        actual = layer.get_output_for(theano.shared(input)).eval()
+        assert np.allclose(output, actual)
diff --git a/lasagne/tests/layers/test_dense.py b/lasagne/tests/layers/test_dense.py
new file mode 100644
index 0000000..58aca9f
--- /dev/null
+++ b/lasagne/tests/layers/test_dense.py
@@ -0,0 +1,361 @@
+from mock import Mock
+import numpy as np
+import pytest
+import theano
+
+
+import lasagne
+
+
+class TestDenseLayer:
+    @pytest.fixture
+    def DenseLayer(self):
+        from lasagne.layers.dense import DenseLayer
+        return DenseLayer
+
+    @pytest.fixture
+    def layer_vars(self, dummy_input_layer):
+        from lasagne.layers.dense import DenseLayer
+        W = Mock()
+        b = Mock()
+        nonlinearity = Mock()
+
+        W.return_value = np.ones((12, 3))
+        b.return_value = np.ones((3,)) * 3
+        layer = DenseLayer(
+            dummy_input_layer,
+            num_units=3,
+            W=W,
+            b=b,
+            nonlinearity=nonlinearity,
+            )
+
+        return {
+            'W': W,
+            'b': b,
+            'nonlinearity': nonlinearity,
+            'layer': layer,
+            }
+
+    @pytest.fixture
+    def layer(self, layer_vars):
+        return layer_vars['layer']
+
+    def test_init(self, layer_vars):
+        layer = layer_vars['layer']
+        assert (layer.W.get_value() == layer_vars['W'].return_value).all()
+        assert (layer.b.get_value() == layer_vars['b'].return_value).all()
+        layer_vars['W'].assert_called_with((12, 3))
+        layer_vars['b'].assert_called_with((3,))
+
+    def test_init_none_nonlinearity_bias(self, DenseLayer, dummy_input_layer):
+        layer = DenseLayer(
+            dummy_input_layer,
+            num_units=3,
+            nonlinearity=None,
+            b=None,
+            )
+        assert layer.nonlinearity == lasagne.nonlinearities.identity
+        assert layer.b is None
+
+    def test_get_params(self, layer):
+        assert layer.get_params() == [layer.W, layer.b]
+        assert layer.get_params(regularizable=False) == [layer.b]
+        assert layer.get_params(regularizable=True) == [layer.W]
+        assert layer.get_params(trainable=True) == [layer.W, layer.b]
+        assert layer.get_params(trainable=False) == []
+        assert layer.get_params(_nonexistent_tag=True) == []
+        assert layer.get_params(_nonexistent_tag=False) == [layer.W, layer.b]
+
+    def test_get_output_shape_for(self, layer):
+        assert layer.get_output_shape_for((5, 6, 7)) == (5, 3)
+
+    def test_get_output_for(self, layer_vars):
+        layer = layer_vars['layer']
+        nonlinearity = layer_vars['nonlinearity']
+        W = layer_vars['W']()
+        b = layer_vars['b']()
+
+        input = theano.shared(np.ones((2, 12)))
+        result = layer.get_output_for(input)
+        assert result is nonlinearity.return_value
+
+        # Check that the input to the nonlinearity was what we expect
+        # from dense layer, i.e. the dot product plus bias
+        nonlinearity_arg = nonlinearity.call_args[0][0]
+        assert (nonlinearity_arg.eval() ==
+                np.dot(input.get_value(), W) + b).all()
+
+    def test_get_output_for_flattens_input(self, layer_vars):
+        layer = layer_vars['layer']
+        nonlinearity = layer_vars['nonlinearity']
+        W = layer_vars['W']()
+        b = layer_vars['b']()
+
+        input = theano.shared(np.ones((2, 3, 4)))
+        result = layer.get_output_for(input)
+        assert result is nonlinearity.return_value
+
+        # Check that the input to the nonlinearity was what we expect
+        # from dense layer, i.e. the dot product plus bias
+        nonlinearity_arg = nonlinearity.call_args[0][0]
+        assert np.allclose(nonlinearity_arg.eval(),
+                           np.dot(input.get_value().reshape(2, -1), W) + b)
+
+    def test_param_names(self, layer):
+        assert layer.W.name == "W"
+        assert layer.b.name == "b"
+
+    def test_named_layer_param_names(self, DenseLayer, dummy_input_layer):
+        layer = DenseLayer(
+            dummy_input_layer,
+            num_units=3,
+            name="foo"
+            )
+
+        assert layer.W.name == "foo.W"
+        assert layer.b.name == "foo.b"
+
+
+class TestNINLayer:
+    @pytest.fixture
+    def dummy_input_layer(self):
+        from lasagne.layers.input import InputLayer
+        input_layer = InputLayer((2, 3, 4, 5))
+        mock = Mock(input_layer)
+        mock.shape = input_layer.shape
+        mock.input_var = input_layer.input_var
+        mock.output_shape = input_layer.output_shape
+        return mock
+
+    @pytest.fixture
+    def NINLayer(self):
+        from lasagne.layers.dense import NINLayer
+        return NINLayer
+
+    @pytest.fixture
+    def layer_vars(self, NINLayer, dummy_input_layer):
+        W = Mock()
+        b = Mock()
+        nonlinearity = Mock()
+
+        W.return_value = np.ones((3, 5))
+        b.return_value = np.ones((5,))
+        layer = NINLayer(
+            dummy_input_layer,
+            num_units=5,
+            W=W,
+            b=b,
+            nonlinearity=nonlinearity,
+            )
+
+        return {
+            'W': W,
+            'b': b,
+            'nonlinearity': nonlinearity,
+            'layer': layer,
+            }
+
+    @pytest.fixture
+    def layer(self, layer_vars):
+        return layer_vars['layer']
+
+    def test_init(self, layer_vars):
+        layer = layer_vars['layer']
+        assert (layer.W.get_value() == layer_vars['W'].return_value).all()
+        assert (layer.b.get_value() == layer_vars['b'].return_value).all()
+        layer_vars['W'].assert_called_with((3, 5))
+        layer_vars['b'].assert_called_with((5,))
+
+    def test_init_none_nonlinearity_bias(self, NINLayer, dummy_input_layer):
+        layer = NINLayer(
+            dummy_input_layer,
+            num_units=3,
+            nonlinearity=None,
+            b=None,
+            )
+        assert layer.nonlinearity == lasagne.nonlinearities.identity
+        assert layer.b is None
+
+    def test_init_untie_biases(self, NINLayer, dummy_input_layer):
+        layer = NINLayer(
+            dummy_input_layer,
+            num_units=5,
+            untie_biases=True,
+            )
+        assert (layer.b.shape.eval() == (5, 4, 5)).all()
+
+    def test_get_params(self, layer):
+        assert layer.get_params() == [layer.W, layer.b]
+        assert layer.get_params(regularizable=False) == [layer.b]
+        assert layer.get_params(regularizable=True) == [layer.W]
+        assert layer.get_params(trainable=True) == [layer.W, layer.b]
+        assert layer.get_params(trainable=False) == []
+        assert layer.get_params(_nonexistent_tag=True) == []
+        assert layer.get_params(_nonexistent_tag=False) == [layer.W, layer.b]
+
+    def test_get_output_shape_for(self, layer):
+        assert layer.get_output_shape_for((5, 6, 7, 8)) == (5, 5, 7, 8)
+
+    @pytest.mark.parametrize("extra_kwargs", [
+        {},
+        {'untie_biases': True},
+        {'b': None},
+    ])
+    def test_get_output_for(self, dummy_input_layer, extra_kwargs):
+        from lasagne.layers.dense import NINLayer
+        nonlinearity = Mock()
+
+        layer = NINLayer(
+            dummy_input_layer,
+            num_units=6,
+            nonlinearity=nonlinearity,
+            **extra_kwargs
+            )
+
+        input = theano.shared(np.random.uniform(-1, 1, (2, 3, 4, 5)))
+        result = layer.get_output_for(input)
+        assert result is nonlinearity.return_value
+
+        nonlinearity_arg = nonlinearity.call_args[0][0]
+        X = input.get_value()
+        X = np.rollaxis(X, 1).T
+        X = np.dot(X, layer.W.get_value())
+        if layer.b is not None:
+            if layer.untie_biases:
+                X += layer.b.get_value()[:, np.newaxis].T
+            else:
+                X += layer.b.get_value()
+        X = np.rollaxis(X.T, 0, 2)
+        assert np.allclose(nonlinearity_arg.eval(), X)
+
+    def test_param_names(self, layer):
+        assert layer.W.name == "W"
+        assert layer.b.name == "b"
+
+    def test_named_layer_param_names(self, NINLayer, dummy_input_layer):
+        layer = NINLayer(
+            dummy_input_layer,
+            num_units=3,
+            name="foo"
+            )
+
+        assert layer.W.name == "foo.W"
+        assert layer.b.name == "foo.b"
+
+
+class TestNINLayer_c01b:
+    @pytest.fixture
+    def dummy_input_layer(self):
+        from lasagne.layers.input import InputLayer
+        input_layer = InputLayer((3, 4, 5, 2))
+        mock = Mock(input_layer)
+        mock.shape = input_layer.shape
+        mock.input_var = input_layer.input_var
+        mock.output_shape = input_layer.output_shape
+        return mock
+
+    @pytest.fixture
+    def NINLayer_c01b(self):
+        try:
+            from lasagne.layers.cuda_convnet import NINLayer_c01b
+        except ImportError:
+            pytest.skip("cuda_convnet not available")
+        return NINLayer_c01b
+
+    @pytest.fixture
+    def layer_vars(self, NINLayer_c01b, dummy_input_layer):
+        W = Mock()
+        b = Mock()
+        nonlinearity = Mock()
+
+        W.return_value = np.ones((5, 3))
+        b.return_value = np.ones((5,))
+        layer = NINLayer_c01b(
+            dummy_input_layer,
+            num_units=5,
+            W=W,
+            b=b,
+            nonlinearity=nonlinearity,
+            )
+
+        return {
+            'W': W,
+            'b': b,
+            'nonlinearity': nonlinearity,
+            'layer': layer,
+            }
+
+    @pytest.fixture
+    def layer(self, layer_vars):
+        return layer_vars['layer']
+
+    def test_init(self, layer_vars):
+        layer = layer_vars['layer']
+        assert (layer.W.get_value() == layer_vars['W'].return_value).all()
+        assert (layer.b.get_value() == layer_vars['b'].return_value).all()
+        layer_vars['W'].assert_called_with((5, 3))
+        layer_vars['b'].assert_called_with((5,))
+
+    def test_init_none_nonlinearity_bias(self, NINLayer_c01b,
+                                         dummy_input_layer):
+        layer = NINLayer_c01b(
+            dummy_input_layer,
+            num_units=3,
+            nonlinearity=None,
+            b=None,
+            )
+        assert layer.nonlinearity == lasagne.nonlinearities.identity
+        assert layer.b is None
+
+    def test_init_untie_biases(self, NINLayer_c01b, dummy_input_layer):
+        layer = NINLayer_c01b(
+            dummy_input_layer,
+            num_units=5,
+            untie_biases=True,
+            )
+        assert (layer.b.shape.eval() == (5, 4, 5)).all()
+
+    def test_get_params(self, layer):
+        assert layer.get_params() == [layer.W, layer.b]
+        assert layer.get_params(regularizable=False) == [layer.b]
+        assert layer.get_params(regularizable=True) == [layer.W]
+        assert layer.get_params(trainable=True) == [layer.W, layer.b]
+        assert layer.get_params(trainable=False) == []
+        assert layer.get_params(_nonexistent_tag=True) == []
+        assert layer.get_params(_nonexistent_tag=False) == [layer.W, layer.b]
+
+    def test_get_output_shape_for(self, layer):
+        assert layer.get_output_shape_for((6, 7, 8, 5)) == (5, 7, 8, 5)
+
+    @pytest.mark.parametrize("extra_kwargs", [
+        {},
+        {'untie_biases': True},
+        {'b': None},
+    ])
+    def test_get_output_for(self, dummy_input_layer, NINLayer_c01b,
+                            extra_kwargs):
+        nonlinearity = Mock()
+
+        layer = NINLayer_c01b(
+            dummy_input_layer,
+            num_units=6,
+            nonlinearity=nonlinearity,
+            **extra_kwargs
+            )
+
+        input = theano.shared(np.random.uniform(-1, 1, (3, 4, 5, 2)))
+        result = layer.get_output_for(input)
+        assert result is nonlinearity.return_value
+
+        nonlinearity_arg = nonlinearity.call_args[0][0]
+        X = input.get_value()
+        W = layer.W.get_value()
+        out = np.dot(W, X.reshape(X.shape[0], -1))
+        out = out.reshape(W.shape[0], X.shape[1], X.shape[2], X.shape[3])
+        if layer.b is not None:
+            if layer.untie_biases:
+                out += layer.b.get_value()[..., None]
+            else:
+                out += layer.b.get_value()[:, None, None, None]
+        assert np.allclose(nonlinearity_arg.eval(), out)
diff --git a/lasagne/tests/layers/test_embedding.py b/lasagne/tests/layers/test_embedding.py
new file mode 100644
index 0000000..36c8d00
--- /dev/null
+++ b/lasagne/tests/layers/test_embedding.py
@@ -0,0 +1,56 @@
+import numpy
+import pytest
+import theano
+
+
+def test_embedding_2D_input():
+    import numpy as np
+    import theano
+    import theano.tensor as T
+    from lasagne.layers import EmbeddingLayer, InputLayer, helper
+    x = T.imatrix()
+    batch_size = 2
+    seq_len = 3
+    emb_size = 5
+    vocab_size = 3
+    l_in = InputLayer((None, seq_len))
+    W = np.arange(
+        vocab_size*emb_size).reshape((vocab_size, emb_size)).astype('float32')
+    l1 = EmbeddingLayer(l_in, input_size=vocab_size, output_size=emb_size,
+                        W=W)
+
+    x_test = np.array([[0, 1, 2], [0, 0, 2]], dtype='int32')
+
+    # check output shape
+    assert helper.get_output_shape(
+        l1, (batch_size, seq_len)) == (batch_size, seq_len, emb_size)
+
+    output = helper.get_output(l1, x)
+    f = theano.function([x], output)
+    np.testing.assert_array_almost_equal(f(x_test), W[x_test])
+
+
+def test_embedding_1D_input():
+    import numpy as np
+    import theano
+    import theano.tensor as T
+    from lasagne.layers import EmbeddingLayer, InputLayer, helper
+    x = T.ivector()
+    batch_size = 2
+    emb_size = 10
+    vocab_size = 3
+    l_in = InputLayer((None,))
+    W = np.arange(
+        vocab_size*emb_size).reshape((vocab_size, emb_size)).astype('float32')
+    l1 = EmbeddingLayer(l_in, input_size=vocab_size, output_size=emb_size,
+                        W=W)
+
+    x_test = np.array([0, 1, 2], dtype='int32')
+
+    # check output shape
+    assert helper.get_output_shape(
+        l1, (batch_size, )) == (batch_size, emb_size)
+
+    output = helper.get_output(l1, x)
+    f = theano.function([x], output)
+    np.testing.assert_array_almost_equal(f(x_test), W[x_test])
diff --git a/lasagne/tests/layers/test_helper.py b/lasagne/tests/layers/test_helper.py
new file mode 100644
index 0000000..326a96e
--- /dev/null
+++ b/lasagne/tests/layers/test_helper.py
@@ -0,0 +1,791 @@
+import warnings
+from mock import Mock, PropertyMock
+import pytest
+import numpy
+import theano
+
+
+class TestGetAllLayers:
+    def test_stack(self):
+        from lasagne.layers import InputLayer, DenseLayer, get_all_layers
+        from itertools import permutations
+        # l1 --> l2 --> l3
+        l1 = InputLayer((10, 20))
+        l2 = DenseLayer(l1, 30)
+        l3 = DenseLayer(l2, 40)
+        # try all possible combinations and orders for a query
+        for count in (0, 1, 2, 3):
+            for query in permutations([l1, l2, l3], count):
+                if l3 in query:
+                    expected = [l1, l2, l3]
+                elif l2 in query:
+                    expected = [l1, l2]
+                elif l1 in query:
+                    expected = [l1]
+                else:
+                    expected = []
+                assert get_all_layers(query) == expected
+        # treat_as_input=[l2] should block l1 from appearing
+        assert get_all_layers(l3, treat_as_input=[l2]) == [l2, l3]
+
+    def test_merge(self):
+        from lasagne.layers import (InputLayer, DenseLayer, ElemwiseSumLayer,
+                                    get_all_layers)
+        # l1 --> l2 --> l3 --> l6
+        #        l4 --> l5 ----^
+        l1 = InputLayer((10, 20))
+        l2 = DenseLayer(l1, 30)
+        l3 = DenseLayer(l2, 40)
+        l4 = InputLayer((10, 30))
+        l5 = DenseLayer(l4, 40)
+        l6 = ElemwiseSumLayer([l3, l5])
+        # try various combinations and orders for a query
+        assert get_all_layers(l6) == [l1, l2, l3, l4, l5, l6]
+        assert get_all_layers([l4, l6]) == [l4, l1, l2, l3, l5, l6]
+        assert get_all_layers([l5, l6]) == [l4, l5, l1, l2, l3, l6]
+        assert get_all_layers([l4, l2, l5, l6]) == [l4, l1, l2, l5, l3, l6]
+        # check that treat_as_input correctly blocks the search
+        assert get_all_layers(l6, treat_as_input=[l2]) == [l2, l3, l4, l5, l6]
+        assert get_all_layers(l6, treat_as_input=[l3, l5]) == [l3, l5, l6]
+        assert get_all_layers([l6, l2], treat_as_input=[l6]) == [l6, l1, l2]
+
+    def test_split(self):
+        from lasagne.layers import InputLayer, DenseLayer, get_all_layers
+        # l1 --> l2 --> l3
+        #  \---> l4
+        l1 = InputLayer((10, 20))
+        l2 = DenseLayer(l1, 30)
+        l3 = DenseLayer(l2, 40)
+        l4 = DenseLayer(l1, 50)
+        # try various combinations and orders for a query
+        assert get_all_layers(l3) == [l1, l2, l3]
+        assert get_all_layers(l4) == [l1, l4]
+        assert get_all_layers([l3, l4]) == [l1, l2, l3, l4]
+        assert get_all_layers([l4, l3]) == [l1, l4, l2, l3]
+        # check that treat_as_input correctly blocks the search
+        assert get_all_layers(l3, treat_as_input=[l2]) == [l2, l3]
+        assert get_all_layers([l3, l4], treat_as_input=[l2]) == [l2, l3,
+                                                                 l1, l4]
+
+    def test_bridge(self):
+        from lasagne.layers import (InputLayer, DenseLayer, ElemwiseSumLayer,
+                                    get_all_layers)
+        # l1 --> l2 --> l3 --> l4 --> l5
+        #         \------------^
+        l1 = InputLayer((10, 20))
+        l2 = DenseLayer(l1, 30)
+        l3 = DenseLayer(l2, 30)
+        l4 = ElemwiseSumLayer([l2, l3])
+        l5 = DenseLayer(l4, 40)
+        # check for correct topological order
+        assert get_all_layers(l5) == [l1, l2, l3, l4, l5]
+        # check that treat_as_input=[l4] blocks the search and =[l3] does not
+        assert get_all_layers(l5, treat_as_input=[l4]) == [l4, l5]
+        assert get_all_layers(l5, treat_as_input=[l3]) == [l1, l2, l3, l4, l5]
+
+
+class TestGetOutput_InputLayer:
+    @pytest.fixture
+    def get_output(self):
+        from lasagne.layers.helper import get_output
+        return get_output
+
+    @pytest.fixture
+    def layer(self):
+        from lasagne.layers.input import InputLayer
+        return InputLayer((3, 2))
+
+    def test_get_output_without_arguments(self, layer, get_output):
+        assert get_output(layer) is layer.input_var
+
+    def test_get_output_input_is_variable(self, layer, get_output):
+        variable = theano.Variable("myvariable")
+        assert get_output(layer, variable) is variable
+
+    def test_get_output_input_is_array(self, layer, get_output):
+        inputs = [[1, 2, 3]]
+        output = get_output(layer, inputs)
+        assert numpy.all(output.eval() == inputs)
+
+    def test_get_output_input_is_a_mapping(self, layer, get_output):
+        inputs = {layer: theano.tensor.matrix()}
+        assert get_output(layer, inputs) is inputs[layer]
+
+
+class TestGetOutput_Layer:
+    @pytest.fixture
+    def get_output(self):
+        from lasagne.layers.helper import get_output
+        return get_output
+
+    @pytest.fixture
+    def layers(self):
+        from lasagne.layers.base import Layer
+        from lasagne.layers.input import InputLayer
+        # create a mock that has the same attributes as an InputLayer instance
+        l1 = Mock(InputLayer((None,)), output_shape=(None,),
+                  get_output_kwargs=[])
+        # create a mock that has the same attributes as a Layer instance
+        l2 = Mock(Layer(l1), output_shape=(None,), get_output_kwargs=[])
+        # link it to the InputLayer mock
+        l2.input_layer = l1
+        # create another mock that has the same attributes as a Layer instance
+        l3 = Mock(Layer(l2), output_shape=(None,), get_output_kwargs=['kwarg'])
+        # link it to the first mock, to get an "l1 --> l2 --> l3" chain
+        l3.input_layer = l2
+        return l1, l2, l3
+
+    def test_get_output_without_arguments(self, layers, get_output):
+        l1, l2, l3 = layers
+        output = get_output(l3)
+        # expected: l3.get_output_for(l2.get_output_for(l1.input_var))
+        assert output is l3.get_output_for.return_value
+        l3.get_output_for.assert_called_with(
+            l2.get_output_for.return_value)
+        l2.get_output_for.assert_called_with(
+            l1.input_var)
+
+    def test_get_output_with_single_argument(self, layers, get_output):
+        l1, l2, l3 = layers
+        inputs, kwarg = theano.tensor.matrix(), object()
+        output = get_output(l3, inputs, kwarg=kwarg)
+        # expected: l3.get_output_for(l2.get_output_for(inputs, kwarg=kwarg),
+        #                             kwarg=kwarg)
+        assert output is l3.get_output_for.return_value
+        l3.get_output_for.assert_called_with(
+            l2.get_output_for.return_value, kwarg=kwarg)
+        l2.get_output_for.assert_called_with(
+            inputs, kwarg=kwarg)
+
+    def test_get_output_input_is_a_mapping(self, layers, get_output):
+        l1, l2, l3 = layers
+        p = PropertyMock()
+        type(l1).input_var = p
+        inputs = {l3: theano.tensor.matrix()}
+        # expected: inputs[l3]
+        assert get_output(l3, inputs) is inputs[l3]
+        # l3.get_output_for, l2.get_output_for should not have been called
+        assert l3.get_output_for.call_count == 0
+        assert l2.get_output_for.call_count == 0
+        # l1.input_var should not have been accessed
+        assert p.call_count == 0
+
+    def test_get_output_input_is_a_mapping_no_key(self, layers, get_output):
+        l1, l2, l3 = layers
+        output = get_output(l3, {})
+        # expected: l3.get_output_for(l2.get_output_for(l1.input_var))
+        assert output is l3.get_output_for.return_value
+        l3.get_output_for.assert_called_with(
+            l2.get_output_for.return_value)
+        l2.get_output_for.assert_called_with(
+            l1.input_var)
+
+    def test_get_output_input_is_a_mapping_to_array(self, layers, get_output):
+        l1, l2, l3 = layers
+        p = PropertyMock()
+        type(l1).input_var = p
+        inputs = {l3: [[1, 2, 3]]}
+        output = get_output(l3, inputs)
+        # expected: inputs[l3]
+        assert numpy.all(output.eval() == inputs[l3])
+        # l3.get_output_for, l2.get_output_for should not have been called
+        assert l3.get_output_for.call_count == 0
+        assert l2.get_output_for.call_count == 0
+        # l1.input_var should not have been accessed
+        assert p.call_count == 0
+
+    def test_get_output_input_is_a_mapping_for_layer(self, layers, get_output):
+        l1, l2, l3 = layers
+        p = PropertyMock()
+        type(l1).input_var = p
+        input_expr, kwarg = theano.tensor.matrix(), object()
+        inputs = {l2: input_expr}
+        output = get_output(l3, inputs, kwarg=kwarg)
+        # expected: l3.get_output_for(input_expr, kwarg=kwarg)
+        assert output is l3.get_output_for.return_value
+        l3.get_output_for.assert_called_with(input_expr, kwarg=kwarg)
+        # l2.get_output_for should not have been called
+        assert l2.get_output_for.call_count == 0
+        # l1.input_var should not have been accessed
+        assert p.call_count == 0
+
+    def test_get_output_input_is_a_mapping_for_input_layer(self, layers,
+                                                           get_output):
+        l1, l2, l3 = layers
+        p = PropertyMock()
+        type(l1).input_var = p
+        input_expr, kwarg = theano.tensor.matrix(), object()
+        inputs = {l1: input_expr}
+        output = get_output(l3, inputs, kwarg=kwarg)
+        # expected: l3.get_output_for(l2.get_output_for(input_expr,
+        #                                               kwarg=kwarg),
+        #                             kwarg=kwarg)
+        assert output is l3.get_output_for.return_value
+        l3.get_output_for.assert_called_with(
+            l2.get_output_for.return_value, kwarg=kwarg)
+        l2.get_output_for.assert_called_with(
+            input_expr, kwarg=kwarg)
+        # l1.input_var should not have been accessed
+        assert p.call_count == 0
+
+    def test_get_output_with_unused_kwarg(self, layers, get_output):
+        l1, l2, l3 = layers
+        unused_kwarg = object()
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            get_output(l3, kwagg=unused_kwarg)
+            assert len(w) == 1
+            assert issubclass(w[0].category, UserWarning)
+            assert 'perhaps you meant kwarg' in str(w[0].message)
+
+    def test_get_output_with_no_unused_kwarg(self, layers, get_output):
+        l1, l2, l3 = layers
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            get_output(l3)
+            assert len(w) == 0
+
+    @pytest.fixture
+    def layer_from_shape(self):
+        from lasagne.layers.base import Layer
+        return Layer((None, 20))
+
+    def test_layer_from_shape_invalid_get_output(self, layer_from_shape,
+                                                 get_output):
+        layer = layer_from_shape
+        with pytest.raises(ValueError):
+            get_output(layer)
+        with pytest.raises(ValueError):
+            get_output(layer, [1, 2])
+        with pytest.raises(ValueError):
+            get_output(layer, {Mock(): [1, 2]})
+
+    def test_layer_from_shape_valid_get_output(self, layer_from_shape,
+                                               get_output):
+        layer = layer_from_shape
+        inputs = {layer: theano.tensor.matrix()}
+        assert get_output(layer, inputs) is inputs[layer]
+        inputs = {None: theano.tensor.matrix()}
+        layer.get_output_for = Mock()
+        assert get_output(layer, inputs) is layer.get_output_for.return_value
+        layer.get_output_for.assert_called_with(inputs[None])
+
+
+class TestGetOutput_MergeLayer:
+    @pytest.fixture
+    def get_output(self):
+        from lasagne.layers.helper import get_output
+        return get_output
+
+    @pytest.fixture
+    def layers(self):
+        from lasagne.layers.base import Layer, MergeLayer
+        from lasagne.layers.input import InputLayer
+        # create two mocks of the same attributes as an InputLayer instance
+        l1 = [Mock(InputLayer((None,)), output_shape=(None,),
+                   get_output_kwargs=[]),
+              Mock(InputLayer((None,)), output_shape=(None,),
+                   get_output_kwargs=[])]
+        # create two mocks of the same attributes as a Layer instance
+        l2 = [Mock(Layer(l1[0]), output_shape=(None,),
+                   get_output_kwargs=[]),
+              Mock(Layer(l1[1]), output_shape=(None,),
+                   get_output_kwargs=[])]
+        # link them to the InputLayer mocks
+        l2[0].input_layer = l1[0]
+        l2[1].input_layer = l1[1]
+        # create a mock that has the same attributes as a MergeLayer
+        l3 = Mock(MergeLayer(l2), get_output_kwargs=['kwarg'])
+        # link it to the two layer mocks, to get the following network:
+        # l1[0] --> l2[0] --> l3
+        # l1[1] --> l2[1] ----^
+        l3.input_layers = l2
+        return l1, l2, l3
+
+    def test_get_output_without_arguments(self, layers, get_output):
+        l1, l2, l3 = layers
+        output = get_output(l3)
+        # expected: l3.get_output_for([l2[0].get_output_for(l1[0].input_var),
+        #                              l2[1].get_output_for(l1[1].input_var)])
+        assert output is l3.get_output_for.return_value
+        l3.get_output_for.assert_called_with([
+            l2[0].get_output_for.return_value,
+            l2[1].get_output_for.return_value,
+            ])
+        l2[0].get_output_for.assert_called_with(
+            l1[0].input_var)
+        l2[1].get_output_for.assert_called_with(
+            l1[1].input_var)
+
+    def test_get_output_with_single_argument_fails(self, layers, get_output):
+        l1, l2, l3 = layers
+        inputs, kwarg = theano.tensor.matrix(), object()
+        # expected to fail: only gave one expression for two input layers
+        with pytest.raises(ValueError):
+            output = get_output(l3, inputs, kwarg=kwarg)
+
+    def test_get_output_input_is_a_mapping(self, layers, get_output):
+        l1, l2, l3 = layers
+        p = PropertyMock()
+        type(l1[0]).input_var = p
+        type(l1[1]).input_var = p
+        inputs = {l3: theano.tensor.matrix()}
+        # expected: inputs[l3]
+        assert get_output(l3, inputs) is inputs[l3]
+        # l3.get_output_for, l2[*].get_output_for should not have been called
+        assert l3.get_output_for.call_count == 0
+        assert l2[0].get_output_for.call_count == 0
+        assert l2[1].get_output_for.call_count == 0
+        # l1[*].input_var should not have been accessed
+        assert p.call_count == 0
+
+    def test_get_output_input_is_a_mapping_no_key(self, layers, get_output):
+        l1, l2, l3 = layers
+        output = get_output(l3, {})
+        # expected: l3.get_output_for([l2[0].get_output_for(l1[0].input_var),
+        #                              l2[1].get_output_for(l1[1].input_var)])
+        assert output is l3.get_output_for.return_value
+        l3.get_output_for.assert_called_with([
+            l2[0].get_output_for.return_value,
+            l2[1].get_output_for.return_value,
+            ])
+        l2[0].get_output_for.assert_called_with(
+            l1[0].input_var)
+        l2[1].get_output_for.assert_called_with(
+            l1[1].input_var)
+
+    def test_get_output_input_is_a_mapping_to_array(self, layers, get_output):
+        l1, l2, l3 = layers
+        p = PropertyMock()
+        type(l1[0]).input_var = p
+        type(l1[1]).input_var = p
+        inputs = {l3: [[1, 2, 3]]}
+        output = get_output(l3, inputs)
+        # expected: inputs[l3]
+        assert numpy.all(output.eval() == inputs[l3])
+        # l3.get_output_for, l2[*].get_output_for should not have been called
+        assert l3.get_output_for.call_count == 0
+        assert l2[0].get_output_for.call_count == 0
+        assert l2[1].get_output_for.call_count == 0
+        # l1[*].input_var should not have been accessed
+        assert p.call_count == 0
+
+    def test_get_output_input_is_a_mapping_for_layer(self, layers, get_output):
+        l1, l2, l3 = layers
+        p = PropertyMock()
+        type(l1[0]).input_var = p
+        input_expr, kwarg = theano.tensor.matrix(), object()
+        inputs = {l2[0]: input_expr}
+        output = get_output(l3, inputs, kwarg=kwarg)
+        # expected: l3.get_output_for([input_expr,
+        #                              l2[1].get_output_for(l1[1].input_var,
+        #                                                   kwarg=kwarg)],
+        #                              kwarg=kwarg)
+        assert output is l3.get_output_for.return_value
+        l3.get_output_for.assert_called_with([
+            input_expr,
+            l2[1].get_output_for.return_value,
+            ], kwarg=kwarg)
+        l2[1].get_output_for.assert_called_with(
+            l1[1].input_var, kwarg=kwarg)
+        # l2[0].get_output_for should not have been called
+        assert l2[0].get_output_for.call_count == 0
+        # l1[0].input_var should not have been accessed
+        assert p.call_count == 0
+
+    def test_get_output_input_is_a_mapping_for_input_layer(self, layers,
+                                                           get_output):
+        l1, l2, l3 = layers
+        p = PropertyMock()
+        type(l1[0]).input_var = p
+        input_expr, kwarg = theano.tensor.matrix(), object()
+        inputs = {l1[0]: input_expr}
+        output = get_output(l3, inputs, kwarg=kwarg)
+        # expected: l3.get_output_for([l2[0].get_output_for(input_expr,
+        #                                                   kwarg=kwarg),
+        #                              l2[1].get_output_for(l1[1].input_var,
+        #                                                   kwarg=kwarg)],
+        #                              kwarg=kwarg)
+        assert output is l3.get_output_for.return_value
+        l3.get_output_for.assert_called_with([
+            l2[0].get_output_for.return_value,
+            l2[1].get_output_for.return_value,
+            ], kwarg=kwarg)
+        l2[0].get_output_for.assert_called_with(
+            input_expr, kwarg=kwarg)
+        l2[1].get_output_for.assert_called_with(
+            l1[1].input_var, kwarg=kwarg)
+        # l1[0].input_var should not have been accessed
+        assert p.call_count == 0
+
+    @pytest.fixture
+    def layer_from_shape(self):
+        from lasagne.layers.input import InputLayer
+        from lasagne.layers.base import MergeLayer
+        return MergeLayer([
+            (None, 20),
+            Mock(InputLayer((None,)), output_shape=(None,))])
+
+    def test_layer_from_shape_invalid_get_output(self, layer_from_shape,
+                                                 get_output):
+        layer = layer_from_shape
+        with pytest.raises(ValueError):
+            get_output(layer)
+        with pytest.raises(ValueError):
+            get_output(layer, [1, 2])
+        with pytest.raises(ValueError):
+            get_output(layer, {layer.input_layers[1]: [1, 2]})
+
+    def test_layer_from_shape_valid_get_output(self, layer_from_shape,
+                                               get_output):
+        layer = layer_from_shape
+        inputs = {layer: theano.tensor.matrix()}
+        assert get_output(layer, inputs) is inputs[layer]
+        inputs = {None: theano.tensor.matrix()}
+        layer.get_output_for = Mock()
+        assert get_output(layer, inputs) is layer.get_output_for.return_value
+        layer.get_output_for.assert_called_with(
+            [inputs[None], layer.input_layers[1].input_var])
+
+
+class TestGetOutputShape_InputLayer:
+    @pytest.fixture
+    def get_output_shape(self):
+        from lasagne.layers.helper import get_output_shape
+        return get_output_shape
+
+    @pytest.fixture
+    def layer(self):
+        from lasagne.layers.input import InputLayer
+        return InputLayer((3, 2))
+
+    def test_get_output_shape_without_arguments(self, layer, get_output_shape):
+        assert get_output_shape(layer) == (3, 2)
+
+    def test_get_output_shape_input_is_tuple(self, layer, get_output_shape):
+        shp = (4, 5, 6)
+        assert get_output_shape(layer, shp) == shp
+
+    def test_get_output_shape_input_is_a_mapping(self, layer,
+                                                 get_output_shape):
+        input_shapes = {layer: (4, 5, 6)}
+        assert get_output_shape(layer, input_shapes) == input_shapes[layer]
+
+
+class TestGetOutputShape_Layer:
+    @pytest.fixture
+    def get_output_shape(self):
+        from lasagne.layers.helper import get_output_shape
+        return get_output_shape
+
+    @pytest.fixture
+    def layers(self):
+        from lasagne.layers.base import Layer
+        from lasagne.layers.input import InputLayer
+        # create a mock that has the same attributes as an InputLayer instance
+        l1 = Mock(InputLayer((None,)), output_shape=(None,))
+        # create a mock that has the same attributes as a Layer instance
+        l2 = Mock(Layer(l1), output_shape=(None,))
+        # link it to the InputLayer mock
+        l2.input_layer = l1
+        # create another mock that has the same attributes as a Layer instance
+        l3 = Mock(Layer(l2), output_shape=(None,))
+        # link it to the first mock, to get an "l1 --> l2 --> l3" chain
+        l3.input_layer = l2
+        return l1, l2, l3
+
+    def test_get_output_shape_without_arguments(self, layers,
+                                                get_output_shape):
+        l1, l2, l3 = layers
+        output_shape = get_output_shape(l3)
+        # expected: l3.output_shape
+        assert output_shape is l3.output_shape
+        # l3.get_output_shape_for, l2.get_output_shape_for should not have been
+        # called
+        assert l3.get_output_shape_for.call_count == 0
+        assert l2.get_output_shape_for.call_count == 0
+
+    def test_get_output_shape_with_single_argument(self, layers,
+                                                   get_output_shape):
+        l1, l2, l3 = layers
+        shp = (3, 4, 5)
+        output_shape = get_output_shape(l3, shp)
+        # expected: l3.get_output_shape_for(l2.get_output_shape_for(shp))
+        assert output_shape is l3.get_output_shape_for.return_value
+        l3.get_output_shape_for.assert_called_with(
+            l2.get_output_shape_for.return_value)
+        l2.get_output_shape_for.assert_called_with(shp)
+
+    def test_get_output_shape_input_is_a_mapping(self, layers,
+                                                 get_output_shape):
+        l1, l2, l3 = layers
+        input_shapes = {l3: (4, 5, 6)}
+        # expected: input_shapes[l3]
+        assert get_output_shape(l3, input_shapes) is input_shapes[l3]
+        # l3.get_output_shape_for, l2.get_output_shape_for should not have been
+        # called
+        assert l3.get_output_shape_for.call_count == 0
+        assert l2.get_output_shape_for.call_count == 0
+
+    def test_get_output_shape_input_is_a_mapping_no_key(self, layers,
+                                                        get_output_shape):
+        l1, l2, l3 = layers
+        output_shape = get_output_shape(l3, {})
+        # expected: l3.output_shape
+        assert output_shape is l3.output_shape
+        # l3.get_output_shape_for, l2.get_output_shape_for should not have been
+        # called
+        assert l3.get_output_shape_for.call_count == 0
+        assert l2.get_output_shape_for.call_count == 0
+
+    def test_get_output_shape_input_is_a_mapping_for_layer(self, layers,
+                                                           get_output_shape):
+        l1, l2, l3 = layers
+        shp = (4, 5, 6)
+        input_shapes = {l2: shp}
+        output_shape = get_output_shape(l3, input_shapes)
+        # expected: l3.get_output_shape_for(shp)
+        assert output_shape is l3.get_output_shape_for.return_value
+        l3.get_output_shape_for.assert_called_with(shp)
+        # l2.get_output_shape_for should not have been called
+        assert l2.get_output_shape_for.call_count == 0
+
+    def test_get_output_shape_input_is_a_mapping_for_input_layer(
+            self, layers, get_output_shape):
+        l1, l2, l3 = layers
+        shp = (4, 5, 6)
+        input_shapes = {l1: shp}
+        output_shape = get_output_shape(l3, input_shapes)
+        # expected: l3.get_output_shape_for(l2.get_output_shape_for(shp))
+        assert output_shape is l3.get_output_shape_for.return_value
+        l3.get_output_shape_for.assert_called_with(
+            l2.get_output_shape_for.return_value)
+        l2.get_output_shape_for.assert_called_with(shp)
+
+    @pytest.fixture
+    def layer_from_shape(self):
+        from lasagne.layers.base import Layer
+        return Layer((None, 20))
+
+    def test_layer_from_shape(self, layer_from_shape, get_output_shape):
+        layer = layer_from_shape
+        input_shapes = {layer: (4, 5, 6)}
+        assert get_output_shape(layer, input_shapes) is input_shapes[layer]
+        input_shapes = {None: (4, 5, 6)}
+        layer.get_output_shape_for = Mock()
+        assert (get_output_shape(layer, input_shapes) is
+                layer.get_output_shape_for.return_value)
+        layer.get_output_shape_for.assert_called_with(input_shapes[None])
+
+
+class TestGetOutputShape_MergeLayer:
+    @pytest.fixture
+    def get_output_shape(self):
+        from lasagne.layers.helper import get_output_shape
+        return get_output_shape
+
+    @pytest.fixture
+    def layers(self):
+        from lasagne.layers.base import Layer, MergeLayer
+        from lasagne.layers.input import InputLayer
+        # create two mocks of the same attributes as an InputLayer instance
+        l1 = [Mock(InputLayer((None,)), output_shape=(None,)),
+              Mock(InputLayer((None,)), output_shape=(None,))]
+        # create two mocks of the same attributes as a Layer instance
+        l2 = [Mock(Layer(l1[0]), output_shape=(None,)),
+              Mock(Layer(l1[1]), output_shape=(None,))]
+        # link them to the InputLayer mocks
+        l2[0].input_layer = l1[0]
+        l2[1].input_layer = l1[1]
+        # create a mock that has the same attributes as a MergeLayer
+        l3 = Mock(MergeLayer(l2))
+        # link it to the two layer mocks, to get the following network:
+        # l1[0] --> l2[0] --> l3
+        # l1[1] --> l2[1] ----^
+        l3.input_layers = l2
+        return l1, l2, l3
+
+    def test_get_output_shape_without_arguments(self, layers,
+                                                get_output_shape):
+        l1, l2, l3 = layers
+        output_shape = get_output_shape(l3)
+        # expected: l3.output_shape
+        assert output_shape is l3.output_shape
+        # l3.get_output_shape_for, l2[*].get_output_shape_for should not have
+        # been called
+        assert l3.get_output_shape_for.call_count == 0
+        assert l2[0].get_output_shape_for.call_count == 0
+        assert l2[1].get_output_shape_for.call_count == 0
+
+    def test_get_output_shape_with_single_argument_fails(self, layers,
+                                                         get_output_shape):
+        l1, l2, l3 = layers
+        shp = (4, 5, 6)
+        # expected to fail: only gave one shape tuple for two input layers
+        with pytest.raises(ValueError):
+            output_shape = get_output_shape(l3, shp)
+
+    def test_get_output_shape_input_is_a_mapping(self, layers,
+                                                 get_output_shape):
+        l1, l2, l3 = layers
+        input_shapes = {l3: (4, 5, 6)}
+        # expected: input_shapes[l3]
+        assert get_output_shape(l3, input_shapes) is input_shapes[l3]
+        # l3.get_output_shape_for, l2[*].get_output_shape_for should not have
+        # been called
+        assert l3.get_output_shape_for.call_count == 0
+        assert l2[0].get_output_shape_for.call_count == 0
+        assert l2[1].get_output_shape_for.call_count == 0
+
+    def test_get_output_shape_input_is_a_mapping_no_key(self, layers,
+                                                        get_output_shape):
+        l1, l2, l3 = layers
+        output_shape = get_output_shape(l3, {})
+        # expected: l3.output_shape
+        assert output_shape is l3.output_shape
+        # l3.get_output_shape_for, l2[*].get_output_shape_for should not have
+        # been called
+        assert l3.get_output_shape_for.call_count == 0
+        assert l2[0].get_output_shape_for.call_count == 0
+        assert l2[1].get_output_shape_for.call_count == 0
+
+    def test_get_output_shape_input_is_a_mapping_for_layer(self, layers,
+                                                           get_output_shape):
+        l1, l2, l3 = layers
+        shp = (4, 5, 6)
+        input_shapes = {l2[0]: shp}
+        output = get_output_shape(l3, input_shapes)
+        # expected: l3.get_output_shape_for(
+        #     [shp, l2[1].get_output_shape_for(l1[1].shape)])
+        assert output is l3.get_output_shape_for.return_value
+        l3.get_output_shape_for.assert_called_with([
+            shp, l2[1].get_output_shape_for.return_value])
+        l2[1].get_output_shape_for.assert_called_with(l1[1].shape)
+        # l2[0].get_output_shape_for should not have been called
+        assert l2[0].get_output_shape_for.call_count == 0
+
+    def test_get_output_shape_input_is_a_mapping_for_input_layer(
+            self, layers, get_output_shape):
+        l1, l2, l3 = layers
+        shp = (4, 5, 6)
+        input_shapes = {l1[0]: shp}
+        output = get_output_shape(l3, input_shapes)
+        # expected: l3.get_output_shape_for(
+        #     [l2[0].get_output_shape_for(shp),
+        #      l2[1].get_output_shape_for(l1[1].shape)])
+        assert output is l3.get_output_shape_for.return_value
+        l3.get_output_shape_for.assert_called_with([
+            l2[0].get_output_shape_for.return_value,
+            l2[1].get_output_shape_for.return_value,
+            ])
+        l2[0].get_output_shape_for.assert_called_with(shp)
+        l2[1].get_output_shape_for.assert_called_with(l1[1].shape)
+
+    @pytest.fixture
+    def layer_from_shape(self):
+        from lasagne.layers.input import InputLayer
+        from lasagne.layers.base import MergeLayer
+        return MergeLayer([
+            (None, 20),
+            Mock(InputLayer((None,)), output_shape=(None,))])
+
+    def test_layer_from_shape_valid_get_output_shape(self, layer_from_shape,
+                                                     get_output_shape):
+        layer = layer_from_shape
+        input_shapes = {layer: (4, 5, 6)}
+        assert get_output_shape(layer, input_shapes) is input_shapes[layer]
+        input_shapes = {None: (4, 5, 6)}
+        layer.get_output_shape_for = Mock()
+        assert (get_output_shape(layer, input_shapes) is
+                layer.get_output_shape_for.return_value)
+        layer.get_output_shape_for.assert_called_with(
+            [input_shapes[None], layer.input_layers[1].shape])
+
+
+class TestGetAllParams:
+    def test_get_all_params(self):
+        from lasagne.layers import (InputLayer, DenseLayer, get_all_params)
+        l1 = InputLayer((10, 20))
+        l2 = DenseLayer(l1, 30)
+        l3 = DenseLayer(l2, 40)
+
+        assert get_all_params(l3) == l2.get_params() + l3.get_params()
+        assert (get_all_params(l3, regularizable=False) ==
+                (l2.get_params(regularizable=False) +
+                 l3.get_params(regularizable=False)))
+
+        assert (get_all_params(l3, regularizable=True) ==
+                (l2.get_params(regularizable=True) +
+                 l3.get_params(regularizable=True)))
+
+    def test_get_all_params_with_unwrap_shared(self):
+        from lasagne.layers import (InputLayer, DenseLayer, get_all_params)
+        import theano.tensor as T
+        from lasagne.utils import floatX
+
+        l1 = InputLayer((10, 20))
+        l2 = DenseLayer(l1, 30)
+
+        W1 = theano.shared(floatX(numpy.zeros((30, 2))))
+        W2 = theano.shared(floatX(numpy.zeros((2, 40))))
+        W_expr = T.dot(W1, W2)
+        l3 = DenseLayer(l2, 40, W=W_expr, b=None)
+
+        l2_params = get_all_params(l2)
+        assert get_all_params(l3) == l2_params + [W1, W2]
+        assert get_all_params(l3, unwrap_shared=False) == l2_params + [W_expr]
+
+
+class TestCountParams:
+    def test_get_all_params(self):
+        from lasagne.layers import (InputLayer, DenseLayer, count_params)
+        l1 = InputLayer((10, 20))
+        l2 = DenseLayer(l1, 30)
+        l3 = DenseLayer(l2, 40)
+
+        num_weights = 20 * 30 + 30 * 40
+        num_biases = 30 + 40
+
+        assert count_params(l3, regularizable=True) == num_weights
+        assert count_params(l3, regularizable=False) == num_biases
+        assert count_params(l3) == num_weights + num_biases
+
+
+class TestGetAllParamValues:
+    def test_get_all_param_values(self):
+        from lasagne.layers import (InputLayer, DenseLayer,
+                                    get_all_param_values)
+        l1 = InputLayer((10, 20))
+        l2 = DenseLayer(l1, 30)
+        l3 = DenseLayer(l2, 40)
+
+        pvs = get_all_param_values(l3)
+        assert len(pvs) == 4
+
+
+class TestSetAllParamValues:
+    def test_set_all_param_values(self):
+        from lasagne.layers import (InputLayer, DenseLayer,
+                                    set_all_param_values)
+        from lasagne.utils import floatX
+
+        l1 = InputLayer((10, 20))
+        l2 = DenseLayer(l1, 30)
+        l3 = DenseLayer(l2, 40)
+
+        a2 = floatX(numpy.random.normal(0, 1, (20, 30)))
+        b2 = floatX(numpy.random.normal(0, 1, (30,)))
+        a3 = floatX(numpy.random.normal(0, 1, (30, 40)))
+        b3 = floatX(numpy.random.normal(0, 1, (40,)))
+        set_all_param_values(l3, [a2, b2, a3, b3])
+        assert numpy.allclose(l3.W.get_value(), a3)
+        assert numpy.allclose(l3.b.get_value(), b3)
+        assert numpy.allclose(l2.W.get_value(), a2)
+        assert numpy.allclose(l2.b.get_value(), b2)
+
+        with pytest.raises(ValueError):
+            set_all_param_values(l3, [a3, b3, a2])
+
+        with pytest.raises(ValueError):
+            a3_bad = floatX(numpy.random.normal(0, 1, (25, 40)))
+            set_all_param_values(l3, [a2, b2, a3_bad, b3])
diff --git a/lasagne/tests/layers/test_input.py b/lasagne/tests/layers/test_input.py
new file mode 100644
index 0000000..88654e5
--- /dev/null
+++ b/lasagne/tests/layers/test_input.py
@@ -0,0 +1,41 @@
+import pytest
+import theano
+
+
+class TestInputLayer:
+    @pytest.fixture
+    def layer(self):
+        from lasagne.layers.input import InputLayer
+        return InputLayer((3, 2))
+
+    def test_input_var(self, layer):
+        assert layer.input_var.ndim == 2
+
+    def test_shape(self, layer):
+        assert layer.shape == (3, 2)
+
+    def test_input_var_name(self, layer):
+        assert layer.input_var.name == "input"
+
+    def test_named_layer_input_var_name(self):
+        from lasagne.layers.input import InputLayer
+        layer = InputLayer((3, 2), name="foo")
+        assert layer.input_var.name == "foo.input"
+
+    def test_get_params(self, layer):
+        assert layer.get_params() == []
+
+    def test_bad_shape_fails(self):
+        from lasagne.layers.input import InputLayer
+        input_var = theano.tensor.tensor4()
+
+        with pytest.raises(ValueError):
+            InputLayer((3, 2), input_var)
+
+    def test_nonpositive_input_dims_raises_value_error(self):
+        from lasagne.layers import InputLayer
+        with pytest.raises(ValueError):
+            InputLayer(shape=(None, -1, -1))
+        with pytest.raises(ValueError):
+            InputLayer(shape=(None, 0, 0))
+        InputLayer(shape=(None, 1, 1))
diff --git a/lasagne/tests/layers/test_merge.py b/lasagne/tests/layers/test_merge.py
new file mode 100644
index 0000000..470ea5c
--- /dev/null
+++ b/lasagne/tests/layers/test_merge.py
@@ -0,0 +1,256 @@
+from mock import Mock
+import numpy
+import pytest
+import theano
+
+
+class TestAutocrop:
+    # Test internal helper methods of MergeCropLayer
+    def test_autocrop_array_shapes(self):
+        from lasagne.layers.merge import autocrop_array_shapes
+        crop0 = None
+        crop1 = [None, 'lower', 'center', 'upper']
+        # Too few crop modes; should get padded with None
+        crop2 = ['lower', 'upper']
+        # Invalid crop modes
+        crop_bad = ['lower', 'upper', 'bad', 'worse']
+
+        assert autocrop_array_shapes(
+            [(1, 2, 3, 4), (5, 6, 7, 8), (5, 4, 3, 2)], crop0) == \
+            [(1, 2, 3, 4), (5, 6, 7, 8), (5, 4, 3, 2)]
+        assert autocrop_array_shapes(
+            [(1, 2, 3, 4), (5, 6, 7, 8), (5, 4, 3, 2)], crop1) == \
+            [(1, 2, 3, 2), (5, 2, 3, 2), (5, 2, 3, 2)]
+        assert autocrop_array_shapes(
+            [(1, 2, 3, 4), (5, 6, 7, 8), (5, 4, 3, 2)], crop2) == \
+            [(1, 2, 3, 4), (1, 2, 7, 8), (1, 2, 3, 2)]
+
+        with pytest.raises(ValueError):
+            autocrop_array_shapes(
+                [(1, 2, 3, 4), (5, 6, 7, 8), (5, 4, 3, 2)], crop_bad)
+
+        # Inconsistent dimensionality
+        with pytest.raises(ValueError):
+            autocrop_array_shapes(
+                [(1, 2, 3, 4), (5, 6, 7), (5, 4, 3, 2, 10)], crop1)
+
+    def test_crop_inputs(self):
+        from lasagne.layers.merge import autocrop
+        from numpy.testing import assert_array_equal
+        crop_0 = None
+        crop_1 = [None, 'lower', 'center', 'upper']
+        crop_l = ['lower', 'lower', 'lower', 'lower']
+        crop_c = ['center', 'center', 'center', 'center']
+        crop_u = ['upper', 'upper', 'upper', 'upper']
+        crop_x = ['lower', 'lower']
+        crop_bad = ['lower', 'lower', 'bad', 'worse']
+
+        x0 = numpy.random.random((2, 3, 5, 7))
+        x1 = numpy.random.random((1, 2, 3, 4))
+        x2 = numpy.random.random((6, 3, 4, 2))
+
+        def crop_test(cropping, inputs, expected):
+            inputs = [theano.shared(x) for x in inputs]
+            outs = autocrop(inputs, cropping)
+            outs = [o.eval() for o in outs]
+            assert len(outs) == len(expected)
+            for o, e in zip(outs, expected):
+                assert_array_equal(o, e)
+
+        crop_test(crop_0, [x0, x1],
+                  [x0, x1])
+        crop_test(crop_1, [x0, x1],
+                  [x0[:, :2, 1:4, 3:], x1[:, :, :, :]])
+        crop_test(crop_l, [x0, x1],
+                  [x0[:1, :2, :3, :4], x1[:, :, :, :]])
+        crop_test(crop_c, [x0, x1],
+                  [x0[:1, :2, 1:4, 1:5], x1[:, :, :, :]])
+        crop_test(crop_u, [x0, x1],
+                  [x0[1:, 1:, 2:, 3:], x1[:, :, :, :]])
+
+        crop_test(crop_0, [x0, x2],
+                  [x0, x2])
+        crop_test(crop_1, [x0, x2],
+                  [x0[:, :, :4, 5:], x2[:, :, :, :]])
+        crop_test(crop_l, [x0, x2],
+                  [x0[:, :, :4, :2], x2[:2, :, :, :]])
+        crop_test(crop_c, [x0, x2],
+                  [x0[:, :, :4, 2:4], x2[2:4, :, :, :]])
+        crop_test(crop_u, [x0, x2],
+                  [x0[:, :, 1:, 5:], x2[4:, :, :, :]])
+
+        crop_test(crop_0, [x0, x1, x2],
+                  [x0, x1, x2])
+        crop_test(crop_1, [x0, x1, x2],
+                  [x0[:, :2, 1:4, 5:], x1[:, :, :, 2:], x2[:, :2, :3, :]])
+        crop_test(crop_l, [x0, x1, x2],
+                  [x0[:1, :2, :3, :2], x1[:, :, :, :2], x2[:1, :2, :3, :]])
+        crop_test(crop_c, [x0, x1, x2],
+                  [x0[:1, :2, 1:4, 2:4], x1[:, :, :, 1:3], x2[2:3, :2, :3, :]])
+        crop_test(crop_u, [x0, x1, x2],
+                  [x0[1:, 1:, 2:, 5:], x1[:, :, :, 2:], x2[5:, 1:, 1:, :]])
+
+        crop_test(crop_x, [x0, x1, x2],
+                  [x0[:1, :2, :, :], x1[:1, :2, :, :], x2[:1, :2, :, :]])
+
+        # test that num outputs is correct when the number of inputs is
+        # larger than ndim of the inputs.
+        crop_test(crop_x, [x0, x1, x2, x0, x1, x2],
+                  [x0[:1, :2, :, :], x1[:1, :2, :, :], x2[:1, :2, :, :],
+                   x0[:1, :2, :, :], x1[:1, :2, :, :], x2[:1, :2, :, :]])
+
+        with pytest.raises(ValueError):
+            crop_test(crop_bad, [x0, x1, x2],
+                      [x0[:1, :2, :, :], x1[:1, :2, :, :], x2[:1, :2, :, :]])
+
+        # Inconsistent dimensionality
+        with pytest.raises(ValueError):
+            crop_test(crop_bad, [x0[:, :, :, 0], x1, x2[:, :, :, :, None]],
+                      [x0[:1, :2, :, :], x1[:1, :2, :, :], x2[:1, :2, :, :]])
+
+
+class TestConcatLayer:
+    def layer(self, axis):
+        from lasagne.layers.merge import ConcatLayer
+        return ConcatLayer([Mock(), Mock()], axis=axis)
+
+    @pytest.fixture
+    def crop_layer_0(self):
+        from lasagne.layers.merge import ConcatLayer
+        return ConcatLayer([Mock(), Mock()], axis=0,
+                           cropping=['lower'] * 2)
+
+    @pytest.fixture
+    def crop_layer_1(self):
+        from lasagne.layers.merge import ConcatLayer
+        return ConcatLayer([Mock(), Mock()], axis=1,
+                           cropping=['lower'] * 2)
+
+    @pytest.mark.parametrize("axis", (1, -1))
+    def test_get_output_shape_for(self, axis):
+        layer = self.layer(axis)
+        assert layer.get_output_shape_for([(3, 2), (3, 5)]) == (3, 7)
+        assert layer.get_output_shape_for([(3, 2), (3, None)]) == (3, None)
+        assert layer.get_output_shape_for([(None, 2), (3, 5)]) == (3, 7)
+        assert layer.get_output_shape_for([(None, 2), (None, 5)]) == (None, 7)
+        with pytest.raises(ValueError):
+            layer.get_output_shape_for([(4, None), (3, 5)])
+        with pytest.raises(ValueError):
+            layer.get_output_shape_for([(3, 2), (4, None)])
+        with pytest.raises(ValueError):
+            layer.get_output_shape_for([(None, 2), (3, 5), (4, 5)])
+
+    def test_get_output_shape_for_cropped(self, crop_layer_0, crop_layer_1):
+        input_shapes = [(3, 2), (4, 5)]
+        result_0 = crop_layer_0.get_output_shape_for(input_shapes)
+        result_1 = crop_layer_1.get_output_shape_for(input_shapes)
+        assert result_0 == (7, 2)
+        assert result_1 == (3, 7)
+
+    @pytest.mark.parametrize("axis", (1, -1))
+    def test_get_output_for(self, axis):
+        layer = self.layer(axis)
+        inputs = [theano.shared(numpy.ones((3, 3))),
+                  theano.shared(numpy.ones((3, 2)))]
+        result = layer.get_output_for(inputs)
+        result_eval = result.eval()
+        desired_result = numpy.hstack([input.get_value() for input in inputs])
+        assert (result_eval == desired_result).all()
+
+    def test_get_output_for_cropped(self, crop_layer_0, crop_layer_1):
+        x0 = numpy.random.random((5, 3))
+        x1 = numpy.random.random((4, 2))
+        inputs = [theano.shared(x0),
+                  theano.shared(x1)]
+        result_0 = crop_layer_0.get_output_for(inputs).eval()
+        result_1 = crop_layer_1.get_output_for(inputs).eval()
+        desired_result_0 = numpy.concatenate([x0[:, :2], x1[:, :2]], axis=0)
+        desired_result_1 = numpy.concatenate([x0[:4, :], x1[:4, :]], axis=1)
+        assert (result_0 == desired_result_0).all()
+        assert (result_1 == desired_result_1).all()
+
+
+class TestElemwiseSumLayer:
+    @pytest.fixture
+    def layer(self):
+        from lasagne.layers.merge import ElemwiseSumLayer
+        return ElemwiseSumLayer([Mock(), Mock()], coeffs=[2, -1])
+
+    @pytest.fixture
+    def crop_layer(self):
+        from lasagne.layers.merge import ElemwiseSumLayer
+        return ElemwiseSumLayer([Mock(), Mock()], coeffs=[2, -1],
+                                cropping=['lower'] * 2)
+
+    def test_get_output_shape_for(self, layer):
+        assert layer.get_output_shape_for([(3, 2), (3, 2)]) == (3, 2)
+        assert layer.get_output_shape_for([(3, 2), (3, None)]) == (3, 2)
+        assert layer.get_output_shape_for([(None, 2), (3, 2)]) == (3, 2)
+        assert layer.get_output_shape_for([(None, 2), (None, 2)]) == (None, 2)
+        with pytest.raises(ValueError):
+            layer.get_output_shape_for([(3, None), (4, 2)])
+        with pytest.raises(ValueError):
+            layer.get_output_shape_for([(3, 2), (4, None)])
+        with pytest.raises(ValueError):
+            layer.get_output_shape_for([(None, 2), (3, 2), (4, 2)])
+
+    def test_get_output_for(self, layer):
+        a = numpy.array([[0, 1], [2, 3]])
+        b = numpy.array([[1, 2], [4, 5]])
+        inputs = [theano.shared(a),
+                  theano.shared(b)]
+        result = layer.get_output_for(inputs)
+        result_eval = result.eval()
+        desired_result = 2*a - b
+        assert (result_eval == desired_result).all()
+
+    def test_get_output_for_cropped(self, crop_layer):
+        from numpy.testing import assert_array_almost_equal as aeq
+        x0 = numpy.random.random((5, 3))
+        x1 = numpy.random.random((4, 2))
+        inputs = [theano.shared(x0),
+                  theano.shared(x1)]
+        result = crop_layer.get_output_for(inputs).eval()
+        desired_result = 2*x0[:4, :2] - x1[:4, :2]
+        aeq(result, desired_result)
+
+    def test_bad_coeffs_fails(self, layer):
+        from lasagne.layers.merge import ElemwiseSumLayer
+        with pytest.raises(ValueError):
+            ElemwiseSumLayer([Mock(), Mock()], coeffs=[2, 3, -1])
+
+
+class TestElemwiseMergeLayerMul:
+    @pytest.fixture
+    def layer(self):
+        import theano.tensor as T
+        from lasagne.layers.merge import ElemwiseMergeLayer
+        return ElemwiseMergeLayer([Mock(), Mock()], merge_function=T.mul)
+
+    def test_get_output_for(self, layer):
+        a = numpy.array([[0, 1], [2, 3]])
+        b = numpy.array([[1, 2], [4, 5]])
+        inputs = [theano.shared(a),
+                  theano.shared(b)]
+        result = layer.get_output_for(inputs)
+        result_eval = result.eval()
+        desired_result = a*b
+        assert (result_eval == desired_result).all()
+
+
+class TestElemwiseMergeLayerMaximum:
+    @pytest.fixture
+    def layer(self):
+        import theano.tensor as T
+        from lasagne.layers.merge import ElemwiseMergeLayer
+        return ElemwiseMergeLayer([Mock(), Mock()], merge_function=T.maximum)
+
+    def test_get_output_for(self, layer):
+        a = numpy.array([[0, 1], [2, 3]])
+        b = numpy.array([[1, 2], [4, 5]])
+        inputs = [theano.shared(a),
+                  theano.shared(b)]
+        result = layer.get_output_for(inputs)
+        result_eval = result.eval()
+        desired_result = numpy.maximum(a, b)
+        assert (result_eval == desired_result).all()
diff --git a/lasagne/tests/layers/test_noise.py b/lasagne/tests/layers/test_noise.py
new file mode 100644
index 0000000..0bc598c
--- /dev/null
+++ b/lasagne/tests/layers/test_noise.py
@@ -0,0 +1,127 @@
+from mock import Mock
+import numpy
+from numpy.random import RandomState
+import theano
+import pytest
+
+from lasagne.random import get_rng, set_rng
+
+
+class TestDropoutLayer:
+    @pytest.fixture(params=[(100, 100), (None, 100)])
+    def input_layer(self, request):
+        from lasagne.layers.input import InputLayer
+        return InputLayer(request.param)
+
+    @pytest.fixture
+    def layer(self, input_layer):
+        from lasagne.layers.noise import DropoutLayer
+        return DropoutLayer(input_layer)
+
+    @pytest.fixture
+    def layer_no_rescale(self, input_layer):
+        from lasagne.layers.noise import DropoutLayer
+        return DropoutLayer(input_layer, rescale=False)
+
+    @pytest.fixture
+    def layer_p_02(self, input_layer):
+        from lasagne.layers.noise import DropoutLayer
+        return DropoutLayer(input_layer, p=0.2)
+
+    def test_get_output_for_non_deterministic(self, layer):
+        input = theano.shared(numpy.ones((100, 100)))
+        result = layer.get_output_for(input)
+        result_eval = result.eval()
+        assert 0.9 < result_eval.mean() < 1.1
+        assert (numpy.unique(result_eval) == [0., 2.]).all()
+
+    def test_get_output_for_deterministic(self, layer):
+        input = theano.shared(numpy.ones((100, 100)))
+        result = layer.get_output_for(input, deterministic=True)
+        result_eval = result.eval()
+        assert (result_eval == input.get_value()).all()
+
+    def test_get_output_for_no_rescale(self, layer_no_rescale):
+        input = theano.shared(numpy.ones((100, 100)))
+        result = layer_no_rescale.get_output_for(input)
+        result_eval = result.eval()
+        assert 0.4 < result_eval.mean() < 0.6
+        assert (numpy.unique(result_eval) == [0., 1.]).all()
+
+    def test_get_output_for_no_rescale_dtype(self, layer_no_rescale):
+        input = theano.shared(numpy.ones((100, 100), dtype=numpy.int32))
+        result = layer_no_rescale.get_output_for(input)
+        assert result.dtype == input.dtype
+
+    def test_get_output_for_p_02(self, layer_p_02):
+        input = theano.shared(numpy.ones((100, 100)))
+        result = layer_p_02.get_output_for(input)
+        result_eval = result.eval()
+        assert 0.9 < result_eval.mean() < 1.1
+        assert (numpy.round(numpy.unique(result_eval), 2) == [0., 1.25]).all()
+
+    def test_get_output_for_p_float32(self, input_layer):
+        from lasagne.layers.noise import DropoutLayer
+        layer = DropoutLayer(input_layer, p=numpy.float32(0.5))
+        input = theano.shared(numpy.ones((100, 100), dtype=numpy.float32))
+        assert layer.get_output_for(input).dtype == input.dtype
+
+    def test_specified_rng(self, input_layer):
+        from lasagne.layers.noise import DropoutLayer
+        input = theano.shared(numpy.ones((100, 100)))
+        seed = 123456789
+        rng = get_rng()
+
+        set_rng(RandomState(seed))
+        result = DropoutLayer(input_layer).get_output_for(input)
+        result_eval1 = result.eval()
+
+        set_rng(RandomState(seed))
+        result = DropoutLayer(input_layer).get_output_for(input)
+        result_eval2 = result.eval()
+
+        set_rng(rng)  # reset to original RNG for other tests
+        assert numpy.allclose(result_eval1, result_eval2)
+
+
+class TestGaussianNoiseLayer:
+    @pytest.fixture
+    def layer(self):
+        from lasagne.layers.noise import GaussianNoiseLayer
+        return GaussianNoiseLayer(Mock(output_shape=(None,)))
+
+    @pytest.fixture(params=[(100, 100), (None, 100)])
+    def input_layer(self, request):
+        from lasagne.layers.input import InputLayer
+        return InputLayer(request.param)
+
+    def test_get_output_for_non_deterministic(self, layer):
+        input = theano.shared(numpy.ones((100, 100)))
+        result = layer.get_output_for(input, deterministic=False)
+        result_eval = result.eval()
+        assert (result_eval != input.eval()).all()
+        assert result_eval.mean() != 1.0
+        assert numpy.round(result_eval.mean()) == 1.0
+
+    def test_get_output_for_deterministic(self, layer):
+        input = theano.shared(numpy.ones((3, 3)))
+        result = layer.get_output_for(input, deterministic=True)
+        result_eval = result.eval()
+        assert (result_eval == input.eval()).all()
+
+    def test_specified_rng(self, input_layer):
+        from lasagne.layers.noise import GaussianNoiseLayer
+        input = theano.shared(numpy.ones((100, 100)))
+        seed = 123456789
+        rng = get_rng()
+
+        set_rng(RandomState(seed))
+        result = GaussianNoiseLayer(input_layer).get_output_for(input)
+        result_eval1 = result.eval()
+
+        set_rng(RandomState(seed))
+        result = GaussianNoiseLayer(input_layer).get_output_for(input)
+        result_eval2 = result.eval()
+
+        set_rng(rng)  # reset to original RNG for other tests
+        assert numpy.allclose(result_eval1, result_eval2)
diff --git a/lasagne/tests/layers/test_normalization.py b/lasagne/tests/layers/test_normalization.py
new file mode 100644
index 0000000..5f6bc4f
--- /dev/null
+++ b/lasagne/tests/layers/test_normalization.py
@@ -0,0 +1,327 @@
+# -*- coding: utf-8 -*-
+
+"""
+
+The :func:`ground_truth_normalizer()`, :func:`ground_truth_normalize_row` and
+:class:`TestLocalResponseNormalization2DLayer` implementations contain code
+from `pylearn2 <http://github.com/lisa-lab/pylearn2>`_, which is covered
+by the following license:
+
+
+Copyright (c) 2011--2014, Université de Montréal
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+
+from mock import Mock
+import numpy as np
+import pytest
+import theano
+
+
+def ground_truth_normalizer(c01b, k, n, alpha, beta):
+    out = np.zeros(c01b.shape)
+
+    for r in range(out.shape[1]):
+        for c in range(out.shape[2]):
+            for x in range(out.shape[3]):
+                out[:, r, c, x] = ground_truth_normalize_row(
+                        row=c01b[:, r, c, x],
+                        k=k, n=n, alpha=alpha, beta=beta)
+    return out
+
+
+def ground_truth_normalize_row(row, k, n, alpha, beta):
+    assert row.ndim == 1
+    out = np.zeros(row.shape)
+    for i in range(row.shape[0]):
+        s = k
+        tot = 0
+        for j in range(max(0, i-n//2), min(row.shape[0], i+n//2+1)):
+            tot += 1
+            sq = row[j] ** 2.
+            assert sq > 0.
+            assert s >= k
+            assert alpha > 0.
+            s += alpha * sq
+            assert s >= k
+        assert tot <= n
+        assert s >= k
+        s = s ** beta
+        out[i] = row[i] / s
+    return out
+
+
+class TestLocalResponseNormalization2DLayer:
+
+    @pytest.fixture
+    def rng(self):
+        return np.random.RandomState([2013, 2])
+
+    @pytest.fixture
+    def input_data(self, rng):
+        channels = 15
+        rows = 3
+        cols = 4
+        batch_size = 2
+        shape = (batch_size, channels, rows, cols)
+        return rng.randn(*shape).astype(theano.config.floatX)
+
+    @pytest.fixture
+    def input_layer(self, input_data):
+        from lasagne.layers.input import InputLayer
+        shape = list(input_data.shape)
+        shape[0] = None
+        return InputLayer(shape)
+
+    @pytest.fixture
+    def layer(self, input_layer):
+
+        from lasagne.layers.normalization import\
+                LocalResponseNormalization2DLayer
+
+        layer = LocalResponseNormalization2DLayer(input_layer,
+                                                  alpha=1.5,
+                                                  k=2,
+                                                  beta=0.75,
+                                                  n=5)
+        return layer
+
+    def test_get_params(self, layer):
+        assert layer.get_params() == []
+
+    def test_get_output_shape_for(self, layer):
+        assert layer.get_output_shape_for((1, 2, 3, 4)) == (1, 2, 3, 4)
+
+    def test_even_n_fails(self, input_layer):
+        from lasagne.layers.normalization import\
+                LocalResponseNormalization2DLayer
+
+        with pytest.raises(NotImplementedError):
+            LocalResponseNormalization2DLayer(input_layer, n=4)
+
+    def test_normalization(self, input_data, input_layer, layer):
+        from lasagne.layers import get_output
+        X = input_layer.input_var
+        lrn = theano.function([X], get_output(layer, X))
+        out = lrn(input_data)
+
+        # ground_truth_normalizer assumes c01b
+        input_data_c01b = input_data.transpose([1, 2, 3, 0])
+        ground_out = ground_truth_normalizer(input_data_c01b,
+                                             n=layer.n, k=layer.k,
+                                             alpha=layer.alpha,
+                                             beta=layer.beta)
+        ground_out = np.transpose(ground_out, [3, 0, 1, 2])
+
+        assert out.shape == ground_out.shape
+
+        assert np.allclose(out, ground_out)
+
+
+class TestBatchNormLayer:
+    @pytest.fixture
+    def BatchNormLayer(self):
+        from lasagne.layers.normalization import BatchNormLayer
+        return BatchNormLayer
+
+    @pytest.fixture
+    def init_unique(self):
+        # initializer for a tensor of unique values
+        return lambda shape: np.arange(np.prod(shape)).reshape(shape)
+
+    def test_init(self, BatchNormLayer, init_unique):
+        input_shape = (2, 3, 4)
+        # default: normalize over all but second axis
+        beta = BatchNormLayer(input_shape, beta=init_unique).beta
+        assert np.allclose(beta.get_value(), init_unique((3,)))
+        # normalize over first axis only
+        beta = BatchNormLayer(input_shape, beta=init_unique, axes=0).beta
+        assert np.allclose(beta.get_value(), init_unique((3, 4)))
+        # normalize over second and third axis
+        beta = BatchNormLayer(input_shape, beta=init_unique, axes=(1, 2)).beta
+        assert np.allclose(beta.get_value(), init_unique((2,)))
+
+    @pytest.mark.parametrize('update_averages', [None, True, False])
+    @pytest.mark.parametrize('use_averages', [None, True, False])
+    @pytest.mark.parametrize('deterministic', [True, False])
+    def test_get_output_for(self, BatchNormLayer, deterministic, use_averages,
+                            update_averages):
+        input_shape = (20, 30, 40)
+
+        # random input tensor, beta, gamma, mean, inv_std and alpha
+        input = (np.random.randn(*input_shape).astype(theano.config.floatX) +
+                 np.random.randn(1, 30, 1).astype(theano.config.floatX))
+        beta = np.random.randn(30).astype(theano.config.floatX)
+        gamma = np.random.randn(30).astype(theano.config.floatX)
+        mean = np.random.randn(30).astype(theano.config.floatX)
+        inv_std = np.random.rand(30).astype(theano.config.floatX)
+        alpha = np.random.rand()
+
+        # create layer (with default axes: normalize over all but second axis)
+        layer = BatchNormLayer(input_shape, beta=beta, gamma=gamma, mean=mean,
+                               inv_std=inv_std, alpha=alpha)
+
+        # call get_output_for()
+        kwargs = {'deterministic': deterministic}
+        if use_averages is not None:
+            kwargs['batch_norm_use_averages'] = use_averages
+        else:
+            use_averages = deterministic
+        if update_averages is not None:
+            kwargs['batch_norm_update_averages'] = update_averages
+        else:
+            update_averages = not deterministic
+        result = layer.get_output_for(theano.tensor.constant(input),
+                                      **kwargs).eval()
+
+        # compute expected results and expected updated parameters
+        input_mean = input.mean(axis=(0, 2))
+        input_inv_std = 1 / np.sqrt(input.var(axis=(0, 2)) + layer.epsilon)
+        if use_averages:
+            use_mean, use_inv_std = mean, inv_std
+        else:
+            use_mean, use_inv_std = input_mean, input_inv_std
+        bcast = (np.newaxis, slice(None), np.newaxis)
+        exp_result = (input - use_mean[bcast]) * use_inv_std[bcast]
+        exp_result = exp_result * gamma[bcast] + beta[bcast]
+        if update_averages:
+            new_mean = (1 - alpha) * mean + alpha * input_mean
+            new_inv_std = (1 - alpha) * inv_std + alpha * input_inv_std
+        else:
+            new_mean, new_inv_std = mean, inv_std
+
+        # compare expected results to actual results
+        tol = {'atol': 1e-5, 'rtol': 1e-6}
+        assert np.allclose(layer.mean.get_value(), new_mean, **tol)
+        assert np.allclose(layer.inv_std.get_value(), new_inv_std, **tol)
+        assert np.allclose(result, exp_result, **tol)
+
+    def test_undefined_shape(self, BatchNormLayer):
+        # should work:
+        BatchNormLayer((64, None, 3), axes=(1, 2))
+        # should not work:
+        with pytest.raises(ValueError) as exc:
+            BatchNormLayer((64, None, 3), axes=(0, 2))
+        assert 'needs specified input sizes' in exc.value.args[0]
+
+    def test_skip_linear_transform(self, BatchNormLayer):
+        input_shape = (20, 30, 40)
+
+        # random input tensor, beta, gamma
+        input = (np.random.randn(*input_shape).astype(theano.config.floatX) +
+                 np.random.randn(1, 30, 1).astype(theano.config.floatX))
+        beta = np.random.randn(30).astype(theano.config.floatX)
+        gamma = np.random.randn(30).astype(theano.config.floatX)
+
+        # create layers without beta or gamma
+        layer1 = BatchNormLayer(input_shape, beta=None, gamma=gamma)
+        layer2 = BatchNormLayer(input_shape, beta=beta, gamma=None)
+
+        # check that one parameter is missing
+        assert len(layer1.get_params()) == 3
+        assert len(layer2.get_params()) == 3
+
+        # call get_output_for()
+        result1 = layer1.get_output_for(theano.tensor.constant(input),
+                                        deterministic=False).eval()
+        result2 = layer2.get_output_for(theano.tensor.constant(input),
+                                        deterministic=False).eval()
+
+        # compute expected results and expected updated parameters
+        mean = input.mean(axis=(0, 2))
+        std = np.sqrt(input.var(axis=(0, 2)) + layer1.epsilon)
+        exp_result = (input - mean[None, :, None]) / std[None, :, None]
+        exp_result1 = exp_result * gamma[None, :, None]  # no beta
+        exp_result2 = exp_result + beta[None, :, None]  # no gamma
+
+        # compare expected results to actual results
+        tol = {'atol': 1e-5, 'rtol': 1e-6}
+        assert np.allclose(result1, exp_result1, **tol)
+        assert np.allclose(result2, exp_result2, **tol)
+
+
+def test_batch_norm_macro():
+    from lasagne.layers import (Layer, BatchNormLayer, batch_norm,
+                                NonlinearityLayer)
+    from lasagne.nonlinearities import identity
+    input_shape = (2, 3)
+    obj = object()
+
+    # check if it steals the nonlinearity
+    layer = Mock(Layer, output_shape=input_shape, nonlinearity=obj)
+    bnstack = batch_norm(layer)
+    assert isinstance(bnstack, NonlinearityLayer)
+    assert isinstance(bnstack.input_layer, BatchNormLayer)
+    assert layer.nonlinearity is identity
+    assert bnstack.nonlinearity is obj
+
+    # check if it removes the bias
+    layer = Mock(Layer, output_shape=input_shape, b=obj, params={obj: set()})
+    bnstack = batch_norm(layer)
+    assert isinstance(bnstack, BatchNormLayer)
+    assert layer.b is None
+    assert obj not in layer.params
+
+    # check if it can handle an unset bias
+    layer = Mock(Layer, output_shape=input_shape, b=None, params={obj: set()})
+    bnstack = batch_norm(layer)
+    assert isinstance(bnstack, BatchNormLayer)
+    assert layer.b is None
+
+    # check if it passes on kwargs
+    layer = Mock(Layer, output_shape=input_shape)
+    bnstack = batch_norm(layer, name='foo')
+    assert isinstance(bnstack, BatchNormLayer)
+    assert bnstack.name == 'foo'
+
+    # check if created layers are named with kwargs name
+    layer = Mock(Layer, output_shape=input_shape, nonlinearity=obj)
+    layer.name = 'foo'
+    bnstack = batch_norm(layer, name='foo_bnorm')
+    assert isinstance(bnstack, NonlinearityLayer)
+    assert isinstance(bnstack.input_layer, BatchNormLayer)
+    assert bnstack.name == 'foo_bnorm_nonlin'
+    assert bnstack.input_layer.name == 'foo_bnorm'
+
+    # check if created layers are named with wrapped layer name
+    layer = Mock(Layer, output_shape=input_shape, nonlinearity=obj)
+    layer.name = 'foo'
+    bnstack = batch_norm(layer)
+    assert isinstance(bnstack, NonlinearityLayer)
+    assert isinstance(bnstack.input_layer, BatchNormLayer)
+    assert bnstack.name == 'foo_bn_nonlin'
+    assert bnstack.input_layer.name == 'foo_bn'
+
+    # check if created layers remain unnamed if no names are given
+    layer = Mock(Layer, output_shape=input_shape, nonlinearity=obj)
+    bnstack = batch_norm(layer)
+    assert isinstance(bnstack, NonlinearityLayer)
+    assert isinstance(bnstack.input_layer, BatchNormLayer)
+    assert bnstack.name is None
+    assert bnstack.input_layer.name is None
diff --git a/lasagne/tests/layers/test_pool.py b/lasagne/tests/layers/test_pool.py
new file mode 100644
index 0000000..bce9a4e
--- /dev/null
+++ b/lasagne/tests/layers/test_pool.py
@@ -0,0 +1,905 @@
+from mock import Mock
+import numpy as np
+import pytest
+import theano
+
+from lasagne.utils import floatX
+
+
+def max_pool_1d(data, pool_size, stride=None):
+    stride = pool_size if stride is None else stride
+
+    idx = range(data.shape[-1])
+    used_idx = set([])
+    idx_sets = []
+
+    i = 0
+    while i < data.shape[-1]:
+        idx_set = set(range(i, i + pool_size))
+        idx_set = idx_set.intersection(idx)
+        if not idx_set.issubset(used_idx):
+            idx_sets.append(list(idx_set))
+            used_idx = used_idx.union(idx_set)
+        i += stride
+
+    data_pooled = np.array(
+        [data[..., idx_set].max(axis=-1) for idx_set in idx_sets])
+    data_pooled = np.rollaxis(data_pooled, 0, len(data_pooled.shape))
+
+    return data_pooled
+
+
+def max_pool_1d_ignoreborder(data, pool_size, stride=None, pad=0):
+    stride = pool_size if stride is None else stride
+
+    pads = [(0, 0), ] * len(data.shape)
+    pads[-1] = (pad, pad)
+    data = np.pad(data, pads, mode='constant', constant_values=(-np.inf,))
+
+    data_shifted = np.zeros((pool_size,) + data.shape)
+    data_shifted = data_shifted[..., :data.shape[-1] - pool_size + 1]
+    for i in range(pool_size):
+        data_shifted[i] = data[..., i:i + data.shape[-1] - pool_size + 1]
+    data_pooled = data_shifted.max(axis=0)
+
+    if stride:
+        data_pooled = data_pooled[..., ::stride]
+
+    return data_pooled
+
+
+def upscale_1d_shape(shape, scale_factor):
+    return (shape[0], shape[1],
+            shape[2] * scale_factor[0])
+
+
+def upscale_1d(data, scale_factor):
+    upscaled = np.zeros(upscale_1d_shape(data.shape, scale_factor))
+    for i in range(scale_factor[0]):
+        upscaled[:, :, i::scale_factor[0]] = data
+    return upscaled
+
+
+def max_pool_2d(data, pool_size, stride):
+    data_pooled = max_pool_1d(data, pool_size[1], stride[1])
+
+    data_pooled = np.swapaxes(data_pooled, -1, -2)
+    data_pooled = max_pool_1d(data_pooled, pool_size[0], stride[0])
+    data_pooled = np.swapaxes(data_pooled, -1, -2)
+
+    return data_pooled
+
+
+def max_pool_2d_ignoreborder(data, pool_size, stride, pad):
+    data_pooled = max_pool_1d_ignoreborder(
+        data, pool_size[1], stride[1], pad[1])
+
+    data_pooled = np.swapaxes(data_pooled, -1, -2)
+    data_pooled = max_pool_1d_ignoreborder(
+        data_pooled, pool_size[0], stride[0], pad[0])
+    data_pooled = np.swapaxes(data_pooled, -1, -2)
+
+    return data_pooled
+
+
+def max_pool_3d_ignoreborder(data, pool_size, stride, pad):
+    # Pool last dim
+    data_pooled = max_pool_1d_ignoreborder(
+        data, pool_size[2], stride[2], pad[2])
+    # Swap second to last to back and pool it
+    data_pooled = np.swapaxes(data_pooled, -1, -2)
+    data_pooled = max_pool_1d_ignoreborder(
+        data_pooled, pool_size[1], stride[1], pad[1])
+
+    # Swap third to last and pool
+    data_pooled = np.swapaxes(data_pooled, -1, -3)
+    data_pooled = max_pool_1d_ignoreborder(
+        data_pooled, pool_size[0], stride[0], pad[0])
+
+    # Bring back in order
+    data_pooled = np.swapaxes(data_pooled, -1, -2)
+    data_pooled = np.swapaxes(data_pooled, -2, -3)
+
+    return data_pooled
+
+
+def upscale_2d_shape(shape, scale_factor):
+    return (shape[0], shape[1],
+            shape[2] * scale_factor[0], shape[3] * scale_factor[1])
+
+
+def upscale_2d(data, scale_factor):
+    upscaled = np.zeros(upscale_2d_shape(data.shape, scale_factor))
+    for j in range(scale_factor[0]):
+        for i in range(scale_factor[1]):
+            upscaled[:, :, j::scale_factor[0], i::scale_factor[1]] = data
+    return upscaled
+
+
+def spatial_pool(data, pool_dims):
+
+    def ceildiv(a, b):
+        return (a + b - 1) // b
+
+    def floordiv(a, b):
+        return a // b
+
+    input_size = data.shape[2:]
+    pooled_data_list = []
+    for pool_dim in pool_dims:
+        pool_size = tuple(ceildiv(i, pool_dim) for i in input_size)
+        stride_size = tuple(floordiv(i, pool_dim) for i in input_size)
+
+        pooled_part = max_pool_2d_ignoreborder(
+                data, pool_size, stride_size, (0, 0))
+        pooled_part = pooled_part.reshape(
+                data.shape[0], data.shape[1], pool_dim ** 2)
+        pooled_data_list.append(pooled_part)
+
+    return np.concatenate(pooled_data_list, axis=2)
+
+
+class TestFeaturePoolLayer:
+    def pool_test_sets():
+        for pool_size in [2, 3]:
+            for axis in [1, 2]:
+                yield (pool_size, axis)
+
+    def input_layer(self, output_shape):
+        from lasagne.layers.input import InputLayer
+        return InputLayer(output_shape)
+
+    def layer(self, input_layer, pool_size, axis):
+        from lasagne.layers.pool import FeaturePoolLayer
+        return FeaturePoolLayer(
+            input_layer,
+            pool_size=pool_size,
+            axis=axis,
+        )
+
+    def test_init_raises(self):
+        input_layer = self.input_layer((2, 3, 4))
+
+        with pytest.raises(ValueError):
+            self.layer(input_layer, pool_size=2, axis=1)
+
+    @pytest.mark.parametrize(
+        "pool_size, axis", list(pool_test_sets()))
+    def test_layer(self, pool_size, axis):
+        input = floatX(np.random.randn(3, 6, 12, 23))
+        input_layer = self.input_layer(input.shape)
+        input_theano = theano.shared(input)
+
+        layer = self.layer(input_layer, pool_size, axis)
+        layer_result = layer.get_output_for(input_theano).eval()
+
+        numpy_result = np.swapaxes(input, axis, -1)
+        numpy_result = max_pool_1d(numpy_result, pool_size)
+        numpy_result = np.swapaxes(numpy_result, -1, axis)
+
+        assert np.all(numpy_result.shape == layer.output_shape)
+        assert np.all(numpy_result.shape == layer_result.shape)
+        assert np.allclose(numpy_result, layer_result)
+
+
+class TestMaxPool1DLayer:
+    def pool_test_sets():
+        for pool_size in [2, 3]:
+            for stride in [1, 2, 3, 4]:
+                yield (pool_size, stride)
+
+    def pool_test_sets_ignoreborder():
+        for pool_size in [2, 3]:
+            for stride in [1, 2, 3, 4]:
+                for pad in range(pool_size):
+                    yield (pool_size, stride, pad)
+
+    def input_layer(self, output_shape):
+        return Mock(output_shape=output_shape)
+
+    def layer(self, input_layer, pool_size, stride=None, pad=0):
+        from lasagne.layers.pool import MaxPool1DLayer
+        return MaxPool1DLayer(
+            input_layer,
+            pool_size=pool_size,
+            stride=stride,
+            ignore_border=False,
+        )
+
+    def layer_ignoreborder(self, input_layer, pool_size, stride=None, pad=0):
+        from lasagne.layers.pool import MaxPool1DLayer
+        return MaxPool1DLayer(
+            input_layer,
+            pool_size=pool_size,
+            stride=stride,
+            pad=pad,
+            ignore_border=True,
+        )
+
+    @pytest.mark.parametrize(
+        "pool_size, stride", list(pool_test_sets()))
+    def test_get_output_and_shape_for(self, pool_size, stride):
+        input = floatX(np.random.randn(8, 16, 23))
+        input_layer = self.input_layer(input.shape)
+        input_theano = theano.shared(input)
+
+        layer = self.layer(input_layer, pool_size, stride)
+        layer_output_shape = layer.get_output_shape_for(input.shape)
+        layer_output = layer.get_output_for(input_theano)
+        layer_result = layer_output.eval()
+
+        numpy_result = max_pool_1d(input, pool_size, stride)
+
+        assert numpy_result.shape == layer_output_shape
+        assert np.allclose(numpy_result, layer_result)
+
+    @pytest.mark.parametrize(
+        "pool_size, stride, pad", list(pool_test_sets_ignoreborder()))
+    def test_get_output_for_ignoreborder(self, pool_size, stride, pad):
+        input = floatX(np.random.randn(8, 16, 23))
+        input_layer = self.input_layer(input.shape)
+        input_theano = theano.shared(input)
+        layer_output = self.layer_ignoreborder(
+            input_layer, pool_size, stride, pad).get_output_for(input_theano)
+
+        layer_result = layer_output.eval()
+        numpy_result = max_pool_1d_ignoreborder(input, pool_size, stride, pad)
+
+        assert np.all(numpy_result.shape == layer_result.shape)
+        assert np.allclose(numpy_result, layer_result)
+
+    @pytest.mark.parametrize(
+        "input_shape", [(32, 64, 128), (None, 64, 128), (32, None, 128),
+                        (32, 64, None)])
+    def test_get_output_shape_for(self, input_shape):
+        input_layer = self.input_layer(input_shape)
+        layer = self.layer_ignoreborder(input_layer, pool_size=2)
+        assert layer.get_output_shape_for((None, 64, 128)) == (None, 64, 64)
+        assert layer.get_output_shape_for((32, 64, None)) == (32, 64, None)
+        assert layer.get_output_shape_for((32, 64, 128)) == (32, 64, 64)
+
+    def test_fail_on_mismatching_dimensionality(self):
+        from lasagne.layers.pool import MaxPool1DLayer
+        with pytest.raises(ValueError) as exc:
+            MaxPool1DLayer((10, 20), 3, 2)
+        assert "Expected 3 input dimensions" in exc.value.args[0]
+        with pytest.raises(ValueError) as exc:
+            MaxPool1DLayer((10, 20, 30, 40), 3, 2)
+        assert "Expected 3 input dimensions" in exc.value.args[0]
+
+
+class TestMaxPool2DLayer:
+    def pool_test_sets():
+        for pool_size in [2, 3]:
+            for stride in [1, 2, 3, 4]:
+                yield (pool_size, stride)
+
+    def pool_test_sets_ignoreborder():
+        for pool_size in [2, 3]:
+            for stride in [1, 2, 3, 4]:
+                for pad in range(pool_size):
+                    yield (pool_size, stride, pad)
+
+    def input_layer(self, output_shape):
+        return Mock(output_shape=output_shape)
+
+    def layer(self, input_layer, pool_size, stride=None,
+              pad=(0, 0), ignore_border=False):
+        from lasagne.layers.pool import MaxPool2DLayer
+        return MaxPool2DLayer(
+            input_layer,
+            pool_size=pool_size,
+            stride=stride,
+            pad=pad,
+            ignore_border=ignore_border,
+        )
+
+    @pytest.mark.parametrize(
+        "pool_size, stride", list(pool_test_sets()))
+    def test_get_output_for(self, pool_size, stride):
+        try:
+            input = floatX(np.random.randn(8, 16, 17, 13))
+            input_layer = self.input_layer(input.shape)
+            input_theano = theano.shared(input)
+            result = self.layer(
+                input_layer,
+                (pool_size, pool_size),
+                (stride, stride),
+                ignore_border=False,
+            ).get_output_for(input_theano)
+
+            result_eval = result.eval()
+            numpy_result = max_pool_2d(
+                input, (pool_size, pool_size), (stride, stride))
+
+            assert np.all(numpy_result.shape == result_eval.shape)
+            assert np.allclose(result_eval, numpy_result)
+        except NotImplementedError:
+            pytest.skip()
+
+    @pytest.mark.parametrize(
+        "pool_size, stride, pad", list(pool_test_sets_ignoreborder()))
+    def test_get_output_for_ignoreborder(self, pool_size,
+                                         stride, pad):
+        try:
+            input = floatX(np.random.randn(8, 16, 17, 13))
+            input_layer = self.input_layer(input.shape)
+            input_theano = theano.shared(input)
+
+            result = self.layer(
+                input_layer,
+                pool_size,
+                stride,
+                pad,
+                ignore_border=True,
+            ).get_output_for(input_theano)
+
+            result_eval = result.eval()
+            numpy_result = max_pool_2d_ignoreborder(
+                input, (pool_size, pool_size), (stride, stride), (pad, pad))
+
+            assert np.all(numpy_result.shape == result_eval.shape)
+            assert np.allclose(result_eval, numpy_result)
+        except NotImplementedError:
+            pytest.skip()
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [((32, 64, 24, 24), (32, 64, 12, 12)),
+         ((None, 64, 24, 24), (None, 64, 12, 12)),
+         ((32, None, 24, 24), (32, None, 12, 12)),
+         ((32, 64, None, 24), (32, 64, None, 12)),
+         ((32, 64, 24, None), (32, 64, 12, None)),
+         ((32, 64, None, None), (32, 64, None, None))],
+    )
+    def test_get_output_shape_for(self, input_shape, output_shape):
+        try:
+            input_layer = self.input_layer(input_shape)
+            layer = self.layer(input_layer,
+                               pool_size=(2, 2), stride=None)
+            assert layer.get_output_shape_for(
+                input_shape) == output_shape
+        except NotImplementedError:
+            pytest.skip()
+
+    def test_fail_on_mismatching_dimensionality(self):
+        from lasagne.layers.pool import MaxPool2DLayer
+        with pytest.raises(ValueError) as exc:
+            MaxPool2DLayer((10, 20, 30), 3, 2)
+        assert "Expected 4 input dimensions" in exc.value.args[0]
+        with pytest.raises(ValueError) as exc:
+            MaxPool2DLayer((10, 20, 30, 40, 50), 3, 2)
+        assert "Expected 4 input dimensions" in exc.value.args[0]
+
+
+class TestMaxPool2DCCLayer:
+    def pool_test_sets():
+        for pool_size in [2, 3]:
+            for stride in range(1, pool_size+1):
+                yield (pool_size, stride)
+
+    def input_layer(self, output_shape):
+        return Mock(output_shape=output_shape)
+
+    def layer(self, input_layer, pool_size, stride):
+        try:
+            from lasagne.layers.cuda_convnet import MaxPool2DCCLayer
+        except ImportError:
+            pytest.skip("cuda_convnet not available")
+        return MaxPool2DCCLayer(
+            input_layer,
+            pool_size=pool_size,
+            stride=stride,
+        )
+
+    @pytest.mark.parametrize(
+        "pool_size, stride", list(pool_test_sets()))
+    def test_get_output_for(self, pool_size, stride):
+        try:
+            input = floatX(np.random.randn(8, 16, 16, 16))
+            input_layer = self.input_layer(input.shape)
+            input_theano = theano.shared(input)
+            result = self.layer(
+                input_layer,
+                (pool_size, pool_size),
+                (stride, stride),
+            ).get_output_for(input_theano)
+
+            result_eval = result.eval()
+            numpy_result = max_pool_2d(
+                input, (pool_size, pool_size), (stride, stride))
+
+            assert np.all(numpy_result.shape == result_eval.shape)
+            assert np.allclose(result_eval, numpy_result)
+        except NotImplementedError:
+            pytest.skip()
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [((32, 64, 24, 24), (32, 64, 12, 12)),
+         ((None, 64, 24, 24), (None, 64, 12, 12)),
+         ((32, None, 24, 24), (32, None, 12, 12)),
+         ((32, 64, None, 24), (32, 64, None, 12)),
+         ((32, 64, 24, None), (32, 64, 12, None)),
+         ((32, 64, None, None), (32, 64, None, None))],
+    )
+    def test_get_output_shape_for(self, input_shape, output_shape):
+        try:
+            input_layer = self.input_layer(input_shape)
+            layer = self.layer(input_layer,
+                               pool_size=(2, 2), stride=None)
+            assert layer.get_output_shape_for(
+                input_shape) == output_shape
+        except NotImplementedError:
+            pytest.skip()
+
+    def test_not_implemented(self):
+        try:
+            from lasagne.layers.cuda_convnet import MaxPool2DCCLayer
+        except ImportError:
+            pytest.skip("cuda_convnet not available")
+
+        input_layer = self.input_layer((128, 4, 12, 12))
+
+        with pytest.raises(NotImplementedError) as exc:
+            layer = MaxPool2DCCLayer(input_layer, pool_size=2, pad=2)
+        assert "MaxPool2DCCLayer does not support padding" in exc.value.args[0]
+
+        with pytest.raises(NotImplementedError) as exc:
+            layer = MaxPool2DCCLayer(input_layer, pool_size=(2, 3))
+        assert ("MaxPool2DCCLayer only supports square pooling regions" in
+                exc.value.args[0])
+
+        with pytest.raises(NotImplementedError) as exc:
+            layer = MaxPool2DCCLayer(input_layer, pool_size=2, stride=(1, 2))
+        assert (("MaxPool2DCCLayer only supports using the same stride in "
+                 "both directions") in exc.value.args[0])
+
+        with pytest.raises(NotImplementedError) as exc:
+            layer = MaxPool2DCCLayer(input_layer, pool_size=2, stride=3)
+        assert ("MaxPool2DCCLayer only supports stride <= pool_size" in
+                exc.value.args[0])
+
+        with pytest.raises(NotImplementedError) as exc:
+            layer = MaxPool2DCCLayer(input_layer, pool_size=2,
+                                     ignore_border=True)
+        assert ("MaxPool2DCCLayer does not support ignore_border=True" in
+                exc.value.args[0])
+
+    def test_dimshuffle_false(self):
+        try:
+            from lasagne.layers.cuda_convnet import MaxPool2DCCLayer
+        except ImportError:
+            pytest.skip("cuda_convnet not available")
+        from lasagne.layers.input import InputLayer
+
+        input_layer = InputLayer((4, 12, 12, 16))  # c01b order
+        layer = MaxPool2DCCLayer(input_layer, pool_size=2, dimshuffle=False)
+        assert layer.output_shape == (4, 6, 6, 16)
+
+        input = floatX(np.random.randn(4, 12, 12, 16))
+        output = max_pool_2d(input.transpose(3, 0, 1, 2), (2, 2), (2, 2))
+        output = output.transpose(1, 2, 3, 0)
+        actual = layer.get_output_for(input).eval()
+        assert np.allclose(output, actual)
+
+
+class TestMaxPool2DNNLayer:
+    def pool_test_sets_ignoreborder():
+        for pool_size in [2, 3]:
+            for stride in [1, 2, 3, 4]:
+                for pad in range(pool_size):
+                    yield (pool_size, stride, pad)
+
+    def input_layer(self, output_shape):
+        return Mock(output_shape=output_shape)
+
+    def layer(self, input_layer, pool_size, stride, pad):
+        try:
+            from lasagne.layers.dnn import MaxPool2DDNNLayer
+        except ImportError:
+            pytest.skip("cuDNN not available")
+
+        return MaxPool2DDNNLayer(
+            input_layer,
+            pool_size=pool_size,
+            stride=stride,
+            pad=pad,
+        )
+
+    @pytest.mark.parametrize(
+        "pool_size, stride, pad", list(pool_test_sets_ignoreborder()))
+    def test_get_output_for_ignoreborder(self, pool_size,
+                                         stride, pad):
+        try:
+            input = floatX(np.random.randn(8, 16, 17, 13))
+            input_layer = self.input_layer(input.shape)
+            input_theano = theano.shared(input)
+
+            result = self.layer(
+                input_layer,
+                pool_size,
+                stride,
+                pad,
+            ).get_output_for(input_theano)
+
+            result_eval = result.eval()
+            numpy_result = max_pool_2d_ignoreborder(
+                input, (pool_size, pool_size), (stride, stride), (pad, pad))
+
+            assert np.all(numpy_result.shape == result_eval.shape)
+            assert np.allclose(result_eval, numpy_result)
+        except NotImplementedError:
+            pytest.skip()
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [((32, 64, 24, 24), (32, 64, 12, 12)),
+         ((None, 64, 24, 24), (None, 64, 12, 12)),
+         ((32, None, 24, 24), (32, None, 12, 12)),
+         ((32, 64, None, 24), (32, 64, None, 12)),
+         ((32, 64, 24, None), (32, 64, 12, None)),
+         ((32, 64, None, None), (32, 64, None, None))],
+    )
+    def test_get_output_shape_for(self, input_shape, output_shape):
+        try:
+            input_layer = self.input_layer(input_shape)
+            layer = self.layer(input_layer,
+                               pool_size=(2, 2), stride=None, pad=(0, 0))
+            assert layer.get_output_shape_for(
+                input_shape) == output_shape
+        except NotImplementedError:
+            raise
+        #    pytest.skip()
+
+    def test_not_implemented(self):
+        try:
+            from lasagne.layers.dnn import MaxPool2DDNNLayer
+        except ImportError:
+            pytest.skip("cuDNN not available")
+        with pytest.raises(NotImplementedError) as exc:
+            layer = MaxPool2DDNNLayer((1, 2, 3, 4), pool_size=2,
+                                      ignore_border=False)
+        assert ("Pool2DDNNLayer does not support ignore_border=False" in
+                exc.value.args[0])
+
+    def test_fail_on_mismatching_dimensionality(self):
+        try:
+            from lasagne.layers.dnn import MaxPool2DDNNLayer
+        except ImportError:
+            pytest.skip("cuDNN not available")
+        with pytest.raises(ValueError) as exc:
+            MaxPool2DDNNLayer((10, 20, 30), 3, 2)
+        assert "Expected 4 input dimensions" in exc.value.args[0]
+        with pytest.raises(ValueError) as exc:
+            MaxPool2DDNNLayer((10, 20, 30, 40, 50), 3, 2)
+        assert "Expected 4 input dimensions" in exc.value.args[0]
+
+
+class TestMaxPool3DNNLayer:
+    def pool_test_sets_ignoreborder():
+        for pool_size in [2, 3]:
+            for stride in [1, 2, 3, 4]:
+                for pad in range(pool_size):
+                    yield (pool_size, stride, pad)
+
+    def input_layer(self, output_shape):
+        return Mock(output_shape=output_shape)
+
+    def layer(self, input_layer, pool_size, stride, pad):
+        try:
+            from lasagne.layers.dnn import MaxPool3DDNNLayer
+        except ImportError:
+            pytest.skip("cuDNN not available")
+
+        return MaxPool3DDNNLayer(
+            input_layer,
+            pool_size=pool_size,
+            stride=stride,
+            pad=pad,
+        )
+
+    @pytest.mark.parametrize(
+        "pool_size, stride, pad", list(pool_test_sets_ignoreborder()))
+    def test_get_output_for_ignoreborder(self, pool_size,
+                                         stride, pad):
+        try:
+            input = floatX(np.random.randn(5, 8, 16, 17, 13))
+            input_layer = self.input_layer(input.shape)
+            input_theano = theano.shared(input)
+
+            result = self.layer(
+                input_layer,
+                pool_size,
+                stride,
+                pad,
+            ).get_output_for(input_theano)
+
+            result_eval = result.eval()
+            numpy_result = max_pool_3d_ignoreborder(
+                input, [pool_size]*3, [stride]*3, [pad]*3)
+
+            assert np.all(numpy_result.shape == result_eval.shape)
+            assert np.allclose(result_eval, numpy_result)
+        except NotImplementedError:
+            pytest.skip()
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [((32, 32, 64, 24, 24), (32, 32, 32, 12, 12)),
+         ((None, 32, 48, 24, 24), (None, 32, 24, 12, 12)),
+         ((32, None, 32, 24, 24), (32, None, 16, 12, 12)),
+         ((32, 64, None, 24, 24), (32, 64, None, 12, 12)),
+         ((32, 64, 32, None, 24), (32, 64, 16, None, 12)),
+         ((32, 64, 32, 24, None), (32, 64, 16, 12, None)),
+         ((32, 64, 12, None, None), (32, 64, 6, None, None)),
+         ((32, 64, None, None, None), (32, 64, None, None, None))],
+    )
+    def test_get_output_shape_for(self, input_shape, output_shape):
+        try:
+            input_layer = self.input_layer(input_shape)
+            layer = self.layer(input_layer,
+                               pool_size=(2, 2, 2), stride=None, pad=(0, 0, 0))
+            assert layer.get_output_shape_for(
+                input_shape) == output_shape
+        except NotImplementedError:
+            raise
+        #    pytest.skip()
+
+    def test_not_implemented(self):
+        try:
+            from lasagne.layers.dnn import MaxPool3DDNNLayer
+        except ImportError:
+            pytest.skip("cuDNN not available")
+        with pytest.raises(NotImplementedError) as exc:
+            layer = MaxPool3DDNNLayer((1, 2, 3, 4, 5), pool_size=2,
+                                      ignore_border=False)
+        assert ("Pool3DDNNLayer does not support ignore_border=False" in
+                exc.value.args[0])
+
+    def test_fail_on_mismatching_dimensionality(self):
+        try:
+            from lasagne.layers.dnn import MaxPool3DDNNLayer
+        except ImportError:
+            pytest.skip("cuDNN not available")
+        with pytest.raises(ValueError) as exc:
+            MaxPool3DDNNLayer((10, 20, 30, 40), 3, 2)
+        assert "Expected 5 input dimensions" in exc.value.args[0]
+        with pytest.raises(ValueError) as exc:
+            MaxPool3DDNNLayer((10, 20, 30, 40, 50, 60), 3, 2)
+        assert "Expected 5 input dimensions" in exc.value.args[0]
+
+
+class TestUpscale1DLayer:
+    def scale_factor_test_sets():
+        for scale_factor in [2, 3]:
+            yield scale_factor
+
+    def input_layer(self, output_shape):
+        return Mock(output_shape=output_shape)
+
+    def layer(self, input_layer, scale_factor):
+        from lasagne.layers.pool import Upscale1DLayer
+        return Upscale1DLayer(
+            input_layer,
+            scale_factor=scale_factor,
+        )
+
+    def test_invalid_scale_factor(self):
+        from lasagne.layers.pool import Upscale1DLayer
+        inlayer = self.input_layer((128, 3, 32))
+        with pytest.raises(ValueError):
+            Upscale1DLayer(inlayer, scale_factor=0)
+        with pytest.raises(ValueError):
+            Upscale1DLayer(inlayer, scale_factor=-1)
+        with pytest.raises(ValueError):
+            Upscale1DLayer(inlayer, scale_factor=(0))
+
+    @pytest.mark.parametrize(
+        "scale_factor", list(scale_factor_test_sets()))
+    def test_get_output_for(self, scale_factor):
+        input = floatX(np.random.randn(8, 16, 17))
+        input_layer = self.input_layer(input.shape)
+        input_theano = theano.shared(input)
+        result = self.layer(
+            input_layer,
+            (scale_factor),
+        ).get_output_for(input_theano)
+
+        result_eval = result.eval()
+        numpy_result = upscale_1d(input, (scale_factor, scale_factor))
+
+        assert np.all(numpy_result.shape == result_eval.shape)
+        assert np.allclose(result_eval, numpy_result)
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [((32, 64, 24), (32, 64, 48)),
+         ((None, 64, 24), (None, 64, 48)),
+         ((32, None, 24), (32, None, 48)),
+         ((32, 64, None), (32, 64, None))],
+    )
+    def test_get_output_shape_for(self, input_shape, output_shape):
+        input_layer = self.input_layer(input_shape)
+        layer = self.layer(input_layer,
+                           scale_factor=(2))
+        assert layer.get_output_shape_for(
+            input_shape) == output_shape
+
+
+class TestUpscale2DLayer:
+    def scale_factor_test_sets():
+        for scale_factor in [2, 3]:
+                yield scale_factor
+
+    def input_layer(self, output_shape):
+        return Mock(output_shape=output_shape)
+
+    def layer(self, input_layer, scale_factor):
+        from lasagne.layers.pool import Upscale2DLayer
+        return Upscale2DLayer(
+            input_layer,
+            scale_factor=scale_factor,
+        )
+
+    def test_invalid_scale_factor(self):
+        from lasagne.layers.pool import Upscale2DLayer
+        inlayer = self.input_layer((128, 3, 32, 32))
+        with pytest.raises(ValueError):
+            Upscale2DLayer(inlayer, scale_factor=0)
+        with pytest.raises(ValueError):
+            Upscale2DLayer(inlayer, scale_factor=-1)
+        with pytest.raises(ValueError):
+            Upscale2DLayer(inlayer, scale_factor=(0, 2))
+        with pytest.raises(ValueError):
+            Upscale2DLayer(inlayer, scale_factor=(2, 0))
+
+    @pytest.mark.parametrize(
+        "scale_factor", list(scale_factor_test_sets()))
+    def test_get_output_for(self, scale_factor):
+        input = floatX(np.random.randn(8, 16, 17, 13))
+        input_layer = self.input_layer(input.shape)
+        input_theano = theano.shared(input)
+        result = self.layer(
+            input_layer,
+            (scale_factor, scale_factor),
+        ).get_output_for(input_theano)
+
+        result_eval = result.eval()
+        numpy_result = upscale_2d(input, (scale_factor, scale_factor))
+
+        assert np.all(numpy_result.shape == result_eval.shape)
+        assert np.allclose(result_eval, numpy_result)
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [((32, 64, 24, 24), (32, 64, 48, 48)),
+         ((None, 64, 24, 24), (None, 64, 48, 48)),
+         ((32, None, 24, 24), (32, None, 48, 48)),
+         ((32, 64, None, 24), (32, 64, None, 48)),
+         ((32, 64, 24, None), (32, 64, 48, None)),
+         ((32, 64, None, None), (32, 64, None, None))],
+    )
+    def test_get_output_shape_for(self, input_shape, output_shape):
+        input_layer = self.input_layer(input_shape)
+        layer = self.layer(input_layer,
+                           scale_factor=(2, 2))
+        assert layer.get_output_shape_for(
+            input_shape) == output_shape
+
+
+class TestFeatureWTALayer(object):
+    @pytest.fixture
+    def FeatureWTALayer(self):
+        from lasagne.layers.pool import FeatureWTALayer
+        return FeatureWTALayer
+
+    @pytest.fixture
+    def input_layer(self):
+        from lasagne.layers.input import InputLayer
+        return InputLayer((2, 4, 8))
+
+    @pytest.fixture
+    def layer(self, FeatureWTALayer, input_layer):
+        return FeatureWTALayer(input_layer, pool_size=2)
+
+    def test_init_raises(self, FeatureWTALayer, input_layer):
+        with pytest.raises(ValueError):
+            FeatureWTALayer(input_layer, pool_size=3)
+
+    def test_get_output_for(self, layer):
+        input = theano.shared(np.random.uniform(-1, 1, (2, 4, 8)))
+        result = layer.get_output_for(input).eval()
+
+        reshaped = input.get_value().reshape((2, 2, 2, 8))
+        np_result = reshaped * (reshaped == reshaped.max(2, keepdims=True))
+        np_result = np_result.reshape((2, 4, 8))
+
+        assert np.allclose(result, np_result)
+
+
+class TestGlobalPoolLayer(object):
+    @pytest.fixture
+    def GlobalPoolLayer(self):
+        from lasagne.layers.pool import GlobalPoolLayer
+        return GlobalPoolLayer
+
+    @pytest.fixture
+    def layer(self, GlobalPoolLayer):
+        return GlobalPoolLayer(Mock(output_shape=(None,)))
+
+    def test_get_output_shape_for(self, layer):
+        assert layer.get_output_shape_for((2, 3, 4, 5)) == (2, 3)
+
+    def test_get_output_for(self, layer):
+        input = theano.shared(np.random.uniform(-1, 1, (2, 3, 4, 5)))
+        result = layer.get_output_for(input).eval()
+
+        np_result = input.get_value().reshape((2, 3, -1)).mean(-1)
+
+        assert np.allclose(result, np_result)
+
+
+class TestSpatialPyramidPoolingDNNLayer:
+    def pool_dims_test_sets():
+        for pyramid_level in [2, 3, 4]:
+            pool_dims = list(range(1, pyramid_level))
+            yield pool_dims
+
+    def input_layer(self, output_shape):
+        return Mock(output_shape=output_shape)
+
+    def layer(self, input_layer, pool_dims):
+        try:
+            from lasagne.layers.dnn import SpatialPyramidPoolingDNNLayer
+        except ImportError:
+            pytest.skip("cuDNN not available")
+
+        return SpatialPyramidPoolingDNNLayer(input_layer, pool_dims=pool_dims)
+
+    @pytest.mark.parametrize(
+        "pool_dims", list(pool_dims_test_sets()))
+    def test_get_output_for(self, pool_dims):
+        try:
+            input = floatX(np.random.randn(8, 16, 17, 13))
+            input_layer = self.input_layer(input.shape)
+            input_theano = theano.shared(input)
+            layer = self.layer(input_layer, pool_dims)
+
+            result = layer.get_output_for(input_theano)
+
+            result_eval = result.eval()
+            numpy_result = spatial_pool(input, pool_dims)
+
+            assert result_eval.shape == numpy_result.shape
+            assert np.allclose(result_eval, numpy_result)
+            assert result_eval.shape == layer.output_shape
+        except NotImplementedError:
+            pytest.skip()
+
+    @pytest.mark.parametrize(
+        "input_shape,output_shape",
+        [((32, 64, 24, 24), (32, 64, 21)),
+         ((None, 64, 23, 25), (None, 64, 21)),
+         ((32, None, 22, 26), (32, None, 21)),
+         ((None, None, None, None), (None, None, 21))],
+    )
+    def test_get_output_shape_for(self, input_shape, output_shape):
+        try:
+            input_layer = self.input_layer(input_shape)
+            layer = self.layer(input_layer, pool_dims=[1, 2, 4])
+            assert layer.get_output_shape_for(input_shape) == output_shape
+        except NotImplementedError:
+            raise
+
+    def test_fail_on_mismatching_dimensionality(self):
+        try:
+            from lasagne.layers.dnn import SpatialPyramidPoolingDNNLayer
+        except ImportError:
+            pytest.skip("cuDNN not available")
+        with pytest.raises(ValueError) as exc:
+            SpatialPyramidPoolingDNNLayer((10, 20, 30))
+        assert "Expected 4 input dimensions" in exc.value.args[0]
+        with pytest.raises(ValueError) as exc:
+            SpatialPyramidPoolingDNNLayer((10, 20, 30, 40, 50))
+        assert "Expected 4 input dimensions" in exc.value.args[0]
diff --git a/lasagne/tests/layers/test_recurrent.py b/lasagne/tests/layers/test_recurrent.py
new file mode 100644
index 0000000..9464c04
--- /dev/null
+++ b/lasagne/tests/layers/test_recurrent.py
@@ -0,0 +1,1101 @@
+import pytest
+
+from lasagne.layers import RecurrentLayer, LSTMLayer, CustomRecurrentLayer
+from lasagne.layers import InputLayer, DenseLayer, GRULayer, Gate, Layer
+from lasagne.layers import helper
+import theano
+import theano.tensor as T
+import numpy as np
+import lasagne
+from mock import Mock
+
+
+def test_recurrent_return_shape():
+    num_batch, seq_len, n_features1, n_features2 = 5, 3, 10, 11
+    num_units = 6
+    x = T.tensor4()
+    in_shp = (num_batch, seq_len, n_features1, n_features2)
+    l_inp = InputLayer(in_shp)
+    l_rec = RecurrentLayer(l_inp, num_units=num_units)
+
+    x_in = np.random.random(in_shp).astype('float32')
+    output = helper.get_output(l_rec, x)
+    output_val = output.eval({x: x_in})
+
+    assert helper.get_output_shape(l_rec, x_in.shape) == output_val.shape
+    assert output_val.shape == (num_batch, seq_len, num_units)
+
+
+def test_recurrent_grad():
+    num_batch, seq_len, n_features = 5, 3, 10
+    num_units = 6
+    l_inp = InputLayer((num_batch, seq_len, n_features))
+    l_rec = RecurrentLayer(l_inp,
+                           num_units=num_units)
+    output = helper.get_output(l_rec)
+    g = T.grad(T.mean(output), lasagne.layers.get_all_params(l_rec))
+    assert isinstance(g, (list, tuple))
+
+
+def test_recurrent_nparams():
+    l_inp = InputLayer((2, 2, 3))
+    l_rec = RecurrentLayer(l_inp, 5, learn_init=False, nonlinearity=None)
+
+    # b, W_hid_to_hid and W_in_to_hid
+    assert len(lasagne.layers.get_all_params(l_rec, trainable=True)) == 3
+
+    # b + hid_init
+    assert len(lasagne.layers.get_all_params(l_rec, regularizable=False)) == 2
+
+
+def test_recurrent_nparams_learn_init():
+    l_inp = InputLayer((2, 2, 3))
+    l_rec = RecurrentLayer(l_inp, 5, learn_init=True)
+
+    # b, W_hid_to_hid and W_in_to_hid + hid_init
+    assert len(lasagne.layers.get_all_params(l_rec, trainable=True)) == 4
+
+    # b + hid_init
+    assert len(lasagne.layers.get_all_params(l_rec, regularizable=False)) == 2
+
+
+def test_recurrent_hid_init_layer():
+    # test that you can set hid_init to be a layer
+    l_inp = InputLayer((2, 2, 3))
+    l_inp_h = InputLayer((2, 5))
+    l_rec = RecurrentLayer(l_inp, 5, hid_init=l_inp_h)
+
+    x = T.tensor3()
+    h = T.matrix()
+
+    output = lasagne.layers.get_output(l_rec, {l_inp: x, l_inp_h: h})
+
+
+def test_recurrent_nparams_hid_init_layer():
+    # test that you can see layers through hid_init
+    l_inp = InputLayer((2, 2, 3))
+    l_inp_h = InputLayer((2, 5))
+    l_inp_h_de = DenseLayer(l_inp_h, 7)
+    l_rec = RecurrentLayer(l_inp, 7, hid_init=l_inp_h_de)
+
+    # directly check the layers can be seen through hid_init
+    assert lasagne.layers.get_all_layers(l_rec) == [l_inp, l_inp_h, l_inp_h_de,
+                                                    l_rec]
+
+    # b, W_hid_to_hid and W_in_to_hid + W + b
+    assert len(lasagne.layers.get_all_params(l_rec, trainable=True)) == 5
+
+    # b (recurrent) + b (dense)
+    assert len(lasagne.layers.get_all_params(l_rec, regularizable=False)) == 2
+
+
+def test_recurrent_hid_init_mask():
+    # test that you can set hid_init to be a layer when a mask is provided
+    l_inp = InputLayer((2, 2, 3))
+    l_inp_h = InputLayer((2, 5))
+    l_inp_msk = InputLayer((2, 2))
+    l_rec = RecurrentLayer(l_inp, 5, hid_init=l_inp_h, mask_input=l_inp_msk)
+
+    x = T.tensor3()
+    h = T.matrix()
+    msk = T.matrix()
+
+    inputs = {l_inp: x, l_inp_h: h, l_inp_msk: msk}
+    output = lasagne.layers.get_output(l_rec, inputs)
+
+
+def test_recurrent_hid_init_layer_eval():
+    # Test `hid_init` as a `Layer` with some dummy input. Compare the output of
+    # a network with a `Layer` as input to `hid_init` to a network with a
+    # `np.array` as input to `hid_init`
+    n_units = 7
+    n_test_cases = 2
+    in_shp = (n_test_cases, 2, 3)
+    in_h_shp = (1, n_units)
+
+    # dummy inputs
+    X_test = np.ones(in_shp, dtype=theano.config.floatX)
+    Xh_test = np.ones(in_h_shp, dtype=theano.config.floatX)
+    Xh_test_batch = np.tile(Xh_test, (n_test_cases, 1))
+
+    # network with `Layer` initializer for hid_init
+    l_inp = InputLayer(in_shp)
+    l_inp_h = InputLayer(in_h_shp)
+    l_rec_inp_layer = RecurrentLayer(l_inp, n_units, hid_init=l_inp_h,
+                                     nonlinearity=None)
+
+    # network with `np.array` initializer for hid_init
+    l_rec_nparray = RecurrentLayer(l_inp, n_units, hid_init=Xh_test,
+                                   nonlinearity=None)
+
+    # copy network parameters from l_rec_inp_layer to l_rec_nparray
+    l_il_param = dict([(p.name, p) for p in l_rec_inp_layer.get_params()])
+    l_rn_param = dict([(p.name, p) for p in l_rec_nparray.get_params()])
+    for k, v in l_rn_param.items():
+        if k in l_il_param:
+            v.set_value(l_il_param[k].get_value())
+
+    # build the theano functions
+    X = T.tensor3()
+    Xh = T.matrix()
+    output_inp_layer = lasagne.layers.get_output(l_rec_inp_layer,
+                                                 {l_inp: X, l_inp_h: Xh})
+    output_nparray = lasagne.layers.get_output(l_rec_nparray, {l_inp: X})
+
+    # test both nets with dummy input
+    output_val_inp_layer = output_inp_layer.eval({X: X_test,
+                                                  Xh: Xh_test_batch})
+    output_val_nparray = output_nparray.eval({X: X_test})
+
+    # check output given `Layer` is the same as with `np.array`
+    assert np.allclose(output_val_inp_layer, output_val_nparray)
+
+
+def test_recurrent_incoming_tuple():
+    input_shape = (2, 3, 4)
+    l_rec = lasagne.layers.RecurrentLayer(input_shape, 5)
+    assert l_rec.input_shapes[0] == input_shape
+
+
+def test_recurrent_name():
+    l_in = lasagne.layers.InputLayer((2, 3, 4))
+    layer_name = 'l_rec'
+    l_rec = lasagne.layers.RecurrentLayer(l_in, 4, name=layer_name)
+    assert l_rec.b.name == layer_name + '.input_to_hidden.b'
+    assert l_rec.W_in_to_hid.name == layer_name + '.input_to_hidden.W'
+    assert l_rec.W_hid_to_hid.name == layer_name + '.hidden_to_hidden.W'
+
+
+def test_custom_recurrent_arbitrary_shape():
+    # Check that the custom recurrent layer can handle more than 1 feature dim
+    n_batch, n_steps, n_channels, width, height = (2, 3, 4, 5, 6)
+    n_out_filters = 7
+    filter_shape = (3, 3)
+    l_in = lasagne.layers.InputLayer(
+        (n_batch, n_steps, n_channels, width, height))
+    l_in_to_hid = lasagne.layers.Conv2DLayer(
+        lasagne.layers.InputLayer((None, n_channels, width, height)),
+        n_out_filters, filter_shape, pad='same')
+    l_hid_to_hid = lasagne.layers.Conv2DLayer(
+        lasagne.layers.InputLayer((None, n_out_filters, width, height)),
+        n_out_filters, filter_shape, pad='same')
+    l_rec = lasagne.layers.CustomRecurrentLayer(
+        l_in, l_in_to_hid, l_hid_to_hid)
+    assert l_rec.output_shape == (n_batch, n_steps, n_out_filters, width,
+                                  height)
+    out = theano.function([l_in.input_var], lasagne.layers.get_output(l_rec))
+    out_shape = out(np.zeros((n_batch, n_steps, n_channels, width, height),
+                             dtype=theano.config.floatX)).shape
+    assert out_shape == (n_batch, n_steps, n_out_filters, width, height)
+
+
+def test_recurrent_init_shape_error():
+    # Check that the custom recurrent layer throws errors for invalid shapes
+    n_batch, n_steps, n_channels, width, height = (2, 3, 4, 5, 6)
+    n_out_filters = 7
+    filter_shape = (3, 3)
+    l_in = lasagne.layers.InputLayer(
+        (n_batch, n_steps, n_channels, width, height))
+    l_hid_to_hid = lasagne.layers.Conv2DLayer(
+        lasagne.layers.InputLayer((n_batch, n_out_filters, width, height)),
+        n_out_filters, filter_shape, pad='same')
+
+    # When precompute_input == True, input_to_hidden.shape[0] must be None
+    # or n_batch*n_steps
+    l_in_to_hid = lasagne.layers.Conv2DLayer(
+        lasagne.layers.InputLayer((n_batch, n_channels, width, height)),
+        n_out_filters, filter_shape, pad='same')
+    with pytest.raises(ValueError):
+        l_rec = lasagne.layers.CustomRecurrentLayer(
+            l_in, l_in_to_hid, l_hid_to_hid, precompute_input=True)
+
+    # When precompute_input = False, input_to_hidden.shape[1] must be None
+    # or hidden_to_hidden.shape[1]
+    l_in_to_hid = lasagne.layers.Conv2DLayer(
+        lasagne.layers.InputLayer((n_batch + 1, n_channels, width, height)),
+        n_out_filters, filter_shape, pad='same')
+    with pytest.raises(ValueError):
+        l_rec = lasagne.layers.CustomRecurrentLayer(
+            l_in, l_in_to_hid, l_hid_to_hid, precompute_input=False)
+
+    # In any case, input_to_hidden and hidden_to_hidden's output shapes after
+    # the first dimension must match
+    l_in_to_hid = lasagne.layers.Conv2DLayer(
+        lasagne.layers.InputLayer((None, n_channels, width + 1, height)),
+        n_out_filters, filter_shape, pad='same')
+    with pytest.raises(ValueError):
+        l_rec = lasagne.layers.CustomRecurrentLayer(
+            l_in, l_in_to_hid, l_hid_to_hid)
+
+    # And, the output shape of input_to_hidden must match the input shape
+    # of hidden_to_hidden past the first dimension.  By not using padding,
+    # the output of l_in_to_hid will be cropped, which will make the
+    # shape inappropriate.
+    l_in_to_hid = lasagne.layers.Conv2DLayer(
+        lasagne.layers.InputLayer((None, n_channels, width, height)),
+        n_out_filters, filter_shape)
+    l_hid_to_hid = lasagne.layers.Conv2DLayer(
+        lasagne.layers.InputLayer((n_batch, n_out_filters, width, height)),
+        n_out_filters, filter_shape)
+    with pytest.raises(ValueError):
+        l_rec = lasagne.layers.CustomRecurrentLayer(
+            l_in, l_in_to_hid, l_hid_to_hid)
+
+
+def test_recurrent_grad_clipping():
+    num_units = 5
+    batch_size = 3
+    seq_len = 2
+    n_inputs = 4
+    in_shp = (batch_size, seq_len, n_inputs)
+    l_inp = InputLayer(in_shp)
+    x = T.tensor3()
+    l_rec = RecurrentLayer(l_inp, num_units, grad_clipping=1.0)
+    output = lasagne.layers.get_output(l_rec, x)
+
+
+def test_recurrent_bck():
+    num_batch, seq_len, n_features1 = 2, 3, 4
+    num_units = 2
+    x = T.tensor3()
+    in_shp = (num_batch, seq_len, n_features1)
+    l_inp = InputLayer(in_shp)
+
+    x_in = np.ones(in_shp).astype('float32')
+
+    # need to set random seed.
+    lasagne.random.get_rng().seed(1234)
+    l_rec_fwd = RecurrentLayer(l_inp, num_units=num_units, backwards=False)
+    lasagne.random.get_rng().seed(1234)
+    l_rec_bck = RecurrentLayer(l_inp, num_units=num_units, backwards=True)
+    l_out_fwd = helper.get_output(l_rec_fwd, x)
+    l_out_bck = helper.get_output(l_rec_bck, x)
+
+    output_fwd = l_out_fwd.eval({l_out_fwd: x_in})
+    output_bck = l_out_bck.eval({l_out_bck: x_in})
+
+    # test that the backwards model reverses its final input
+    np.testing.assert_almost_equal(output_fwd, output_bck[:, ::-1])
+
+
+def test_recurrent_variable_input_size():
+    # check that seqlen and batchsize None works
+    num_batch, n_features1 = 6, 5
+    num_units = 13
+    x = T.tensor3()
+
+    in_shp = (None, None, n_features1)
+    l_inp = InputLayer(in_shp)
+    x_in1 = np.ones((num_batch+1, 10, n_features1)).astype('float32')
+    x_in2 = np.ones((num_batch, 15, n_features1)).astype('float32')
+    l_rec = RecurrentLayer(l_inp, num_units=num_units, backwards=False)
+    output = helper.get_output(l_rec, x)
+    output_val1 = output.eval({x: x_in1})
+    output_val2 = output.eval({x: x_in2})
+
+
+def test_recurrent_unroll_scan_fwd():
+    num_batch, seq_len, n_features1 = 2, 3, 4
+    num_units = 2
+    in_shp = (num_batch, seq_len, n_features1)
+    l_inp = InputLayer(in_shp)
+    l_mask_inp = InputLayer(in_shp[:2])
+
+    x_in = np.random.random(in_shp).astype('float32')
+    mask_in = np.ones(in_shp[:2]).astype('float32')
+
+    # need to set random seed.
+    lasagne.random.get_rng().seed(1234)
+    l_rec_scan = RecurrentLayer(l_inp, num_units=num_units, backwards=False,
+                                unroll_scan=False, mask_input=l_mask_inp)
+    lasagne.random.get_rng().seed(1234)
+    l_rec_unroll = RecurrentLayer(l_inp, num_units=num_units, backwards=False,
+                                  unroll_scan=True, mask_input=l_mask_inp)
+    output_scan = helper.get_output(l_rec_scan)
+    output_unrolled = helper.get_output(l_rec_unroll)
+
+    output_scan_val = output_scan.eval(
+        {l_inp.input_var: x_in, l_mask_inp.input_var: mask_in})
+    output_unrolled_val = output_unrolled.eval(
+        {l_inp.input_var: x_in, l_mask_inp.input_var: mask_in})
+    np.testing.assert_almost_equal(output_scan_val, output_unrolled_val)
+
+
+def test_recurrent_unroll_scan_bck():
+    num_batch, seq_len, n_features1 = 2, 3, 4
+    num_units = 2
+    x = T.tensor3()
+    in_shp = (num_batch, seq_len, n_features1)
+    l_inp = InputLayer(in_shp)
+    x_in = np.random.random(in_shp).astype('float32')
+
+    # need to set random seed.
+    lasagne.random.get_rng().seed(1234)
+    l_rec_scan = RecurrentLayer(l_inp, num_units=num_units, backwards=True,
+                                unroll_scan=False)
+    lasagne.random.get_rng().seed(1234)
+    l_rec_unroll = RecurrentLayer(l_inp, num_units=num_units, backwards=True,
+                                  unroll_scan=True)
+    output_scan = helper.get_output(l_rec_scan, x)
+    output_unrolled = helper.get_output(l_rec_unroll, x)
+    output_scan_val = output_scan.eval({x: x_in})
+    output_unrolled_val = output_unrolled.eval({x: x_in})
+
+    np.testing.assert_almost_equal(output_scan_val, output_unrolled_val)
+
+
+def test_recurrent_precompute():
+    num_batch, seq_len, n_features1 = 2, 3, 4
+    num_units = 2
+    in_shp = (num_batch, seq_len, n_features1)
+    l_inp = InputLayer(in_shp)
+    l_mask_inp = InputLayer(in_shp[:2])
+
+    x_in = np.random.random(in_shp).astype('float32')
+    mask_in = np.ones((num_batch, seq_len), dtype='float32')
+
+    # need to set random seed.
+    lasagne.random.get_rng().seed(1234)
+    l_rec_precompute = RecurrentLayer(l_inp, num_units=num_units,
+                                      precompute_input=True,
+                                      mask_input=l_mask_inp)
+    lasagne.random.get_rng().seed(1234)
+    l_rec_no_precompute = RecurrentLayer(l_inp, num_units=num_units,
+                                         precompute_input=False,
+                                         mask_input=l_mask_inp)
+    output_precompute = helper.get_output(
+        l_rec_precompute).eval({l_inp.input_var: x_in,
+                                l_mask_inp.input_var: mask_in})
+    output_no_precompute = helper.get_output(
+        l_rec_no_precompute).eval({l_inp.input_var: x_in,
+                                   l_mask_inp.input_var: mask_in})
+
+    np.testing.assert_almost_equal(output_precompute, output_no_precompute)
+
+
+def test_recurrent_return_final():
+    num_batch, seq_len, n_features = 2, 3, 4
+    num_units = 2
+    in_shp = (num_batch, seq_len, n_features)
+    x_in = np.random.random(in_shp).astype('float32')
+
+    l_inp = InputLayer(in_shp)
+    lasagne.random.get_rng().seed(1234)
+    l_rec_final = RecurrentLayer(l_inp, num_units, only_return_final=True)
+    lasagne.random.get_rng().seed(1234)
+    l_rec_all = RecurrentLayer(l_inp, num_units, only_return_final=False)
+
+    output_final = helper.get_output(l_rec_final).eval({l_inp.input_var: x_in})
+    output_all = helper.get_output(l_rec_all).eval({l_inp.input_var: x_in})
+
+    assert output_final.shape == (output_all.shape[0], output_all.shape[2])
+    assert output_final.shape == lasagne.layers.get_output_shape(l_rec_final)
+    assert np.allclose(output_final, output_all[:, -1])
+
+
+def test_lstm_return_shape():
+    num_batch, seq_len, n_features1, n_features2 = 5, 3, 10, 11
+    num_units = 6
+    x = T.tensor4()
+    in_shp = (num_batch, seq_len, n_features1, n_features2)
+    l_inp = InputLayer(in_shp)
+
+    x_in = np.random.random(in_shp).astype('float32')
+
+    l_lstm = LSTMLayer(l_inp, num_units=num_units)
+    output = helper.get_output(l_lstm, x)
+    output_val = output.eval({x: x_in})
+    assert helper.get_output_shape(l_lstm, x_in.shape) == output_val.shape
+    assert output_val.shape == (num_batch, seq_len, num_units)
+
+
+def test_lstm_grad():
+    num_batch, seq_len, n_features = 5, 3, 10
+    num_units = 6
+    l_inp = InputLayer((num_batch, seq_len, n_features))
+    l_lstm = LSTMLayer(l_inp, num_units=num_units)
+    output = helper.get_output(l_lstm)
+    g = T.grad(T.mean(output), lasagne.layers.get_all_params(l_lstm))
+    assert isinstance(g, (list, tuple))
+
+
+def test_lstm_nparams_no_peepholes():
+    l_inp = InputLayer((2, 2, 3))
+    l_lstm = LSTMLayer(l_inp, 5, peepholes=False, learn_init=False)
+
+    # 3*n_gates
+    # the 3 is because we have  hid_to_gate, in_to_gate and bias for each gate
+    assert len(lasagne.layers.get_all_params(l_lstm, trainable=True)) == 12
+
+    # bias params + init params
+    assert len(lasagne.layers.get_all_params(l_lstm, regularizable=False)) == 6
+
+
+def test_lstm_nparams_peepholes():
+    l_inp = InputLayer((2, 2, 3))
+    l_lstm = LSTMLayer(l_inp, 5, peepholes=True, learn_init=False)
+
+    # 3*n_gates + peepholes(3).
+    # the 3 is because we have  hid_to_gate, in_to_gate and bias for each gate
+    assert len(lasagne.layers.get_all_params(l_lstm, trainable=True)) == 15
+
+    # bias params(4) + init params(2)
+    assert len(lasagne.layers.get_all_params(l_lstm, regularizable=False)) == 6
+
+
+def test_lstm_nparams_learn_init():
+    l_inp = InputLayer((2, 2, 3))
+    l_lstm = LSTMLayer(l_inp, 5, peepholes=False, learn_init=True)
+
+    # 3*n_gates + inits(2).
+    # the 3 is because we have  hid_to_gate, in_to_gate and bias for each gate
+    assert len(lasagne.layers.get_all_params(l_lstm, trainable=True)) == 14
+
+    # bias params(4) + init params(2)
+    assert len(lasagne.layers.get_all_params(l_lstm, regularizable=False)) == 6
+
+
+def test_lstm_hid_init_layer():
+    # test that you can set hid_init to be a layer
+    l_inp = InputLayer((2, 2, 3))
+    l_inp_h = InputLayer((2, 5))
+    l_cell_h = InputLayer((2, 5))
+    l_lstm = LSTMLayer(l_inp, 5, hid_init=l_inp_h, cell_init=l_cell_h)
+
+    x = T.tensor3()
+    h = T.matrix()
+
+    output = lasagne.layers.get_output(l_lstm, {l_inp: x, l_inp_h: h})
+
+
+def test_lstm_nparams_hid_init_layer():
+    # test that you can see layers through hid_init
+    l_inp = InputLayer((2, 2, 3))
+    l_inp_h = InputLayer((2, 5))
+    l_inp_h_de = DenseLayer(l_inp_h, 7)
+    l_inp_cell = InputLayer((2, 5))
+    l_inp_cell_de = DenseLayer(l_inp_cell, 7)
+    l_lstm = LSTMLayer(l_inp, 7, hid_init=l_inp_h_de, cell_init=l_inp_cell_de)
+
+    # directly check the layers can be seen through hid_init
+    layers_to_find = [l_inp, l_inp_h, l_inp_h_de, l_inp_cell, l_inp_cell_de,
+                      l_lstm]
+    assert lasagne.layers.get_all_layers(l_lstm) == layers_to_find
+
+    # 3*n_gates + 4
+    # the 3 is because we have  hid_to_gate, in_to_gate and bias for each gate
+    # 4 is for the W and b parameters in the two DenseLayer layers
+    assert len(lasagne.layers.get_all_params(l_lstm, trainable=True)) == 19
+
+    # GRU bias params(3) + Dense bias params(1) * 2
+    assert len(lasagne.layers.get_all_params(l_lstm, regularizable=False)) == 6
+
+
+def test_lstm_hid_init_mask():
+    # test that you can set hid_init to be a layer when a mask is provided
+    l_inp = InputLayer((2, 2, 3))
+    l_inp_h = InputLayer((2, 5))
+    l_inp_msk = InputLayer((2, 2))
+    l_cell_h = InputLayer((2, 5))
+    l_lstm = LSTMLayer(l_inp, 5, hid_init=l_inp_h, mask_input=l_inp_msk,
+                       cell_init=l_cell_h)
+
+    x = T.tensor3()
+    h = T.matrix()
+    msk = T.matrix()
+
+    inputs = {l_inp: x, l_inp_h: h, l_inp_msk: msk}
+    output = lasagne.layers.get_output(l_lstm, inputs)
+
+
+def test_lstm_hid_init_layer_eval():
+    # Test `hid_init` as a `Layer` with some dummy input. Compare the output of
+    # a network with a `Layer` as input to `hid_init` to a network with a
+    # `np.array` as input to `hid_init`
+    n_units = 7
+    n_test_cases = 2
+    in_shp = (n_test_cases, 2, 3)
+    in_h_shp = (1, n_units)
+    in_cell_shp = (1, n_units)
+
+    # dummy inputs
+    X_test = np.ones(in_shp, dtype=theano.config.floatX)
+    Xh_test = np.ones(in_h_shp, dtype=theano.config.floatX)
+    Xc_test = np.ones(in_cell_shp, dtype=theano.config.floatX)
+    Xh_test_batch = np.tile(Xh_test, (n_test_cases, 1))
+    Xc_test_batch = np.tile(Xc_test, (n_test_cases, 1))
+
+    # network with `Layer` initializer for hid_init
+    l_inp = InputLayer(in_shp)
+    l_inp_h = InputLayer(in_h_shp)
+    l_inp_cell = InputLayer(in_cell_shp)
+    l_rec_inp_layer = LSTMLayer(l_inp, n_units, hid_init=l_inp_h,
+                                cell_init=l_inp_cell, nonlinearity=None)
+
+    # network with `np.array` initializer for hid_init
+    l_rec_nparray = LSTMLayer(l_inp, n_units, hid_init=Xh_test,
+                              cell_init=Xc_test, nonlinearity=None)
+
+    # copy network parameters from l_rec_inp_layer to l_rec_nparray
+    l_il_param = dict([(p.name, p) for p in l_rec_inp_layer.get_params()])
+    l_rn_param = dict([(p.name, p) for p in l_rec_nparray.get_params()])
+    for k, v in l_rn_param.items():
+        if k in l_il_param:
+            v.set_value(l_il_param[k].get_value())
+
+    # build the theano functions
+    X = T.tensor3()
+    Xh = T.matrix()
+    Xc = T.matrix()
+    output_inp_layer = lasagne.layers.get_output(l_rec_inp_layer,
+                                                 {l_inp: X, l_inp_h:
+                                                  Xh, l_inp_cell: Xc})
+    output_nparray = lasagne.layers.get_output(l_rec_nparray, {l_inp: X})
+
+    # test both nets with dummy input
+    output_val_inp_layer = output_inp_layer.eval({X: X_test, Xh: Xh_test_batch,
+                                                  Xc: Xc_test_batch})
+    output_val_nparray = output_nparray.eval({X: X_test})
+
+    # check output given `Layer` is the same as with `np.array`
+    assert np.allclose(output_val_inp_layer, output_val_nparray)
+
+
+def test_lstm_grad_clipping():
+    # test that you can set grad_clip variable
+    x = T.tensor3()
+    l_rec = LSTMLayer(InputLayer((2, 2, 3)), 5, grad_clipping=1)
+    output = lasagne.layers.get_output(l_rec, x)
+
+
+def test_lstm_bck():
+    num_batch, seq_len, n_features1 = 2, 3, 4
+    num_units = 2
+    x = T.tensor3()
+    in_shp = (num_batch, seq_len, n_features1)
+    l_inp = InputLayer(in_shp)
+
+    x_in = np.ones(in_shp).astype('float32')
+
+    # need to set random seed.
+    lasagne.random.get_rng().seed(1234)
+    l_lstm_fwd = LSTMLayer(l_inp, num_units=num_units, backwards=False)
+    lasagne.random.get_rng().seed(1234)
+    l_lstm_bck = LSTMLayer(l_inp, num_units=num_units, backwards=True)
+    output_fwd = helper.get_output(l_lstm_fwd, x)
+    output_bck = helper.get_output(l_lstm_bck, x)
+
+    output_fwd_val = output_fwd.eval({x: x_in})
+    output_bck_val = output_bck.eval({x: x_in})
+
+    # test that the backwards model reverses its final input
+    np.testing.assert_almost_equal(output_fwd_val, output_bck_val[:, ::-1])
+
+
+def test_lstm_precompute():
+    num_batch, seq_len, n_features1 = 2, 3, 4
+    num_units = 2
+    in_shp = (num_batch, seq_len, n_features1)
+    l_inp = InputLayer(in_shp)
+    l_mask_inp = InputLayer(in_shp[:2])
+
+    x_in = np.random.random(in_shp).astype('float32')
+    mask_in = np.ones((num_batch, seq_len), dtype='float32')
+
+    # need to set random seed.
+    lasagne.random.get_rng().seed(1234)
+    l_lstm_precompute = LSTMLayer(
+        l_inp, num_units=num_units, precompute_input=True,
+        mask_input=l_mask_inp)
+    lasagne.random.get_rng().seed(1234)
+    l_lstm_no_precompute = LSTMLayer(
+        l_inp, num_units=num_units, precompute_input=False,
+        mask_input=l_mask_inp)
+    output_precompute = helper.get_output(
+        l_lstm_precompute).eval({l_inp.input_var: x_in,
+                                 l_mask_inp.input_var: mask_in})
+    output_no_precompute = helper.get_output(
+        l_lstm_no_precompute).eval({l_inp.input_var: x_in,
+                                    l_mask_inp.input_var: mask_in})
+
+    # test that the backwards model reverses its final input
+    np.testing.assert_almost_equal(output_precompute, output_no_precompute)
+
+
+def test_lstm_variable_input_size():
+    # that seqlen and batchsize None works
+    num_batch, n_features1 = 6, 5
+    num_units = 13
+    x = T.tensor3()
+
+    in_shp = (None, None, n_features1)
+    l_inp = InputLayer(in_shp)
+    x_in1 = np.ones((num_batch+1, 3+1, n_features1)).astype('float32')
+    x_in2 = np.ones((num_batch, 3, n_features1)).astype('float32')
+    l_rec = LSTMLayer(l_inp, num_units=num_units, backwards=False)
+    output = helper.get_output(l_rec, x)
+    output_val1 = output.eval({x: x_in1})
+    output_val2 = output.eval({x: x_in2})
+
+
+def test_lstm_unroll_scan_fwd():
+    num_batch, seq_len, n_features1 = 2, 3, 4
+    num_units = 2
+    in_shp = (num_batch, seq_len, n_features1)
+    l_inp = InputLayer(in_shp)
+    l_mask_inp = InputLayer(in_shp[:2])
+
+    x_in = np.random.random(in_shp).astype('float32')
+    mask_in = np.ones(in_shp[:2]).astype('float32')
+
+    # need to set random seed.
+    lasagne.random.get_rng().seed(1234)
+    l_lstm_scan = LSTMLayer(l_inp, num_units=num_units, backwards=False,
+                            unroll_scan=False, mask_input=l_mask_inp)
+    lasagne.random.get_rng().seed(1234)
+    l_lstm_unrolled = LSTMLayer(l_inp, num_units=num_units, backwards=False,
+                                unroll_scan=True, mask_input=l_mask_inp)
+    output_scan = helper.get_output(l_lstm_scan)
+    output_unrolled = helper.get_output(l_lstm_unrolled)
+
+    output_scan_val = output_scan.eval({l_inp.input_var: x_in,
+                                        l_mask_inp.input_var: mask_in})
+    output_unrolled_val = output_unrolled.eval({l_inp.input_var: x_in,
+                                                l_mask_inp.input_var: mask_in})
+
+    np.testing.assert_almost_equal(output_scan_val, output_unrolled_val)
+
+
+def test_lstm_unroll_scan_bck():
+    num_batch, seq_len, n_features1 = 2, 3, 4
+    num_units = 2
+    x = T.tensor3()
+    in_shp = (num_batch, seq_len, n_features1)
+    l_inp = InputLayer(in_shp)
+
+    x_in = np.random.random(in_shp).astype('float32')
+
+    # need to set random seed.
+    lasagne.random.get_rng().seed(1234)
+    l_lstm_scan = LSTMLayer(l_inp, num_units=num_units, backwards=True,
+                            unroll_scan=False)
+    lasagne.random.get_rng().seed(1234)
+    l_lstm_unrolled = LSTMLayer(l_inp, num_units=num_units, backwards=True,
+                                unroll_scan=True)
+    output_scan = helper.get_output(l_lstm_scan, x)
+    output_scan_unrolled = helper.get_output(l_lstm_unrolled, x)
+
+    output_scan_val = output_scan.eval({x: x_in})
+    output_unrolled_val = output_scan_unrolled.eval({x: x_in})
+
+    np.testing.assert_almost_equal(output_scan_val, output_unrolled_val)
+
+
+def test_lstm_passthrough():
+    # Tests that the LSTM can simply pass through its input
+    l_in = InputLayer((4, 5, 6))
+    zero = lasagne.init.Constant(0.)
+    one = lasagne.init.Constant(1.)
+    pass_gate = Gate(zero, zero, zero, one, None)
+    no_gate = Gate(zero, zero, zero, zero, None)
+    in_pass_gate = Gate(
+        np.eye(6).astype(theano.config.floatX), zero, zero, zero, None)
+    l_rec = LSTMLayer(
+        l_in, 6, pass_gate, no_gate, in_pass_gate, pass_gate, None)
+    out = lasagne.layers.get_output(l_rec)
+    inp = np.arange(4*5*6).reshape(4, 5, 6).astype(theano.config.floatX)
+    np.testing.assert_almost_equal(out.eval({l_in.input_var: inp}), inp)
+
+
+def test_lstm_return_final():
+    num_batch, seq_len, n_features = 2, 3, 4
+    num_units = 2
+    in_shp = (num_batch, seq_len, n_features)
+    x_in = np.random.random(in_shp).astype('float32')
+
+    l_inp = InputLayer(in_shp)
+    lasagne.random.get_rng().seed(1234)
+    l_rec_final = LSTMLayer(l_inp, num_units, only_return_final=True)
+    lasagne.random.get_rng().seed(1234)
+    l_rec_all = LSTMLayer(l_inp, num_units, only_return_final=False)
+
+    output_final = helper.get_output(l_rec_final).eval({l_inp.input_var: x_in})
+    output_all = helper.get_output(l_rec_all).eval({l_inp.input_var: x_in})
+
+    assert output_final.shape == (output_all.shape[0], output_all.shape[2])
+    assert output_final.shape == lasagne.layers.get_output_shape(l_rec_final)
+    assert np.allclose(output_final, output_all[:, -1])
+
+
+def test_gru_return_shape():
+    num_batch, seq_len, n_features1, n_features2 = 5, 3, 10, 11
+    num_units = 6
+    x = T.tensor4()
+    in_shp = (num_batch, seq_len, n_features1, n_features2)
+    l_inp = InputLayer(in_shp)
+    l_rec = GRULayer(l_inp, num_units=num_units)
+
+    x_in = np.random.random(in_shp).astype('float32')
+    output = helper.get_output(l_rec, x)
+    output_val = output.eval({x: x_in})
+
+    assert helper.get_output_shape(l_rec, x_in.shape) == output_val.shape
+    assert output_val.shape == (num_batch, seq_len, num_units)
+
+
+def test_gru_grad():
+    num_batch, seq_len, n_features = 5, 3, 10
+    num_units = 6
+    l_inp = InputLayer((num_batch, seq_len, n_features))
+    l_gru = GRULayer(l_inp,
+                     num_units=num_units)
+    output = helper.get_output(l_gru)
+    g = T.grad(T.mean(output), lasagne.layers.get_all_params(l_gru))
+    assert isinstance(g, (list, tuple))
+
+
+def test_gru_nparams_learn_init_false():
+    l_inp = InputLayer((2, 2, 3))
+    l_gru = GRULayer(l_inp, 5, learn_init=False)
+
+    # 3*n_gates
+    # the 3 is because we have  hid_to_gate, in_to_gate and bias for each gate
+    assert len(lasagne.layers.get_all_params(l_gru, trainable=True)) == 9
+
+    # bias params(3) + hid_init
+    assert len(lasagne.layers.get_all_params(l_gru, regularizable=False)) == 4
+
+
+def test_gru_nparams_learn_init_true():
+    l_inp = InputLayer((2, 2, 3))
+    l_gru = GRULayer(l_inp, 5, learn_init=True)
+
+    # 3*n_gates + hid_init
+    # the 3 is because we have  hid_to_gate, in_to_gate and bias for each gate
+    assert len(lasagne.layers.get_all_params(l_gru, trainable=True)) == 10
+
+    # bias params(3) + init params(1)
+    assert len(lasagne.layers.get_all_params(l_gru, regularizable=False)) == 4
+
+
+def test_gru_hid_init_layer():
+    # test that you can set hid_init to be a layer
+    l_inp = InputLayer((2, 2, 3))
+    l_inp_h = InputLayer((2, 5))
+    l_gru = GRULayer(l_inp, 5, hid_init=l_inp_h)
+
+    x = T.tensor3()
+    h = T.matrix()
+
+    output = lasagne.layers.get_output(l_gru, {l_inp: x, l_inp_h: h})
+
+
+def test_gru_nparams_hid_init_layer():
+    # test that you can see layers through hid_init
+    l_inp = InputLayer((2, 2, 3))
+    l_inp_h = InputLayer((2, 5))
+    l_inp_h_de = DenseLayer(l_inp_h, 7)
+    l_gru = GRULayer(l_inp, 7, hid_init=l_inp_h_de)
+
+    # directly check the layers can be seen through hid_init
+    assert lasagne.layers.get_all_layers(l_gru) == [l_inp, l_inp_h, l_inp_h_de,
+                                                    l_gru]
+
+    # 3*n_gates + 2
+    # the 3 is because we have  hid_to_gate, in_to_gate and bias for each gate
+    # 2 is for the W and b parameters in the DenseLayer
+    assert len(lasagne.layers.get_all_params(l_gru, trainable=True)) == 11
+
+    # GRU bias params(3) + Dense bias params(1)
+    assert len(lasagne.layers.get_all_params(l_gru, regularizable=False)) == 4
+
+
+def test_gru_hid_init_layer_eval():
+    # Test `hid_init` as a `Layer` with some dummy input. Compare the output of
+    # a network with a `Layer` as input to `hid_init` to a network with a
+    # `np.array` as input to `hid_init`
+    n_units = 7
+    n_test_cases = 2
+    in_shp = (n_test_cases, 2, 3)
+    in_h_shp = (1, n_units)
+
+    # dummy inputs
+    X_test = np.ones(in_shp, dtype=theano.config.floatX)
+    Xh_test = np.ones(in_h_shp, dtype=theano.config.floatX)
+    Xh_test_batch = np.tile(Xh_test, (n_test_cases, 1))
+
+    # network with `Layer` initializer for hid_init
+    l_inp = InputLayer(in_shp)
+    l_inp_h = InputLayer(in_h_shp)
+    l_rec_inp_layer = GRULayer(l_inp, n_units, hid_init=l_inp_h)
+
+    # network with `np.array` initializer for hid_init
+    l_rec_nparray = GRULayer(l_inp, n_units, hid_init=Xh_test)
+
+    # copy network parameters from l_rec_inp_layer to l_rec_nparray
+    l_il_param = dict([(p.name, p) for p in l_rec_inp_layer.get_params()])
+    l_rn_param = dict([(p.name, p) for p in l_rec_nparray.get_params()])
+    for k, v in l_rn_param.items():
+        if k in l_il_param:
+            v.set_value(l_il_param[k].get_value())
+
+    # build the theano functions
+    X = T.tensor3()
+    Xh = T.matrix()
+    output_inp_layer = lasagne.layers.get_output(l_rec_inp_layer,
+                                                 {l_inp: X, l_inp_h: Xh})
+    output_nparray = lasagne.layers.get_output(l_rec_nparray, {l_inp: X})
+
+    # test both nets with dummy input
+    output_val_inp_layer = output_inp_layer.eval({X: X_test,
+                                                  Xh: Xh_test_batch})
+    output_val_nparray = output_nparray.eval({X: X_test})
+
+    # check output given `Layer` is the same as with `np.array`
+    assert np.allclose(output_val_inp_layer, output_val_nparray)
+
+
+def test_gru_hid_init_mask():
+    # test that you can set hid_init to be a layer when a mask is provided
+    l_inp = InputLayer((2, 2, 3))
+    l_inp_h = InputLayer((2, 5))
+    l_inp_msk = InputLayer((2, 2))
+    l_gru = GRULayer(l_inp, 5, hid_init=l_inp_h, mask_input=l_inp_msk)
+
+    x = T.tensor3()
+    h = T.matrix()
+    msk = T.matrix()
+
+    inputs = {l_inp: x, l_inp_h: h, l_inp_msk: msk}
+    output = lasagne.layers.get_output(l_gru, inputs)
+
+
+def test_gru_grad_clipping():
+    # test that you can set grad_clip variable
+    x = T.tensor3()
+    l_rec = GRULayer(InputLayer((2, 2, 3)), 5, grad_clipping=1)
+    output = lasagne.layers.get_output(l_rec, x)
+
+
+def test_gru_bck():
+    num_batch, seq_len, n_features1 = 2, 3, 4
+    num_units = 2
+    x = T.tensor3()
+    in_shp = (num_batch, seq_len, n_features1)
+    l_inp = InputLayer(in_shp)
+
+    x_in = np.ones(in_shp).astype('float32')
+
+    # need to set random seed.
+    lasagne.random.get_rng().seed(1234)
+    l_gru_fwd = GRULayer(l_inp, num_units=num_units, backwards=False)
+    lasagne.random.get_rng().seed(1234)
+    l_gru_bck = GRULayer(l_inp, num_units=num_units, backwards=True)
+    output_fwd = helper.get_output(l_gru_fwd, x)
+    output_bck = helper.get_output(l_gru_bck, x)
+
+    output_fwd_val = output_fwd.eval({x: x_in})
+    output_bck_val = output_bck.eval({x: x_in})
+
+    # test that the backwards model reverses its final input
+    np.testing.assert_almost_equal(output_fwd_val, output_bck_val[:, ::-1])
+
+
+def test_gru_variable_input_size():
+    # that seqlen and batchsize None works
+    num_batch, n_features1 = 6, 5
+    num_units = 13
+    x = T.tensor3()
+
+    in_shp = (None, None, n_features1)
+    l_inp = InputLayer(in_shp)
+    x_in1 = np.ones((num_batch+1, 10, n_features1)).astype('float32')
+    x_in2 = np.ones((num_batch, 15, n_features1)).astype('float32')
+    l_rec = GRULayer(l_inp, num_units=num_units, backwards=False)
+    output = helper.get_output(l_rec, x)
+
+    output.eval({x: x_in1})
+    output.eval({x: x_in2})
+
+
+def test_gru_unroll_scan_fwd():
+    num_batch, seq_len, n_features1 = 2, 3, 4
+    num_units = 2
+    in_shp = (num_batch, seq_len, n_features1)
+    l_inp = InputLayer(in_shp)
+    l_mask_inp = InputLayer(in_shp[:2])
+
+    x_in = np.random.random(in_shp).astype('float32')
+    mask_in = np.ones(in_shp[:2]).astype('float32')
+
+    # need to set random seed.
+    lasagne.random.get_rng().seed(1234)
+    l_gru_scan = GRULayer(l_inp, num_units=num_units, backwards=False,
+                          unroll_scan=False, mask_input=l_mask_inp)
+    lasagne.random.get_rng().seed(1234)
+    l_gru_unrolled = GRULayer(l_inp, num_units=num_units, backwards=False,
+                              unroll_scan=True, mask_input=l_mask_inp)
+    output_scan = helper.get_output(l_gru_scan)
+    output_unrolled = helper.get_output(l_gru_unrolled)
+
+    output_scan_val = output_scan.eval({l_inp.input_var: x_in,
+                                        l_mask_inp.input_var: mask_in})
+    output_unrolled_val = output_unrolled.eval({l_inp.input_var: x_in,
+                                                l_mask_inp.input_var: mask_in})
+
+    np.testing.assert_almost_equal(output_scan_val, output_unrolled_val)
+
+
+def test_gru_unroll_scan_bck():
+    num_batch, seq_len, n_features1 = 2, 5, 4
+    num_units = 2
+    x = T.tensor3()
+    in_shp = (num_batch, seq_len, n_features1)
+    l_inp = InputLayer(in_shp)
+    x_in = np.random.random(in_shp).astype('float32')
+
+    # need to set random seed.
+    lasagne.random.get_rng().seed(1234)
+    l_gru_scan = GRULayer(l_inp, num_units=num_units, backwards=True,
+                          unroll_scan=False)
+    lasagne.random.get_rng().seed(1234)
+    l_gru_unrolled = GRULayer(l_inp, num_units=num_units, backwards=True,
+                              unroll_scan=True)
+    output_scan = helper.get_output(l_gru_scan, x)
+    output_unrolled = helper.get_output(l_gru_unrolled, x)
+
+    output_scan_val = output_scan.eval({x: x_in})
+    output_unrolled_val = output_unrolled.eval({x: x_in})
+
+    np.testing.assert_almost_equal(output_scan_val, output_unrolled_val)
+
+
+def test_gru_precompute():
+    num_batch, seq_len, n_features1 = 2, 3, 4
+    num_units = 2
+    in_shp = (num_batch, seq_len, n_features1)
+    l_inp = InputLayer(in_shp)
+    l_mask_inp = InputLayer(in_shp[:2])
+
+    x_in = np.random.random(in_shp).astype('float32')
+    mask_in = np.ones((num_batch, seq_len), dtype='float32')
+
+    # need to set random seed.
+    lasagne.random.get_rng().seed(1234)
+    l_gru_precompute = GRULayer(l_inp, num_units=num_units,
+                                precompute_input=True, mask_input=l_mask_inp)
+    lasagne.random.get_rng().seed(1234)
+    l_gru_no_precompute = GRULayer(l_inp, num_units=num_units,
+                                   precompute_input=False,
+                                   mask_input=l_mask_inp)
+    output_precompute = helper.get_output(
+        l_gru_precompute).eval({l_inp.input_var: x_in,
+                                l_mask_inp.input_var: mask_in})
+    output_no_precompute = helper.get_output(
+        l_gru_no_precompute).eval({l_inp.input_var: x_in,
+                                   l_mask_inp.input_var: mask_in})
+
+    # test that the backwards model reverses its final input
+    np.testing.assert_almost_equal(output_precompute, output_no_precompute)
+
+
+def test_gru_passthrough():
+    # Tests that the LSTM can simply pass through its input
+    l_in = InputLayer((4, 5, 6))
+    zero = lasagne.init.Constant(0.)
+    one = lasagne.init.Constant(1.)
+    pass_gate = Gate(zero, zero, None, one, None)
+    no_gate = Gate(zero, zero, None, zero, None)
+    in_pass_gate = Gate(
+        np.eye(6).astype(theano.config.floatX), zero, None, zero, None)
+    l_rec = GRULayer(l_in, 6, no_gate, pass_gate, in_pass_gate)
+    out = lasagne.layers.get_output(l_rec)
+    inp = np.arange(4*5*6).reshape(4, 5, 6).astype(theano.config.floatX)
+    np.testing.assert_almost_equal(out.eval({l_in.input_var: inp}), inp)
+
+
+def test_gru_return_final():
+    num_batch, seq_len, n_features = 2, 3, 4
+    num_units = 2
+    in_shp = (num_batch, seq_len, n_features)
+    x_in = np.random.random(in_shp).astype('float32')
+
+    l_inp = InputLayer(in_shp)
+    lasagne.random.get_rng().seed(1234)
+    l_rec_final = GRULayer(l_inp, num_units, only_return_final=True)
+    lasagne.random.get_rng().seed(1234)
+    l_rec_all = GRULayer(l_inp, num_units, only_return_final=False)
+
+    output_final = helper.get_output(l_rec_final).eval({l_inp.input_var: x_in})
+    output_all = helper.get_output(l_rec_all).eval({l_inp.input_var: x_in})
+
+    assert output_final.shape == (output_all.shape[0], output_all.shape[2])
+    assert output_final.shape == lasagne.layers.get_output_shape(l_rec_final)
+    assert np.allclose(output_final, output_all[:, -1])
+
+
+def test_gradient_steps_error():
+    # Check that error is raised if gradient_steps is not -1 and scan_unroll
+    # is true
+    l_in = InputLayer((2, 2, 3))
+    with pytest.raises(ValueError):
+        RecurrentLayer(l_in, 5, gradient_steps=3, unroll_scan=True)
+
+    with pytest.raises(ValueError):
+        LSTMLayer(l_in, 5, gradient_steps=3, unroll_scan=True)
+
+    with pytest.raises(ValueError):
+        GRULayer(l_in, 5, gradient_steps=3, unroll_scan=True)
+
+
+def test_unroll_none_input_error():
+    # Test that a ValueError is raised if unroll scan is True and the input
+    # sequence length is specified as None.
+    l_in = InputLayer((2, None, 3))
+    with pytest.raises(ValueError):
+        RecurrentLayer(l_in, 5, unroll_scan=True)
+
+    with pytest.raises(ValueError):
+        LSTMLayer(l_in, 5, unroll_scan=True)
+
+    with pytest.raises(ValueError):
+        GRULayer(l_in, 5, unroll_scan=True)
+
+
+def test_CustomRecurrentLayer_child_kwargs():
+    in_shape = (2, 3, 4)
+    n_hid = 5
+    # Construct mock for input-to-hidden layer
+    in_to_hid = Mock(
+        Layer,
+        output_shape=(in_shape[0]*in_shape[1], n_hid),
+        input_shape=(in_shape[0]*in_shape[1], in_shape[2]),
+        input_layer=InputLayer((in_shape[0]*in_shape[1], in_shape[2])),
+        get_output_kwargs=['foo'])
+    # These two functions get called, need to return dummy values for them
+    in_to_hid.get_output_for.return_value = T.matrix()
+    in_to_hid.get_params.return_value = []
+    # As above, for hidden-to-hidden layer
+    hid_to_hid = Mock(
+        Layer,
+        output_shape=(in_shape[0], n_hid),
+        input_shape=(in_shape[0], n_hid),
+        input_layer=InputLayer((in_shape[0], n_hid)),
+        get_output_kwargs=[])
+    hid_to_hid.get_output_for.return_value = T.matrix()
+    hid_to_hid.get_params.return_value = []
+    # Construct a CustomRecurrentLayer using these Mocks
+    l_rec = lasagne.layers.CustomRecurrentLayer(
+        InputLayer(in_shape), in_to_hid, hid_to_hid)
+    # Call get_output with a kwarg, should be passd to in_to_hid and hid_to_hid
+    helper.get_output(l_rec, foo='bar')
+    # Retrieve the arguments used to call in_to_hid.get_output_for
+    args, kwargs = in_to_hid.get_output_for.call_args
+    # Should be one argument - the Theano expression
+    assert len(args) == 1
+    # One keywould argument - should be 'foo' -> 'bar'
+    assert kwargs == {'foo': 'bar'}
+    # Same as with in_to_hid
+    args, kwargs = hid_to_hid.get_output_for.call_args
+    assert len(args) == 1
+    assert kwargs == {'foo': 'bar'}
diff --git a/lasagne/tests/layers/test_shape.py b/lasagne/tests/layers/test_shape.py
new file mode 100644
index 0000000..2e1de90
--- /dev/null
+++ b/lasagne/tests/layers/test_shape.py
@@ -0,0 +1,291 @@
+import numpy as np
+import pytest
+import theano
+
+from mock import Mock
+
+
+class TestFlattenLayer:
+    @pytest.fixture
+    def layer(self):
+        from lasagne.layers.shape import FlattenLayer
+        return FlattenLayer(Mock(output_shape=(None,)))
+
+    @pytest.fixture
+    def layer_outdim3(self):
+        from lasagne.layers.shape import FlattenLayer
+        return FlattenLayer(Mock(output_shape=(None,)), outdim=3)
+
+    @pytest.fixture
+    def layer_outdim1(self):
+        from lasagne.layers.shape import FlattenLayer
+        return FlattenLayer(Mock(output_shape=(None,)), outdim=1)
+
+    def test_get_output_shape_for(self, layer):
+        input_shape = (2, 3, 4, 5)
+        assert layer.get_output_shape_for(input_shape) == (2, 3 * 4 * 5)
+
+    def test_get_output_shape_for_contain_none(self, layer):
+        input_shape = (2, 3, None, 5)
+        assert layer.get_output_shape_for(input_shape) == (2, None)
+
+    def test_get_output_for(self, layer):
+        input = np.random.random((2, 3, 4, 5))
+        result = layer.get_output_for(theano.shared(input)).eval()
+        assert (result == input.reshape((input.shape[0], -1))).all()
+
+    def test_get_output_shape_for_outdim3(self, layer_outdim3):
+        input_shape = (2, 3, 4, 5)
+        assert layer_outdim3.get_output_shape_for(input_shape) == (2, 3, 4 * 5)
+
+    def test_get_output_for_outdim3(self, layer_outdim3):
+        input = np.random.random((2, 3, 4, 5))
+        result = layer_outdim3.get_output_for(theano.shared(input)).eval()
+        assert (result == input.reshape(
+            (input.shape[0], input.shape[1], -1))).all()
+
+    def test_get_output_shape_for_outdim1(self, layer_outdim1):
+        input_shape = (2, 3, 4, 5)
+        assert layer_outdim1.get_output_shape_for(input_shape) == (
+            2 * 3 * 4 * 5, )
+
+    def test_get_output_for_outdim1(self, layer_outdim1):
+        input = np.random.random((2, 3, 4, 5))
+        result = layer_outdim1.get_output_for(theano.shared(input)).eval()
+        assert (result == input.reshape(-1)).all()
+
+    def test_dim0_raises(self):
+        from lasagne.layers.shape import FlattenLayer
+        with pytest.raises(ValueError):
+            FlattenLayer((2, 3, 4), outdim=0)
+
+
+class TestPadLayer:
+    @pytest.fixture
+    def layerclass(self):
+        from lasagne.layers.shape import PadLayer
+        return PadLayer
+
+    @pytest.mark.parametrize(
+        "width, input_shape, output_shape",
+        [(3, (2, 3, 4, 5), (2, 3, 10, 11)),
+         ((2, 3), (2, 3, 4, 5), (2, 3, 8, 11)),
+         (((1, 2), (3, 4)), (2, 3, 4, 5), (2, 3, 7, 12)),
+         (3, (2, 3, None, 5), (2, 3, None, 11)),
+         ((2, 3), (2, 3, 4, None), (2, 3, 8, None)),
+         (((1, 2), (3, 4)), (None, 3, None, None), (None, 3, None, None)),
+         ])
+    def test_get_output_shape_for(self, layerclass,
+                                  width, input_shape, output_shape):
+        layer = layerclass(Mock(output_shape=(None,)), width=width)
+        assert layer.get_output_shape_for(input_shape) == output_shape
+
+    def test_get_output_for(self, layerclass):
+        layer = layerclass(Mock(output_shape=(None,)), width=2)
+        input = np.zeros((1, 2, 10))
+        trimmed = theano.shared(input[:, :, 2:-2])
+        result = layer.get_output_for(trimmed).eval()
+
+        assert (result == input).all()
+
+
+class TestReshapeLayer:
+    @pytest.fixture
+    def layerclass(self):
+        from lasagne.layers.shape import ReshapeLayer
+        return ReshapeLayer
+
+    @pytest.fixture
+    def two_unknown(self):
+        from lasagne.layers.input import InputLayer
+        shape = (16, 3, None, None, 10)
+        return (InputLayer(shape),
+                theano.shared(np.ones((16, 3, 5, 7, 10))))
+
+    def test_no_reference(self, layerclass, two_unknown):
+        inputlayer, inputdata = two_unknown
+        layer = layerclass(inputlayer, (16, 3, 5, 7, 2, 5))
+        assert layer.output_shape == (16, 3, 5, 7, 2, 5)
+        result = layer.get_output_for(inputdata).eval()
+        assert result.shape == (16, 3, 5, 7, 2, 5)
+
+    def test_reference_both(self, layerclass, two_unknown):
+        inputlayer, inputdata = two_unknown
+        layer = layerclass(inputlayer, (-1, [1], [2], [3], 2, 5))
+        assert layer.output_shape == (16, 3, None, None, 2, 5)
+        result = layer.get_output_for(inputdata).eval()
+        assert result.shape == (16, 3, 5, 7, 2, 5)
+
+    def test_reference_one(self, layerclass, two_unknown):
+        inputlayer, inputdata = two_unknown
+        layer = layerclass(inputlayer, (-1, [1], [2], 7, 2, 5))
+        assert layer.output_shape == (None, 3, None, 7, 2, 5)
+        result = layer.get_output_for(inputdata).eval()
+        assert result.shape == (16, 3, 5, 7, 2, 5)
+
+    def test_reference_twice(self, layerclass, two_unknown):
+        inputlayer, inputdata = two_unknown
+        layer = layerclass(inputlayer, (-1, [1], [2], [3], 2, [2]))
+        assert layer.output_shape == (None, 3, None, None, 2, None)
+        result = layer.get_output_for(inputdata).eval()
+        assert result.shape == (16, 3, 5, 7, 2, 5)
+
+    def test_merge_with_unknown(self, layerclass, two_unknown):
+        inputlayer, inputdata = two_unknown
+        layer = layerclass(inputlayer, ([0], [1], [2], -1))
+        assert layer.output_shape == (16, 3, None, None)
+        result = layer.get_output_for(inputdata).eval()
+        assert result.shape == (16, 3, 5, 70)
+
+    def test_merge_two_unknowns(self, layerclass, two_unknown):
+        inputlayer, inputdata = two_unknown
+        layer = layerclass(inputlayer, ([0], [1], -1, [4]))
+        assert layer.output_shape == (16, 3, None, 10)
+        result = layer.get_output_for(inputdata).eval()
+        assert result.shape == (16, 3, 35, 10)
+
+    def test_size_mismatch(self, layerclass, two_unknown):
+        inputlayer, inputdata = two_unknown
+        with pytest.raises(ValueError) as excinfo:
+            layerclass(inputlayer, (17, 3, [2], [3], -1))
+        assert 'match' in str(excinfo.value)
+
+    def test_invalid_spec(self, layerclass, two_unknown):
+        inputlayer, inputdata = two_unknown
+        with pytest.raises(ValueError):
+            layerclass(inputlayer, (-16, 3, 5, 7, 10))
+        with pytest.raises(ValueError):
+            layerclass(inputlayer, (-1, 3, 5, 7, -1))
+        with pytest.raises(ValueError):
+            layerclass(inputlayer, ([-1], 3, 5, 7, 10))
+        with pytest.raises(ValueError):
+            layerclass(inputlayer, ([0, 1], 3, 5, 7, 10))
+        with pytest.raises(ValueError):
+            layerclass(inputlayer, (None, 3, 5, 7, 10))
+        with pytest.raises(ValueError):
+            layerclass(inputlayer, (16, 3, 5, 7, [5]))
+        with pytest.raises(ValueError):
+            layerclass(inputlayer, (16, 3, theano.tensor.vector(), 7, 10))
+
+    def test_symbolic_shape(self):
+        from lasagne.layers import InputLayer, ReshapeLayer, get_output
+        x = theano.tensor.tensor3()
+        batch_size, seq_len, num_features = x.shape
+        l_inp = InputLayer((None, None, None))
+        l_rshp2 = ReshapeLayer(l_inp, (batch_size*seq_len, [2]))
+
+        # we cannot infer any of the output shapes because they are symbolic.
+        output_shape = l_rshp2.get_output_shape_for(
+            (batch_size, seq_len, num_features))
+        assert output_shape == (None, None)
+
+        output = get_output(l_rshp2, x)
+        out1 = output.eval({x: np.ones((3, 5, 6), dtype='float32')})
+        out2 = output.eval({x: np.ones((4, 5, 7), dtype='float32')})
+
+        assert out1.shape == (3*5, 6)
+        assert out2.shape == (4*5, 7)
+
+
+class TestDimshuffleLayer:
+    @pytest.fixture
+    def input_shape(self):
+        return (2, 3, 1, 5, 7)
+
+    @pytest.fixture
+    def input_var(self):
+        InputTensorType = theano.tensor.TensorType(
+            'float64', broadcastable=(False, False, True, False, False),
+            name='DimShuffleTestTensor')
+        return InputTensorType(name='x')
+
+    @pytest.fixture
+    def input_layer(self, input_shape, input_var):
+        from lasagne.layers.input import InputLayer
+        return InputLayer(input_shape, input_var)
+
+    @pytest.fixture
+    def input_shape_with_None(self):
+        return (2, 3, None, 5, 7)
+
+    @pytest.fixture
+    def input_layer_with_None(self, input_shape_with_None, input_var):
+        from lasagne.layers.input import InputLayer
+        return InputLayer(input_shape_with_None, input_var)
+
+    @pytest.fixture
+    def input_data(self, input_shape):
+        return np.ones(input_shape)
+
+    def test_rearrange(self, input_data, input_var, input_layer):
+        from lasagne.layers.shape import DimshuffleLayer
+        ds = DimshuffleLayer(input_layer, [4, 3, 2, 1, 0])
+        assert ds.output_shape == (7, 5, 1, 3, 2)
+        assert ds.get_output_for(input_var).eval(
+            {input_var: input_data}).shape == (7, 5, 1, 3, 2)
+
+    def test_broadcast(self, input_data, input_var, input_layer):
+        from lasagne.layers.shape import DimshuffleLayer
+        ds = DimshuffleLayer(input_layer, [0, 1, 2, 3, 4, 'x'])
+        assert ds.output_shape == (2, 3, 1, 5, 7, 1)
+        assert ds.get_output_for(input_var).eval(
+            {input_var: input_data}).shape == (2, 3, 1, 5, 7, 1)
+
+    def test_collapse(self, input_data, input_var, input_layer):
+        from lasagne.layers.shape import DimshuffleLayer
+        ds_ok = DimshuffleLayer(input_layer, [0, 1, 3, 4])
+        assert ds_ok.output_shape == (2, 3, 5, 7)
+        assert ds_ok.get_output_for(input_var).eval(
+            {input_var: input_data}).shape == (2, 3, 5, 7)
+        with pytest.raises(ValueError):
+            DimshuffleLayer(input_layer, [0, 1, 2, 4])
+
+    def test_collapse_None(self, input_data, input_var, input_layer_with_None):
+        from lasagne.layers.shape import DimshuffleLayer
+        ds_ok = DimshuffleLayer(input_layer_with_None, [0, 1, 3, 4])
+        assert ds_ok.output_shape == (2, 3, 5, 7)
+        assert ds_ok.get_output_for(input_var).eval(
+            {input_var: input_data}).shape == (2, 3, 5, 7)
+        with pytest.raises(ValueError):
+            DimshuffleLayer(input_layer_with_None, [0, 1, 2, 4])
+
+    def test_invalid_pattern(self, input_data, input_var, input_layer):
+        from lasagne.layers.shape import DimshuffleLayer
+        with pytest.raises(ValueError):
+            DimshuffleLayer(input_layer, ['q'])
+        with pytest.raises(ValueError):
+            DimshuffleLayer(input_layer, [0, 0, 1, 3, 4])
+        with pytest.raises(ValueError):
+            # There is no dimension 42
+            DimshuffleLayer(input_layer, [0, 1, 2, 4, 42])
+
+
+def test_slice_layer():
+    from lasagne.layers import SliceLayer, InputLayer, get_output_shape,\
+        get_output
+    from numpy.testing import assert_array_almost_equal as aeq
+    in_shp = (3, 5, 2)
+    l_inp = InputLayer(in_shp)
+    l_slice_ax0 = SliceLayer(l_inp, axis=0, indices=0)
+    l_slice_ax1 = SliceLayer(l_inp, axis=1, indices=slice(3, 5))
+    l_slice_ax2 = SliceLayer(l_inp, axis=-1, indices=-1)
+
+    x = np.arange(np.prod(in_shp)).reshape(in_shp).astype('float32')
+    x1 = x[0]
+    x2 = x[:, 3:5]
+    x3 = x[:, :, -1]
+
+    assert get_output_shape(l_slice_ax0) == x1.shape
+    assert get_output_shape(l_slice_ax1) == x2.shape
+    assert get_output_shape(l_slice_ax2) == x3.shape
+
+    aeq(get_output(l_slice_ax0, x).eval(), x1)
+    aeq(get_output(l_slice_ax1, x).eval(), x2)
+    aeq(get_output(l_slice_ax2, x).eval(), x3)
+
+    # test slicing None dimension
+    in_shp = (2, None, 2)
+    l_inp = InputLayer(in_shp)
+    l_slice_ax1 = SliceLayer(l_inp, axis=1, indices=slice(3, 5))
+    assert get_output_shape(l_slice_ax1) == (2, None, 2)
+    aeq(get_output(l_slice_ax1, x).eval(), x2)
diff --git a/lasagne/tests/layers/test_special.py b/lasagne/tests/layers/test_special.py
new file mode 100644
index 0000000..c3befaa
--- /dev/null
+++ b/lasagne/tests/layers/test_special.py
@@ -0,0 +1,793 @@
+from mock import Mock
+import numpy as np
+import pytest
+import theano
+from lasagne.layers import InputLayer, standardize, get_output, get_all_params
+
+
+class TestExpressionLayer:
+    @pytest.fixture
+    def ExpressionLayer(self):
+        from lasagne.layers.special import ExpressionLayer
+        return ExpressionLayer
+
+    @pytest.fixture
+    def input_layer(self):
+        from lasagne.layers import InputLayer
+        return InputLayer((2, 3, 4, 5))
+
+    @pytest.fixture
+    def input_layer_nones(self):
+        from lasagne.layers import InputLayer
+        return InputLayer((1, None, None, 5))
+
+    def np_result(self, func, input_layer):
+        X = np.random.uniform(-1, 1, input_layer.output_shape)
+        return X, func(X)
+
+    @pytest.mark.parametrize('func',
+                             [lambda X: X**2,
+                              lambda X: X.mean(-1),
+                              lambda X: X.sum(),
+                              ])
+    def test_tuple_shape(self, func, input_layer, ExpressionLayer):
+        from lasagne.layers.helper import get_output
+
+        X, expected = self.np_result(func, input_layer)
+        layer = ExpressionLayer(input_layer, func, output_shape=expected.shape)
+        assert layer.get_output_shape_for(X.shape) == expected.shape
+
+        output = get_output(layer, X).eval()
+        assert np.allclose(output, expected)
+
+    @pytest.mark.parametrize('func',
+                             [lambda X: X**2,
+                              lambda X: X.mean(-1),
+                              lambda X: X.sum(),
+                              ])
+    def test_callable_shape(self, func, input_layer, ExpressionLayer):
+        from lasagne.layers.helper import get_output
+
+        X, expected = self.np_result(func, input_layer)
+
+        def get_shape(input_shape):
+            return func(np.empty(shape=input_shape)).shape
+
+        layer = ExpressionLayer(input_layer, func, output_shape=get_shape)
+        assert layer.get_output_shape_for(X.shape) == expected.shape
+
+        output = get_output(layer, X).eval()
+        assert np.allclose(output, expected)
+
+    @pytest.mark.parametrize('func',
+                             [lambda X: X**2,
+                              lambda X: X.mean(-1),
+                              lambda X: X.sum(),
+                              ])
+    def test_none_shape(self, func, input_layer, ExpressionLayer):
+        from lasagne.layers.helper import get_output
+
+        X, expected = self.np_result(func, input_layer)
+
+        layer = ExpressionLayer(input_layer, func, output_shape=None)
+        if X.shape == expected.shape:
+            assert layer.get_output_shape_for(X.shape) == expected.shape
+
+        output = get_output(layer, X).eval()
+        assert np.allclose(output, expected)
+
+    @pytest.mark.parametrize('func',
+                             [lambda X: X**2,
+                              lambda X: X.mean(-1),
+                              lambda X: X.sum(),
+                              ])
+    def test_auto_shape(self, func, input_layer, ExpressionLayer):
+        from lasagne.layers.helper import get_output
+
+        X, expected = self.np_result(func, input_layer)
+
+        layer = ExpressionLayer(input_layer, func, output_shape='auto')
+        assert layer.get_output_shape_for(X.shape) == expected.shape
+
+        output = get_output(layer, X).eval()
+        assert np.allclose(output, expected)
+
+    @pytest.mark.parametrize('func',
+                             [lambda X: X**2,
+                              lambda X: X.mean(-1),
+                              lambda X: X.sum(),
+                              ])
+    def test_nones_shape(self, func, input_layer_nones, ExpressionLayer):
+        input_shape = input_layer_nones.output_shape
+        np_shape = tuple(0 if s is None else s for s in input_shape)
+        X = np.random.uniform(-1, 1, np_shape)
+        expected = func(X)
+        expected_shape = tuple(s if s else None for s in expected.shape)
+
+        layer = ExpressionLayer(input_layer_nones,
+                                func,
+                                output_shape=expected_shape)
+        assert layer.get_output_shape_for(input_shape) == expected_shape
+
+        def get_shape(input_shape):
+            return expected_shape
+        layer = ExpressionLayer(input_layer_nones,
+                                func,
+                                output_shape=get_shape)
+        assert layer.get_output_shape_for(input_shape) == expected_shape
+
+        layer = ExpressionLayer(input_layer_nones,
+                                func,
+                                output_shape='auto')
+        assert layer.get_output_shape_for(input_shape) == expected_shape
+
+
+class TestNonlinearityLayer:
+    @pytest.fixture
+    def NonlinearityLayer(self):
+        from lasagne.layers.special import NonlinearityLayer
+        return NonlinearityLayer
+
+    @pytest.fixture
+    def layer_vars(self, NonlinearityLayer, dummy_input_layer):
+        nonlinearity = Mock()
+
+        layer = NonlinearityLayer(
+            dummy_input_layer,
+            nonlinearity=nonlinearity,
+            )
+
+        return {
+            'nonlinearity': nonlinearity,
+            'layer': layer,
+            }
+
+    @pytest.fixture
+    def layer(self, layer_vars):
+        return layer_vars['layer']
+
+    def test_init_none_nonlinearity(self, NonlinearityLayer,
+                                    dummy_input_layer):
+        import lasagne.nonlinearities
+        layer = NonlinearityLayer(
+            dummy_input_layer,
+            nonlinearity=None,
+            )
+        assert layer.nonlinearity == lasagne.nonlinearities.identity
+
+    def test_get_output_for(self, layer_vars):
+        layer = layer_vars['layer']
+        nonlinearity = layer_vars['nonlinearity']
+
+        input = theano.tensor.matrix()
+        result = layer.get_output_for(input)
+        nonlinearity.assert_called_with(input)
+        assert result is nonlinearity.return_value
+
+
+class TestBiasLayer:
+    @pytest.fixture
+    def BiasLayer(self):
+        from lasagne.layers.special import BiasLayer
+        return BiasLayer
+
+    @pytest.fixture
+    def init_b(self):
+        # initializer for a tensor of unique values
+        return lambda shape: np.arange(np.prod(shape)).reshape(shape)
+
+    def test_bias_init(self, BiasLayer, init_b):
+        input_shape = (2, 3, 4)
+        # default: share biases over all but second axis
+        b = BiasLayer(input_shape, b=init_b).b
+        assert np.allclose(b.get_value(), init_b((3,)))
+        # share over first axis only
+        b = BiasLayer(input_shape, b=init_b, shared_axes=0).b
+        assert np.allclose(b.get_value(), init_b((3, 4)))
+        # share over second and third axis
+        b = BiasLayer(input_shape, b=init_b, shared_axes=(1, 2)).b
+        assert np.allclose(b.get_value(), init_b((2,)))
+        # no bias
+        b = BiasLayer(input_shape, b=None).b
+        assert b is None
+
+    def test_get_output_for(self, BiasLayer, init_b):
+        input_shape = (2, 3, 4)
+        # random input tensor
+        input = np.random.randn(*input_shape).astype(theano.config.floatX)
+        # default: share biases over all but second axis
+        layer = BiasLayer(input_shape, b=init_b)
+        assert np.allclose(layer.get_output_for(input).eval(),
+                           input + init_b((1, 3, 1)))
+        # share over first axis only
+        layer = BiasLayer(input_shape, b=init_b, shared_axes=0)
+        assert np.allclose(layer.get_output_for(input).eval(),
+                           input + init_b((1, 3, 4)))
+        # share over second and third axis
+        layer = BiasLayer(input_shape, b=init_b, shared_axes=(1, 2))
+        assert np.allclose(layer.get_output_for(input).eval(),
+                           input + init_b((2, 1, 1)))
+        # no bias
+        layer = BiasLayer(input_shape, b=None)
+        assert layer.get_output_for(input) is input
+
+    def test_undefined_shape(self, BiasLayer):
+        # should work:
+        BiasLayer((64, None, 3), shared_axes=(1, 2))
+        # should not work:
+        with pytest.raises(ValueError) as exc:
+            BiasLayer((64, None, 3), shared_axes=(0, 2))
+        assert 'needs specified input sizes' in exc.value.args[0]
+
+
+class TestScaleLayer:
+    @pytest.fixture
+    def ScaleLayer(self):
+        from lasagne.layers.special import ScaleLayer
+        return ScaleLayer
+
+    @pytest.fixture
+    def init_scales(self):
+        # initializer for a tensor of unique values
+        return lambda shape: np.arange(np.prod(shape)).reshape(shape)
+
+    def test_scales_init(self, ScaleLayer, init_scales):
+        input_shape = (2, 3, 4)
+        # default: share scales over all but second axis
+        b = ScaleLayer(input_shape, scales=init_scales).scales
+        assert np.allclose(b.get_value(), init_scales((3,)))
+        # share over first axis only
+        b = ScaleLayer(input_shape, scales=init_scales, shared_axes=0).scales
+        assert np.allclose(b.get_value(), init_scales((3, 4)))
+        # share over second and third axis
+        b = ScaleLayer(
+            input_shape, scales=init_scales, shared_axes=(1, 2)).scales
+        assert np.allclose(b.get_value(), init_scales((2,)))
+
+    def test_get_output_for(self, ScaleLayer, init_scales):
+        input_shape = (2, 3, 4)
+        # random input tensor
+        input = np.random.randn(*input_shape).astype(theano.config.floatX)
+        # default: share scales over all but second axis
+        layer = ScaleLayer(input_shape, scales=init_scales)
+        assert np.allclose(layer.get_output_for(input).eval(),
+                           input * init_scales((1, 3, 1)))
+        # share over first axis only
+        layer = ScaleLayer(input_shape, scales=init_scales, shared_axes=0)
+        assert np.allclose(layer.get_output_for(input).eval(),
+                           input * init_scales((1, 3, 4)))
+        # share over second and third axis
+        layer = ScaleLayer(input_shape, scales=init_scales, shared_axes=(1, 2))
+        assert np.allclose(layer.get_output_for(input).eval(),
+                           input * init_scales((2, 1, 1)))
+
+    def test_undefined_shape(self, ScaleLayer):
+        # should work:
+        ScaleLayer((64, None, 3), shared_axes=(1, 2))
+        # should not work:
+        with pytest.raises(ValueError) as exc:
+            ScaleLayer((64, None, 3), shared_axes=(0, 2))
+        assert 'needs specified input sizes' in exc.value.args[0]
+
+
+def test_standardize():
+    # Simple example
+    X = np.random.standard_normal((1000, 20)).astype(theano.config.floatX)
+    l_in = InputLayer((None, 20))
+    l_std = standardize(
+        l_in, X.min(axis=0), (X.max(axis=0) - X.min(axis=0)), shared_axes=0)
+    out = get_output(l_std).eval({l_in.input_var: X})
+    assert np.allclose(out.max(axis=0), 1.)
+    assert np.allclose(out.min(axis=0), 0.)
+    assert len(get_all_params(l_std)) == 2
+    # More complicated example
+    X = np.random.standard_normal(
+        (50, 3, 100, 10)).astype(theano.config.floatX)
+    mean = X.mean(axis=(0, 2))
+    std = X.std(axis=(0, 2))
+    l_in = InputLayer((None, 3, None, 10))
+    l_std = standardize(l_in, mean, std, shared_axes=(0, 2))
+    out = get_output(l_std).eval({l_in.input_var: X})
+    assert np.allclose(out.mean(axis=(0, 2)), 0., atol=1e-5)
+    assert np.allclose(out.std((0, 2)), 1., atol=1e-5)
+
+
+class TestInverseLayer:
+    @pytest.fixture
+    def invlayer_vars(self):
+        from lasagne.layers.dense import DenseLayer
+        from lasagne.layers.input import InputLayer
+        from lasagne.layers.special import InverseLayer
+        from lasagne.nonlinearities import identity
+
+        l_in = InputLayer(shape=(10, 12))
+
+        layer = DenseLayer(
+            l_in,
+            num_units=3,
+            b=None,
+            nonlinearity=identity,
+        )
+
+        invlayer = InverseLayer(
+            incoming=layer,
+            layer=layer
+        )
+
+        return {
+            'layer': layer,
+            'invlayer': invlayer,
+        }
+
+    def test_init(self, invlayer_vars):
+        layer = invlayer_vars['layer']
+        invlayer = invlayer_vars['invlayer']
+        # Check that the output shape of the invlayer is the same
+        # as the input shape of the layer
+        assert layer.input_shape == invlayer.output_shape
+
+    def test_get_output_shape_for(self, invlayer_vars):
+        invlayer = invlayer_vars['invlayer']
+        assert invlayer.get_output_shape_for(
+            [(34, 55, 89, 144), (5, 8, 13, 21), (1, 1, 2, 3)]) == (1, 1, 2, 3)
+
+    def test_get_output_for(self, invlayer_vars):
+        from lasagne.layers.helper import get_output
+        invlayer = invlayer_vars['invlayer']
+        layer = invlayer_vars['layer']
+        W = layer.W.get_value()
+        input = theano.shared(
+            np.random.rand(*layer.input_shape))
+        results = get_output(invlayer, inputs=input)
+
+        # Check that the output of the invlayer is the output of the
+        # dot product of the output of the dense layer and the
+        # transposed weights
+        assert np.allclose(
+            results.eval(), np.dot(np.dot(input.get_value(), W), W.T))
+
+
+class TestTransformLayer():
+
+    def test_transform_affine_errors(self):
+        import lasagne
+        with pytest.raises(ValueError):
+            l_in_a = lasagne.layers.InputLayer((None, 3, 28, 28))
+            l_loc_a = lasagne.layers.DenseLayer(l_in_a, num_units=5)
+            l_trans = lasagne.layers.TransformerLayer(l_in_a, l_loc_a)
+        with pytest.raises(ValueError):
+            l_in_b = lasagne.layers.InputLayer((3, 28, 28))
+            l_loc_b = lasagne.layers.DenseLayer(l_in_b, num_units=6)
+            l_trans = lasagne.layers.TransformerLayer(l_in_b, l_loc_b)
+
+    def test_transform_affine_downsample(self):
+            import lasagne
+            downsample = (0.7, 2.3)
+            x = np.random.random((10, 3, 28, 28)).astype('float32')
+            x_sym = theano.tensor.tensor4()
+
+            # create transformer with fixed input size
+            l_in = lasagne.layers.InputLayer((None, 3, 28, 28))
+            l_loc = lasagne.layers.DenseLayer(l_in, num_units=6)
+            l_trans = lasagne.layers.TransformerLayer(
+                    l_in, l_loc, downsample_factor=downsample)
+
+            # check that shape propagation works
+            assert l_trans.output_shape[0] is None
+            assert l_trans.output_shape[1:] == (3, int(28 / .7), int(28 / 2.3))
+
+            # check that data propagation works
+            output = lasagne.layers.get_output(l_trans, x_sym)
+            x_out = output.eval({x_sym: x})
+            assert x_out.shape[0] == x.shape[0]
+            assert x_out.shape[1:] == l_trans.output_shape[1:]
+
+            # create transformer with variable input size
+            l_in = lasagne.layers.InputLayer((None, 3, None, 28))
+            l_loc = lasagne.layers.DenseLayer(
+                    lasagne.layers.ReshapeLayer(l_in, ([0], 3*28*28)),
+                    num_units=6, W=l_loc.W, b=l_loc.b)
+            l_trans = lasagne.layers.TransformerLayer(
+                    l_in, l_loc, downsample_factor=downsample)
+
+            # check that shape propagation works
+            assert l_trans.output_shape[0] is None
+            assert l_trans.output_shape[1] == 3
+            assert l_trans.output_shape[2] is None
+            assert l_trans.output_shape[3] == int(28 / 2.3)
+
+            # check that data propagation works
+            output = lasagne.layers.get_output(l_trans, x_sym)
+            x_out2 = output.eval({x_sym: x})
+            assert x_out2.shape == x_out.shape
+            np.testing.assert_allclose(x_out2, x_out, rtol=1e-5, atol=1e-5)
+
+    def test_transform_affine_identity(self):
+        from lasagne.layers import InputLayer, TransformerLayer
+        from lasagne.utils import floatX
+        from theano.tensor import constant
+        batchsize = 10
+        l_in = InputLayer((batchsize, 3, 28, 28))
+        l_loc = InputLayer((batchsize, 6))
+        layer = TransformerLayer(l_in, l_loc)
+        inputs = floatX(np.arange(np.prod(l_in.shape)).reshape(l_in.shape))
+        thetas = floatX(np.tile([1, 0, 0, 0, 1, 0], (batchsize, 1)))
+        outputs = layer.get_output_for([constant(inputs),
+                                        constant(thetas)]).eval()
+        np.testing.assert_allclose(inputs, outputs, rtol=1e-6)
+
+
+class TestTPSTransformLayer():
+
+    def test_transform_thin_plate_spline_errors(self):
+        import lasagne
+
+        # Check that number of inputs matches 2*num_control_points
+        with pytest.raises(ValueError):
+            num_control_points = 16
+            l_in_a = lasagne.layers.InputLayer((None, 3, 28, 28))
+            l_loc_a = lasagne.layers.DenseLayer(l_in_a,
+                                                num_units=3*num_control_points)
+            l_trans = lasagne.layers.TPSTransformerLayer(
+                    l_in_a, l_loc_a, control_points=num_control_points)
+
+        # Check that error is raised when precompute_grid is set to True
+        # with unknown input size
+        with pytest.raises(ValueError):
+            l_in = lasagne.layers.InputLayer((None, 3, None, 28))
+            l_loc = lasagne.layers.DenseLayer(
+                    lasagne.layers.ReshapeLayer(l_in, ([0], 3*28*28)),
+                    num_units=32)
+            l_trans = lasagne.layers.TPSTransformerLayer(l_in, l_loc,
+                                                         precompute_grid=True)
+
+        # Check that input is right size
+        with pytest.raises(ValueError):
+            l_in_b = lasagne.layers.InputLayer((3, 28, 28))
+            l_loc_b = lasagne.layers.DenseLayer(l_in_b, num_units=6)
+            l_trans = lasagne.layers.TPSTransformerLayer(l_in_b, l_loc_b)
+
+        # Check that number of control points is a perfect square
+        with pytest.raises(ValueError):
+            num_control_points = 17
+            l_in_a = lasagne.layers.InputLayer((None, 3, 28, 28))
+            l_loc_a = lasagne.layers.DenseLayer(l_in_a,
+                                                num_units=2*num_control_points)
+            l_trans = lasagne.layers.TPSTransformerLayer(
+                    l_in_a, l_loc_a, control_points=num_control_points)
+
+        # Check that the input shape is correct
+        with pytest.raises(ValueError):
+            num_control_points = 16
+            l_in_b = lasagne.layers.InputLayer((3, 28, 28))
+            l_loc_b = lasagne.layers.DenseLayer(
+                    l_in_b, num_units=2*num_control_points
+            )
+            l_trans = lasagne.layers.TPSTransformerLayer(l_in_b, l_loc_b)
+
+    def test_transform_thin_plate_spline_variable_input(self):
+        import lasagne
+        from lasagne.utils import floatX
+        from theano.tensor import constant
+
+        x = np.random.random((10, 3, 28, 28)).astype('float32')
+        x_sym = theano.tensor.tensor4()
+
+        l_in = lasagne.layers.InputLayer((None, 3, None, 28))
+        l_loc = lasagne.layers.DenseLayer(
+                lasagne.layers.ReshapeLayer(l_in, ([0], 3*28*28)),
+                num_units=32)
+        l_trans = lasagne.layers.TPSTransformerLayer(
+                l_in, l_loc, precompute_grid='auto')
+
+        # check that shape propagation works
+        assert l_trans.output_shape[0] is None
+        assert l_trans.output_shape[1] == 3
+        assert l_trans.output_shape[2] is None
+        assert l_trans.output_shape[3] == 28
+
+        # check that data propagation works
+        dest_offset = np.zeros(shape=(10, 32))
+        inputs = floatX(np.arange(np.prod(x.shape)).reshape(x.shape))
+        outputs = l_trans.get_output_for([constant(inputs),
+                                          constant(dest_offset)]).eval()
+        np.testing.assert_allclose(inputs, outputs, atol=5e-4)
+
+    def test_transform_thin_plate_spline_downsample(self):
+        import lasagne
+        downsample = (0.7, 2.3)
+        x = np.random.random((10, 3, 28, 28)).astype('float32')
+        x_sym = theano.tensor.tensor4()
+
+        # create transformer with fixed input size
+        l_in = lasagne.layers.InputLayer((None, 3, 28, 28))
+        l_loc = lasagne.layers.DenseLayer(l_in, num_units=32)
+        l_trans = lasagne.layers.TPSTransformerLayer(
+                l_in, l_loc, downsample_factor=downsample,
+                precompute_grid=False
+        )
+
+        # check that shape propagation works
+        assert l_trans.output_shape[0] is None
+        assert l_trans.output_shape[1:] == (3, int(28 / .7), int(28 / 2.3))
+
+        # check that data propagation works
+        output = lasagne.layers.get_output(l_trans, x_sym)
+        x_out = output.eval({x_sym: x})
+        assert x_out.shape[0] == x.shape[0]
+        assert x_out.shape[1:] == l_trans.output_shape[1:]
+
+        # create transformer with variable input size
+        l_in = lasagne.layers.InputLayer((None, 3, None, 28))
+        l_loc = lasagne.layers.DenseLayer(
+                lasagne.layers.ReshapeLayer(l_in, ([0], 3*28*28)),
+                num_units=32, W=l_loc.W, b=l_loc.b)
+        l_trans = lasagne.layers.TPSTransformerLayer(
+                l_in, l_loc, downsample_factor=downsample,
+                precompute_grid=False
+        )
+
+        # check that shape propagation works
+        assert l_trans.output_shape[0] is None
+        assert l_trans.output_shape[1] == 3
+        assert l_trans.output_shape[2] is None
+        assert l_trans.output_shape[3] == int(28 / 2.3)
+
+        # check that data propagation works
+        output = lasagne.layers.get_output(l_trans, x_sym)
+        x_out2 = output.eval({x_sym: x})
+        assert x_out2.shape == x_out.shape
+        np.testing.assert_allclose(x_out2, x_out, rtol=1e-5, atol=1e-5)
+
+    def test_transform_thin_plate_spline_identity(self):
+        from lasagne.layers import InputLayer, TPSTransformerLayer
+        from lasagne.utils import floatX
+        from theano.tensor import constant
+        batchsize = 5
+        num_control_points = 16
+        dest_offset = np.zeros(shape=(batchsize, 2*num_control_points))
+        l_in = InputLayer((batchsize, 3, 28, 28))
+        l_loc = InputLayer((batchsize, 2*num_control_points))
+        layer = TPSTransformerLayer(
+                l_in, l_loc, control_points=num_control_points
+        )
+        inputs = floatX(np.arange(np.prod(l_in.shape)).reshape(l_in.shape))
+        outputs = layer.get_output_for([constant(inputs),
+                                        constant(dest_offset)]).eval()
+        np.testing.assert_allclose(inputs, outputs, atol=5e-4)
+
+    def test_transform_thin_plate_spline_shift(self):
+        from lasagne.layers import InputLayer, TPSTransformerLayer
+        from theano.tensor import constant
+        batchsize = 5
+        num_control_points = 16
+        dest_offset = np.ones(shape=(batchsize, 2*num_control_points))
+        l_in = InputLayer((batchsize, 3, 28, 28))
+        l_loc = InputLayer((batchsize, 2*num_control_points))
+        layer = TPSTransformerLayer(
+                l_in, l_loc, control_points=num_control_points
+        )
+        image = np.zeros(shape=(28, 28))
+        image[[0, -1], :] = 1
+        image[:, [0, -1]] = 1
+        inputs = np.tile(image, (batchsize, 3, 1, 1))
+        shifted_input = np.ones(shape=(28, 28))
+        shifted_input[:13, :13] = 0
+        shifted_input[13, :13] = 0.50000271
+        shifted_input[:13, 13] = 0.50000271
+        shifted_input[13, 13] = 0.75000271
+        shifted_input = np.tile(shifted_input, (batchsize, 3, 1, 1))
+        outputs = layer.get_output_for([constant(inputs),
+                                        constant(dest_offset)]).eval()
+        np.testing.assert_allclose(shifted_input,
+                                   outputs, atol=1e-5)
+
+
+class TestParametricRectifierLayer:
+    @pytest.fixture
+    def ParametricRectifierLayer(self):
+        from lasagne.layers.special import ParametricRectifierLayer
+        return ParametricRectifierLayer
+
+    @pytest.fixture
+    def init_alpha(self):
+        # initializer for a tensor of unique values
+        return lambda shape: (np.arange(np.prod(shape)).reshape(shape)) \
+            / np.prod(shape)
+
+    def test_alpha_init(self, ParametricRectifierLayer, init_alpha):
+        input_shape = (None, 3, 28, 28)
+        # default: alphas only over 2nd axis
+        layer = ParametricRectifierLayer(input_shape, alpha=init_alpha)
+        alpha = layer.alpha
+        assert layer.shared_axes == (0, 2, 3)
+        assert alpha.get_value().shape == (3, )
+        assert np.allclose(alpha.get_value(), init_alpha((3, )))
+
+        # scalar alpha
+        layer = ParametricRectifierLayer(input_shape, alpha=init_alpha,
+                                         shared_axes='all')
+        alpha = layer.alpha
+        assert layer.shared_axes == (0, 1, 2, 3)
+        assert alpha.get_value().shape == ()
+        assert np.allclose(alpha.get_value(), init_alpha((1,)))
+
+        # alphas shared over the 1st axis
+        layer = ParametricRectifierLayer(input_shape, alpha=init_alpha,
+                                         shared_axes=0)
+        alpha = layer.alpha
+        assert layer.shared_axes == (0,)
+        assert alpha.get_value().shape == (3, 28, 28)
+        assert np.allclose(alpha.get_value(), init_alpha((3, 28, 28)))
+
+        # alphas shared over the 1st and 4th axes
+        layer = ParametricRectifierLayer(input_shape, alpha=init_alpha,
+                                         shared_axes=(0, 3))
+        alpha = layer.alpha
+        assert layer.shared_axes == (0, 3)
+        assert alpha.get_value().shape == (3, 28)
+        assert np.allclose(alpha.get_value(), init_alpha((3, 28)))
+
+    def test_undefined_shape(self, ParametricRectifierLayer):
+        with pytest.raises(ValueError):
+            ParametricRectifierLayer((None, 3, 28, 28), shared_axes=(1, 2, 3))
+
+    def test_get_output_for(self, ParametricRectifierLayer, init_alpha):
+        input_shape = (3, 3, 28, 28)
+        # random input tensor
+        input = np.random.randn(*input_shape).astype(theano.config.floatX)
+
+        # default: alphas shared only along 2nd axis
+        layer = ParametricRectifierLayer(input_shape, alpha=init_alpha)
+        alpha_v = layer.alpha.get_value()
+        expected = np.maximum(input, 0) + np.minimum(input, 0) * \
+            alpha_v[None, :, None, None]
+        assert np.allclose(layer.get_output_for(input).eval(), expected)
+
+        # scalar alpha
+        layer = ParametricRectifierLayer(input_shape, alpha=init_alpha,
+                                         shared_axes='all')
+        alpha_v = layer.alpha.get_value()
+        expected = np.maximum(input, 0) + np.minimum(input, 0) * alpha_v
+        assert np.allclose(layer.get_output_for(input).eval(), expected)
+
+        # alphas shared over the 1st axis
+        layer = ParametricRectifierLayer(input_shape, alpha=init_alpha,
+                                         shared_axes=0)
+        alpha_v = layer.alpha.get_value()
+        expected = np.maximum(input, 0) + np.minimum(input, 0) * \
+            alpha_v[None, :, :, :]
+        assert np.allclose(layer.get_output_for(input).eval(), expected)
+
+        # alphas shared over the 1st and 4th axes
+        layer = ParametricRectifierLayer(input_shape, shared_axes=(0, 3),
+                                         alpha=init_alpha)
+        alpha_v = layer.alpha.get_value()
+        expected = np.maximum(input, 0) + np.minimum(input, 0) * \
+            alpha_v[None, :, :, None]
+        assert np.allclose(layer.get_output_for(input).eval(), expected)
+
+    def test_prelu(self, init_alpha):
+        import lasagne
+        input_shape = (3, 28)
+        input = np.random.randn(*input_shape).astype(theano.config.floatX)
+
+        l_in = lasagne.layers.input.InputLayer(input_shape)
+        l_dense = lasagne.layers.dense.DenseLayer(l_in, num_units=100)
+        l_prelu = lasagne.layers.prelu(l_dense, alpha=init_alpha)
+        output = lasagne.layers.get_output(l_prelu, input)
+
+        assert l_dense.nonlinearity == lasagne.nonlinearities.identity
+
+        W = l_dense.W.get_value()
+        b = l_dense.b.get_value()
+        alpha_v = l_prelu.alpha.get_value()
+        expected = np.dot(input, W) + b
+        expected = np.maximum(expected, 0) + \
+            np.minimum(expected, 0) * alpha_v
+        assert np.allclose(output.eval(), expected)
+
+
+class TestRandomizedRectifierLayer:
+    @pytest.fixture
+    def RandomizedRectifierLayer(self):
+        from lasagne.layers.special import RandomizedRectifierLayer
+        return RandomizedRectifierLayer
+
+    def test_high_low(self, RandomizedRectifierLayer):
+        with pytest.raises(ValueError):
+            RandomizedRectifierLayer((None, 3, 28, 28), lower=0.9, upper=0.1)
+
+    def test_nomod_positive(self, RandomizedRectifierLayer):
+        input = np.ones((3, 3, 28, 28)).astype(theano.config.floatX)
+        layer = RandomizedRectifierLayer(input.shape)
+        out = layer.get_output_for(input).eval()
+        assert np.allclose(out, 1.0)
+
+    def test_low_eq_high(self, RandomizedRectifierLayer):
+        input = np.ones((3, 3, 28, 28)).astype(theano.config.floatX) * -1
+        layer = RandomizedRectifierLayer(input.shape, lower=0.5, upper=0.5)
+        out = layer.get_output_for(theano.tensor.constant(input)).eval()
+        assert np.allclose(out, -0.5)
+
+    def test_deterministic(self, RandomizedRectifierLayer):
+        input = np.ones((3, 3, 28, 28)).astype(theano.config.floatX) * -1
+        layer = RandomizedRectifierLayer(input.shape, lower=0.4, upper=0.6)
+        out = layer.get_output_for(theano.tensor.constant(input),
+                                   deterministic=True).eval()
+        assert np.allclose(out, -0.5)
+
+    def test_dim_None(self, RandomizedRectifierLayer):
+        import lasagne
+        l_in = lasagne.layers.input.InputLayer((None, 3, 28, 28))
+        layer = RandomizedRectifierLayer(l_in)
+        input = np.ones((3, 3, 28, 28)).astype(theano.config.floatX)
+        out = layer.get_output_for(input).eval()
+        assert np.allclose(out, 1.0)
+
+    def assert_between(self, layer, input, output):
+        slopes = output / input
+        slopes = slopes[input < 0]
+        assert slopes.min() >= layer.lower
+        assert slopes.max() <= layer.upper
+        assert slopes.var() > 0
+
+    def test_get_output_for(self, RandomizedRectifierLayer):
+        input_shape = (3, 3, 28, 28)
+
+        # ensure slope never exceeds [lower,upper)
+        input = np.random.randn(*input_shape).astype(theano.config.floatX)
+        layer = RandomizedRectifierLayer(input_shape, shared_axes=0)
+        self.assert_between(layer, input, layer.get_output_for(input).eval())
+
+        # from here on, we want to check parameter sharing
+        # this is easier to check if the input is all ones
+        input = np.ones(input_shape).astype(theano.config.floatX) * -1
+
+        # default: parameters shared along all but 2nd axis
+        layer = RandomizedRectifierLayer(input_shape)
+        out = layer.get_output_for(input).eval()
+        assert [
+                np.allclose(out.var(axis=a), 0)
+                for a in range(4)
+               ] == [True, False, True, True]
+
+        # share across all axes (single slope)
+        layer = RandomizedRectifierLayer(input_shape, shared_axes='all')
+        out = layer.get_output_for(input).eval()
+        assert [
+                np.allclose(out.var(axis=a), 0)
+                for a in range(4)
+               ] == [True, True, True, True]
+
+        # share across 1st axis
+        layer = RandomizedRectifierLayer(input_shape, shared_axes=0)
+        out = layer.get_output_for(input).eval()
+        assert [
+                np.allclose(out.var(axis=a), 0)
+                for a in range(4)
+               ] == [True, False, False, False]
+
+        # share across 1st and 4th axes
+        layer = RandomizedRectifierLayer(input_shape, shared_axes=(0, 3))
+        out = layer.get_output_for(input).eval()
+        assert [
+                np.allclose(out.var(axis=a), 0)
+                for a in range(4)
+               ] == [True, False, False, True]
+
+    def test_rrelu(self):
+        import lasagne
+        input_shape = (3, 28)
+        input = np.random.randn(*input_shape).astype(theano.config.floatX)
+
+        l_in = lasagne.layers.input.InputLayer(input_shape)
+        l_dense = lasagne.layers.dense.DenseLayer(l_in, num_units=100)
+        l_rrelu = lasagne.layers.rrelu(l_dense)
+        output = lasagne.layers.get_output(l_rrelu, input)
+
+        assert l_dense.nonlinearity == lasagne.nonlinearities.identity
+
+        W = l_dense.W.get_value()
+        b = l_dense.b.get_value()
+        self.assert_between(l_rrelu, np.dot(input, W) + b, output.eval())
diff --git a/lasagne/tests/test_examples.py b/lasagne/tests/test_examples.py
new file mode 100644
index 0000000..3d64c44
--- /dev/null
+++ b/lasagne/tests/test_examples.py
@@ -0,0 +1,38 @@
+from glob import glob
+from importlib import import_module
+from os.path import basename
+from os.path import dirname
+from os.path import join
+from os.path import splitext
+import sys
+
+import pytest
+
+
+EXAMPLES_DIR = join(dirname(dirname(dirname(__file__))), 'examples')
+
+
+def _example_modules():
+    paths = glob(join(EXAMPLES_DIR, "*py"))
+    return [splitext(basename(path))[0] for path in paths]
+
+
+ at pytest.fixture
+def example(request):
+    sys.path.insert(0, EXAMPLES_DIR)
+    request.addfinalizer(lambda: sys.path.remove(EXAMPLES_DIR))
+
+
+ at pytest.mark.slow
+ at pytest.mark.parametrize("module_name", _example_modules())
+def test_example(example, module_name):
+    try:
+        main = getattr(import_module(module_name), 'main')
+    except ImportError as e:
+        skip_exceptions = ["requires a GPU", "pylearn2", "dnn not available"]
+        if any([text in str(e) for text in skip_exceptions]):
+            pytest.skip(e)
+        else:
+            raise
+
+    main(num_epochs=1)  # run the example for one iteration
diff --git a/lasagne/tests/test_init.py b/lasagne/tests/test_init.py
new file mode 100644
index 0000000..2933014
--- /dev/null
+++ b/lasagne/tests/test_init.py
@@ -0,0 +1,351 @@
+import pytest
+
+
+def test_initializer_sample():
+    from lasagne.init import Initializer
+
+    with pytest.raises(NotImplementedError):
+        Initializer().sample((100, 100))
+
+
+def test_shape():
+    from lasagne.init import Initializer
+
+    # Assert that all `Initializer` sublasses return the shape that
+    # we've asked for in `sample`:
+    for klass in Initializer.__subclasses__():
+        if len(klass.__subclasses__()):
+            # check HeNormal, HeUniform, GlorotNormal, GlorotUniform
+            for sub_klass in klass.__subclasses__():
+                assert sub_klass().sample((12, 23)).shape == (12, 23)
+        else:
+            assert klass().sample((12, 23)).shape == (12, 23)
+
+
+def test_specified_rng():
+    from lasagne.random import get_rng, set_rng
+    from lasagne.init import (Normal, Uniform, GlorotNormal,
+                              GlorotUniform, Sparse, Orthogonal)
+
+    from numpy.random import RandomState
+    from numpy import allclose
+
+    seed = 123456789
+    rng = get_rng()
+
+    for init_class in [Normal, Uniform, GlorotNormal,
+                       GlorotUniform, Sparse, Orthogonal]:
+        set_rng(RandomState(seed))
+        sample1 = init_class().sample((100, 100))
+        set_rng(RandomState(seed))
+        sample2 = init_class().sample((100, 100))
+        set_rng(rng)  # reset to original RNG for other tests
+        assert allclose(sample1, sample2),\
+            ("random initialization was inconsistent for {}"
+             .format(init_class.__name__))
+
+
+def test_normal():
+    from lasagne.init import Normal
+
+    sample = Normal().sample((100, 200))
+    assert -0.001 < sample.mean() < 0.001
+    assert 0.009 < sample.std() < 0.011
+
+
+def test_uniform_range_as_number():
+    from lasagne.init import Uniform
+
+    sample = Uniform(1.0).sample((300, 400))
+    assert sample.shape == (300, 400)
+    assert -1.0 <= sample.min() < -0.9
+    assert 0.9 < sample.max() <= 1.0
+
+
+def test_uniform_range_as_range():
+    from lasagne.init import Uniform
+
+    sample = Uniform((0.0, 1.0)).sample((300, 400))
+    assert sample.shape == (300, 400)
+    assert 0.0 <= sample.min() < 0.1
+    assert 0.9 < sample.max() <= 1.0
+
+
+def test_uniform_mean_std():
+    from lasagne.init import Uniform
+    sample = Uniform(std=1.0, mean=5.0).sample((300, 400))
+    assert 4.9 < sample.mean() < 5.1
+    assert 0.9 < sample.std() < 1.1
+
+
+def test_glorot_normal():
+    from lasagne.init import GlorotNormal
+
+    sample = GlorotNormal().sample((100, 100))
+    assert -0.01 < sample.mean() < 0.01
+    assert 0.09 < sample.std() < 0.11
+
+
+def test_glorot_1d_not_supported():
+    from lasagne.init import GlorotNormal
+
+    with pytest.raises(RuntimeError):
+        GlorotNormal().sample((100,))
+
+
+def test_glorot_normal_receptive_field():
+    from lasagne.init import GlorotNormal
+
+    sample = GlorotNormal().sample((50, 50, 2))
+    assert -0.01 < sample.mean() < 0.01
+    assert 0.09 < sample.std() < 0.11
+
+
+def test_glorot_normal_gain():
+    from lasagne.init import GlorotNormal
+
+    sample = GlorotNormal(gain=10.0).sample((100, 100))
+    assert -0.1 < sample.mean() < 0.1
+    assert 0.9 < sample.std() < 1.1
+
+    sample = GlorotNormal(gain='relu').sample((100, 100))
+    assert -0.01 < sample.mean() < 0.01
+    assert 0.132 < sample.std() < 0.152
+
+
+def test_glorot_normal_c01b():
+    from lasagne.init import GlorotNormal
+
+    sample = GlorotNormal(c01b=True).sample((25, 2, 2, 25))
+    assert -0.01 < sample.mean() < 0.01
+    assert 0.09 < sample.std() < 0.11
+
+
+def test_glorot_normal_c01b_4d_only():
+    from lasagne.init import GlorotNormal
+
+    with pytest.raises(RuntimeError):
+        GlorotNormal(c01b=True).sample((100,))
+
+    with pytest.raises(RuntimeError):
+        GlorotNormal(c01b=True).sample((100, 100))
+
+    with pytest.raises(RuntimeError):
+        GlorotNormal(c01b=True).sample((100, 100, 100))
+
+
+def test_glorot_uniform():
+    from lasagne.init import GlorotUniform
+
+    sample = GlorotUniform().sample((150, 450))
+    assert -0.1 <= sample.min() < -0.09
+    assert 0.09 < sample.max() <= 0.1
+
+
+def test_glorot_uniform_receptive_field():
+    from lasagne.init import GlorotUniform
+
+    sample = GlorotUniform().sample((150, 150, 2))
+    assert -0.10 <= sample.min() < -0.09
+    assert 0.09 < sample.max() <= 0.10
+
+
+def test_glorot_uniform_gain():
+    from lasagne.init import GlorotUniform
+
+    sample = GlorotUniform(gain=10.0).sample((150, 450))
+    assert -1.0 <= sample.min() < -0.9
+    assert 0.9 < sample.max() <= 1.0
+
+    sample = GlorotUniform(gain='relu').sample((100, 100))
+    assert -0.01 < sample.mean() < 0.01
+    assert 0.132 < sample.std() < 0.152
+
+
+def test_glorot_uniform_c01b():
+    from lasagne.init import GlorotUniform
+
+    sample = GlorotUniform(c01b=True).sample((75, 2, 2, 75))
+    assert -0.1 <= sample.min() < -0.09
+    assert 0.09 < sample.max() <= 0.1
+
+
+def test_glorot_uniform_c01b_4d_only():
+    from lasagne.init import GlorotUniform
+
+    with pytest.raises(RuntimeError):
+        GlorotUniform(c01b=True).sample((100,))
+
+    with pytest.raises(RuntimeError):
+        GlorotUniform(c01b=True).sample((100, 100))
+
+    with pytest.raises(RuntimeError):
+        GlorotUniform(c01b=True).sample((100, 100, 100))
+
+
+def test_he_normal():
+    from lasagne.init import HeNormal
+
+    sample = HeNormal().sample((100, 100))
+    assert -0.01 < sample.mean() < 0.01
+    assert 0.09 < sample.std() < 0.11
+
+
+def test_he_1d_not_supported():
+    from lasagne.init import HeNormal
+
+    with pytest.raises(RuntimeError):
+        HeNormal().sample((100,))
+
+
+def test_he_normal_receptive_field():
+    from lasagne.init import HeNormal
+
+    sample = HeNormal().sample((50, 50, 2))
+    assert -0.01 < sample.mean() < 0.01
+    assert 0.09 < sample.std() < 0.11
+
+
+def test_he_normal_gain():
+    from lasagne.init import HeNormal
+
+    sample = HeNormal(gain=10.0).sample((100, 100))
+    assert -0.1 < sample.mean() < 0.1
+    assert 0.9 < sample.std() < 1.1
+
+    sample = HeNormal(gain='relu').sample((200, 50))
+    assert -0.1 < sample.mean() < 0.1
+    assert 0.07 < sample.std() < 0.12
+
+
+def test_he_normal_c01b():
+    from lasagne.init import HeNormal
+
+    sample = HeNormal(c01b=True).sample((25, 2, 2, 25))
+    assert -0.01 < sample.mean() < 0.01
+    assert 0.09 < sample.std() < 0.11
+
+
+def test_he_normal_c01b_4d_only():
+    from lasagne.init import HeNormal
+
+    with pytest.raises(RuntimeError):
+        HeNormal(c01b=True).sample((100,))
+
+    with pytest.raises(RuntimeError):
+        HeNormal(c01b=True).sample((100, 100))
+
+    with pytest.raises(RuntimeError):
+        HeNormal(c01b=True).sample((100, 100, 100))
+
+
+def test_he_uniform():
+    from lasagne.init import HeUniform
+
+    sample = HeUniform().sample((300, 200))
+    assert -0.1 <= sample.min() < -0.09
+    assert 0.09 < sample.max() <= 0.1
+
+
+def test_he_uniform_receptive_field():
+    from lasagne.init import HeUniform
+
+    sample = HeUniform().sample((150, 150, 2))
+    assert -0.10 <= sample.min() < -0.09
+    assert 0.09 < sample.max() <= 0.10
+
+
+def test_he_uniform_gain():
+    from lasagne.init import HeUniform
+
+    sample = HeUniform(gain=10.0).sample((300, 200))
+    assert -1.0 <= sample.min() < -0.9
+    assert 0.9 < sample.max() <= 1.0
+
+    sample = HeUniform(gain='relu').sample((100, 100))
+    assert -0.1 < sample.mean() < 0.1
+    assert 0.1 < sample.std() < 0.2
+
+
+def test_he_uniform_c01b():
+    from lasagne.init import HeUniform
+
+    sample = HeUniform(c01b=True).sample((75, 2, 2, 75))
+    assert -0.1 <= sample.min() < -0.09
+    assert 0.09 < sample.max() <= 0.1
+
+
+def test_he_uniform_c01b_4d_only():
+    from lasagne.init import HeUniform
+
+    with pytest.raises(RuntimeError):
+        HeUniform(c01b=True).sample((100,))
+
+    with pytest.raises(RuntimeError):
+        HeUniform(c01b=True).sample((100, 100))
+
+    with pytest.raises(RuntimeError):
+        HeUniform(c01b=True).sample((100, 100, 100))
+
+
+def test_constant():
+    from lasagne.init import Constant
+
+    sample = Constant(1.0).sample((10, 20))
+    assert (sample == 1.0).all()
+
+
+def test_sparse():
+    from lasagne.init import Sparse
+
+    sample = Sparse(sparsity=0.1).sample((10, 20))
+    assert (sample != 0.0).sum() == (10 * 20) * 0.1
+
+
+def test_sparse_1d_not_supported():
+    from lasagne.init import Sparse
+
+    with pytest.raises(RuntimeError):
+        Sparse().sample((100,))
+
+
+def test_orthogonal():
+    import numpy as np
+    from lasagne.init import Orthogonal
+
+    sample = Orthogonal().sample((100, 200))
+    assert np.allclose(np.dot(sample, sample.T), np.eye(100), atol=1e-6)
+
+    sample = Orthogonal().sample((200, 100))
+    assert np.allclose(np.dot(sample.T, sample), np.eye(100), atol=1e-6)
+
+
+def test_orthogonal_gain():
+    import numpy as np
+    from lasagne.init import Orthogonal
+
+    gain = 2
+    sample = Orthogonal(gain).sample((100, 200))
+    assert np.allclose(np.dot(sample, sample.T), gain * gain * np.eye(100),
+                       atol=1e-6)
+
+    gain = np.sqrt(2)
+    sample = Orthogonal('relu').sample((100, 200))
+    assert np.allclose(np.dot(sample, sample.T), gain * gain * np.eye(100),
+                       atol=1e-6)
+
+
+def test_orthogonal_multi():
+    import numpy as np
+    from lasagne.init import Orthogonal
+
+    sample = Orthogonal().sample((100, 50, 80))
+    sample = sample.reshape(100, 50*80)
+    assert np.allclose(np.dot(sample, sample.T), np.eye(100), atol=1e-6)
+
+
+def test_orthogonal_1d_not_supported():
+    from lasagne.init import Orthogonal
+
+    with pytest.raises(RuntimeError):
+        Orthogonal().sample((100,))
diff --git a/lasagne/tests/test_nonlinearities.py b/lasagne/tests/test_nonlinearities.py
new file mode 100644
index 0000000..3d906c1
--- /dev/null
+++ b/lasagne/tests/test_nonlinearities.py
@@ -0,0 +1,69 @@
+import pytest
+import numpy as np
+import theano.tensor as T
+
+
+class TestNonlinearities(object):
+    def linear(self, x):
+        return x
+
+    def rectify(self, x):
+        return x * (x > 0)
+
+    def leaky_rectify(self, x):
+        return x * (x > 0) + 0.01 * x * (x < 0)
+
+    def leaky_rectify_0(self, x):
+        return self.rectify(x)
+
+    def elu(self, x, alpha=1):
+        return np.where(x > 0, x, alpha * (np.exp(x) - 1))
+
+    def softplus(self, x):
+        return np.log1p(np.exp(x))
+
+    def sigmoid(self, x):
+        return 1 / (1 + np.exp(-x))
+
+    def tanh(self, x):
+        return np.tanh(x)
+
+    def scaled_tanh(self, x):
+        return np.tanh(x)
+
+    def scaled_tanh_p(self, x):
+        return 2.27 * np.tanh(0.5 * x)
+
+    def softmax(self, x):
+        return (np.exp(x).T / np.exp(x).sum(-1)).T
+
+    @pytest.mark.parametrize('nonlinearity',
+                             ['linear', 'rectify',
+                              'leaky_rectify', 'elu', 'sigmoid',
+                              'tanh', 'scaled_tanh',
+                              'softmax', 'leaky_rectify_0',
+                              'scaled_tanh_p', 'softplus'])
+    def test_nonlinearity(self, nonlinearity):
+        import lasagne.nonlinearities
+
+        if nonlinearity == 'leaky_rectify_0':
+            from lasagne.nonlinearities import LeakyRectify
+            theano_nonlinearity = LeakyRectify(leakiness=0)
+        elif nonlinearity == 'scaled_tanh':
+            from lasagne.nonlinearities import ScaledTanH
+            theano_nonlinearity = ScaledTanH()
+        elif nonlinearity == 'scaled_tanh_p':
+            from lasagne.nonlinearities import ScaledTanH
+            theano_nonlinearity = ScaledTanH(scale_in=0.5, scale_out=2.27)
+        else:
+            theano_nonlinearity = getattr(lasagne.nonlinearities,
+                                          nonlinearity)
+        np_nonlinearity = getattr(self, nonlinearity)
+
+        X = T.matrix()
+        X0 = lasagne.utils.floatX(np.random.uniform(-3, 3, (10, 10)))
+
+        theano_result = theano_nonlinearity(X).eval({X: X0})
+        np_result = np_nonlinearity(X0)
+
+        assert np.allclose(theano_result, np_result)
diff --git a/lasagne/tests/test_objectives.py b/lasagne/tests/test_objectives.py
new file mode 100644
index 0000000..63717e2
--- /dev/null
+++ b/lasagne/tests/test_objectives.py
@@ -0,0 +1,236 @@
+import numpy as np
+import theano
+import pytest
+
+
+def test_binary_crossentropy():
+    # symbolic version
+    from lasagne.objectives import binary_crossentropy
+    p, t = theano.tensor.matrices('pt')
+    c = binary_crossentropy(p, t)
+    # numeric version
+    floatX = theano.config.floatX
+    predictions = np.random.rand(10, 20).astype(floatX)
+    targets = np.random.rand(10, 20).astype(floatX)
+    crossent = (- targets * np.log(predictions) -
+                (1-targets) * np.log(1-predictions))
+    # compare
+    assert np.allclose(crossent, c.eval({p: predictions, t: targets}))
+
+
+def test_categorical_crossentropy():
+    # symbolic version
+    from lasagne.objectives import categorical_crossentropy
+    p, t = theano.tensor.matrices('pt')
+    c = categorical_crossentropy(p, t)
+    # numeric version
+    floatX = theano.config.floatX
+    predictions = np.random.rand(10, 20).astype(floatX)
+    predictions /= predictions.sum(axis=1, keepdims=True)
+    targets = np.random.rand(10, 20).astype(floatX)
+    targets /= targets.sum(axis=1, keepdims=True)
+    crossent = -(targets * np.log(predictions)).sum(axis=-1)
+    # compare
+    assert np.allclose(crossent, c.eval({p: predictions, t: targets}))
+
+
+def test_categorical_crossentropy_onehot():
+    # symbolic version
+    from lasagne.objectives import categorical_crossentropy
+    p = theano.tensor.matrix('p')
+    t = theano.tensor.ivector('t')  # correct class per item
+    c = categorical_crossentropy(p, t)
+    # numeric version
+    floatX = theano.config.floatX
+    predictions = np.random.rand(10, 20).astype(floatX)
+    predictions /= predictions.sum(axis=1, keepdims=True)
+    targets = np.random.randint(20, size=10).astype(np.uint8)
+    crossent = -np.log(predictions[np.arange(10), targets])
+    # compare
+    assert np.allclose(crossent, c.eval({p: predictions, t: targets}))
+
+
+def test_squared_error():
+    # symbolic version
+    from lasagne.objectives import squared_error
+    a, b = theano.tensor.matrices('ab')
+    c = squared_error(a, b)
+    # numeric version
+    floatX = theano.config.floatX
+    x = np.random.randn(10, 20).astype(floatX)
+    y = np.random.randn(10, 20).astype(floatX)
+    z = (x - y)**2
+    # compare
+    assert np.allclose(z, c.eval({a: x, b: y}))
+
+
+def test_aggregate_mean():
+    from lasagne.objectives import aggregate
+    x = theano.tensor.matrix('x')
+    assert theano.gof.graph.is_same_graph(aggregate(x), x.mean())
+    assert theano.gof.graph.is_same_graph(aggregate(x, mode='mean'), x.mean())
+
+
+def test_aggregate_sum():
+    from lasagne.objectives import aggregate
+    x = theano.tensor.matrix('x')
+    assert theano.gof.graph.is_same_graph(aggregate(x, mode='sum'), x.sum())
+
+
+def test_aggregate_weighted_mean():
+    from lasagne.objectives import aggregate
+    x = theano.tensor.matrix('x')
+    w = theano.tensor.matrix('w')
+    assert theano.gof.graph.is_same_graph(aggregate(x, w), (x * w).mean())
+    assert theano.gof.graph.is_same_graph(aggregate(x, w, mode='mean'),
+                                          (x * w).mean())
+
+
+def test_aggregate_weighted_sum():
+    from lasagne.objectives import aggregate
+    x = theano.tensor.matrix('x')
+    w = theano.tensor.matrix('w')
+    assert theano.gof.graph.is_same_graph(aggregate(x, w, mode='sum'),
+                                          (x * w).sum())
+
+
+def test_aggregate_weighted_normalized_sum():
+    from lasagne.objectives import aggregate
+    x = theano.tensor.matrix('x')
+    w = theano.tensor.matrix('w')
+    assert theano.gof.graph.is_same_graph(aggregate(x, w, 'normalized_sum'),
+                                          (x * w).sum() / w.sum())
+
+
+def test_aggregate_invalid():
+    from lasagne.objectives import aggregate
+    with pytest.raises(ValueError) as exc:
+        aggregate(theano.tensor.matrix(), mode='asdf')
+    assert 'mode must be' in exc.value.args[0]
+    with pytest.raises(ValueError) as exc:
+        aggregate(theano.tensor.matrix(), mode='normalized_sum')
+    assert 'require weights' in exc.value.args[0]
+
+
+def test_binary_hinge_loss():
+    from lasagne.objectives import binary_hinge_loss
+    from lasagne.nonlinearities import rectify
+    p = theano.tensor.vector('p')
+    t = theano.tensor.ivector('t')
+    c = binary_hinge_loss(p, t)
+    # numeric version
+    floatX = theano.config.floatX
+    predictions = np.random.rand(10).astype(floatX)
+    targets = np.random.random_integers(0, 1, (10,)).astype("int8")
+    hinge = rectify(1 - predictions * (2 * targets - 1))
+    # compare
+    assert np.allclose(hinge, c.eval({p: predictions, t: targets}))
+
+
+def test_binary_hinge_loss_not_binary_targets():
+    from lasagne.objectives import binary_hinge_loss
+    from lasagne.nonlinearities import rectify
+    p = theano.tensor.vector('p')
+    t = theano.tensor.ivector('t')
+    c = binary_hinge_loss(p, t, binary=False)
+    # numeric version
+    floatX = theano.config.floatX
+    predictions = np.random.rand(10, ).astype(floatX)
+    targets = np.random.random_integers(0, 1, (10, )).astype("int8")
+    targets = 2 * targets - 1
+    hinge = rectify(1 - predictions * targets)
+    # compare
+    assert np.allclose(hinge, c.eval({p: predictions, t: targets}))
+
+
+def test_multiclass_hinge_loss():
+    from lasagne.objectives import multiclass_hinge_loss
+    from lasagne.nonlinearities import rectify
+    p = theano.tensor.matrix('p')
+    t = theano.tensor.ivector('t')
+    c = multiclass_hinge_loss(p, t)
+    # numeric version
+    floatX = theano.config.floatX
+    predictions = np.random.rand(10, 20).astype(floatX)
+    targets = np.random.random_integers(0, 19, (10,)).astype("int8")
+    one_hot = np.zeros((10, 20))
+    one_hot[np.arange(10), targets] = 1
+    correct = predictions[one_hot > 0]
+    rest = predictions[one_hot < 1].reshape((10, 19))
+    rest = np.max(rest, axis=1)
+    hinge = rectify(1 + rest - correct)
+    # compare
+    assert np.allclose(hinge, c.eval({p: predictions, t: targets}))
+
+
+def test_multiclass_hinge_loss_invalid():
+    from lasagne.objectives import multiclass_hinge_loss
+    with pytest.raises(TypeError) as exc:
+        multiclass_hinge_loss(theano.tensor.vector(),
+                              theano.tensor.matrix())
+    assert 'rank mismatch' in exc.value.args[0]
+
+
+def test_binary_accuracy():
+    from lasagne.objectives import binary_accuracy
+    p = theano.tensor.vector('p')
+    t = theano.tensor.ivector('t')
+    c = binary_accuracy(p, t)
+    # numeric version
+    floatX = theano.config.floatX
+    predictions = np.random.rand(10, ).astype(floatX) > 0.5
+    targets = np.random.random_integers(0, 1, (10,)).astype("int8")
+    accuracy = predictions == targets
+    # compare
+    assert np.allclose(accuracy, c.eval({p: predictions, t: targets}))
+
+
+def test_categorical_accuracy():
+    from lasagne.objectives import categorical_accuracy
+    p = theano.tensor.matrix('p')
+    t = theano.tensor.ivector('t')
+    c = categorical_accuracy(p, t)
+    # numeric version
+    floatX = theano.config.floatX
+    predictions = np.random.rand(100, 5).astype(floatX)
+    cls_predictions = np.argmax(predictions, axis=1)
+    targets = np.random.random_integers(0, 4, (100,)).astype("int8")
+    accuracy = cls_predictions == targets
+    # compare
+    assert np.allclose(accuracy, c.eval({p: predictions, t: targets}))
+    one_hot = np.zeros((100, 5)).astype("int8")
+    one_hot[np.arange(100), targets] = 1
+    t = theano.tensor.imatrix('t')
+    c = categorical_accuracy(p, t)
+    assert np.allclose(accuracy, c.eval({p: predictions, t: one_hot}))
+
+
+def test_categorical_accuracy_top_k():
+    from lasagne.objectives import categorical_accuracy
+    p = theano.tensor.matrix('p')
+    t = theano.tensor.ivector('t')
+    top_k = 4
+    c = categorical_accuracy(p, t, top_k=top_k)
+    # numeric version
+    floatX = theano.config.floatX
+    predictions = np.random.rand(10, 20).astype(floatX)
+    cls_predictions = np.argsort(predictions, axis=1).astype("int8")
+    # (construct targets such that top-1 to top-10 predictions are in there)
+    targets = cls_predictions[np.arange(10), -np.random.permutation(10)]
+    top_predictions = cls_predictions[:, -top_k:]
+    accuracy = np.any(top_predictions == targets[:, np.newaxis], axis=1)
+    # compare
+    assert np.allclose(accuracy, c.eval({p: predictions, t: targets}))
+    one_hot = np.zeros((10, 20)).astype("int8")
+    one_hot[np.arange(10), targets] = 1
+    t = theano.tensor.imatrix('t')
+    c = categorical_accuracy(p, t, top_k=top_k)
+    assert np.allclose(accuracy, c.eval({p: predictions, t: one_hot}))
+
+
+def test_categorial_accuracy_invalid():
+    from lasagne.objectives import categorical_accuracy
+    with pytest.raises(TypeError) as exc:
+        categorical_accuracy(theano.tensor.vector(),
+                             theano.tensor.matrix())
+    assert 'rank mismatch' in exc.value.args[0]
diff --git a/lasagne/tests/test_regularization.py b/lasagne/tests/test_regularization.py
new file mode 100644
index 0000000..ce98870
--- /dev/null
+++ b/lasagne/tests/test_regularization.py
@@ -0,0 +1,99 @@
+import pytest
+import numpy as np
+import theano.tensor as T
+import lasagne
+
+from collections import OrderedDict
+from theano.scan_module.scan_utils import equal_computations
+from mock import Mock
+
+
+class TestRegularizationPenalties(object):
+    def l1(self, x):
+        return np.abs(x).sum()
+
+    def l2(self, x):
+        return (x**2).sum()
+
+    @pytest.mark.parametrize('penalty',
+                             ['l1', 'l2'])
+    def test_penalty(self, penalty):
+        np_penalty = getattr(self, penalty)
+        theano_penalty = getattr(lasagne.regularization, penalty)
+
+        X = T.matrix()
+        X0 = lasagne.utils.floatX(np.random.uniform(-3, 3, (10, 10)))
+
+        theano_result = theano_penalty(X).eval({X: X0})
+        np_result = np_penalty(X0)
+
+        assert np.allclose(theano_result, np_result)
+
+
+class TestRegularizationHelpers(object):
+    @pytest.fixture
+    def layers(self):
+        l_1 = lasagne.layers.InputLayer((10,))
+        l_2 = lasagne.layers.DenseLayer(l_1, num_units=20)
+        l_3 = lasagne.layers.DenseLayer(l_2, num_units=30)
+        return l_1, l_2, l_3
+
+    def test_apply_penalty(self):
+        from lasagne.regularization import apply_penalty, l2
+        A = T.vector()
+        B = T.matrix()
+
+        assert apply_penalty([], l2) == 0
+
+        assert equal_computations([apply_penalty(A, l2)],
+                                  [l2(A)])
+
+        assert equal_computations([apply_penalty([A, B], l2)],
+                                  [sum([l2(A), l2(B)])])
+
+    def test_regularize_layer_params_single_layer(self, layers):
+        from lasagne.regularization import regularize_layer_params
+        l_1, l_2, l_3 = layers
+
+        penalty = Mock(return_value=0)
+        loss = regularize_layer_params(l_2, penalty)
+
+        assert penalty.call_count == 1
+        penalty.assert_any_call(l_2.W)
+
+    def test_regularize_layer_params_multiple_layers(self, layers):
+        from lasagne.regularization import regularize_layer_params
+        l_1, l_2, l_3 = layers
+
+        penalty = Mock(return_value=0)
+        loss = regularize_layer_params([l_1, l_2, l_3], penalty)
+
+        assert penalty.call_count == 2
+        penalty.assert_any_call(l_2.W)
+        penalty.assert_any_call(l_3.W)
+
+    def test_regularize_network_params(self, layers):
+        from lasagne.regularization import regularize_network_params
+        l_1, l_2, l_3 = layers
+
+        penalty = Mock(return_value=0)
+        loss = regularize_network_params(l_3, penalty)
+
+        assert penalty.call_count == 2
+        penalty.assert_any_call(l_2.W)
+        penalty.assert_any_call(l_3.W)
+
+    def test_regularize_layer_params_weighted(self, layers):
+        from lasagne.regularization import regularize_layer_params_weighted
+        from lasagne.regularization import apply_penalty, l2
+        l_1, l_2, l_3 = layers
+
+        layers = OrderedDict()
+        layers[l_2] = 0.1
+        layers[l_3] = 0.5
+
+        loss = regularize_layer_params_weighted(layers,
+                                                lasagne.regularization.l2)
+        assert equal_computations([loss],
+                                  [sum([0.1 * apply_penalty([l_2.W], l2),
+                                        0.5 * apply_penalty([l_3.W], l2)])])
diff --git a/lasagne/tests/test_theano_extensions.py b/lasagne/tests/test_theano_extensions.py
new file mode 100644
index 0000000..57a9c6c
--- /dev/null
+++ b/lasagne/tests/test_theano_extensions.py
@@ -0,0 +1,155 @@
+import pytest
+import numpy as np
+import theano.tensor as T
+import lasagne
+
+
+def conv1d(input, kernel, stride=1):
+    output = []
+    for b in input:
+        temp = []
+        for c in kernel:
+            temp.append(
+                np.convolve(b[0, :], c[0, :], mode='valid'))
+        output.append(temp)
+    return np.array(output)[:, :, ::stride]
+
+
+ at pytest.mark.parametrize('impl', ['conv1d_sc', 'conv1d_mc0',
+                                  'conv1d_mc1', 'conv1d_unstrided',
+                                  'conv1d_sd', 'conv1d_md'])
+ at pytest.mark.parametrize('filter_flip', [True, False])
+ at pytest.mark.parametrize('stride', [1, 2])
+def test_conv(impl, stride, filter_flip):
+    import lasagne.theano_extensions.conv
+    conv = getattr(lasagne.theano_extensions.conv, impl)
+
+    X = T.tensor3()
+    W = T.tensor3()
+    input = lasagne.utils.floatX(np.ones((1, 1, 10)))
+    kernel = lasagne.utils.floatX(np.random.uniform(-1, 1, (2, 1, 6)))
+
+    conv_theano = conv(X, W, input.shape, kernel.shape, subsample=(stride,),
+                       filter_flip=filter_flip).eval({X: input, W: kernel})
+
+    conv_np = conv1d(input, kernel, stride)
+
+    assert np.allclose(conv_theano, conv_np)
+
+
+ at pytest.mark.parametrize('impl', ['conv1d_sc', 'conv1d_mc0', 'conv1d_mc1'])
+def test_conv_nones(impl):
+    import lasagne.theano_extensions.conv
+    conv = getattr(lasagne.theano_extensions.conv, impl)
+
+    X = T.tensor3()
+    W = T.tensor3()
+    input = lasagne.utils.floatX(np.ones((1, 1, 12)))
+    kernel = lasagne.utils.floatX(np.random.uniform(-1, 1, (2, 1, 3)))
+
+    conv_theano = conv(X, W, None, None).eval({
+        X: input, W: kernel
+        })
+
+    conv_np = conv1d(input, kernel)
+
+    assert np.allclose(conv_theano, conv_np)
+
+
+ at pytest.mark.parametrize('impl', ['conv1d_mc0', 'conv1d_mc1'])
+ at pytest.mark.parametrize('pad', [1, (2,)])
+def test_conv_pad(impl, pad):
+    import lasagne.theano_extensions.conv
+    conv = getattr(lasagne.theano_extensions.conv, impl)
+
+    X = T.tensor3()
+    W = T.tensor3()
+    input = lasagne.utils.floatX(np.ones((1, 1, 12)))
+    kernel = lasagne.utils.floatX(np.random.uniform(-1, 1, (2, 1, 3)))
+
+    conv_theano = conv(X, W, input.shape, kernel.shape, border_mode=pad).eval({
+        X: input, W: kernel
+        })
+
+    pad = pad[0] if isinstance(pad, tuple) else pad
+    input = np.pad(input, [(0, 0), (0, 0), (pad, pad)], mode='constant')
+    conv_np = conv1d(input, kernel)
+
+    assert np.allclose(conv_theano, conv_np)
+
+
+ at pytest.mark.parametrize('impl', ['conv1d_sc', 'conv1d_mc0',
+                                  'conv1d_mc1', 'conv1d_unstrided',
+                                  'conv1d_sd', 'conv1d_md'])
+def test_conv_invalid_border_mode(impl):
+    import lasagne.theano_extensions.conv
+    conv = getattr(lasagne.theano_extensions.conv, impl)
+
+    X = T.tensor3()
+    W = T.tensor3()
+
+    with pytest.raises(Exception):
+        conv(X, W, (1, 1, 10), (2, 1, 3), border_mode=None)
+
+
+ at pytest.mark.parametrize('impl', ['conv1d_unstrided', 'conv1d_sd',
+                                  'conv1d_md'])
+def test_conv_stride(impl):
+    import lasagne.theano_extensions.conv
+    conv = getattr(lasagne.theano_extensions.conv, impl)
+
+    X = T.tensor3()
+    W = T.tensor3()
+
+    with pytest.raises(Exception):
+        conv(X, W, (1, 1, 10), (2, 1, 3), subsample=(2,))
+
+
+ at pytest.mark.parametrize('val', [0, 7])
+ at pytest.mark.parametrize('batch_ndim', [1, 2])
+def test_pad(batch_ndim, val, width=3):
+    from lasagne.theano_extensions.padding import pad
+
+    X = T.tensor4()
+    X0 = lasagne.utils.floatX(np.ones((2, 3, 4, 5)))
+    X_pad_theano = pad(X, width, val, batch_ndim).eval({X: X0})
+
+    pads = tuple((width, width) if i >= batch_ndim else (0, 0)
+                 for i, _ in enumerate(X0.shape))
+    X_pad_np = np.pad(X0, pads, mode='constant', constant_values=val)
+
+    assert (X_pad_theano == X_pad_np).all()
+
+
+ at pytest.mark.parametrize('batch_ndim', [1, 2])
+def test_pad_width_per_axis(batch_ndim, val=0):
+    from lasagne.theano_extensions.padding import pad
+
+    width = (1, 2, 3, 4)
+
+    X = T.tensor4()
+    X0 = lasagne.utils.floatX(np.ones((2, 3, 4, 5)))
+    X_pad_theano = pad(X, width[batch_ndim:], val, batch_ndim).eval({X: X0})
+
+    pads = tuple((w, w) if i >= batch_ndim else (0, 0)
+                 for i, w in enumerate(width))
+    X_pad_np = np.pad(X0, pads, mode='constant', constant_values=val)
+
+    assert (X_pad_theano == X_pad_np).all()
+
+
+ at pytest.mark.parametrize('batch_ndim', [1, 2])
+def test_pad_width_per_border(batch_ndim, val=0):
+    from lasagne.theano_extensions.padding import pad
+
+    width = [(1, 2), (3, 4), (1, 2), (3, 4)]
+
+    X = T.tensor4()
+    X0 = lasagne.utils.floatX(np.ones((2, 3, 4, 5)))
+    X_pad_theano = pad(X, width[batch_ndim:], val, batch_ndim).eval({X: X0})
+
+    pads = tuple(w if i >= batch_ndim else (0, 0)
+                 for i, w in enumerate(width))
+    X_pad_np = np.pad(X0, pads, mode='constant', constant_values=val)
+
+    assert (X_pad_theano == X_pad_np).all()
diff --git a/lasagne/tests/test_updates.py b/lasagne/tests/test_updates.py
new file mode 100644
index 0000000..c82cd2c
--- /dev/null
+++ b/lasagne/tests/test_updates.py
@@ -0,0 +1,227 @@
+import pytest
+import numpy as np
+import theano
+import theano.tensor as T
+import lasagne
+
+PCT_TOLERANCE = 1E-5
+
+
+class TestUpdateFunctions(object):
+    # These tests compare results on a toy problem to values
+    # calculated by the torch.optim package, using this script:
+    # https://gist.github.com/ebenolson/931e879ed38f257253d2
+    torch_values = {'sgd': [0.81707280688755,
+                            0.6648326359915,
+                            0.5386151140949],
+                    'momentum': [0.6848486952183,
+                                 0.44803321781003,
+                                 0.27431190123502],
+                    'nesterov_momentum': [0.67466543592725,
+                                          0.44108468114241,
+                                          0.2769002108997],
+                    'adagrad': [0.55373120047759,
+                                0.55373120041518,
+                                0.55373120039438],
+                    'rmsprop': [0.83205403985348,
+                                0.83205322744821,
+                                0.83205295664444],
+                    'adadelta': [0.95453237704725,
+                                 0.9545237471374,
+                                 0.95452214847397],
+                    'adam': [0.90034972009036,
+                             0.90034967993061,
+                             0.90034966654402],
+                    'adamax': [0.90211749000754,
+                               0.90211748762402,
+                               0.90211748682951],
+                    }
+
+    def f(self, X):
+        return ([0.1, 0.2, 0.3] * X**2).sum()
+
+    @pytest.mark.parametrize('method, kwargs', [
+        ['sgd', {'learning_rate': 0.1}],
+        ['momentum', {'learning_rate': 0.1, 'momentum': 0.5}],
+        ['nesterov_momentum', {'learning_rate': 0.1, 'momentum': 0.5}],
+        ['adagrad', {'learning_rate': 0.1}],
+        ['rmsprop', {'learning_rate': 0.01}],
+        ['adadelta', {}],
+        ['adam', {'learning_rate': 0.01}],
+        ['adamax', {'learning_rate': 0.01}],
+        ])
+    def test_updates(self, method, kwargs):
+        A = theano.shared(lasagne.utils.floatX([1, 1, 1]))
+        B = theano.shared(lasagne.utils.floatX([1, 1, 1]))
+        update_func = getattr(lasagne.updates, method)
+        updates = update_func(self.f(A) + self.f(B),
+                              [A, B],
+                              **kwargs)
+        do_update = theano.function([], [], updates=updates)
+
+        for _ in range(10):
+            do_update()
+
+        assert np.allclose(A.get_value(), B.get_value())
+        assert np.allclose(A.get_value(), self.torch_values[method])
+
+    @pytest.mark.parametrize('method, kwargs', [
+        ['sgd', {'learning_rate': 0.1}],
+        ['momentum', {'learning_rate': 0.1,
+                      'momentum': 0.5}],
+        ['nesterov_momentum', {'learning_rate': 0.1,
+                               'momentum': 0.5}],
+        ['adagrad', {'learning_rate': 0.1,
+                     'epsilon': 1e-6}],
+        ['rmsprop', {'learning_rate': 0.01,
+                     'rho': 0.9,
+                     'epsilon': 1e-6}],
+        ['adadelta', {'learning_rate': 0.01,
+                      'rho': 0.9,
+                      'epsilon': 1e-6}],
+        ['adam', {'learning_rate': 0.01,
+                  'beta1': 0.9,
+                  'beta2': 0.999,
+                  'epsilon': 1e-8}],
+        ['adamax', {'learning_rate': 0.01,
+                    'beta1': 0.9,
+                    'beta2': 0.999,
+                    'epsilon': 1e-8}],
+        ])
+    def test_update_returntype(self, method, kwargs):
+        '''Checks whether lasagne.updates handles float32 inputs correctly'''
+        floatX_ = theano.config.floatX
+        theano.config.floatX = 'float32'
+        try:
+            A = theano.shared(lasagne.utils.floatX([1, 1, 1]))
+            B = theano.shared(lasagne.utils.floatX([1, 1, 1]))
+            update_func = getattr(lasagne.updates, method)
+            updates = update_func(self.f(A) + self.f(B),
+                                  [A, B],
+                                  **kwargs)
+
+            assert all(v.dtype == 'float32' for v in updates)
+
+            # Checking for float32 arguments
+            for param in kwargs:
+                kwargs[param] = np.float32(kwargs[param])
+            updates = update_func(self.f(A) + self.f(B),
+                                  [A, B],
+                                  **kwargs)
+
+            assert all(v.dtype == 'float32' for v in updates)
+        finally:
+            theano.config.floatX = floatX_
+
+
+def test_get_or_compute_grads():
+
+    from lasagne.updates import get_or_compute_grads
+
+    A = theano.shared(1)
+    B = theano.shared(1)
+    loss = A + B
+    grads = get_or_compute_grads(loss, [A, B])
+
+    assert get_or_compute_grads(grads, [A, B]) is grads
+
+    with pytest.raises(ValueError):
+        get_or_compute_grads(grads, [A])
+
+    C = T.scalar()
+    with pytest.raises(ValueError):
+        get_or_compute_grads(A + C, [A, C])
+
+
+ at pytest.mark.parametrize('ndim', [2, 3])
+def test_norm_constraint(ndim):
+    import numpy as np
+    import theano
+    from lasagne.updates import norm_constraint
+    from lasagne.utils import compute_norms
+
+    max_norm = 0.01
+
+    param = theano.shared(
+        np.random.randn(*((25,) * ndim)).astype(theano.config.floatX)
+    )
+
+    update = norm_constraint(param, max_norm)
+
+    apply_update = theano.function([], [], updates=[(param, update)])
+    apply_update()
+
+    assert param.dtype == update.dtype
+    assert (np.max(compute_norms(param.get_value())) <=
+            max_norm * (1 + PCT_TOLERANCE))
+
+
+def test_norm_constraint_norm_axes():
+    import numpy as np
+    import theano
+    from lasagne.updates import norm_constraint
+    from lasagne.utils import compute_norms
+
+    max_norm = 0.01
+    norm_axes = (0, 2)
+
+    param = theano.shared(
+        np.random.randn(10, 20, 30, 40).astype(theano.config.floatX)
+    )
+
+    update = norm_constraint(param, max_norm, norm_axes=norm_axes)
+
+    apply_update = theano.function([], [], updates=[(param, update)])
+    apply_update()
+
+    assert param.dtype == update.dtype
+    assert (np.max(compute_norms(param.get_value(), norm_axes=norm_axes)) <=
+            max_norm*(1 + PCT_TOLERANCE))
+
+
+def test_norm_constraint_dim6_raises():
+    import numpy as np
+    import theano
+    from lasagne.updates import norm_constraint
+
+    max_norm = 0.01
+
+    param = theano.shared(
+        np.random.randn(1, 2, 3, 4, 5, 6).astype(theano.config.floatX)
+    )
+
+    with pytest.raises(ValueError) as excinfo:
+        norm_constraint(param, max_norm)
+    assert "Unsupported tensor dimensionality" in str(excinfo.value)
+
+
+def test_total_norm_constraint():
+    import numpy as np
+    import theano
+    import theano.tensor as T
+    from lasagne.updates import total_norm_constraint
+
+    x1 = T.scalar()
+    x2 = T.matrix()
+    threshold = 5.0
+    tensors1 = total_norm_constraint([x1, x2], threshold, return_norm=False)
+    tensors2, norm = total_norm_constraint([x1, x2], threshold,
+                                           return_norm=True)
+
+    f1 = theano.function([x1, x2], [tensors1[0], tensors1[1]])
+    f2 = theano.function([x1, x2], [tensors2[0], tensors2[1],
+                                    norm])
+
+    x_test = np.arange(1+9, dtype='float32')
+    x1_test = x_test[-1]
+    x2_test = x_test[:9].reshape((3, 3))
+    x1_out1, x2_out1 = f1(x1_test, x2_test)
+    x1_out2, x2_out2, norm = f2(x1_test, x2_test)
+
+    np.testing.assert_array_almost_equal(x1_out1, x1_out2)
+    np.testing.assert_array_almost_equal(x2_out1, x2_out2)
+
+    x_out = [float(x1_out1)] + list(x2_out1.flatten())
+
+    np.testing.assert_array_almost_equal(np.linalg.norm(x_test), norm)
+    np.testing.assert_array_almost_equal(np.linalg.norm(x_out), threshold)
diff --git a/lasagne/tests/test_utils.py b/lasagne/tests/test_utils.py
new file mode 100644
index 0000000..98d38a6
--- /dev/null
+++ b/lasagne/tests/test_utils.py
@@ -0,0 +1,308 @@
+from mock import Mock
+import pytest
+import numpy as np
+import theano
+import theano.tensor as T
+
+
+def test_shared_empty():
+    from lasagne.utils import shared_empty
+
+    X = shared_empty(3)
+    assert (np.zeros((1, 1, 1)) == X.eval()).all()
+
+
+def test_as_theano_expression_fails():
+    from lasagne.utils import as_theano_expression
+    with pytest.raises(TypeError):
+        as_theano_expression({})
+
+
+def test_collect_shared_vars():
+    from lasagne.utils import collect_shared_vars as collect
+    x, y, z = (theano.shared(0, name=n) for n in 'xyz')
+    # collecting must not change the order
+    assert collect([x, y, z]) == [x, y, z]
+    # duplicates should be eliminated
+    assert collect([x, y, x, y, y, z]) == [x, y, z]
+    # ensure we have left-recursive depth-first search
+    assert collect((x + y) + z) == [x, y, z]
+    assert collect(x + (y + z)) == [x, y, z]
+    # complex expressions and constants should not be included
+    assert collect([x**2, y * z * np.ones(10), x + T.matrix()]) == [x, y, z]
+    # the result can even be empty
+    assert collect([T.matrix() + T.matrix(), T.log(T.matrix())]) == []
+
+
+def test_one_hot():
+    from lasagne.utils import one_hot
+    a = np.random.randint(0, 10, 20)
+    b = np.zeros((a.size, a.max()+1))
+    b[np.arange(a.size), a] = 1
+
+    result = one_hot(a).eval()
+    assert (result == b).all()
+
+
+def test_as_tuple_fails():
+    from lasagne.utils import as_tuple
+    with pytest.raises(ValueError):
+        as_tuple([1, 2, 3], 4)
+    with pytest.raises(TypeError):
+        as_tuple('asdf', 4, int)
+
+
+def test_compute_norms():
+    from lasagne.utils import compute_norms
+
+    # Test numpy version of compute_norms
+    array = np.random.randn(10, 20, 30, 40).astype(theano.config.floatX)
+
+    norms = compute_norms(array)
+
+    assert array.dtype == norms.dtype
+    assert norms.shape[0] == array.shape[0]
+
+    # Test theano version of compute_norms
+    t_array = theano.shared(array)
+    t_norms = compute_norms(t_array)
+
+    # Check if they do not differ much
+    assert np.allclose(t_norms.eval(), norms)
+
+
+def test_compute_norms_axes():
+    from lasagne.utils import compute_norms
+
+    # Test numpy versions of compute norms with axes
+    array = np.random.randn(10, 20, 30, 40).astype(theano.config.floatX)
+
+    norms = compute_norms(array, norm_axes=(0, 2))
+
+    assert array.dtype == norms.dtype
+    assert norms.shape == (array.shape[1], array.shape[3])
+
+    # Test theano version of compute_norms
+    t_array = theano.shared(array)
+    t_norms = compute_norms(t_array, norm_axes=(0, 2))
+
+    # Check if they do not differ much
+    assert np.allclose(t_norms.eval(), norms)
+
+
+def test_compute_norms_ndim1():
+    from lasagne.utils import compute_norms
+
+    # Test numpy versions of compute norms with axes
+    array = np.random.randn(10, ).astype(theano.config.floatX)
+
+    norms = compute_norms(array)
+
+    assert array.dtype == norms.dtype
+    assert norms.shape == array.shape
+
+    # Check if they do not differ much
+    assert np.allclose(norms, abs(array))
+
+    # Test theano version of compute_norms
+    t_array = theano.shared(array)
+    t_norms = compute_norms(t_array)
+
+    # Check if they do not differ much
+    assert np.allclose(t_norms.eval(), norms)
+
+
+def test_compute_norms_type_raises():
+    from lasagne.utils import compute_norms
+
+    array = [[1, 2], [3, 4]]
+
+    with pytest.raises(RuntimeError) as excinfo:
+        compute_norms(array)
+
+    assert ("Unsupported type") in str(excinfo.value)
+
+
+def test_compute_norms_ndim6_raises():
+    from lasagne.utils import compute_norms
+
+    array = np.random.randn(1, 2, 3, 4, 5, 6).astype(theano.config.floatX)
+
+    with pytest.raises(ValueError) as excinfo:
+        compute_norms(array)
+
+    assert "Unsupported tensor dimensionality" in str(excinfo.value)
+
+
+def test_create_param_bad_callable_raises():
+    from lasagne.utils import create_param
+
+    with pytest.raises(TypeError):
+        create_param(lambda x: {}, (1, 2, 3))
+    with pytest.raises(ValueError):
+        create_param(lambda x: np.array(1), (1, 2, 3))
+
+
+def test_create_param_bad_spec_raises():
+    from lasagne.utils import create_param
+
+    with pytest.raises(TypeError):
+        create_param({}, (1, 2, 3))
+
+
+def test_create_param_accepts_iterable_shape():
+    from lasagne.utils import create_param
+    factory = np.empty
+    create_param(factory, [2, 3])
+    create_param(factory, (x for x in [2, 3]))
+
+
+def test_create_param_numpy_bad_shape_raises_error():
+    from lasagne.utils import create_param
+
+    param = np.array([[1, 2, 3], [4, 5, 6]])
+    with pytest.raises(ValueError):
+        create_param(param, (3, 2))
+
+
+def test_create_param_numpy_returns_shared():
+    from lasagne.utils import create_param
+
+    param = np.array([[1, 2, 3], [4, 5, 6]])
+    result = create_param(param, (2, 3))
+    assert (result.get_value() == param).all()
+    assert isinstance(result, type(theano.shared(param)))
+    assert (result.get_value() == param).all()
+
+
+def test_create_param_shared_returns_same():
+    from lasagne.utils import create_param
+
+    param = theano.shared(np.array([[1, 2, 3], [4, 5, 6]]))
+    result = create_param(param, (2, 3))
+    assert result is param
+
+
+def test_create_param_shared_bad_ndim_raises_error():
+    from lasagne.utils import create_param
+
+    param = theano.shared(np.array([[1, 2, 3], [4, 5, 6]]))
+    with pytest.raises(ValueError):
+        create_param(param, (2, 3, 4))
+
+
+def test_create_param_callable_returns_return_value():
+    from lasagne.utils import create_param
+
+    array = np.array([[1, 2, 3], [4, 5, 6]])
+    factory = Mock()
+    factory.return_value = array
+    result = create_param(factory, (2, 3))
+    assert (result.get_value() == array).all()
+    factory.assert_called_with((2, 3))
+
+
+def test_create_param_callable_returns_shared():
+    from lasagne.utils import create_param
+
+    array = np.array([[1, 2, 3], [4, 5, 6]])
+    param = theano.shared(array)
+    factory = Mock()
+    factory.return_value = param
+    result = create_param(factory, (2, 3))
+    assert (result.get_value() == array).all()
+    factory.assert_called_with((2, 3))
+    assert result is param
+
+
+def test_create_param_callable_returns_shared_bad_ndim_raises_error():
+    from lasagne.utils import create_param
+
+    array = np.array([[1, 2], [3, 4]])
+    param = theano.shared(array)
+    factory = Mock()
+    factory.return_value = param
+    with pytest.raises(ValueError):
+        create_param(factory, (2, 3, 4))
+
+
+def test_create_param_callable_returns_theano_expr():
+    from lasagne.utils import create_param
+
+    array = np.array([[1, 2, 3], [4, 5, 6]])
+    param = theano.shared(array) * 2
+    factory = Mock()
+    factory.return_value = param
+    result = create_param(factory, (2, 3))
+    assert (result.eval() == array * 2).all()
+    assert result is param
+
+
+def test_nonpositive_dims_raises_value_error():
+    from lasagne.utils import create_param
+    neg_shape = (-1, -1)
+    zero_shape = (0, 0)
+    pos_shape = (1, 1)
+    spec = np.empty
+    with pytest.raises(ValueError):
+        create_param(spec, neg_shape)
+    with pytest.raises(ValueError):
+        create_param(spec, zero_shape)
+    create_param(spec, pos_shape)
+
+
+def test_create_param_callable_returns_wrong_type():
+    from lasagne.utils import create_param
+
+    param = 'string'
+    factory = Mock()
+    factory.return_value = param
+    with pytest.raises(TypeError):
+        create_param(factory, (1, 2))
+
+
+def test_create_param_retain_ndarray_dtype():
+    from lasagne.utils import create_param
+    param = np.array([[1, 2, 3], [4, 5, 6]])
+
+    param = param.astype('float64')
+    result = create_param(param, (2, 3))
+    assert (result.dtype == param.dtype)
+
+    param = param.astype('int16')
+    result = create_param(param, (2, 3))
+    assert (result.dtype == param.dtype)
+
+
+def test_create_param_broadcast_pattern():
+    from lasagne.utils import create_param
+    for shape in (10, 1, 20), (1, 2), (3, 1), (2, 3):
+        bcast = tuple(s == 1 for s in shape)
+        assert create_param(np.zeros, shape).broadcastable == bcast
+        assert create_param(np.zeros(shape, np.float32),
+                            shape).broadcastable == bcast
+
+
+def test_unroll_scan():
+    from lasagne.utils import unroll_scan
+    k = 2
+    a = T.scalar("a")
+
+    result = unroll_scan(
+        fn=lambda step, prior_result, a: prior_result * a,
+        sequences=T.arange(k), outputs_info=[1.], non_sequences=[a], n_steps=k)
+    final_result = result[-1]
+    power = theano.function(inputs=[a], outputs=final_result)
+
+    assert np.all(power(10) == [10, 100])
+
+    b = T.scalar("b")
+
+    def mul_div(step, previous_mul, previous_div, mul, div):
+            return previous_mul*mul, previous_div/div
+
+    result = unroll_scan(
+        fn=mul_div, sequences=T.arange(k), outputs_info=[1., 1.],
+        non_sequences=[a, b], n_steps=k)
+    power = theano.function(inputs=[a, b], outputs=result)
+    assert np.allclose(power(10, 10), [[10, 100], [.1, .01]])
diff --git a/lasagne/theano_extensions/__init__.py b/lasagne/theano_extensions/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/lasagne/theano_extensions/conv.py b/lasagne/theano_extensions/conv.py
new file mode 100644
index 0000000..6dbca08
--- /dev/null
+++ b/lasagne/theano_extensions/conv.py
@@ -0,0 +1,273 @@
+"""
+Alternative convolution implementations for Theano
+"""
+
+import numpy as np
+
+import theano.tensor as T
+
+
+# 1D convolutions
+
+def conv1d_sc(input, filters, image_shape=None, filter_shape=None,
+              border_mode='valid', subsample=(1,), filter_flip=True):
+    """
+    using conv2d with a single input channel
+    """
+    if border_mode not in ('valid', 0, (0,)):
+        raise RuntimeError("Unsupported border_mode for conv1d_sc: "
+                           "%s" % border_mode)
+
+    if image_shape is None:
+        image_shape_sc = None
+    else:
+        # (b, c, i0) to (b, 1, c, i0)
+        image_shape_sc = (image_shape[0], 1, image_shape[1], image_shape[2])
+
+    if filter_shape is None:
+        filter_shape_sc = None
+    else:
+        filter_shape_sc = (filter_shape[0], 1, filter_shape[1],
+                           filter_shape[2])
+
+    input_sc = input.dimshuffle(0, 'x', 1, 2)
+    # We need to flip the channels dimension because it will be convolved over.
+    filters_sc = filters.dimshuffle(0, 'x', 1, 2)[:, :, ::-1, :]
+
+    conved = T.nnet.conv2d(input_sc, filters_sc, image_shape_sc,
+                           filter_shape_sc, subsample=(1, subsample[0]),
+                           filter_flip=filter_flip)
+    return conved[:, :, 0, :]  # drop the unused dimension
+
+
+def conv1d_mc0(input, filters, image_shape=None, filter_shape=None,
+               border_mode='valid', subsample=(1,), filter_flip=True):
+    """
+    using conv2d with width == 1
+    """
+    if image_shape is None:
+        image_shape_mc0 = None
+    else:
+        # (b, c, i0) to (b, c, 1, i0)
+        image_shape_mc0 = (image_shape[0], image_shape[1], 1, image_shape[2])
+
+    if filter_shape is None:
+        filter_shape_mc0 = None
+    else:
+        filter_shape_mc0 = (filter_shape[0], filter_shape[1], 1,
+                            filter_shape[2])
+
+    if isinstance(border_mode, tuple):
+        (border_mode,) = border_mode
+    if isinstance(border_mode, int):
+        border_mode = (0, border_mode)
+
+    input_mc0 = input.dimshuffle(0, 1, 'x', 2)
+    filters_mc0 = filters.dimshuffle(0, 1, 'x', 2)
+
+    conved = T.nnet.conv2d(
+        input_mc0, filters_mc0, image_shape_mc0, filter_shape_mc0,
+        subsample=(1, subsample[0]), border_mode=border_mode,
+        filter_flip=filter_flip)
+    return conved[:, :, 0, :]  # drop the unused dimension
+
+
+def conv1d_mc1(input, filters, image_shape=None, filter_shape=None,
+               border_mode='valid', subsample=(1,), filter_flip=True):
+    """
+    using conv2d with height == 1
+    """
+    if image_shape is None:
+        image_shape_mc1 = None
+    else:
+        # (b, c, i0) to (b, c, i0, 1)
+        image_shape_mc1 = (image_shape[0], image_shape[1], image_shape[2], 1)
+
+    if filter_shape is None:
+        filter_shape_mc1 = None
+    else:
+        filter_shape_mc1 = (filter_shape[0], filter_shape[1],
+                            filter_shape[2], 1)
+
+    if isinstance(border_mode, tuple):
+        (border_mode,) = border_mode
+    if isinstance(border_mode, int):
+        border_mode = (border_mode, 0)
+
+    input_mc1 = input.dimshuffle(0, 1, 2, 'x')
+    filters_mc1 = filters.dimshuffle(0, 1, 2, 'x')
+
+    conved = T.nnet.conv2d(
+        input_mc1, filters_mc1, image_shape_mc1, filter_shape_mc1,
+        subsample=(subsample[0], 1), border_mode=border_mode,
+        filter_flip=filter_flip)
+    return conved[:, :, :, 0]  # drop the unused dimension
+
+
+def conv1d_unstrided(input, filters, image_shape, filter_shape,
+                     border_mode='valid', subsample=(1,), filter_flip=True,
+                     implementation=conv1d_sc):
+    """
+    perform a strided 1D convolution by reshaping input and filters so that the
+    stride becomes 1. This function requires that the filter length is a
+    multiple of the stride. It also truncates the input to have a length
+    that is a multiple of the stride.
+    """
+    batch_size, num_input_channels, input_length = image_shape
+    num_filters, num_input_channels_, filter_length = filter_shape
+    stride = subsample[0]
+
+    if filter_length % stride > 0:
+        raise RuntimeError("Filter length (%d) is not a multiple of the "
+                           "stride (%d)" % (filter_length, stride))
+    # TODO: test if this works for border_mode='full'
+    if border_mode not in ('valid', 0, (0,)):
+        raise RuntimeError("Unsupported border_mode for conv1d_unstrided: "
+                           "%s" % border_mode)
+
+    num_steps = filter_length // stride
+
+    # input sizes need to be multiples of the strides,
+    # truncate to correct sizes.
+    truncated_length = (input_length // stride) * stride
+    input_truncated = input[:, :, :truncated_length]
+
+    r_input_shape = (batch_size, num_input_channels,
+                     truncated_length // stride, stride)
+    r_input = input_truncated.reshape(r_input_shape)
+
+    # fold strides into the feature maps dimension (input)
+    r_input_folded_shape = (batch_size, num_input_channels * stride,
+                            truncated_length // stride)
+    r_input_folded = r_input.dimshuffle(
+        0, 1, 3, 2).reshape(r_input_folded_shape)
+
+    r_filter_shape = (num_filters, num_input_channels, num_steps, stride)
+    r_filters_flipped = filters[:, :, ::-1].reshape(r_filter_shape)
+
+    # fold strides into the feature maps dimension (filters)
+    r_filter_folded_shape = (num_filters, num_input_channels * stride,
+                             num_steps)
+    r_filters_flipped_folded = r_filters_flipped.dimshuffle(
+        0, 1, 3, 2).reshape(r_filter_folded_shape)
+    r_filters_folded = r_filters_flipped_folded[:, :, ::-1]  # unflip
+
+    return implementation(r_input_folded, r_filters_folded,
+                          r_input_folded_shape, r_filter_folded_shape,
+                          border_mode, subsample=(1,), filter_flip=filter_flip)
+
+
+def conv1d_sd(input, filters, image_shape, filter_shape, border_mode='valid',
+              subsample=(1,), filter_flip=True):
+    """
+    using a single dot product
+    """
+    if border_mode not in ('valid', 0, (0,)):
+        raise RuntimeError("Unsupported border_mode for conv1d_sd: "
+                           "%s" % border_mode)
+
+    batch_size, num_input_channels, input_length = image_shape
+    num_filters, num_input_channels_, filter_length = filter_shape
+    stride = subsample[0]
+
+    if filter_length % stride > 0:
+        raise RuntimeError("Filter length (%d) is not a multiple of the "
+                           "stride (%d)" % (filter_length, stride))
+
+    num_steps = filter_length // stride
+    output_length = (input_length - filter_length + stride) // stride
+
+    # pad the input so all the shifted dot products fit inside.
+    # shape is (b, c, l)
+    padded_length = ((input_length // filter_length) * filter_length +
+                     (num_steps - 1) * stride)
+
+    # at this point, it is possible that the padded_length is SMALLER than the
+    # input size. so then we have to truncate first.
+    truncated_length = min(input_length, padded_length)
+    input_truncated = input[:, :, :truncated_length]
+
+    input_padded_shape = (batch_size, num_input_channels, padded_length)
+    input_padded = T.zeros(input_padded_shape)
+    input_padded = T.set_subtensor(input_padded[:, :, :truncated_length],
+                                   input_truncated)
+
+    inputs = []
+    for num in range(num_steps):
+        shift = num * stride
+        length = (padded_length - shift) // filter_length
+
+        r_input_shape = (batch_size, num_input_channels, length, filter_length)
+        r_input = input_padded[
+            :, :, shift:length * filter_length + shift].reshape(r_input_shape)
+
+        inputs.append(r_input)
+
+    inputs_stacked = T.stack(*inputs)  # shape is (n, b, c, w, f)
+    filters_flipped = filters[:, :, ::-1] if filter_flip else filters
+
+    r_conved = T.tensordot(inputs_stacked, filters_flipped,
+                           np.asarray([[2, 4], [1, 2]]))
+    # resulting shape is (n, b, w, n_filters)
+    # output needs to be (b, n_filters, w * n)
+    r_conved = r_conved.dimshuffle(1, 3, 2, 0)  # (b, n_filters, w, n)
+    conved = r_conved.reshape((r_conved.shape[0], r_conved.shape[1],
+                               r_conved.shape[2] * r_conved.shape[3]))
+    # result is (b, n_f, l)
+
+    # remove padding
+    return conved[:, :, :output_length]
+
+
+def conv1d_md(input, filters, image_shape, filter_shape, border_mode='valid',
+              subsample=(1,), filter_flip=True):
+    """
+    using multiple dot products
+    """
+    if border_mode not in ('valid', 0, (0,)):
+        raise RuntimeError("Unsupported border_mode for conv1d_md: "
+                           "%s" % border_mode)
+
+    batch_size, num_input_channels, input_length = image_shape
+    num_filters, num_input_channels_, filter_length = filter_shape
+    stride = subsample[0]
+
+    if filter_length % stride > 0:
+        raise RuntimeError("Filter length (%d) is not a multiple of the "
+                           "stride (%d)" % (filter_length, stride))
+
+    num_steps = filter_length // stride
+    output_length = (input_length - filter_length + stride) // stride
+    output_shape = (batch_size, num_filters, output_length)
+
+    filters_flipped = filters[:, :, ::-1] if filter_flip else filters
+
+    conved = T.zeros(output_shape)
+
+    for num in range(num_steps):
+        shift = num * stride
+        length = (input_length - shift) // filter_length
+
+        if length == 0:
+            # we can safely skip this product, it doesn't contribute to the
+            # final convolution.
+            continue
+
+        r_input_shape = (batch_size, num_input_channels, length, filter_length)
+        r_input = input[
+            :, :, shift:length * filter_length + shift].reshape(r_input_shape)
+
+        # shape (b, l, n_filters)
+        r_conved = T.tensordot(r_input, filters_flipped,
+                               np.asarray([[1, 3], [1, 2]]))
+        r_conved = r_conved.dimshuffle(0, 2, 1)  # shape is (b, n_filters, l)
+        conved = T.set_subtensor(conved[:, :, num::num_steps], r_conved)
+
+    return conved
+
+
+# TODO: conv1d_md_channelslast?
+
+# 2D convolutions
+
+# TODO
diff --git a/lasagne/theano_extensions/padding.py b/lasagne/theano_extensions/padding.py
new file mode 100644
index 0000000..12c10ea
--- /dev/null
+++ b/lasagne/theano_extensions/padding.py
@@ -0,0 +1,53 @@
+"""
+Padding
+"""
+
+import theano.tensor as T
+
+
+def pad(x, width, val=0, batch_ndim=1):
+    """
+    Pad a tensor with a constant value.
+
+    Parameters
+    ----------
+    x : tensor
+
+    width : int, iterable of int, or iterable of tuple
+        Padding width. If an int, pads each axis symmetrically with the same
+        amount in the beginning and end. If an iterable of int, defines the
+        symmetric padding width separately for each axis. If an iterable of
+        tuples of two ints, defines a seperate padding width for each beginning
+        and end of each axis.
+
+    val : float
+        The constant value used for padding
+
+    batch_ndim : integer
+        Dimensions before the value will not be padded.
+
+    """
+    input_shape = x.shape
+    input_ndim = x.ndim
+
+    output_shape = list(input_shape)
+    indices = [slice(None) for _ in output_shape]
+
+    if isinstance(width, int):
+        widths = [width] * (input_ndim - batch_ndim)
+    else:
+        widths = width
+
+    for k, w in enumerate(widths):
+        try:
+            l, r = w
+        except TypeError:
+            l = r = w
+        output_shape[k + batch_ndim] += l + r
+        indices[k + batch_ndim] = slice(l, l + input_shape[k + batch_ndim])
+
+    if val:
+        out = T.ones(output_shape) * val
+    else:
+        out = T.zeros(output_shape)
+    return T.set_subtensor(out[tuple(indices)], x)
diff --git a/lasagne/updates.py b/lasagne/updates.py
new file mode 100644
index 0000000..61ee4c1
--- /dev/null
+++ b/lasagne/updates.py
@@ -0,0 +1,819 @@
+"""
+Functions to generate Theano update dictionaries for training.
+
+The update functions implement different methods to control the learning
+rate for use with stochastic gradient descent.
+
+Update functions take a loss expression or a list of gradient expressions and
+a list of parameters as input and return an ordered dictionary of updates:
+
+.. autosummary::
+    :nosignatures:
+
+    sgd
+    momentum
+    nesterov_momentum
+    adagrad
+    rmsprop
+    adadelta
+    adam
+    adamax
+
+Two functions can be used to further modify the updates to include momentum:
+
+.. autosummary::
+    :nosignatures:
+
+    apply_momentum
+    apply_nesterov_momentum
+
+Finally, we provide two helper functions to constrain the norm of tensors:
+
+.. autosummary::
+    :nosignatures:
+
+    norm_constraint
+    total_norm_constraint
+
+:func:`norm_constraint()` can be used to constrain the norm of parameters
+(as an alternative to weight decay), or for a form of gradient clipping.
+:func:`total_norm_constraint()` constrain the total norm of a list of tensors.
+This is often used when training recurrent neural networks.
+
+Examples
+--------
+>>> import lasagne
+>>> import theano.tensor as T
+>>> import theano
+>>> from lasagne.nonlinearities import softmax
+>>> from lasagne.layers import InputLayer, DenseLayer, get_output
+>>> from lasagne.updates import sgd, apply_momentum
+>>> l_in = InputLayer((100, 20))
+>>> l1 = DenseLayer(l_in, num_units=3, nonlinearity=softmax)
+>>> x = T.matrix('x')  # shp: num_batch x num_features
+>>> y = T.ivector('y') # shp: num_batch
+>>> l_out = get_output(l1, x)
+>>> params = lasagne.layers.get_all_params(l1)
+>>> loss = T.mean(T.nnet.categorical_crossentropy(l_out, y))
+>>> updates_sgd = sgd(loss, params, learning_rate=0.0001)
+>>> updates = apply_momentum(updates_sgd, params, momentum=0.9)
+>>> train_function = theano.function([x, y], updates=updates)
+"""
+
+from collections import OrderedDict
+
+import numpy as np
+
+import theano
+import theano.tensor as T
+from . import utils
+
+__all__ = [
+    "sgd",
+    "apply_momentum",
+    "momentum",
+    "apply_nesterov_momentum",
+    "nesterov_momentum",
+    "adagrad",
+    "rmsprop",
+    "adadelta",
+    "adam",
+    "adamax",
+    "norm_constraint",
+    "total_norm_constraint"
+]
+
+
+def get_or_compute_grads(loss_or_grads, params):
+    """Helper function returning a list of gradients
+
+    Parameters
+    ----------
+    loss_or_grads : symbolic expression or list of expressions
+        A scalar loss expression, or a list of gradient expressions
+    params : list of shared variables
+        The variables to return the gradients for
+
+    Returns
+    -------
+    list of expressions
+        If `loss_or_grads` is a list, it is assumed to be a list of
+        gradients and returned as is, unless it does not match the length
+        of `params`, in which case a `ValueError` is raised.
+        Otherwise, `loss_or_grads` is assumed to be a cost expression and
+        the function returns `theano.grad(loss_or_grads, params)`.
+
+    Raises
+    ------
+    ValueError
+        If `loss_or_grads` is a list of a different length than `params`, or if
+        any element of `params` is not a shared variable (while we could still
+        compute its gradient, we can never update it and want to fail early).
+    """
+    if any(not isinstance(p, theano.compile.SharedVariable) for p in params):
+        raise ValueError("params must contain shared variables only. If it "
+                         "contains arbitrary parameter expressions, then "
+                         "lasagne.utils.collect_shared_vars() may help you.")
+    if isinstance(loss_or_grads, list):
+        if not len(loss_or_grads) == len(params):
+            raise ValueError("Got %d gradient expressions for %d parameters" %
+                             (len(loss_or_grads), len(params)))
+        return loss_or_grads
+    else:
+        return theano.grad(loss_or_grads, params)
+
+
+def sgd(loss_or_grads, params, learning_rate):
+    """Stochastic Gradient Descent (SGD) updates
+
+    Generates update expressions of the form:
+
+    * ``param := param - learning_rate * gradient``
+
+    Parameters
+    ----------
+    loss_or_grads : symbolic expression or list of expressions
+        A scalar loss expression, or a list of gradient expressions
+    params : list of shared variables
+        The variables to generate update expressions for
+    learning_rate : float or symbolic scalar
+        The learning rate controlling the size of update steps
+
+    Returns
+    -------
+    OrderedDict
+        A dictionary mapping each parameter to its update expression
+    """
+    grads = get_or_compute_grads(loss_or_grads, params)
+    updates = OrderedDict()
+
+    for param, grad in zip(params, grads):
+        updates[param] = param - learning_rate * grad
+
+    return updates
+
+
+def apply_momentum(updates, params=None, momentum=0.9):
+    """Returns a modified update dictionary including momentum
+
+    Generates update expressions of the form:
+
+    * ``velocity := momentum * velocity + updates[param] - param``
+    * ``param := param + velocity``
+
+    Parameters
+    ----------
+    updates : OrderedDict
+        A dictionary mapping parameters to update expressions
+    params : iterable of shared variables, optional
+        The variables to apply momentum to. If omitted, will apply
+        momentum to all `updates.keys()`.
+    momentum : float or symbolic scalar, optional
+        The amount of momentum to apply. Higher momentum results in
+        smoothing over more update steps. Defaults to 0.9.
+
+    Returns
+    -------
+    OrderedDict
+        A copy of `updates` with momentum updates for all `params`.
+
+    Notes
+    -----
+    Higher momentum also results in larger update steps. To counter that,
+    you can optionally scale your learning rate by `1 - momentum`.
+
+    See Also
+    --------
+    momentum : Shortcut applying momentum to SGD updates
+    """
+    if params is None:
+        params = updates.keys()
+    updates = OrderedDict(updates)
+
+    for param in params:
+        value = param.get_value(borrow=True)
+        velocity = theano.shared(np.zeros(value.shape, dtype=value.dtype),
+                                 broadcastable=param.broadcastable)
+        x = momentum * velocity + updates[param]
+        updates[velocity] = x - param
+        updates[param] = x
+
+    return updates
+
+
+def momentum(loss_or_grads, params, learning_rate, momentum=0.9):
+    """Stochastic Gradient Descent (SGD) updates with momentum
+
+    Generates update expressions of the form:
+
+    * ``velocity := momentum * velocity - learning_rate * gradient``
+    * ``param := param + velocity``
+
+    Parameters
+    ----------
+    loss_or_grads : symbolic expression or list of expressions
+        A scalar loss expression, or a list of gradient expressions
+    params : list of shared variables
+        The variables to generate update expressions for
+    learning_rate : float or symbolic scalar
+        The learning rate controlling the size of update steps
+    momentum : float or symbolic scalar, optional
+        The amount of momentum to apply. Higher momentum results in
+        smoothing over more update steps. Defaults to 0.9.
+
+    Returns
+    -------
+    OrderedDict
+        A dictionary mapping each parameter to its update expression
+
+    Notes
+    -----
+    Higher momentum also results in larger update steps. To counter that,
+    you can optionally scale your learning rate by `1 - momentum`.
+
+    See Also
+    --------
+    apply_momentum : Generic function applying momentum to updates
+    nesterov_momentum : Nesterov's variant of SGD with momentum
+    """
+    updates = sgd(loss_or_grads, params, learning_rate)
+    return apply_momentum(updates, momentum=momentum)
+
+
+def apply_nesterov_momentum(updates, params=None, momentum=0.9):
+    """Returns a modified update dictionary including Nesterov momentum
+
+    Generates update expressions of the form:
+
+    * ``velocity := momentum * velocity + updates[param] - param``
+    * ``param := param + momentum * velocity + updates[param] - param``
+
+    Parameters
+    ----------
+    updates : OrderedDict
+        A dictionary mapping parameters to update expressions
+    params : iterable of shared variables, optional
+        The variables to apply momentum to. If omitted, will apply
+        momentum to all `updates.keys()`.
+    momentum : float or symbolic scalar, optional
+        The amount of momentum to apply. Higher momentum results in
+        smoothing over more update steps. Defaults to 0.9.
+
+    Returns
+    -------
+    OrderedDict
+        A copy of `updates` with momentum updates for all `params`.
+
+    Notes
+    -----
+    Higher momentum also results in larger update steps. To counter that,
+    you can optionally scale your learning rate by `1 - momentum`.
+
+    The classic formulation of Nesterov momentum (or Nesterov accelerated
+    gradient) requires the gradient to be evaluated at the predicted next
+    position in parameter space. Here, we use the formulation described at
+    https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617,
+    which allows the gradient to be evaluated at the current parameters.
+
+    See Also
+    --------
+    nesterov_momentum : Shortcut applying Nesterov momentum to SGD updates
+    """
+    if params is None:
+        params = updates.keys()
+    updates = OrderedDict(updates)
+
+    for param in params:
+        value = param.get_value(borrow=True)
+        velocity = theano.shared(np.zeros(value.shape, dtype=value.dtype),
+                                 broadcastable=param.broadcastable)
+        x = momentum * velocity + updates[param] - param
+        updates[velocity] = x
+        updates[param] = momentum * x + updates[param]
+
+    return updates
+
+
+def nesterov_momentum(loss_or_grads, params, learning_rate, momentum=0.9):
+    """Stochastic Gradient Descent (SGD) updates with Nesterov momentum
+
+    Generates update expressions of the form:
+
+    * ``velocity := momentum * velocity - learning_rate * gradient``
+    * ``param := param + momentum * velocity - learning_rate * gradient``
+
+    Parameters
+    ----------
+    loss_or_grads : symbolic expression or list of expressions
+        A scalar loss expression, or a list of gradient expressions
+    params : list of shared variables
+        The variables to generate update expressions for
+    learning_rate : float or symbolic scalar
+        The learning rate controlling the size of update steps
+    momentum : float or symbolic scalar, optional
+        The amount of momentum to apply. Higher momentum results in
+        smoothing over more update steps. Defaults to 0.9.
+
+    Returns
+    -------
+    OrderedDict
+        A dictionary mapping each parameter to its update expression
+
+    Notes
+    -----
+    Higher momentum also results in larger update steps. To counter that,
+    you can optionally scale your learning rate by `1 - momentum`.
+
+    The classic formulation of Nesterov momentum (or Nesterov accelerated
+    gradient) requires the gradient to be evaluated at the predicted next
+    position in parameter space. Here, we use the formulation described at
+    https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617,
+    which allows the gradient to be evaluated at the current parameters.
+
+    See Also
+    --------
+    apply_nesterov_momentum : Function applying momentum to updates
+    """
+    updates = sgd(loss_or_grads, params, learning_rate)
+    return apply_nesterov_momentum(updates, momentum=momentum)
+
+
+def adagrad(loss_or_grads, params, learning_rate=1.0, epsilon=1e-6):
+    """Adagrad updates
+
+    Scale learning rates by dividing with the square root of accumulated
+    squared gradients. See [1]_ for further description.
+
+    Parameters
+    ----------
+    loss_or_grads : symbolic expression or list of expressions
+        A scalar loss expression, or a list of gradient expressions
+    params : list of shared variables
+        The variables to generate update expressions for
+    learning_rate : float or symbolic scalar
+        The learning rate controlling the size of update steps
+    epsilon : float or symbolic scalar
+        Small value added for numerical stability
+
+    Returns
+    -------
+    OrderedDict
+        A dictionary mapping each parameter to its update expression
+
+    Notes
+    -----
+    Using step size eta Adagrad calculates the learning rate for feature i at
+    time step t as:
+
+    .. math:: \\eta_{t,i} = \\frac{\\eta}
+       {\\sqrt{\\sum^t_{t^\\prime} g^2_{t^\\prime,i}+\\epsilon}} g_{t,i}
+
+    as such the learning rate is monotonically decreasing.
+
+    Epsilon is not included in the typical formula, see [2]_.
+
+    References
+    ----------
+    .. [1] Duchi, J., Hazan, E., & Singer, Y. (2011):
+           Adaptive subgradient methods for online learning and stochastic
+           optimization. JMLR, 12:2121-2159.
+
+    .. [2] Chris Dyer:
+           Notes on AdaGrad. http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf
+    """
+
+    grads = get_or_compute_grads(loss_or_grads, params)
+    updates = OrderedDict()
+
+    for param, grad in zip(params, grads):
+        value = param.get_value(borrow=True)
+        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
+                             broadcastable=param.broadcastable)
+        accu_new = accu + grad ** 2
+        updates[accu] = accu_new
+        updates[param] = param - (learning_rate * grad /
+                                  T.sqrt(accu_new + epsilon))
+
+    return updates
+
+
+def rmsprop(loss_or_grads, params, learning_rate=1.0, rho=0.9, epsilon=1e-6):
+    """RMSProp updates
+
+    Scale learning rates by dividing with the moving average of the root mean
+    squared (RMS) gradients. See [1]_ for further description.
+
+    Parameters
+    ----------
+    loss_or_grads : symbolic expression or list of expressions
+        A scalar loss expression, or a list of gradient expressions
+    params : list of shared variables
+        The variables to generate update expressions for
+    learning_rate : float or symbolic scalar
+        The learning rate controlling the size of update steps
+    rho : float or symbolic scalar
+        Gradient moving average decay factor
+    epsilon : float or symbolic scalar
+        Small value added for numerical stability
+
+    Returns
+    -------
+    OrderedDict
+        A dictionary mapping each parameter to its update expression
+
+    Notes
+    -----
+    `rho` should be between 0 and 1. A value of `rho` close to 1 will decay the
+    moving average slowly and a value close to 0 will decay the moving average
+    fast.
+
+    Using the step size :math:`\\eta` and a decay factor :math:`\\rho` the
+    learning rate :math:`\\eta_t` is calculated as:
+
+    .. math::
+       r_t &= \\rho r_{t-1} + (1-\\rho)*g^2\\\\
+       \\eta_t &= \\frac{\\eta}{\\sqrt{r_t + \\epsilon}}
+
+    References
+    ----------
+    .. [1] Tieleman, T. and Hinton, G. (2012):
+           Neural Networks for Machine Learning, Lecture 6.5 - rmsprop.
+           Coursera. http://www.youtube.com/watch?v=O3sxAc4hxZU (formula @5:20)
+    """
+    grads = get_or_compute_grads(loss_or_grads, params)
+    updates = OrderedDict()
+
+    # Using theano constant to prevent upcasting of float32
+    one = T.constant(1)
+
+    for param, grad in zip(params, grads):
+        value = param.get_value(borrow=True)
+        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
+                             broadcastable=param.broadcastable)
+        accu_new = rho * accu + (one - rho) * grad ** 2
+        updates[accu] = accu_new
+        updates[param] = param - (learning_rate * grad /
+                                  T.sqrt(accu_new + epsilon))
+
+    return updates
+
+
+def adadelta(loss_or_grads, params, learning_rate=1.0, rho=0.95, epsilon=1e-6):
+    """ Adadelta updates
+
+    Scale learning rates by the ratio of accumulated gradients to accumulated
+    updates, see [1]_ and notes for further description.
+
+    Parameters
+    ----------
+    loss_or_grads : symbolic expression or list of expressions
+        A scalar loss expression, or a list of gradient expressions
+    params : list of shared variables
+        The variables to generate update expressions for
+    learning_rate : float or symbolic scalar
+        The learning rate controlling the size of update steps
+    rho : float or symbolic scalar
+        Squared gradient moving average decay factor
+    epsilon : float or symbolic scalar
+        Small value added for numerical stability
+
+    Returns
+    -------
+    OrderedDict
+        A dictionary mapping each parameter to its update expression
+
+    Notes
+    -----
+    rho should be between 0 and 1. A value of rho close to 1 will decay the
+    moving average slowly and a value close to 0 will decay the moving average
+    fast.
+
+    rho = 0.95 and epsilon=1e-6 are suggested in the paper and reported to
+    work for multiple datasets (MNIST, speech).
+
+    In the paper, no learning rate is considered (so learning_rate=1.0).
+    Probably best to keep it at this value.
+    epsilon is important for the very first update (so the numerator does
+    not become 0).
+
+    Using the step size eta and a decay factor rho the learning rate is
+    calculated as:
+
+    .. math::
+       r_t &= \\rho r_{t-1} + (1-\\rho)*g^2\\\\
+       \\eta_t &= \\eta \\frac{\\sqrt{s_{t-1} + \\epsilon}}
+                             {\sqrt{r_t + \epsilon}}\\\\
+       s_t &= \\rho s_{t-1} + (1-\\rho)*(\\eta_t*g)^2
+
+    References
+    ----------
+    .. [1] Zeiler, M. D. (2012):
+           ADADELTA: An Adaptive Learning Rate Method.
+           arXiv Preprint arXiv:1212.5701.
+    """
+    grads = get_or_compute_grads(loss_or_grads, params)
+    updates = OrderedDict()
+
+    # Using theano constant to prevent upcasting of float32
+    one = T.constant(1)
+
+    for param, grad in zip(params, grads):
+        value = param.get_value(borrow=True)
+        # accu: accumulate gradient magnitudes
+        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
+                             broadcastable=param.broadcastable)
+        # delta_accu: accumulate update magnitudes (recursively!)
+        delta_accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
+                                   broadcastable=param.broadcastable)
+
+        # update accu (as in rmsprop)
+        accu_new = rho * accu + (one - rho) * grad ** 2
+        updates[accu] = accu_new
+
+        # compute parameter update, using the 'old' delta_accu
+        update = (grad * T.sqrt(delta_accu + epsilon) /
+                  T.sqrt(accu_new + epsilon))
+        updates[param] = param - learning_rate * update
+
+        # update delta_accu (as accu, but accumulating updates)
+        delta_accu_new = rho * delta_accu + (one - rho) * update ** 2
+        updates[delta_accu] = delta_accu_new
+
+    return updates
+
+
+def adam(loss_or_grads, params, learning_rate=0.001, beta1=0.9,
+         beta2=0.999, epsilon=1e-8):
+    """Adam updates
+
+    Adam updates implemented as in [1]_.
+
+    Parameters
+    ----------
+    loss_or_grads : symbolic expression or list of expressions
+        A scalar loss expression, or a list of gradient expressions
+    params : list of shared variables
+        The variables to generate update expressions for
+    learning_rate : float
+        Learning rate
+    beta1 : float
+        Exponential decay rate for the first moment estimates.
+    beta2 : float
+        Exponential decay rate for the second moment estimates.
+    epsilon : float
+        Constant for numerical stability.
+
+    Returns
+    -------
+    OrderedDict
+        A dictionary mapping each parameter to its update expression
+
+    Notes
+    -----
+    The paper [1]_ includes an additional hyperparameter lambda. This is only
+    needed to prove convergence of the algorithm and has no practical use
+    (personal communication with the authors), it is therefore omitted here.
+
+    References
+    ----------
+    .. [1] Kingma, Diederik, and Jimmy Ba (2014):
+           Adam: A Method for Stochastic Optimization.
+           arXiv preprint arXiv:1412.6980.
+    """
+    all_grads = get_or_compute_grads(loss_or_grads, params)
+    t_prev = theano.shared(utils.floatX(0.))
+    updates = OrderedDict()
+
+    # Using theano constant to prevent upcasting of float32
+    one = T.constant(1)
+
+    t = t_prev + 1
+    a_t = learning_rate*T.sqrt(one-beta2**t)/(one-beta1**t)
+
+    for param, g_t in zip(params, all_grads):
+        value = param.get_value(borrow=True)
+        m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
+                               broadcastable=param.broadcastable)
+        v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
+                               broadcastable=param.broadcastable)
+
+        m_t = beta1*m_prev + (one-beta1)*g_t
+        v_t = beta2*v_prev + (one-beta2)*g_t**2
+        step = a_t*m_t/(T.sqrt(v_t) + epsilon)
+
+        updates[m_prev] = m_t
+        updates[v_prev] = v_t
+        updates[param] = param - step
+
+    updates[t_prev] = t
+    return updates
+
+
+def adamax(loss_or_grads, params, learning_rate=0.002, beta1=0.9,
+           beta2=0.999, epsilon=1e-8):
+    """Adamax updates
+
+    Adamax updates implemented as in [1]_. This is a variant of of the Adam
+    algorithm based on the infinity norm.
+
+    Parameters
+    ----------
+    loss_or_grads : symbolic expression or list of expressions
+        A scalar loss expression, or a list of gradient expressions
+    params : list of shared variables
+        The variables to generate update expressions for
+    learning_rate : float
+        Learning rate
+    beta1 : float
+        Exponential decay rate for the first moment estimates.
+    beta2 : float
+        Exponential decay rate for the weighted infinity norm estimates.
+    epsilon : float
+        Constant for numerical stability.
+
+    Returns
+    -------
+    OrderedDict
+        A dictionary mapping each parameter to its update expression
+
+    References
+    ----------
+    .. [1] Kingma, Diederik, and Jimmy Ba (2014):
+           Adam: A Method for Stochastic Optimization.
+           arXiv preprint arXiv:1412.6980.
+    """
+    all_grads = get_or_compute_grads(loss_or_grads, params)
+    t_prev = theano.shared(utils.floatX(0.))
+    updates = OrderedDict()
+
+    # Using theano constant to prevent upcasting of float32
+    one = T.constant(1)
+
+    t = t_prev + 1
+    a_t = learning_rate/(one-beta1**t)
+
+    for param, g_t in zip(params, all_grads):
+        value = param.get_value(borrow=True)
+        m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
+                               broadcastable=param.broadcastable)
+        u_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
+                               broadcastable=param.broadcastable)
+
+        m_t = beta1*m_prev + (one-beta1)*g_t
+        u_t = T.maximum(beta2*u_prev, abs(g_t))
+        step = a_t*m_t/(u_t + epsilon)
+
+        updates[m_prev] = m_t
+        updates[u_prev] = u_t
+        updates[param] = param - step
+
+    updates[t_prev] = t
+    return updates
+
+
+def norm_constraint(tensor_var, max_norm, norm_axes=None, epsilon=1e-7):
+    """Max weight norm constraints and gradient clipping
+
+    This takes a TensorVariable and rescales it so that incoming weight
+    norms are below a specified constraint value. Vectors violating the
+    constraint are rescaled so that they are within the allowed range.
+
+    Parameters
+    ----------
+    tensor_var : TensorVariable
+        Theano expression for update, gradient, or other quantity.
+    max_norm : scalar
+        This value sets the maximum allowed value of any norm in
+        `tensor_var`.
+    norm_axes : sequence (list or tuple)
+        The axes over which to compute the norm.  This overrides the
+        default norm axes defined for the number of dimensions
+        in `tensor_var`. When this is not specified and `tensor_var` is a
+        matrix (2D), this is set to `(0,)`. If `tensor_var` is a 3D, 4D or
+        5D tensor, it is set to a tuple listing all axes but axis 0. The
+        former default is useful for working with dense layers, the latter
+        is useful for 1D, 2D and 3D convolutional layers.
+        (Optional)
+    epsilon : scalar, optional
+        Value used to prevent numerical instability when dividing by
+        very small or zero norms.
+
+    Returns
+    -------
+    TensorVariable
+        Input `tensor_var` with rescaling applied to weight vectors
+        that violate the specified constraints.
+
+    Examples
+    --------
+    >>> param = theano.shared(
+    ...     np.random.randn(100, 200).astype(theano.config.floatX))
+    >>> update = param + 100
+    >>> update = norm_constraint(update, 10)
+    >>> func = theano.function([], [], updates=[(param, update)])
+    >>> # Apply constrained update
+    >>> _ = func()
+    >>> from lasagne.utils import compute_norms
+    >>> norms = compute_norms(param.get_value())
+    >>> np.isclose(np.max(norms), 10)
+    True
+
+    Notes
+    -----
+    When `norm_axes` is not specified, the axes over which the norm is
+    computed depend on the dimensionality of the input variable. If it is
+    2D, it is assumed to come from a dense layer, and the norm is computed
+    over axis 0. If it is 3D, 4D or 5D, it is assumed to come from a
+    convolutional layer and the norm is computed over all trailing axes
+    beyond axis 0. For other uses, you should explicitly specify the axes
+    over which to compute the norm using `norm_axes`.
+    """
+    ndim = tensor_var.ndim
+
+    if norm_axes is not None:
+        sum_over = tuple(norm_axes)
+    elif ndim == 2:  # DenseLayer
+        sum_over = (0,)
+    elif ndim in [3, 4, 5]:  # Conv{1,2,3}DLayer
+        sum_over = tuple(range(1, ndim))
+    else:
+        raise ValueError(
+            "Unsupported tensor dimensionality {}."
+            "Must specify `norm_axes`".format(ndim)
+        )
+
+    dtype = np.dtype(theano.config.floatX).type
+    norms = T.sqrt(T.sum(T.sqr(tensor_var), axis=sum_over, keepdims=True))
+    target_norms = T.clip(norms, 0, dtype(max_norm))
+    constrained_output = \
+        (tensor_var * (target_norms / (dtype(epsilon) + norms)))
+
+    return constrained_output
+
+
+def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7,
+                          return_norm=False):
+    """Rescales a list of tensors based on their combined norm
+
+    If the combined norm of the input tensors exceeds the threshold then all
+    tensors are rescaled such that the combined norm is equal to the threshold.
+
+    Scaling the norms of the gradients is often used when training recurrent
+    neural networks [1]_.
+
+    Parameters
+    ----------
+    tensor_vars : List of TensorVariables.
+        Tensors to be rescaled.
+    max_norm : float
+        Threshold value for total norm.
+    epsilon : scalar, optional
+        Value used to prevent numerical instability when dividing by
+        very small or zero norms.
+    return_norm : bool
+        If true the total norm is also returned.
+
+    Returns
+    -------
+    tensor_vars_scaled : list of TensorVariables
+        The scaled tensor variables.
+    norm : Theano scalar
+        The combined norms of the input variables prior to rescaling,
+        only returned if ``return_norms=True``.
+
+    Examples
+    --------
+    >>> from lasagne.layers import InputLayer, DenseLayer
+    >>> import lasagne
+    >>> from lasagne.updates import sgd, total_norm_constraint
+    >>> x = T.matrix()
+    >>> y = T.ivector()
+    >>> l_in = InputLayer((5, 10))
+    >>> l1 = DenseLayer(l_in, num_units=7, nonlinearity=T.nnet.softmax)
+    >>> output = lasagne.layers.get_output(l1, x)
+    >>> cost = T.mean(T.nnet.categorical_crossentropy(output, y))
+    >>> all_params = lasagne.layers.get_all_params(l1)
+    >>> all_grads = T.grad(cost, all_params)
+    >>> scaled_grads = total_norm_constraint(all_grads, 5)
+    >>> updates = sgd(scaled_grads, all_params, learning_rate=0.1)
+
+    Notes
+    -----
+    The total norm can be used to monitor training.
+
+    References
+    ----------
+    .. [1] Sutskever, I., Vinyals, O., & Le, Q. V. (2014): Sequence to sequence
+       learning with neural networks. In Advances in Neural Information
+       Processing Systems (pp. 3104-3112).
+    """
+    norm = T.sqrt(sum(T.sum(tensor**2) for tensor in tensor_vars))
+    dtype = np.dtype(theano.config.floatX).type
+    target_norm = T.clip(norm, 0, dtype(max_norm))
+    multiplier = target_norm / (dtype(epsilon) + norm)
+    tensor_vars_scaled = [step*multiplier for step in tensor_vars]
+
+    if return_norm:
+        return tensor_vars_scaled, norm
+    else:
+        return tensor_vars_scaled
diff --git a/lasagne/utils.py b/lasagne/utils.py
new file mode 100644
index 0000000..ad22f88
--- /dev/null
+++ b/lasagne/utils.py
@@ -0,0 +1,450 @@
+import numpy as np
+
+import theano
+import theano.tensor as T
+
+
+def floatX(arr):
+    """Converts data to a numpy array of dtype ``theano.config.floatX``.
+
+    Parameters
+    ----------
+    arr : array_like
+        The data to be converted.
+
+    Returns
+    -------
+    numpy ndarray
+        The input array in the ``floatX`` dtype configured for Theano.
+        If `arr` is an ndarray of correct dtype, it is returned as is.
+    """
+    return np.asarray(arr, dtype=theano.config.floatX)
+
+
+def shared_empty(dim=2, dtype=None):
+    """Creates empty Theano shared variable.
+
+    Shortcut to create an empty Theano shared variable with
+    the specified number of dimensions.
+
+    Parameters
+    ----------
+    dim : int, optional
+        The number of dimensions for the empty variable, defaults to 2.
+    dtype : a numpy data-type, optional
+        The desired dtype for the variable. Defaults to the Theano
+        ``floatX`` dtype.
+
+    Returns
+    -------
+    Theano shared variable
+        An empty Theano shared variable of dtype ``dtype`` with
+        `dim` dimensions.
+    """
+    if dtype is None:
+        dtype = theano.config.floatX
+
+    shp = tuple([1] * dim)
+    return theano.shared(np.zeros(shp, dtype=dtype))
+
+
+def as_theano_expression(input):
+    """Wrap as Theano expression.
+
+    Wraps the given input as a Theano constant if it is not
+    a valid Theano expression already. Useful to transparently
+    handle numpy arrays and Python scalars, for example.
+
+    Parameters
+    ----------
+    input : number, numpy array or Theano expression
+        Expression to be converted to a Theano constant.
+
+    Returns
+    -------
+    Theano symbolic constant
+        Theano constant version of `input`.
+    """
+    if isinstance(input, theano.gof.Variable):
+        return input
+    else:
+        try:
+            return theano.tensor.constant(input)
+        except Exception as e:
+            raise TypeError("Input of type %s is not a Theano expression and "
+                            "cannot be wrapped as a Theano constant (original "
+                            "exception: %s)" % (type(input), e))
+
+
+def collect_shared_vars(expressions):
+    """Returns all shared variables the given expression(s) depend on.
+
+    Parameters
+    ----------
+    expressions : Theano expression or iterable of Theano expressions
+        The expressions to collect shared variables from.
+
+    Returns
+    -------
+    list of Theano shared variables
+        All shared variables the given expression(s) depend on, in fixed order
+        (as found by a left-recursive depth-first search). If some expressions
+        are shared variables themselves, they are included in the result.
+    """
+    # wrap single expression in list
+    if isinstance(expressions, theano.Variable):
+        expressions = [expressions]
+    # return list of all shared variables
+    return [v for v in theano.gof.graph.inputs(reversed(expressions))
+            if isinstance(v, theano.compile.SharedVariable)]
+
+
+def one_hot(x, m=None):
+    """One-hot representation of integer vector.
+
+    Given a vector of integers from 0 to m-1, returns a matrix
+    with a one-hot representation, where each row corresponds
+    to an element of x.
+
+    Parameters
+    ----------
+    x : integer vector
+        The integer vector to convert to a one-hot representation.
+    m : int, optional
+        The number of different columns for the one-hot representation. This
+        needs to be strictly greater than the maximum value of `x`.
+        Defaults to ``max(x) + 1``.
+
+    Returns
+    -------
+    Theano tensor variable
+        A Theano tensor variable of shape (``n``, `m`), where ``n`` is the
+        length of `x`, with the one-hot representation of `x`.
+
+    Notes
+    -----
+    If your integer vector represents target class memberships, and you wish to
+    compute the cross-entropy between predictions and the target class
+    memberships, then there is no need to use this function, since the function
+    :func:`lasagne.objectives.categorical_crossentropy()` can compute the
+    cross-entropy from the integer vector directly.
+
+    """
+    if m is None:
+        m = T.cast(T.max(x) + 1, 'int32')
+
+    return T.eye(m)[T.cast(x, 'int32')]
+
+
+def unique(l):
+    """Filters duplicates of iterable.
+
+    Create a new list from l with duplicate entries removed,
+    while preserving the original order.
+
+    Parameters
+    ----------
+    l : iterable
+        Input iterable to filter of duplicates.
+
+    Returns
+    -------
+    list
+        A list of elements of `l` without duplicates and in the same order.
+    """
+    new_list = []
+    seen = set()
+    for el in l:
+        if el not in seen:
+            new_list.append(el)
+            seen.add(el)
+
+    return new_list
+
+
+def as_tuple(x, N, t=None):
+    """
+    Coerce a value to a tuple of given length (and possibly given type).
+
+    Parameters
+    ----------
+    x : value or iterable
+    N : integer
+        length of the desired tuple
+    t : type, optional
+        required type for all elements
+
+    Returns
+    -------
+    tuple
+        ``tuple(x)`` if `x` is iterable, ``(x,) * N`` otherwise.
+
+    Raises
+    ------
+    TypeError
+        if `type` is given and `x` or any of its elements do not match it
+    ValueError
+        if `x` is iterable, but does not have exactly `N` elements
+    """
+    try:
+        X = tuple(x)
+    except TypeError:
+        X = (x,) * N
+
+    if (t is not None) and not all(isinstance(v, t) for v in X):
+        raise TypeError("expected a single value or an iterable "
+                        "of {0}, got {1} instead".format(t.__name__, x))
+
+    if len(X) != N:
+        raise ValueError("expected a single value or an iterable "
+                         "with length {0}, got {1} instead".format(N, x))
+
+    return X
+
+
+def compute_norms(array, norm_axes=None):
+    """ Compute incoming weight vector norms.
+
+    Parameters
+    ----------
+    array : numpy array or Theano expression
+        Weight or bias.
+    norm_axes : sequence (list or tuple)
+        The axes over which to compute the norm.  This overrides the
+        default norm axes defined for the number of dimensions
+        in `array`. When this is not specified and `array` is a 2D array,
+        this is set to `(0,)`. If `array` is a 3D, 4D or 5D array, it is
+        set to a tuple listing all axes but axis 0. The former default is
+        useful for working with dense layers, the latter is useful for 1D,
+        2D and 3D convolutional layers.
+        Finally, in case `array` is a vector, `norm_axes` is set to an empty
+        tuple, and this function will simply return the absolute value for
+        each element. This is useful when the function is applied to all
+        parameters of the network, including the bias, without distinction.
+        (Optional)
+
+    Returns
+    -------
+    norms : 1D array or Theano vector (1D)
+        1D array or Theano vector of incoming weight/bias vector norms.
+
+    Examples
+    --------
+    >>> array = np.random.randn(100, 200)
+    >>> norms = compute_norms(array)
+    >>> norms.shape
+    (200,)
+
+    >>> norms = compute_norms(array, norm_axes=(1,))
+    >>> norms.shape
+    (100,)
+    """
+
+    # Check if supported type
+    if not isinstance(array, theano.Variable) and \
+       not isinstance(array, np.ndarray):
+        raise RuntimeError(
+            "Unsupported type {}. "
+            "Only theano variables and numpy arrays "
+            "are supported".format(type(array))
+        )
+
+    # Compute default axes to sum over
+    ndim = array.ndim
+    if norm_axes is not None:
+        sum_over = tuple(norm_axes)
+    elif ndim == 1:          # For Biases that are in 1d (e.g. b of DenseLayer)
+        sum_over = ()
+    elif ndim == 2:          # DenseLayer
+        sum_over = (0,)
+    elif ndim in [3, 4, 5]:  # Conv{1,2,3}DLayer
+        sum_over = tuple(range(1, ndim))
+    else:
+        raise ValueError(
+            "Unsupported tensor dimensionality {}. "
+            "Must specify `norm_axes`".format(array.ndim)
+        )
+
+    # Run numpy or Theano norm computation
+    if isinstance(array, theano.Variable):
+        # Apply theano version if it is a theano variable
+        if len(sum_over) == 0:
+            norms = T.abs_(array)   # abs if we have nothing to sum over
+        else:
+            norms = T.sqrt(T.sum(array**2, axis=sum_over))
+    elif isinstance(array, np.ndarray):
+        # Apply the numpy version if ndarray
+        if len(sum_over) == 0:
+            norms = abs(array)     # abs if we have nothing to sum over
+        else:
+            norms = np.sqrt(np.sum(array**2, axis=sum_over))
+
+    return norms
+
+
+def create_param(spec, shape, name=None):
+    """
+    Helper method to create Theano shared variables for layer parameters
+    and to initialize them.
+
+    Parameters
+    ----------
+    spec : scalar number, numpy array, Theano expression, or callable
+        Either of the following:
+
+        * a scalar or a numpy array with the initial parameter values
+        * a Theano expression or shared variable representing the parameters
+        * a function or callable that takes the desired shape of
+          the parameter array as its single argument and returns
+          a numpy array, a Theano expression, or a shared variable
+          representing the parameters.
+
+    shape : iterable of int
+        a tuple or other iterable of integers representing the desired
+        shape of the parameter array.
+
+    name : string, optional
+        The name to give to the parameter variable. Ignored if `spec`
+        is or returns a Theano expression or shared variable that
+        already has a name.
+
+
+    Returns
+    -------
+    Theano shared variable or Theano expression
+        A Theano shared variable or expression representing layer parameters.
+        If a scalar or a numpy array was provided, a shared variable is
+        initialized to contain this array. If a shared variable or expression
+        was provided, it is simply returned. If a callable was provided, it is
+        called, and its output is used to initialize a shared variable.
+
+    Notes
+    -----
+    This function is called by :meth:`Layer.add_param()` in the constructor
+    of most :class:`Layer` subclasses. This enables those layers to
+    support initialization with scalars, numpy arrays, existing Theano shared
+    variables or expressions, and callables for generating initial parameter
+    values, Theano expressions, or shared variables.
+    """
+    import numbers  # to check if argument is a number
+    shape = tuple(shape)  # convert to tuple if needed
+    if any(d <= 0 for d in shape):
+        raise ValueError((
+            "Cannot create param with a non-positive shape dimension. "
+            "Tried to create param with shape=%r, name=%r") % (shape, name))
+
+    err_prefix = "cannot initialize parameter %s: " % name
+    if callable(spec):
+        spec = spec(shape)
+        err_prefix += "the %s returned by the provided callable"
+    else:
+        err_prefix += "the provided %s"
+
+    if isinstance(spec, numbers.Number) or isinstance(spec, np.generic) \
+            and spec.dtype.kind in 'biufc':
+        spec = np.asarray(spec)
+
+    if isinstance(spec, np.ndarray):
+        if spec.shape != shape:
+            raise ValueError("%s has shape %s, should be %s" %
+                             (err_prefix % "numpy array", spec.shape, shape))
+        # We assume parameter variables do not change shape after creation.
+        # We can thus fix their broadcast pattern, to allow Theano to infer
+        # broadcastable dimensions of expressions involving these parameters.
+        bcast = tuple(s == 1 for s in shape)
+        spec = theano.shared(spec, broadcastable=bcast)
+
+    if isinstance(spec, theano.Variable):
+        # We cannot check the shape here, Theano expressions (even shared
+        # variables) do not have a fixed compile-time shape. We can check the
+        # dimensionality though.
+        if spec.ndim != len(shape):
+            raise ValueError("%s has %d dimensions, should be %d" %
+                             (err_prefix % "Theano variable", spec.ndim,
+                              len(shape)))
+        # We only assign a name if the user hasn't done so already.
+        if not spec.name:
+            spec.name = name
+        return spec
+
+    else:
+        if "callable" in err_prefix:
+            raise TypeError("%s is not a numpy array or a Theano expression" %
+                            (err_prefix % "value"))
+        else:
+            raise TypeError("%s is not a numpy array, a Theano expression, "
+                            "or a callable" % (err_prefix % "spec"))
+
+
+def unroll_scan(fn, sequences, outputs_info, non_sequences, n_steps,
+                go_backwards=False):
+        """
+        Helper function to unroll for loops. Can be used to unroll theano.scan.
+        The parameter names are identical to theano.scan, please refer to here
+        for more information.
+
+        Note that this function does not support the truncate_gradient
+        setting from theano.scan.
+
+        Parameters
+        ----------
+
+        fn : function
+            Function that defines calculations at each step.
+
+        sequences : TensorVariable or list of TensorVariables
+            List of TensorVariable with sequence data. The function iterates
+            over the first dimension of each TensorVariable.
+
+        outputs_info : list of TensorVariables
+            List of tensors specifying the initial values for each recurrent
+            value.
+
+        non_sequences: list of TensorVariables
+            List of theano.shared variables that are used in the step function.
+
+        n_steps: int
+            Number of steps to unroll.
+
+        go_backwards: bool
+            If true the recursion starts at sequences[-1] and iterates
+            backwards.
+
+        Returns
+        -------
+        List of TensorVariables. Each element in the list gives the recurrent
+        values at each time step.
+
+        """
+        if not isinstance(sequences, (list, tuple)):
+            sequences = [sequences]
+
+        # When backwards reverse the recursion direction
+        counter = range(n_steps)
+        if go_backwards:
+            counter = counter[::-1]
+
+        output = []
+        prev_vals = outputs_info
+        for i in counter:
+            step_input = [s[i] for s in sequences] + prev_vals + non_sequences
+            out_ = fn(*step_input)
+            # The returned values from step can be either a TensorVariable,
+            # a list, or a tuple.  Below, we force it to always be a list.
+            if isinstance(out_, T.TensorVariable):
+                out_ = [out_]
+            if isinstance(out_, tuple):
+                out_ = list(out_)
+            output.append(out_)
+
+            prev_vals = output[-1]
+
+        # iterate over each scan output and convert it to same format as scan:
+        # [[output11, output12,...output1n],
+        # [output21, output22,...output2n],...]
+        output_scan = []
+        for i in range(len(output[0])):
+            l = map(lambda x: x[i], output)
+            output_scan.append(T.stack(*l))
+
+        return output_scan
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..a78e8f2
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,10 @@
+-r requirements.txt
+mock
+numpydoc
+pep8==1.6.2
+pytest
+pytest-cov
+pytest-pep8
+Jinja2==2.7.3
+Sphinx==1.2.3
+sphinx_rtd_theme
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..0132aab
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+Theano==0.8.0
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..50b61b1
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,10 @@
+[aliases]
+dev = develop easy_install lasagne[testing]
+
+[pytest]
+addopts =
+    -v --doctest-modules
+    --cov=lasagne --cov-report=term-missing
+    --pep8
+    lasagne/
+python_files = test*py
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..251a135
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,67 @@
+import os
+import re
+from setuptools import find_packages
+from setuptools import setup
+# We need io.open() (Python 3's default open) to specify file encodings 
+import io
+
+here = os.path.abspath(os.path.dirname(__file__))
+try:
+    # obtain version string from __init__.py
+    # Read ASCII file with builtin open() so __version__ is str in Python 2 and 3
+    with open(os.path.join(here, 'lasagne', '__init__.py'), 'r') as f:
+        init_py = f.read()
+    version = re.search('__version__ = "(.*)"', init_py).groups()[0]
+except Exception:
+    version = ''
+try:
+    # obtain long description from README and CHANGES
+    # Specify encoding to get a unicode type in Python 2 and a str in Python 3
+    with io.open(os.path.join(here, 'README.rst'), 'r', encoding='utf-8') as f:
+        README = f.read()
+    with io.open(os.path.join(here, 'CHANGES.rst'), 'r', encoding='utf-8') as f:
+        CHANGES = f.read()
+except IOError:
+    README = CHANGES = ''
+
+install_requires = [
+    'numpy',
+    # 'Theano',  # we require a development version, see requirements.txt
+    ]
+
+tests_require = [
+    'mock',
+    'pytest',
+    'pytest-cov',
+    'pytest-pep8',
+    ]
+
+setup(
+    name="Lasagne",
+    version=version,
+    description="A lightweight library to build and train neural networks "
+                "in Theano",
+    long_description="\n\n".join([README, CHANGES]),
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python :: 2.7",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.4",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        ],
+    keywords="",
+    author="Lasagne contributors",
+    author_email="lasagne-users at googlegroups.com",
+    url="https://github.com/Lasagne/Lasagne",
+    license="MIT",
+    packages=find_packages(),
+    include_package_data=False,
+    zip_safe=False,
+    install_requires=install_requires,
+    extras_require={
+        'testing': tests_require,
+        },
+    )

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lasagne.git