[caffe-contrib] 02/08: New upstream version 1.0.0

Mon May 1 01:32:57 UTC 2017

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository caffe-contrib.

commit 2912354e88da77b67b3985e4946380c074b9bd36
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Mon May 1 01:19:55 2017 +0000

    New upstream version 1.0.0
---
 CMakeLists.txt                                 |  16 ++-
 CONTRIBUTORS.md                                |   2 +-
 Makefile                                       |   2 +-
 README.md                                      |   6 +-
 cmake/Dependencies.cmake                       |   2 +-
 cmake/Modules/FindAtlas.cmake                  |   4 +-
 cmake/Templates/caffe_config.h.in              |  15 +--
 cmake/Uninstall.cmake.in                       |  26 ++++
 docker/cpu/Dockerfile                          |   3 +-
 docker/gpu/Dockerfile                          |   5 +-
 docs/_layouts/default.html                     |   2 +-
 docs/development.md                            |   4 +-
 docs/index.md                                  |  47 +++----
 docs/install_apt.md                            |   2 +-
 docs/installation.md                           |   4 +-
 docs/model_zoo.md                              |  24 ++--
 docs/multigpu.md                               |   4 +-
 docs/performance_hardware.md                   |  73 -----------
 docs/tutorial/interfaces.md                    |   4 +-
 docs/tutorial/layers.md                        |   2 +-
 docs/tutorial/layers/sigmoid.md                |  10 ++
 examples/CMakeLists.txt                        |   3 +-
 examples/cifar10/cifar10_quick_solver.prototxt |   1 -
 examples/cifar10/train_full.sh                 |   4 +-
 examples/cifar10/train_quick.sh                |   2 +-
 examples/finetune_flickr_style/readme.md       |   2 +-
 include/caffe/layers/base_data_layer.hpp       |   2 -
 include/caffe/layers/data_layer.hpp            |   2 -
 include/caffe/layers/dummy_data_layer.hpp      |   2 -
 include/caffe/layers/hdf5_data_layer.hpp       |   2 -
 include/caffe/layers/hdf5_output_layer.hpp     |   2 -
 include/caffe/layers/infogain_loss_layer.hpp   |  35 +++++
 include/caffe/layers/input_layer.hpp           |   2 -
 include/caffe/layers/python_layer.hpp          |   4 -
 include/caffe/layers/window_data_layer.hpp     |   3 +-
 include/caffe/test/test_caffe_main.hpp         |   3 +-
 include/caffe/util/cudnn.hpp                   |  10 ++
 include/caffe/util/hdf5.hpp                    |   4 +-
 include/caffe/util/math_functions.hpp          |  12 +-
 include/caffe/util/mkl_alternate.hpp           |  19 +--
 models/bvlc_alexnet/readme.md                  |   2 +-
 models/bvlc_googlenet/readme.md                |   2 +-
 models/bvlc_reference_caffenet/readme.md       |   2 +-
 models/bvlc_reference_rcnn_ilsvrc13/readme.md  |   2 +-
 python/caffe/__init__.py                       |   2 +-
 python/caffe/_caffe.cpp                        |  56 ++++++--
 python/caffe/draw.py                           |  12 +-
 python/caffe/io.py                             |   2 +-
 python/caffe/net_spec.py                       |   4 +
 python/caffe/pycaffe.py                        |   4 +-
 python/caffe/test/test_draw.py                 |  37 ++++++
 python/caffe/test/test_nccl.py                 |  19 +++
 python/caffe/test/test_net.py                  |  45 ++++++-
 python/caffe/test/test_net_spec.py             |   8 ++
 scripts/caffe                                  |  73 +++++++++++
 scripts/cpp_lint.py                            |  39 +++---
 scripts/travis/install-deps.sh                 |  12 +-
 scripts/travis/install-python-deps.sh          |   1 +
 src/caffe/CMakeLists.txt                       |   6 +-
 src/caffe/layers/base_conv_layer.cpp           |   1 -
 src/caffe/layers/batch_norm_layer.cpp          |   6 +-
 src/caffe/layers/batch_norm_layer.cu           |   8 +-
 src/caffe/layers/cudnn_conv_layer.cpp          |   1 +
 src/caffe/layers/cudnn_relu_layer.cpp          |   1 +
 src/caffe/layers/eltwise_layer.cpp             |   4 +-
 src/caffe/layers/hdf5_data_layer.cpp           |   3 +-
 src/caffe/layers/infogain_loss_layer.cpp       | 172 +++++++++++++++++++++----
 src/caffe/layers/lstm_unit_layer.cpp           |   1 -
 src/caffe/layers/pooling_layer.cu              |   4 +-
 src/caffe/layers/sigmoid_layer.cpp             |   2 +-
 src/caffe/layers/sigmoid_layer.cu              |   2 +-
 src/caffe/net.cpp                              |   3 +-
 src/caffe/proto/caffe.proto                    |  23 +++-
 src/caffe/solver.cpp                           |   3 +-
 src/caffe/test/test_caffe_main.cpp             |   2 +-
 src/caffe/test/test_convolution_layer.cpp      |   2 +-
 src/caffe/test/test_gradient_based_solver.cpp  |  14 +-
 src/caffe/test/test_hdf5_output_layer.cpp      |  13 +-
 src/caffe/test/test_hdf5data_layer.cpp         |   5 +-
 src/caffe/test/test_infogain_loss_layer.cpp    |  83 +++++++++++-
 src/caffe/test/test_neuron_layer.cpp           |  11 +-
 src/caffe/util/hdf5.cpp                        |  34 ++++-
 src/caffe/util/math_functions.cpp              |  10 ++
 src/caffe/util/math_functions.cu               |  21 +++
 tools/CMakeLists.txt                           |   3 +-
 tools/compute_image_mean.cpp                   |   8 +-
 tools/extra/parse_log.sh                       |   2 +-
 tools/extra/resize_and_crop_images.py          |   2 +-
 88 files changed, 816 insertions(+), 327 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3056d75..08f56a3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,14 +10,15 @@ endif()
 project(Caffe C CXX)
 
 # ---[ Caffe version
-set(CAFFE_TARGET_VERSION "1.0.0-rc5" CACHE STRING "Caffe logical version")
-set(CAFFE_TARGET_SOVERSION "1.0.0-rc5" CACHE STRING "Caffe soname version")
+set(CAFFE_TARGET_VERSION "1.0.0" CACHE STRING "Caffe logical version")
+set(CAFFE_TARGET_SOVERSION "1.0.0" CACHE STRING "Caffe soname version")
 add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 
 # ---[ Using cmake scripts and modules
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
 include(ExternalProject)
+include(GNUInstallDirs)
 
 include(cmake/Utils.cmake)
 include(cmake/Targets.cmake)
@@ -103,8 +104,19 @@ if(BUILD_python)
   add_dependencies(pytest pycaffe)
 endif()
 
+# ---[ uninstall target
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Uninstall.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake/Uninstall.cmake
+    IMMEDIATE @ONLY)
+
+add_custom_target(uninstall
+    COMMAND ${CMAKE_COMMAND} -P
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake/Uninstall.cmake)
+
 # ---[ Configuration summary
 caffe_print_configuration_summary()
 
 # ---[ Export configs generation
 caffe_generate_export_configs()
+
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8db66ea..3fd7678 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -1,6 +1,6 @@
 # Contributors
 
-Caffe is developed by a core set of BVLC members and the open-source community.
+Caffe is developed by a core set of BAIR members and the open-source community.
 
 We thank all of our [contributors](https://github.com/BVLC/caffe/graphs/contributors)!
 
diff --git a/Makefile b/Makefile
index 77900b6..4d32416 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ LIB_BUILD_DIR := $(BUILD_DIR)/lib
 STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
 DYNAMIC_VERSION_MAJOR 		:= 1
 DYNAMIC_VERSION_MINOR 		:= 0
-DYNAMIC_VERSION_REVISION 	:= 0-rc5
+DYNAMIC_VERSION_REVISION 	:= 0
 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
 #DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR)
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)
diff --git a/README.md b/README.md
index 44b9e62..0ae3616 100644
--- a/README.md
+++ b/README.md
@@ -4,13 +4,13 @@
 [![License](https://img.shields.io/badge/license-BSD-blue.svg)](LICENSE)
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.
-It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and community contributors.
+It is developed by Berkeley AI Research ([BAIR](http://bair.berkeley.edu))/The Berkeley Vision and Learning Center (BVLC) and community contributors.
 
 Check out the [project site](http://caffe.berkeleyvision.org) for all the details like
 
 - [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p)
 - [Tutorial Documentation](http://caffe.berkeleyvision.org/tutorial/)
-- [BVLC reference models](http://caffe.berkeleyvision.org/model_zoo.html) and the [community model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)
+- [BAIR reference models](http://caffe.berkeleyvision.org/model_zoo.html) and the [community model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)
 - [Installation instructions](http://caffe.berkeleyvision.org/installation.html)
 
 and step-by-step examples.
@@ -25,7 +25,7 @@ Happy brewing!
 ## License and Citation
 
 Caffe is released under the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE).
-The BVLC reference models are released for unrestricted use.
+The BAIR/BVLC reference models are released for unrestricted use.
 
 Please cite Caffe in your publications if it helps your research:
 
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 02c8152..4a5bac4 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -5,7 +5,7 @@ set(Caffe_DEFINITIONS "")
 set(Caffe_COMPILE_OPTIONS "")
 
 # ---[ Boost
-find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
+find_package(Boost 1.55 REQUIRED COMPONENTS system thread filesystem)
 list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${Boost_INCLUDE_DIRS})
 list(APPEND Caffe_LINKER_LIBS PUBLIC ${Boost_LIBRARIES})
 
diff --git a/cmake/Modules/FindAtlas.cmake b/cmake/Modules/FindAtlas.cmake
index 9c665a4..7ffa639 100644
--- a/cmake/Modules/FindAtlas.cmake
+++ b/cmake/Modules/FindAtlas.cmake
@@ -28,7 +28,7 @@ find_path(Atlas_CLAPACK_INCLUDE_DIR NAMES clapack.h PATHS ${Atlas_INCLUDE_SEARCH
 
 find_library(Atlas_CBLAS_LIBRARY NAMES  ptcblas_r ptcblas cblas_r cblas       PATHS ${Atlas_LIB_SEARCH_PATHS})
 find_library(Atlas_BLAS_LIBRARY NAMES   atlas_r   atlas                       PATHS ${Atlas_LIB_SEARCH_PATHS})
-find_library(Atlas_LAPACK_LIBRARY NAMES lapack alapack_r alapack lapack_atlas PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_LAPACK_LIBRARY NAMES lapack alapack_r alapack lapack_atlas atllapack PATHS ${Atlas_LIB_SEARCH_PATHS})
 
 set(LOOKED_FOR
   Atlas_CBLAS_INCLUDE_DIR
@@ -47,6 +47,6 @@ if(ATLAS_FOUND)
   set(Atlas_LIBRARIES ${Atlas_LAPACK_LIBRARY} ${Atlas_CBLAS_LIBRARY} ${Atlas_BLAS_LIBRARY})
   mark_as_advanced(${LOOKED_FOR})
 
-  message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR}, library: ${Atlas_BLAS_LIBRARY})")
+  message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR} library: ${Atlas_BLAS_LIBRARY} lapack: ${Atlas_LAPACK_LIBRARY}")
 endif(ATLAS_FOUND)
 
diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in
index 45465b9..2080c63 100644
--- a/cmake/Templates/caffe_config.h.in
+++ b/cmake/Templates/caffe_config.h.in
@@ -4,16 +4,9 @@
 /* Binaries directory */
 #define BINARY_FOLDER "${PROJECT_BINARY_DIR}"
 
+/* This is an absolute path so that we can run test from any build
+ * directory */
+#define ABS_TEST_DATA_DIR "${PROJECT_SOURCE_DIR}/src/caffe/test/test_data/"
+
 /* Test device */
 #define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE}
-
-/* Temporary (TODO: remove) */
-#if 1
-  #define CMAKE_SOURCE_DIR SOURCE_FOLDER "/src/"
-  #define EXAMPLES_SOURCE_DIR BINARY_FOLDER "/examples/"
-  #define CMAKE_EXT ".gen.cmake"
-#else
-  #define CMAKE_SOURCE_DIR "src/"
-  #define EXAMPLES_SOURCE_DIR "examples/"
-  #define CMAKE_EXT ""
-#endif
diff --git a/cmake/Uninstall.cmake.in b/cmake/Uninstall.cmake.in
new file mode 100644
index 0000000..bb8e296
--- /dev/null
+++ b/cmake/Uninstall.cmake.in
@@ -0,0 +1,26 @@
+if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+  message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+
+if (NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set (CMAKE_INSTALL_PREFIX "@CMAKE_INSTALL_PREFIX@")
+endif ()
+ message(${CMAKE_INSTALL_PREFIX})
+
+file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+string(REGEX REPLACE "\n" ";" files "${files}")
+foreach(file ${files})
+  message(STATUS "Uninstalling $ENV{DESTDIR}${file}")
+  if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    exec_program(
+      "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
+      OUTPUT_VARIABLE rm_out
+      RETURN_VALUE rm_retval
+      )
+    if(NOT "${rm_retval}" STREQUAL 0)
+      message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}")
+    endif(NOT "${rm_retval}" STREQUAL 0)
+  else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    message(STATUS "File $ENV{DESTDIR}${file} does not exist.")
+  endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+endforeach(file)
\ No newline at end of file
diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile
index af6c03c..67e2e61 100644
--- a/docker/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@@ -28,7 +28,8 @@ ENV CAFFE_ROOT=/opt/caffe
 WORKDIR $CAFFE_ROOT
 
 # FIXME: use ARG instead of ENV once DockerHub supports this
-ENV CLONE_TAG=rc4
+# https://github.com/docker/hub-feedback/issues/460
+ENV CLONE_TAG=1.0
 
 RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
     pip install --upgrade pip && \
diff --git a/docker/gpu/Dockerfile b/docker/gpu/Dockerfile
index 0785b10..dcdbdf3 100644
--- a/docker/gpu/Dockerfile
+++ b/docker/gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
 LABEL maintainer caffe-maint at googlegroups.com
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -28,7 +28,8 @@ ENV CAFFE_ROOT=/opt/caffe
 WORKDIR $CAFFE_ROOT
 
 # FIXME: use ARG instead of ENV once DockerHub supports this
-ENV CLONE_TAG=rc4
+# https://github.com/docker/hub-feedback/issues/460
+ENV CLONE_TAG=1.0
 
 RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
     pip install --upgrade pip && \
diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html
index b8efe60..3799e95 100644
--- a/docs/_layouts/default.html
+++ b/docs/_layouts/default.html
@@ -36,7 +36,7 @@
       <header>
         <h1 class="header"><a href="/">Caffe</a></h1>
         <p class="header">
-          Deep learning framework by the <a class="header name" href="http://bvlc.eecs.berkeley.edu/">BVLC</a>
+          Deep learning framework by <a class="header name" href="http://bair.berkeley.edu/">BAIR</a>
         </p>
         <p class="header">
           Created by
diff --git a/docs/development.md b/docs/development.md
index 107c2c3..ec05bbe 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -4,7 +4,7 @@ title: Developing and Contributing
 # Development and Contributing
 
 Caffe is developed with active participation of the community.<br>
-The [BVLC](http://bvlc.eecs.berkeley.edu/) brewers welcome all contributions!
+The [BAIR](http://bair.berkeley.edu/)/BVLC brewers welcome all contributions!
 
 The exact details of contributions are recorded by versioning and cited in our [acknowledgements](http://caffe.berkeleyvision.org/#acknowledgements).
 This method is impartial and always up-to-date.
@@ -37,7 +37,7 @@ We absolutely appreciate any contribution to this effort!
 
 The `master` branch receives all new development including community contributions.
 We try to keep it in a reliable state, but it is the bleeding edge, and things do get broken every now and then.
-BVLC maintainers will periodically make releases by marking stable checkpoints as tags and maintenance branches. [Past releases](https://github.com/BVLC/caffe/releases) are catalogued online.
+BAIR maintainers will periodically make releases by marking stable checkpoints as tags and maintenance branches. [Past releases](https://github.com/BVLC/caffe/releases) are catalogued online.
 
 #### Issues & Pull Request Protocol
 
diff --git a/docs/index.md b/docs/index.md
index 932b3b5..b633f7c 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -5,7 +5,7 @@ title: Deep Learning Framework
 # Caffe
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.
-It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and by community contributors.
+It is developed by Berkeley AI Research ([BAIR](http://bair.berkeley.edu)) and by community contributors.
 [Yangqing Jia](http://daggerfs.com) created the project during his PhD at UC Berkeley.
 Caffe is released under the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE).
 
@@ -23,21 +23,20 @@ Thanks to these contributors the framework tracks the state-of-the-art in both c
 
 **Speed** makes Caffe perfect for research experiments and industry deployment.
 Caffe can process **over 60M images per day** with a single NVIDIA K40 GPU\*.
-That's 1 ms/image for inference and 4 ms/image for learning.
-We believe that Caffe is the fastest convnet implementation available.
+That's 1 ms/image for inference and 4 ms/image for learning and more recent library versions and hardware are faster still.
+We believe that Caffe is among the fastest convnet implementations available.
 
 **Community**: Caffe already powers academic research projects, startup prototypes, and even large-scale industrial applications in vision, speech, and multimedia.
 Join our community of brewers on the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users) and [Github](https://github.com/BVLC/caffe/).
 
 <p class="footnote" markdown="1">
-\* With the ILSVRC2012-winning [SuperVision](http://www.image-net.org/challenges/LSVRC/2012/supervision.pdf) model and caching IO.
-Consult performance [details](/performance_hardware.html).
+\* With the ILSVRC2012-winning [SuperVision](http://www.image-net.org/challenges/LSVRC/2012/supervision.pdf) model and prefetching IO.
 </p>
 
 ## Documentation
 
-- [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p)<br>
-Tutorial presentation.
+- [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p) and [Caffe in a Day](https://docs.google.com/presentation/d/1HxGdeq8MPktHaPb-rlmYYQ723iWzq9ur6Gjo71YiG0Y/edit#slide=id.gc2fcdcce7_216_0)<br>
+Tutorial presentation of the framework and a full-day crash course.
 - [Tutorial Documentation](/tutorial)<br>
 Practical guide and framework reference.
 - [arXiv / ACM MM '14 paper](http://arxiv.org/abs/1408.5093)<br>
@@ -45,18 +44,13 @@ A 4-page report for the ACM Multimedia Open Source competition (arXiv:1408.5093v
 - [Installation instructions](/installation.html)<br>
 Tested on Ubuntu, Red Hat, OS X.
 * [Model Zoo](/model_zoo.html)<br>
-BVLC suggests a standard distribution format for Caffe models, and provides trained models.
+BAIR suggests a standard distribution format for Caffe models, and provides trained models.
 * [Developing & Contributing](/development.html)<br>
 Guidelines for development and contributing to Caffe.
 * [API Documentation](/doxygen/annotated.html)<br>
 Developer documentation automagically generated from code comments.
-
-### Examples
-
-{% assign examples = site.pages | where:'category','example' | sort: 'priority' %}
-{% for page in examples %}
-- <div><a href="{{page.url}}">{{page.title}}</a><br>{{page.description}}</div>
-{% endfor %}
+* [Benchmarking](https://docs.google.com/spreadsheets/d/1Yp4rqHpT7mKxOPbpzYeUfEFLnELDAgxSSBQKp5uKDGQ/edit#gid=0)<br>
+Comparison of inference and learning for different networks and GPUs.
 
 ### Notebook Examples
 
@@ -65,6 +59,13 @@ Developer documentation automagically generated from code comments.
 - <div><a href="http://nbviewer.ipython.org/github/BVLC/caffe/blob/master/{{page.original_path}}">{{page.title}}</a><br>{{page.description}}</div>
 {% endfor %}
 
+### Command Line Examples
+
+{% assign examples = site.pages | where:'category','example' | sort: 'priority' %}
+{% for page in examples %}
+- <div><a href="{{page.url}}">{{page.title}}</a><br>{{page.description}}</div>
+{% endfor %}
+
 ## Citing Caffe
 
 Please cite Caffe in your publications if it helps your research:
@@ -76,8 +77,7 @@ Please cite Caffe in your publications if it helps your research:
       Year = {2014}
     }
 
-If you do publish a paper where Caffe helped your research, we encourage you to update the [publications wiki](https://github.com/BVLC/caffe/wiki/Publications).
-Citations are also tracked automatically by [Google Scholar](http://scholar.google.com/scholar?oi=bibs&hl=en&cites=17333247995453974016).
+If you do publish a paper where Caffe helped your research, we encourage you to cite the framework for tracking by [Google Scholar](https://scholar.google.com/citations?view_op=view_citation&hl=en&citation_for_view=-ltRSM0AAAAJ:u5HHmVD_uO8C).
 
 ## Contacting Us
 
@@ -85,17 +85,12 @@ Join the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users
 
 Framework development discussions and thorough bug reports are collected on [Issues](https://github.com/BVLC/caffe/issues).
 
-Contact [caffe-dev](mailto:caffe-dev at googlegroups.com) if you have a confidential proposal for the framework *and the ability to act on it*.
-Requests for features, explanations, or personal help will be ignored; post to [caffe-users](https://groups.google.com/forum/#!forum/caffe-users) instead.
-
-The core Caffe developers offer [consulting services](mailto:caffe-coldpress at googlegroups.com) for appropriate projects.
-
 ## Acknowledgements
 
-The BVLC Caffe developers would like to thank NVIDIA for GPU donation, A9 and Amazon Web Services for a research grant in support of Caffe development and reproducible research in deep learning, and BVLC PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for guidance.
+The BAIR Caffe developers would like to thank NVIDIA for GPU donation, A9 and Amazon Web Services for a research grant in support of Caffe development and reproducible research in deep learning, and BAIR PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for guidance.
 
-The BVLC members who have contributed to Caffe are (alphabetical by first name):
-[Eric Tzeng](https://github.com/erictzeng), [Evan Shelhamer](http://imaginarynumber.net/), [Jeff Donahue](http://jeffdonahue.com/), [Jon Long](https://github.com/longjon), [Ross Girshick](http://www.cs.berkeley.edu/~rbg/), [Sergey Karayev](http://sergeykarayev.com/), [Sergio Guadarrama](http://www.eecs.berkeley.edu/~sguada/), and [Yangqing Jia](http://daggerfs.com/).
+The BAIR members who have contributed to Caffe are (alphabetical by first name):
+[Carl Doersch](http://www.carldoersch.com/), [Eric Tzeng](https://github.com/erictzeng), [Evan Shelhamer](http://imaginarynumber.net/), [Jeff Donahue](http://jeffdonahue.com/), [Jon Long](https://github.com/longjon), [Philipp Krähenbühl](http://www.philkr.net/), [Ronghang Hu](http://ronghanghu.com/), [Ross Girshick](http://www.cs.berkeley.edu/~rbg/), [Sergey Karayev](http://sergeykarayev.com/), [Sergio Guadarrama](http://www.eecs.berkeley.edu/~sguada/), [Takuya Narihira](https://github.c [...]
 
 The open-source community plays an important and growing role in Caffe's development.
 Check out the Github [project pulse](https://github.com/BVLC/caffe/pulse) for recent activity and the [contributors](https://github.com/BVLC/caffe/graphs/contributors) for the full list.
@@ -103,4 +98,4 @@ Check out the Github [project pulse](https://github.com/BVLC/caffe/pulse) for re
 We sincerely appreciate your interest and contributions!
 If you'd like to contribute, please read the [developing & contributing](development.html) guide.
 
-Yangqing would like to give a personal thanks to the NVIDIA Academic program for providing GPUs, [Oriol Vinyals](http://www1.icsi.berkeley.edu/~vinyals/) for discussions along the journey, and BVLC PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for advice.
+Yangqing would like to give a personal thanks to the NVIDIA Academic program for providing GPUs, [Oriol Vinyals](http://www1.icsi.berkeley.edu/~vinyals/) for discussions along the journey, and BAIR PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for advice.
diff --git a/docs/install_apt.md b/docs/install_apt.md
index bc1566b..ee2cd28 100644
--- a/docs/install_apt.md
+++ b/docs/install_apt.md
@@ -14,7 +14,7 @@ The NVIDIA package tends to follow more recent library and driver versions, but
 If installing from packages, install the library and latest driver separately; the driver bundled with the library is usually out-of-date.
 This can be skipped for CPU-only installation.
 
-**BLAS**: install ATLAS by `sudo apt-get install libatlas-base-dev` or install OpenBLAS or MKL for better CPU performance.
+**BLAS**: install ATLAS by `sudo apt-get install libatlas-base-dev` or install OpenBLAS by `sudo apt-get install libopenblas-dev` or MKL for better CPU performance.
 
 **Python** (optional): if you use the default Python you will need to `sudo apt-get install` the `python-dev` package to have the Python headers for building the pycaffe interface.
 
diff --git a/docs/installation.md b/docs/installation.md
index 2e55802..42f1d0c 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -42,14 +42,14 @@ Optional dependencies:
 
 * [OpenCV](http://opencv.org/) >= 2.4 including 3.0
 * IO libraries: `lmdb`, `leveldb` (note: leveldb requires `snappy`)
-* cuDNN for GPU acceleration (v5)
+* cuDNN for GPU acceleration (v6)
 
 Pycaffe and Matcaffe interfaces have their own natural needs.
 
 * For Python Caffe:  `Python 2.7` or `Python 3.3+`, `numpy (>= 1.7)`, boost-provided `boost.python`
 * For MATLAB Caffe: MATLAB with the `mex` compiler.
 
-**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v5; older versions are supported in older Caffe.
+**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v6; older versions are supported in older Caffe.
 
 **CPU-only Caffe**: for cold-brewed CPU-only Caffe uncomment the `CPU_ONLY := 1` flag in `Makefile.config` to configure and build Caffe without CUDA. This is helpful for cloud or cluster deployment.
 
diff --git a/docs/model_zoo.md b/docs/model_zoo.md
index 06dc0a4..3f77e82 100644
--- a/docs/model_zoo.md
+++ b/docs/model_zoo.md
@@ -3,7 +3,7 @@ title: Model Zoo
 ---
 # Caffe Model Zoo
 
-Lots of researchers and engineers have made Caffe models for different tasks with all kinds of architectures and data.
+Lots of researchers and engineers have made Caffe models for different tasks with all kinds of architectures and data: check out the [model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)!
 These models are learned and applied for problems ranging from simple regression, to large-scale visual classification, to Siamese networks for image similarity, to speech and robotics applications.
 
 To help share these models, we introduce the model zoo framework:
@@ -14,17 +14,17 @@ To help share these models, we introduce the model zoo framework:
 
 ## Where to get trained models
 
-First of all, we bundle BVLC-trained models for unrestricted, out of the box use.
+First of all, we bundle BAIR-trained models for unrestricted, out of the box use.
 <br>
-See the [BVLC model license](#bvlc-model-license) for details.
+See the [BAIR model license](#bair-model-license) for details.
 Each one of these can be downloaded by running `scripts/download_model_binary.py <dirname>` where `<dirname>` is specified below:
 
-- **BVLC Reference CaffeNet** in `models/bvlc_reference_caffenet`: AlexNet trained on ILSVRC 2012, with a minor variation from the version as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Jeff Donahue @jeffdonahue)
-- **BVLC AlexNet** in `models/bvlc_alexnet`: AlexNet trained on ILSVRC 2012, almost exactly as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Evan Shelhamer @shelhamer)
-- **BVLC Reference R-CNN ILSVRC-2013** in `models/bvlc_reference_rcnn_ilsvrc13`: pure Caffe implementation of [R-CNN](https://github.com/rbgirshick/rcnn) as described by Girshick et al. in CVPR 2014. (Trained by Ross Girshick @rbgirshick)
-- **BVLC GoogLeNet** in `models/bvlc_googlenet`: GoogLeNet trained on ILSVRC 2012, almost exactly as described in [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842) by Szegedy et al. in ILSVRC 2014. (Trained by Sergio Guadarrama @sguada)
+- **BAIR Reference CaffeNet** in `models/bvlc_reference_caffenet`: AlexNet trained on ILSVRC 2012, with a minor variation from the version as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Jeff Donahue @jeffdonahue)
+- **BAIR AlexNet** in `models/bvlc_alexnet`: AlexNet trained on ILSVRC 2012, almost exactly as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Evan Shelhamer @shelhamer)
+- **BAIR Reference R-CNN ILSVRC-2013** in `models/bvlc_reference_rcnn_ilsvrc13`: pure Caffe implementation of [R-CNN](https://github.com/rbgirshick/rcnn) as described by Girshick et al. in CVPR 2014. (Trained by Ross Girshick @rbgirshick)
+- **BAIR GoogLeNet** in `models/bvlc_googlenet`: GoogLeNet trained on ILSVRC 2012, almost exactly as described in [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842) by Szegedy et al. in ILSVRC 2014. (Trained by Sergio Guadarrama @sguada)
 
-**Community models** made by Caffe users are posted to a publicly editable [wiki page](https://github.com/BVLC/caffe/wiki/Model-Zoo).
+**Community models** made by Caffe users are posted to a publicly editable [model zoo wiki page](https://github.com/BVLC/caffe/wiki/Model-Zoo).
 These models are subject to conditions of their respective authors such as citation and license.
 Thank you for sharing your models!
 
@@ -42,6 +42,8 @@ A caffe model is distributed as a directory containing:
     - License information.
 - [optional] Other helpful scripts.
 
+This simple format can be handled through bundled scripts or manually if need be.
+
 ### Hosting model info
 
 Github Gist is a good format for model info distribution because it can contain multiple files, is versionable, and has in-browser syntax highlighting and markdown rendering.
@@ -55,14 +57,14 @@ Downloading model info is done just as easily with `scripts/download_model_from_
 ### Hosting trained models
 
 It is up to the user where to host the `.caffemodel` file.
-We host our BVLC-provided models on our own server.
+We host our BAIR-provided models on our own server.
 Dropbox also works fine (tip: make sure that `?dl=1` is appended to the end of the URL).
 
 `scripts/download_model_binary.py <dirname>` downloads the `.caffemodel` from the URL specified in the `<dirname>/readme.md` frontmatter and confirms SHA1.
 
-## BVLC model license
+## BAIR model license
 
-The Caffe models bundled by the BVLC are released for unrestricted use.
+The Caffe models bundled by the BAIR are released for unrestricted use.
 
 These models are trained on data from the [ImageNet project](http://www.image-net.org/) and training data includes internet photos that may be subject to copyright.
 
diff --git a/docs/multigpu.md b/docs/multigpu.md
index d91acef..e04ebb0 100644
--- a/docs/multigpu.md
+++ b/docs/multigpu.md
@@ -13,7 +13,7 @@ The GPUs to be used for training can be set with the "-gpu" flag on the command
 # Hardware Configuration Assumptions
 
 The current implementation uses a tree reduction strategy.  e.g. if there are 4 GPUs in the system, 0:1, 2:3 will exchange gradients, then 0:2 (top of the tree) will exchange gradients, 0 will calculate
-updated model, 0\-\>2, and then 0\-\>1, 2\-\>3. 
+updated model, 0\-\>2, and then 0\-\>1, 2\-\>3.
 
 For best performance, P2P DMA access between devices is needed. Without P2P access, for example crossing PCIe root complex, data is copied through host and effective exchange bandwidth is greatly reduced.
 
@@ -23,4 +23,4 @@ Current implementation has a "soft" assumption that the devices being used are h
 
 # Scaling Performance
 
-Performance is **heavily** dependent on the PCIe topology of the system, the configuration of the neural network you are training, and the speed of each of the layers.  Systems like the DIGITS DevBox have an optimized PCIe topology (X99-E WS chipset).  In general, scaling on 2 GPUs tends to be ~1.8X on average for networks like AlexNet, CaffeNet, VGG, GoogleNet.  4 GPUs begins to have falloff in scaling.  Generally with "weak scaling" where the batchsize increases with the number of GPUs [...]
\ No newline at end of file
+Performance is **heavily** dependent on the PCIe topology of the system, the configuration of the neural network you are training, and the speed of each of the layers.  Systems like the DIGITS DevBox have an optimized PCIe topology (X99-E WS chipset).  In general, scaling on 2 GPUs tends to be ~1.8X on average for networks like AlexNet, CaffeNet, VGG, GoogleNet.  4 GPUs begins to have falloff in scaling.  Generally with "weak scaling" where the batchsize increases with the number of GPUs [...]
diff --git a/docs/performance_hardware.md b/docs/performance_hardware.md
deleted file mode 100644
index cdd4b36..0000000
--- a/docs/performance_hardware.md
+++ /dev/null
@@ -1,73 +0,0 @@
----
-title: Performance and Hardware Configuration
----
-
-# Performance and Hardware Configuration
-
-To measure performance on different NVIDIA GPUs we use CaffeNet, the Caffe reference ImageNet model.
-
-For training, each time point is 20 iterations/minibatches of 256 images for 5,120 images total. For testing, a 50,000 image validation set is classified.
-
-**Acknowledgements**: BVLC members are very grateful to NVIDIA for providing several GPUs to conduct this research.
-
-## NVIDIA K40
-
-Performance is best with ECC off and boost clock enabled. While ECC makes a negligible difference in speed, disabling it frees ~1 GB of GPU memory.
-
-Best settings with ECC off and maximum clock speed in standard Caffe:
-
-* Training is 26.5 secs / 20 iterations (5,120 images)
-* Testing is 100 secs / validation set (50,000 images)
-
-Best settings with Caffe + [cuDNN acceleration](http://nvidia.com/cudnn):
-
-* Training is 19.2 secs / 20 iterations (5,120 images)
-* Testing is 60.7 secs / validation set (50,000 images)
-
-Other settings:
-
-* ECC on, max speed: training 26.7 secs / 20 iterations, test 101 secs / validation set
-* ECC on, default speed: training 31 secs / 20 iterations, test 117 secs / validation set
-* ECC off, default speed: training 31 secs / 20 iterations, test 118 secs / validation set
-
-### K40 configuration tips
-
-For maximum K40 performance, turn off ECC and boost the clock speed (at your own risk).
-
-To turn off ECC, do
-
-    sudo nvidia-smi -i 0 --ecc-config=0    # repeat with -i x for each GPU ID
-
-then reboot.
-
-Set the "persistence" mode of the GPU settings by
-
-    sudo nvidia-smi -pm 1
-
-and then set the clock speed with
-
-    sudo nvidia-smi -i 0 -ac 3004,875    # repeat with -i x for each GPU ID
-
-but note that this configuration resets across driver reloading / rebooting. Include these commands in a boot script to initialize these settings. For a simple fix, add these commands to `/etc/rc.local` (on Ubuntu).
-
-## NVIDIA Titan
-
-Training: 26.26 secs / 20 iterations (5,120 images).
-Testing: 100 secs / validation set (50,000 images).
-
-cuDNN Training: 20.25 secs / 20 iterations (5,120 images).
-cuDNN Testing: 66.3 secs / validation set (50,000 images).
-
-
-## NVIDIA K20
-
-Training: 36.0 secs / 20 iterations (5,120 images).
-Testing: 133 secs / validation set (50,000 images).
-
-## NVIDIA GTX 770
-
-Training: 33.0 secs / 20 iterations (5,120 images).
-Testing: 129 secs / validation set (50,000 images).
-
-cuDNN Training: 24.3 secs / 20 iterations (5,120 images).
-cuDNN Testing: 104 secs / validation set (50,000 images).
diff --git a/docs/tutorial/interfaces.md b/docs/tutorial/interfaces.md
index d7ff378..b5a4f1a 100644
--- a/docs/tutorial/interfaces.md
+++ b/docs/tutorial/interfaces.md
@@ -91,7 +91,7 @@ In MatCaffe, you can
 * Run for a certain number of iterations and give back control to Matlab
 * Intermingle arbitrary Matlab code with gradient steps
 
-An ILSVRC image classification demo is in caffe/matlab/demo/classification_demo.m (you need to download BVLC CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) to run it).
+An ILSVRC image classification demo is in caffe/matlab/demo/classification_demo.m (you need to download BAIR CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) to run it).
 
 ### Build MatCaffe
 
@@ -114,7 +114,7 @@ You can save your Matlab search PATH by running `savepath` so that you don't hav
 
 MatCaffe is very similar to PyCaffe in usage.
 
-Examples below shows detailed usages and assumes you have downloaded BVLC CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) and started `matlab` from caffe root folder.
+Examples below shows detailed usages and assumes you have downloaded BAIR CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) and started `matlab` from caffe root folder.
 
     model = './models/bvlc_reference_caffenet/deploy.prototxt';
     weights = './models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel';
diff --git a/docs/tutorial/layers.md b/docs/tutorial/layers.md
index a903d5a..2faacc5 100644
--- a/docs/tutorial/layers.md
+++ b/docs/tutorial/layers.md
@@ -128,7 +128,7 @@ Layers:
 * [Infogain Loss](layers/infogainloss.html) - a generalization of MultinomialLogisticLossLayer.
 * [Softmax with Loss](layers/softmaxwithloss.html) - computes the multinomial logistic loss of the softmax of its inputs. It's conceptually identical to a softmax layer followed by a multinomial logistic loss layer, but provides a more numerically stable gradient.
 * [Sum-of-Squares / Euclidean](layers/euclideanloss.html) - computes the sum of squares of differences of its two inputs, $$\frac 1 {2N} \sum_{i=1}^N \| x^1_i - x^2_i \|_2^2$$.
-* [Hinge / Margin](layers/hiddenloss.html) - The hinge loss layer computes a one-vs-all hinge (L1) or squared hinge loss (L2).
+* [Hinge / Margin](layers/hingeloss.html) - The hinge loss layer computes a one-vs-all hinge (L1) or squared hinge loss (L2).
 * [Sigmoid Cross-Entropy Loss](layers/sigmoidcrossentropyloss.html) - computes the cross-entropy (logistic) loss, often used for predicting targets interpreted as probabilities.
 * [Accuracy / Top-k layer](layers/accuracy.html) - scores the output as an accuracy with respect to target -- it is not actually a loss and has no backward step.
 * [Contrastive Loss](layers/contrastiveloss.html)
diff --git a/docs/tutorial/layers/sigmoid.md b/docs/tutorial/layers/sigmoid.md
index 5053183..f18ac4b 100644
--- a/docs/tutorial/layers/sigmoid.md
+++ b/docs/tutorial/layers/sigmoid.md
@@ -9,6 +9,16 @@ title: Sigmoid Layer
 * Header: [`./include/caffe/layers/sigmoid_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/sigmoid_layer.hpp)
 * CPU implementation: [`./src/caffe/layers/sigmoid_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_layer.cpp)
 * CUDA GPU implementation: [`./src/caffe/layers/sigmoid_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_layer.cu)
+* Example (from [`./examples/mnist/mnist_autoencoder.prototxt`](https://github.com/BVLC/caffe/blob/master/examples/mnist/mnist_autoencoder.prototxt)):
+
+      layer {
+        name: "encode1neuron"
+        bottom: "encode1"
+        top: "encode1neuron"
+        type: "Sigmoid"
+      }
+
+The `Sigmoid` layer computes `sigmoid(x)` for each element `x` in the bottom blob.
 
 ## Parameters
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a59e0df..43bbcb8 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -19,7 +19,8 @@ foreach(source_file ${examples_srcs})
   caffe_set_solution_folder(${name} examples)
 
   # install
-  install(TARGETS ${name} DESTINATION bin)
+  install(TARGETS ${name} DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 
   if(UNIX OR APPLE)
     # Funny command to make tutorials work
diff --git a/examples/cifar10/cifar10_quick_solver.prototxt b/examples/cifar10/cifar10_quick_solver.prototxt
index 5de276f..14b4401 100644
--- a/examples/cifar10/cifar10_quick_solver.prototxt
+++ b/examples/cifar10/cifar10_quick_solver.prototxt
@@ -20,7 +20,6 @@ display: 100
 max_iter: 4000
 # snapshot intermediate results
 snapshot: 4000
-snapshot_format: HDF5
 snapshot_prefix: "examples/cifar10/cifar10_quick"
 # solver mode: CPU or GPU
 solver_mode: GPU
diff --git a/examples/cifar10/train_full.sh b/examples/cifar10/train_full.sh
index 06ecc2d..fe46e60 100755
--- a/examples/cifar10/train_full.sh
+++ b/examples/cifar10/train_full.sh
@@ -9,9 +9,9 @@ $TOOLS/caffe train \
 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
     --solver=examples/cifar10/cifar10_full_solver_lr1.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate.h5 $@
+    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate $@
 
 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
     --solver=examples/cifar10/cifar10_full_solver_lr2.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate.h5 $@
+    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate $@
diff --git a/examples/cifar10/train_quick.sh b/examples/cifar10/train_quick.sh
index d2b8753..257479e 100755
--- a/examples/cifar10/train_quick.sh
+++ b/examples/cifar10/train_quick.sh
@@ -9,4 +9,4 @@ $TOOLS/caffe train \
 # reduce learning rate by factor of 10 after 8 epochs
 $TOOLS/caffe train \
   --solver=examples/cifar10/cifar10_quick_solver_lr1.prototxt \
-  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate.h5 $@
+  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate $@
diff --git a/examples/finetune_flickr_style/readme.md b/examples/finetune_flickr_style/readme.md
index 188dedf..dacfd01 100644
--- a/examples/finetune_flickr_style/readme.md
+++ b/examples/finetune_flickr_style/readme.md
@@ -9,7 +9,7 @@ priority: 5
 # Fine-tuning CaffeNet for Style Recognition on "Flickr Style" Data
 
 Fine-tuning takes an already learned model, adapts the architecture, and resumes training from the already learned model weights.
-Let's fine-tune the BVLC-distributed CaffeNet model on a different dataset, [Flickr Style](http://sergeykarayev.com/files/1311.3715v3.pdf), to predict image style instead of object category.
+Let's fine-tune the BAIR-distributed CaffeNet model on a different dataset, [Flickr Style](http://sergeykarayev.com/files/1311.3715v3.pdf), to predict image style instead of object category.
 
 ## Explanation
 
diff --git a/include/caffe/layers/base_data_layer.hpp b/include/caffe/layers/base_data_layer.hpp
index 21d3ada..c8b6998 100644
--- a/include/caffe/layers/base_data_layer.hpp
+++ b/include/caffe/layers/base_data_layer.hpp
@@ -26,8 +26,6 @@ class BaseDataLayer : public Layer<Dtype> {
   // This method may not be overridden except by the BasePrefetchingDataLayer.
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
   // Data layers have no bottoms, so reshaping is trivial.
diff --git a/include/caffe/layers/data_layer.hpp b/include/caffe/layers/data_layer.hpp
index dec5818..667a4ae 100644
--- a/include/caffe/layers/data_layer.hpp
+++ b/include/caffe/layers/data_layer.hpp
@@ -20,8 +20,6 @@ class DataLayer : public BasePrefetchingDataLayer<Dtype> {
   virtual ~DataLayer();
   virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // DataLayer uses DataReader instead for sharing for parallelism
-  virtual inline bool ShareInParallel() const { return false; }
   virtual inline const char* type() const { return "Data"; }
   virtual inline int ExactNumBottomBlobs() const { return 0; }
   virtual inline int MinTopBlobs() const { return 1; }
diff --git a/include/caffe/layers/dummy_data_layer.hpp b/include/caffe/layers/dummy_data_layer.hpp
index 4180f1d..13a63d4 100644
--- a/include/caffe/layers/dummy_data_layer.hpp
+++ b/include/caffe/layers/dummy_data_layer.hpp
@@ -22,8 +22,6 @@ class DummyDataLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/hdf5_data_layer.hpp b/include/caffe/layers/hdf5_data_layer.hpp
index 650a3fb..601b36c 100644
--- a/include/caffe/layers/hdf5_data_layer.hpp
+++ b/include/caffe/layers/hdf5_data_layer.hpp
@@ -27,8 +27,6 @@ class HDF5DataLayer : public Layer<Dtype> {
   virtual ~HDF5DataLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/hdf5_output_layer.hpp b/include/caffe/layers/hdf5_output_layer.hpp
index 487d08f..061e279 100644
--- a/include/caffe/layers/hdf5_output_layer.hpp
+++ b/include/caffe/layers/hdf5_output_layer.hpp
@@ -28,8 +28,6 @@ class HDF5OutputLayer : public Layer<Dtype> {
   virtual ~HDF5OutputLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/infogain_loss_layer.hpp b/include/caffe/layers/infogain_loss_layer.hpp
index 633f339..edecde8 100644
--- a/include/caffe/layers/infogain_loss_layer.hpp
+++ b/include/caffe/layers/infogain_loss_layer.hpp
@@ -8,6 +8,7 @@
 #include "caffe/proto/caffe.pb.h"
 
 #include "caffe/layers/loss_layer.hpp"
+#include "caffe/layers/softmax_layer.hpp"
 
 namespace caffe {
 
@@ -60,6 +61,12 @@ class InfogainLossLayer : public LossLayer<Dtype> {
   virtual inline int MinBottomBlobs() const { return 2; }
   virtual inline int MaxBottomBlobs() const { return 3; }
 
+  // InfogainLossLayer computes softmax prob internally.
+  // optional second "top" outputs the softmax prob
+  virtual inline int ExactNumTopBlobs() const { return -1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline int MaxTopBlobs() const { return 2; }
+
   virtual inline const char* type() const { return "InfogainLoss"; }
 
  protected:
@@ -102,7 +109,35 @@ class InfogainLossLayer : public LossLayer<Dtype> {
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+  /// Read the normalization mode parameter and compute the normalizer based
+  /// on the blob size.  If normalization_mode is VALID, the count of valid
+  /// outputs will be read from valid_count, unless it is -1 in which case
+  /// all outputs are assumed to be valid.
+  virtual Dtype get_normalizer(
+      LossParameter_NormalizationMode normalization_mode, int valid_count);
+  /// fill sum_rows_H_ according to matrix H
+  virtual void sum_rows_of_H(const Blob<Dtype>* H);
+
+  /// The internal SoftmaxLayer used to map predictions to a distribution.
+  shared_ptr<Layer<Dtype> > softmax_layer_;
+  /// prob stores the output probability predictions from the SoftmaxLayer.
+  Blob<Dtype> prob_;
+  /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
+  vector<Blob<Dtype>*> softmax_bottom_vec_;
+  /// top vector holder used in call to the underlying SoftmaxLayer::Forward
+  vector<Blob<Dtype>*> softmax_top_vec_;
+
   Blob<Dtype> infogain_;
+  Blob<Dtype> sum_rows_H_;  // cache the row sums of H.
+
+  /// Whether to ignore instances with a certain label.
+  bool has_ignore_label_;
+  /// The label indicating that an instance should be ignored.
+  int ignore_label_;
+  /// How to normalize the output loss.
+  LossParameter_NormalizationMode normalization_;
+
+  int infogain_axis_, outer_num_, inner_num_, num_labels_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/input_layer.hpp b/include/caffe/layers/input_layer.hpp
index f447267..0ffdc72 100644
--- a/include/caffe/layers/input_layer.hpp
+++ b/include/caffe/layers/input_layer.hpp
@@ -22,8 +22,6 @@ class InputLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/python_layer.hpp b/include/caffe/layers/python_layer.hpp
index 10c4bfd..1407d92 100644
--- a/include/caffe/layers/python_layer.hpp
+++ b/include/caffe/layers/python_layer.hpp
@@ -34,10 +34,6 @@ class PythonLayer : public Layer<Dtype> {
     self_.attr("reshape")(bottom, top);
   }
 
-  virtual inline bool ShareInParallel() const {
-    return this->layer_param_.python_param().share_in_parallel();
-  }
-
   virtual inline const char* type() const { return "Python"; }
 
  protected:
diff --git a/include/caffe/layers/window_data_layer.hpp b/include/caffe/layers/window_data_layer.hpp
index 35f41b8..b9b66b7 100644
--- a/include/caffe/layers/window_data_layer.hpp
+++ b/include/caffe/layers/window_data_layer.hpp
@@ -16,7 +16,8 @@ namespace caffe {
 
 /**
  * @brief Provides data to the Net from windows of images files, specified
- *        by a window data file.
+ *        by a window data file. This layer is *DEPRECATED* and only kept for
+ *        archival purposes for use by the original R-CNN.
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
diff --git a/include/caffe/test/test_caffe_main.hpp b/include/caffe/test/test_caffe_main.hpp
index fc15609..294f7e5 100644
--- a/include/caffe/test/test_caffe_main.hpp
+++ b/include/caffe/test/test_caffe_main.hpp
@@ -18,9 +18,8 @@ using std::endl;
   #include "caffe_config.h"
 #else
   #define CUDA_TEST_DEVICE -1
-  #define CMAKE_SOURCE_DIR "src/"
   #define EXAMPLES_SOURCE_DIR "examples/"
-  #define CMAKE_EXT ""
+  #define ABS_TEST_DATA_DIR "src/caffe/test/test_data"
 #endif
 
 int main(int argc, char** argv);
diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp
index a7d8dbb..498cfe3 100644
--- a/include/caffe/util/cudnn.hpp
+++ b/include/caffe/util/cudnn.hpp
@@ -41,6 +41,10 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
       return "CUDNN_STATUS_NOT_SUPPORTED";
     case CUDNN_STATUS_LICENSE_ERROR:
       return "CUDNN_STATUS_LICENSE_ERROR";
+#if CUDNN_VERSION_MIN(6, 0, 0)
+    case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING:
+      return "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING";
+#endif
   }
   return "Unknown cudnn status";
 }
@@ -109,8 +113,14 @@ template <typename Dtype>
 inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv,
     cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter,
     int pad_h, int pad_w, int stride_h, int stride_w) {
+#if CUDNN_VERSION_MIN(6, 0, 0)
   CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
+      pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION,
+      dataType<Dtype>::type));
+#else
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
       pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
+#endif
 }
 
 template <typename Dtype>
diff --git a/include/caffe/util/hdf5.hpp b/include/caffe/util/hdf5.hpp
index ce568c5..71549c1 100644
--- a/include/caffe/util/hdf5.hpp
+++ b/include/caffe/util/hdf5.hpp
@@ -13,12 +13,12 @@ namespace caffe {
 template <typename Dtype>
 void hdf5_load_nd_dataset_helper(
     hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob);
+    Blob<Dtype>* blob, bool reshape);
 
 template <typename Dtype>
 void hdf5_load_nd_dataset(
     hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob);
+    Blob<Dtype>* blob, bool reshape = false);
 
 template <typename Dtype>
 void hdf5_save_nd_dataset(
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 51068fe..e549120 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -53,6 +53,9 @@ template <typename Dtype>
 void caffe_sqr(const int N, const Dtype* a, Dtype* y);
 
 template <typename Dtype>
+void caffe_sqrt(const int N, const Dtype* a, Dtype* y);
+
+template <typename Dtype>
 void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
 template <typename Dtype>
@@ -128,16 +131,16 @@ inline int8_t caffe_sign(Dtype val) {
   }
 
 // output is 1 for the positives, 0 for zero, and -1 for the negatives
-DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
+DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]))
 
 // This returns a nonzero value if the input has its sign bit set.
 // The name sngbit is meant to avoid conflicts with std::signbit in the macro.
 // The extra parens are needed because CUDA < 6.5 defines signbit as a macro,
 // and we don't want that to expand here when CUDA headers are also included.
 DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \
-    y[i] = static_cast<bool>((std::signbit)(x[i])));
+    y[i] = static_cast<bool>((std::signbit)(x[i])))
 
-DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
+DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]))
 
 template <typename Dtype>
 void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
@@ -214,6 +217,9 @@ void caffe_gpu_log(const int n, const Dtype* a, Dtype* y);
 template <typename Dtype>
 void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
+template <typename Dtype>
+void caffe_gpu_sqrt(const int n, const Dtype* a, Dtype* y);
+
 // caffe_gpu_rng_uniform with two arguments generates integers in the range
 // [0, UINT_MAX].
 void caffe_gpu_rng_uniform(const int n, unsigned int* r);
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
index 95df0f9..8c2294c 100644
--- a/include/caffe/util/mkl_alternate.hpp
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -36,10 +36,11 @@ extern "C" {
     v##name<double>(n, a, y); \
   }
 
-DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i]);
-DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]));
-DEFINE_VSL_UNARY_FUNC(Ln, y[i] = log(a[i]));
-DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]));
+DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i])
+DEFINE_VSL_UNARY_FUNC(Sqrt, y[i] = sqrt(a[i]))
+DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]))
+DEFINE_VSL_UNARY_FUNC(Ln, y[i] = log(a[i]))
+DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]))
 
 // A simple way to define the vsl unary functions with singular parameter b.
 // The operation should be in the form e.g. y[i] = pow(a[i], b)
@@ -58,7 +59,7 @@ DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]));
     v##name<double>(n, a, b, y); \
   }
 
-DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
+DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b))
 
 // A simple way to define the vsl binary functions. The operation should
 // be in the form e.g. y[i] = a[i] + b[i]
@@ -77,10 +78,10 @@ DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
     v##name<double>(n, a, b, y); \
   }
 
-DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i]);
-DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i]);
-DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i]);
-DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]);
+DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i])
+DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i])
+DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i])
+DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i])
 
 // In addition, MKL comes with an additional function axpby that is not present
 // in standard blas. We will simply use a two-step (inefficient, of course) way
diff --git a/models/bvlc_alexnet/readme.md b/models/bvlc_alexnet/readme.md
index 008d690..a83e3d4 100644
--- a/models/bvlc_alexnet/readme.md
+++ b/models/bvlc_alexnet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC AlexNet Model
+name: BAIR/BVLC AlexNet Model
 caffemodel: bvlc_alexnet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_googlenet/readme.md b/models/bvlc_googlenet/readme.md
index 061b6d7..ef04db6 100644
--- a/models/bvlc_googlenet/readme.md
+++ b/models/bvlc_googlenet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC GoogleNet Model
+name: BAIR/BVLC GoogleNet Model
 caffemodel: bvlc_googlenet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_reference_caffenet/readme.md b/models/bvlc_reference_caffenet/readme.md
index 671e47a..5352e53 100644
--- a/models/bvlc_reference_caffenet/readme.md
+++ b/models/bvlc_reference_caffenet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC CaffeNet Model
+name: BAIR/BVLC CaffeNet Model
 caffemodel: bvlc_reference_caffenet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_reference_caffenet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_reference_rcnn_ilsvrc13/readme.md b/models/bvlc_reference_rcnn_ilsvrc13/readme.md
index 9a11a24..12543b2 100644
--- a/models/bvlc_reference_rcnn_ilsvrc13/readme.md
+++ b/models/bvlc_reference_rcnn_ilsvrc13/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC Reference RCNN ILSVRC13 Model
+name: BAIR/BVLC Reference RCNN ILSVRC13 Model
 caffemodel: bvlc_reference_rcnn_ilsvrc13.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_reference_rcnn_ilsvrc13.caffemodel
 license: unrestricted
diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index 43a0c49..776945e 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,5 @@
 from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver, NCCL, Timer
-from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess, Layer, get_solver
+from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess, has_nccl
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 3589e47..d7f43ff 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -51,14 +51,18 @@ const int NPY_DTYPE = NPY_FLOAT32;
 void set_mode_cpu() { Caffe::set_mode(Caffe::CPU); }
 void set_mode_gpu() { Caffe::set_mode(Caffe::GPU); }
 
-void InitLog(int level) {
-  FLAGS_logtostderr = 1;
-  FLAGS_minloglevel = level;
+void InitLog() {
   ::google::InitGoogleLogging("");
   ::google::InstallFailureSignalHandler();
 }
-void InitLogInfo() {
-  InitLog(google::INFO);
+void InitLogLevel(int level) {
+  FLAGS_minloglevel = level;
+  InitLog();
+}
+void InitLogLevelPipe(int level, bool stderr) {
+  FLAGS_minloglevel = level;
+  FLAGS_logtostderr = stderr;
+  InitLog();
 }
 void Log(const string& s) {
   LOG(INFO) << s;
@@ -288,7 +292,7 @@ void Solver_add_callback(Solver<Dtype> * solver, bp::object on_start,
 }
 
 // Seems boost cannot call the base method directly
-void Solver_add_nccl(SGDSolver<Dtype>* solver
+void Solver_add_nccl(Solver<Dtype>* solver
 #ifdef USE_NCCL
   , NCCL<Dtype>* nccl
 #endif
@@ -298,6 +302,10 @@ void Solver_add_nccl(SGDSolver<Dtype>* solver
 #endif
 }
 
+void share_weights(Solver<Dtype>* solver, Net<Dtype>* net) {
+  net->ShareTrainedLayersWith(solver->net().get());
+}
+
 template<typename Dtype>
 class NetCallback: public Net<Dtype>::Callback {
  public:
@@ -339,6 +347,35 @@ class NCCL {
 };
 #endif
 
+bool HasNCCL() {
+#ifdef USE_NCCL
+  return true;
+#else
+  return false;
+#endif
+}
+
+#ifdef USE_NCCL
+bp::object NCCL_New_Uid() {
+  std::string uid = NCCL<Dtype>::new_uid();
+#if PY_MAJOR_VERSION >= 3
+  // Convert std::string to bytes so that Python does not
+  // try to decode the string using the current locale.
+
+  // Since boost 1.53 boost.python will convert str and bytes
+  // to std::string but will convert std::string to str. Here we
+  // force a bytes object to be returned. When this object
+  // is passed back to the NCCL constructor boost.python will
+  // correctly convert the bytes to std::string automatically
+  PyObject* py_uid = PyBytes_FromString(uid.c_str());
+  return bp::object(bp::handle<>(py_uid));
+#else
+  // automatic conversion is correct for python 2.
+  return bp::object(uid);
+#endif
+}
+#endif
+
 BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(SolveOverloads, Solve, 0, 1);
 
 BOOST_PYTHON_MODULE(_caffe) {
@@ -349,8 +386,10 @@ BOOST_PYTHON_MODULE(_caffe) {
 
   // Caffe utility functions
   bp::def("init_log", &InitLog);
-  bp::def("init_log", &InitLogInfo);
+  bp::def("init_log", &InitLogLevel);
+  bp::def("init_log", &InitLogLevelPipe);
   bp::def("log", &Log);
+  bp::def("has_nccl", &HasNCCL);
   bp::def("set_mode_cpu", &set_mode_cpu);
   bp::def("set_mode_gpu", &set_mode_gpu);
   bp::def("set_random_seed", &set_random_seed);
@@ -459,6 +498,7 @@ BOOST_PYTHON_MODULE(_caffe) {
     .def("step", &Solver<Dtype>::Step)
     .def("restore", &Solver<Dtype>::Restore)
     .def("snapshot", &Solver<Dtype>::Snapshot)
+    .def("share_weights", &share_weights)
     .add_property("param", bp::make_function(&Solver<Dtype>::param,
               bp::return_value_policy<bp::copy_const_reference>()));
   BP_REGISTER_SHARED_PTR_TO_PYTHON(Solver<Dtype>);
@@ -508,7 +548,7 @@ BOOST_PYTHON_MODULE(_caffe) {
     boost::noncopyable>("NCCL",
                         bp::init<shared_ptr<Solver<Dtype> >, const string&>())
 #ifdef USE_NCCL
-    .def("new_uid", &NCCL<Dtype>::new_uid).staticmethod("new_uid")
+    .def("new_uid", NCCL_New_Uid).staticmethod("new_uid")
     .def("bcast", &NCCL<Dtype>::Broadcast)
 #endif
     /* NOLINT_NEXT_LINE(whitespace/semicolon) */
diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index e4fd7aa..8411a41 100644
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -91,11 +91,11 @@ def get_layer_label(layer, rankdir):
                       separator,
                       layer.type,
                       separator,
-                      layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size._values) else 1,
+                      layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size) else 1,
                       separator,
-                      layer.convolution_param.stride[0] if len(layer.convolution_param.stride._values) else 1,
+                      layer.convolution_param.stride[0] if len(layer.convolution_param.stride) else 1,
                       separator,
-                      layer.convolution_param.pad[0] if len(layer.convolution_param.pad._values) else 0)
+                      layer.convolution_param.pad[0] if len(layer.convolution_param.pad) else 0)
     elif layer.type == 'Pooling':
         pooling_types_dict = get_pooling_types_dict()
         node_label = '"%s%s(%s %s)%skernel size: %d%sstride: %d%spad: %d"' %\
@@ -104,11 +104,11 @@ def get_layer_label(layer, rankdir):
                       pooling_types_dict[layer.pooling_param.pool],
                       layer.type,
                       separator,
-                      layer.pooling_param.kernel_size[0] if len(layer.pooling_param.kernel_size._values) else 1,
+                      layer.pooling_param.kernel_size,
                       separator,
-                      layer.pooling_param.stride[0] if len(layer.pooling_param.stride._values) else 1,
+                      layer.pooling_param.stride,
                       separator,
-                      layer.pooling_param.pad[0] if len(layer.pooling_param.pad._values) else 0)
+                      layer.pooling_param.pad)
     else:
         node_label = '"%s%s(%s)"' % (layer.name, separator, layer.type)
     return node_label
diff --git a/python/caffe/io.py b/python/caffe/io.py
index e1759be..966c164 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -75,7 +75,7 @@ def array_to_datum(arr, label=None):
     if arr.dtype == np.uint8:
         datum.data = arr.tostring()
     else:
-        datum.float_data.extend(arr.flat)
+        datum.float_data.extend(arr.astype(float).flat)
     if label is not None:
         datum.label = label
     return datum
diff --git a/python/caffe/net_spec.py b/python/caffe/net_spec.py
index 5fb1f0b..20918f9 100644
--- a/python/caffe/net_spec.py
+++ b/python/caffe/net_spec.py
@@ -103,6 +103,10 @@ class Function(object):
 
     def __init__(self, type_name, inputs, params):
         self.type_name = type_name
+        for index, input in enumerate(inputs):
+            if not isinstance(input, Top):
+                raise TypeError('%s input %d is not a Top (type is %s)' %
+                                (type_name, index, type(input)))
         self.inputs = inputs
         self.params = params
         self.ntop = self.params.get('ntop', 1)
diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index 6360659..4a7b5a2 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -113,7 +113,7 @@ def _Net_forward(self, blobs=None, start=None, end=None, **kwargs):
 
     if end is not None:
         end_ind = list(self._layer_names).index(end)
-        outputs = set([end] + blobs)
+        outputs = set(self.top_names[end] + blobs)
     else:
         end_ind = len(self.layers) - 1
         outputs = set(self.outputs + blobs)
@@ -161,7 +161,7 @@ def _Net_backward(self, diffs=None, start=None, end=None, **kwargs):
 
     if end is not None:
         end_ind = list(self._layer_names).index(end)
-        outputs = set([end] + diffs)
+        outputs = set(self.bottom_names[end] + diffs)
     else:
         end_ind = 0
         outputs = set(self.inputs + diffs)
diff --git a/python/caffe/test/test_draw.py b/python/caffe/test/test_draw.py
new file mode 100644
index 0000000..835bb5d
--- /dev/null
+++ b/python/caffe/test/test_draw.py
@@ -0,0 +1,37 @@
+import os
+import unittest
+
+from google.protobuf import text_format
+
+import caffe.draw
+from caffe.proto import caffe_pb2
+
+def getFilenames():
+    """Yields files in the source tree which are Net prototxts."""
+    result = []
+
+    root_dir = os.path.abspath(os.path.join(
+        os.path.dirname(__file__), '..', '..', '..'))
+    assert os.path.exists(root_dir)
+
+    for dirname in ('models', 'examples'):
+        dirname = os.path.join(root_dir, dirname)
+        assert os.path.exists(dirname)
+        for cwd, _, filenames in os.walk(dirname):
+            for filename in filenames:
+                filename = os.path.join(cwd, filename)
+                if filename.endswith('.prototxt') and 'solver' not in filename:
+                    yield os.path.join(dirname, filename)
+
+
+class TestDraw(unittest.TestCase):
+    def test_draw_net(self):
+        for filename in getFilenames():
+            net = caffe_pb2.NetParameter()
+            with open(filename) as infile:
+                text_format.Merge(infile.read(), net)
+            caffe.draw.draw_net(net, 'LR')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/caffe/test/test_nccl.py b/python/caffe/test/test_nccl.py
new file mode 100644
index 0000000..127a933
--- /dev/null
+++ b/python/caffe/test/test_nccl.py
@@ -0,0 +1,19 @@
+import sys
+import unittest
+
+import caffe
+
+
+class TestNCCL(unittest.TestCase):
+
+    def test_newuid(self):
+        """
+        Test that NCCL uids are of the proper type
+        according to python version
+        """
+        if caffe.has_nccl():
+            uid = caffe.NCCL.new_uid()
+            if sys.version_info.major >= 3:
+                self.assertTrue(isinstance(uid, bytes))
+            else:
+                self.assertTrue(isinstance(uid, str))
diff --git a/python/caffe/test/test_net.py b/python/caffe/test/test_net.py
index 24391cc..afd2769 100644
--- a/python/caffe/test/test_net.py
+++ b/python/caffe/test/test_net.py
@@ -25,11 +25,11 @@ def simple_net_file(num_output):
         bias_filler { type: 'constant' value: 2 } }
         param { decay_mult: 1 } param { decay_mult: 0 }
         }
-    layer { type: 'InnerProduct' name: 'ip' bottom: 'conv' top: 'ip'
+    layer { type: 'InnerProduct' name: 'ip' bottom: 'conv' top: 'ip_blob'
       inner_product_param { num_output: """ + str(num_output) + """
         weight_filler { type: 'gaussian' std: 2.5 }
         bias_filler { type: 'constant' value: -3 } } }
-    layer { type: 'SoftmaxWithLoss' name: 'loss' bottom: 'ip' bottom: 'label'
+    layer { type: 'SoftmaxWithLoss' name: 'loss' bottom: 'ip_blob' bottom: 'label'
       top: 'loss' }""")
     f.close()
     return f.name
@@ -71,6 +71,43 @@ class TestNet(unittest.TestCase):
         self.net.forward()
         self.net.backward()
 
+    def test_forward_start_end(self):
+        conv_blob=self.net.blobs['conv'];
+        ip_blob=self.net.blobs['ip_blob'];
+        sample_data=np.random.uniform(size=conv_blob.data.shape);
+        sample_data=sample_data.astype(np.float32);
+        conv_blob.data[:]=sample_data;
+        forward_blob=self.net.forward(start='ip',end='ip');
+        self.assertIn('ip_blob',forward_blob);
+
+        manual_forward=[];
+        for i in range(0,conv_blob.data.shape[0]):
+          dot=np.dot(self.net.params['ip'][0].data,
+                     conv_blob.data[i].reshape(-1));
+          manual_forward.append(dot+self.net.params['ip'][1].data);
+        manual_forward=np.array(manual_forward);
+
+        np.testing.assert_allclose(ip_blob.data,manual_forward,rtol=1e-3);
+
+    def test_backward_start_end(self):
+        conv_blob=self.net.blobs['conv'];
+        ip_blob=self.net.blobs['ip_blob'];
+        sample_data=np.random.uniform(size=ip_blob.data.shape)
+        sample_data=sample_data.astype(np.float32);
+        ip_blob.diff[:]=sample_data;
+        backward_blob=self.net.backward(start='ip',end='ip');
+        self.assertIn('conv',backward_blob);
+
+        manual_backward=[];
+        for i in range(0,conv_blob.data.shape[0]):
+          dot=np.dot(self.net.params['ip'][0].data.transpose(),
+                     sample_data[i].reshape(-1));
+          manual_backward.append(dot);
+        manual_backward=np.array(manual_backward);
+        manual_backward=manual_backward.reshape(conv_blob.data.shape);
+
+        np.testing.assert_allclose(conv_blob.diff,manual_backward,rtol=1e-3);
+
     def test_clear_param_diffs(self):
         # Run a forward/backward step to have non-zero diffs
         self.net.forward()
@@ -90,13 +127,13 @@ class TestNet(unittest.TestCase):
         self.assertEqual(self.net.top_names,
                          OrderedDict([('data', ['data', 'label']),
                                       ('conv', ['conv']),
-                                      ('ip', ['ip']),
+                                      ('ip', ['ip_blob']),
                                       ('loss', ['loss'])]))
         self.assertEqual(self.net.bottom_names,
                          OrderedDict([('data', []),
                                       ('conv', ['data']),
                                       ('ip', ['conv']),
-                                      ('loss', ['ip', 'label'])]))
+                                      ('loss', ['ip_blob', 'label'])]))
 
     def test_save_and_read(self):
         f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
diff --git a/python/caffe/test/test_net_spec.py b/python/caffe/test/test_net_spec.py
index fee3c0a..ffe71ba 100644
--- a/python/caffe/test/test_net_spec.py
+++ b/python/caffe/test/test_net_spec.py
@@ -79,3 +79,11 @@ class TestNetSpec(unittest.TestCase):
         net_proto = silent_net()
         net = self.load_net(net_proto)
         self.assertEqual(len(net.forward()), 0)
+
+    def test_type_error(self):
+        """Test that a TypeError is raised when a Function input isn't a Top."""
+        data = L.DummyData(ntop=2)  # data is a 2-tuple of Tops
+        r = r"^Silence input 0 is not a Top \(type is <(type|class) 'tuple'>\)$"
+        with self.assertRaisesRegexp(TypeError, r):
+            L.Silence(data, ntop=0)  # should raise: data is a tuple, not a Top
+        L.Silence(*data, ntop=0)  # shouldn't raise: each elt of data is a Top
diff --git a/scripts/caffe b/scripts/caffe
new file mode 100644
index 0000000..8a0b22a
--- /dev/null
+++ b/scripts/caffe
@@ -0,0 +1,73 @@
+# bash completion for Caffe's command line utility       -*- shell-script -*-
+# COPYRIGHT (C) 2015,2016 Zhou Mo <cdluminate at gmail.com>
+# License: BSD-2-Clause
+# Originally appeard at https://github.com/BVLC/caffe/issues/3149
+
+# Updated for caffe (1.0.0~rc3+20160715-g42cd785)
+_caffe()
+{
+  local cur prev words cword
+  _init_completion -s || return
+
+  local prototxts='@(prototxt)'
+  local caffemodels='@(caffemodel,binaryproto)'
+  local solverstates='@(solverstate)'
+  local caffefiles='@(prototxt|caffemodel|solverstate)'
+
+  local flags='-gpu -iterations -model -snapshot -solver -weights -sighup_effect -sigint_effect -level -stage -phase'
+  
+  if [[ $cword -eq 1 ]]; then
+    COMPREPLY=( $( compgen -W 'train test time device_query' -- "$cur" ) )
+    return 0
+  fi
+  
+  if [[ $cword -eq 2 ]]; then
+    case ${words[1]} in
+    train|test|device_query|time)
+      COMPREPLY=( $( compgen -W "$flags" -- "$cur") )
+      return 0
+      ;;
+    *)
+      return 0
+      ;;
+    esac
+  fi
+
+  case $prev in
+  -gpu|-iterations|-version|-level|-stage)
+    return 0
+    ;;
+  -solver|-model)
+    _filedir $prototxts
+    return 0
+    ;;
+  -weights)
+    _filedir $caffemodels
+    return 0
+    ;;
+  -snapshot)
+    _filedir $solverstates
+    return 0
+    ;;
+  -sighup_effect|-sigint_effect)
+    COMPREPLY=( $( compgen -W 'snapshot stop none' -- "$cur") )
+    return 0
+    ;;
+  -phase)
+    COMPREPLY=( $( compgen -W 'TRAIN TEST' -- "$cur") )
+    return 0
+    ;;
+  *)
+    COMPREPLY=( $( compgen -W "$flags" -- "$cur") )
+    return 0
+    ;;
+  esac
+
+  # file completion on relevant files
+  _filedir "$caffefiles"
+
+  return 0
+}
+complete -F _caffe caffe
+
+# vim
diff --git a/scripts/cpp_lint.py b/scripts/cpp_lint.py
index 6ec4fb7..b2016d4 100755
--- a/scripts/cpp_lint.py
+++ b/scripts/cpp_lint.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python2
+#!/usr/bin/env python
 #
 # Copyright (c) 2009 Google Inc. All rights reserved.
 #
@@ -52,6 +52,10 @@ import string
 import sys
 import unicodedata
 
+import six
+
+from six import iteritems, itervalues
+from six.moves import xrange
 
 _USAGE = """
 Syntax: cpp_lint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
@@ -756,7 +760,7 @@ class _CppLintState(object):
 
   def PrintErrorCounts(self):
     """Print a summary of errors by category, and the total."""
-    for category, count in self.errors_by_category.iteritems():
+    for category, count in iteritems(self.errors_by_category):
       sys.stderr.write('Category \'%s\' errors found: %d\n' %
                        (category, count))
     sys.stderr.write('Total errors found: %d\n' % self.error_count)
@@ -3444,16 +3448,16 @@ def GetLineWidth(line):
     The width of the line in column positions, accounting for Unicode
     combining characters and wide characters.
   """
-  if isinstance(line, unicode):
-    width = 0
-    for uc in unicodedata.normalize('NFC', line):
-      if unicodedata.east_asian_width(uc) in ('W', 'F'):
-        width += 2
-      elif not unicodedata.combining(uc):
-        width += 1
-    return width
-  else:
-    return len(line)
+  if six.PY2:
+    if isinstance(line, unicode):
+      width = 0
+      for uc in unicodedata.normalize('NFC', line):
+        if unicodedata.east_asian_width(uc) in ('W', 'F'):
+          width += 2
+        elif not unicodedata.combining(uc):
+          width += 1
+      return width
+  return len(line)
 
 
 def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
@@ -3774,7 +3778,7 @@ def _GetTextInside(text, start_pattern):
 
   # Give opening punctuations to get the matching close-punctuations.
   matching_punctuation = {'(': ')', '{': '}', '[': ']'}
-  closing_punctuation = set(matching_punctuation.itervalues())
+  closing_punctuation = set(itervalues(matching_punctuation))
 
   # Find the position to start extracting text.
   match = re.search(start_pattern, text, re.M)
@@ -4851,10 +4855,11 @@ def main():
 
   # Change stderr to write with replacement characters so we don't die
   # if we try to print something containing non-ASCII characters.
-  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
-                                         codecs.getreader('utf8'),
-                                         codecs.getwriter('utf8'),
-                                         'replace')
+  if six.PY2:
+    sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                          codecs.getreader('utf8'),
+                                          codecs.getwriter('utf8'),
+                                          'replace')
 
   _cpplint_state.ResetErrorCounts()
   for filename in filenames:
diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index 1900b16..dac5d2f 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -8,10 +8,11 @@ source $BASEDIR/defaults.sh
 apt-get -y update
 apt-get install -y --no-install-recommends \
   build-essential \
-  libboost-filesystem-dev \
-  libboost-python-dev \
-  libboost-system-dev \
-  libboost-thread-dev \
+  graphviz \
+  libboost-filesystem1.55-dev \
+  libboost-python1.55-dev \
+  libboost-system1.55-dev \
+  libboost-thread1.55-dev \
   libgflags-dev \
   libgoogle-glog-dev \
   libhdf5-serial-dev \
@@ -31,6 +32,7 @@ if ! $WITH_PYTHON3 ; then
     python-dev \
     python-numpy \
     python-protobuf \
+    python-pydot \
     python-skimage
 else
   # Python3
@@ -104,7 +106,7 @@ if $WITH_CUDA ; then
   ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda
 
   if $WITH_CUDNN ; then
-    apt-get install -y --no-install-recommends libcudnn5-dev
+    apt-get install -y --no-install-recommends libcudnn6-dev
   fi
 fi
 
diff --git a/scripts/travis/install-python-deps.sh b/scripts/travis/install-python-deps.sh
index eeec302..910d35a 100755
--- a/scripts/travis/install-python-deps.sh
+++ b/scripts/travis/install-python-deps.sh
@@ -11,4 +11,5 @@ if ! $WITH_PYTHON3 ; then
 else
   # Python3
   pip install --pre protobuf==3.0.0b3
+  pip install pydot
 fi
diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
index 7b25a98..b9152e9 100644
--- a/src/caffe/CMakeLists.txt
+++ b/src/caffe/CMakeLists.txt
@@ -40,9 +40,9 @@ set_target_properties(caffe PROPERTIES
  add_subdirectory(test)
 
 # ---[ Install
-install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION include)
-install(FILES ${proto_hdrs} DESTINATION include/caffe/proto)
-install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION lib)
+install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(FILES ${proto_hdrs} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/caffe/proto)
+install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
 list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 4a4c68e..35c9014 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -19,7 +19,6 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   const int num_axes = bottom[0]->num_axes();
   num_spatial_axes_ = num_axes - first_spatial_axis;
   CHECK_GE(num_spatial_axes_, 0);
-  vector<int> bottom_dim_blob_shape(1, num_spatial_axes_ + 1);
   vector<int> spatial_dim_blob_shape(1, std::max(num_spatial_axes_, 1));
   // Setup filter kernel dimensions (kernel_shape_).
   kernel_shape_.Reshape(spatial_dim_blob_shape);
diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp
index 0a08ed4..c6a1d5b 100644
--- a/src/caffe/layers/batch_norm_layer.cpp
+++ b/src/caffe/layers/batch_norm_layer.cpp
@@ -124,8 +124,8 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   if (!use_global_stats_) {
     // compute variance using var(X) = E((X-EX)^2)
-    caffe_powx(top[0]->count(), top_data, Dtype(2),
-        temp_.mutable_cpu_data());  // (X-EX)^2
+    caffe_sqr<Dtype>(top[0]->count(), top_data,
+                     temp_.mutable_cpu_data());  // (X-EX)^2
     caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
         1. / (num * spatial_dim), temp_.cpu_data(),
         spatial_sum_multiplier_.cpu_data(), 0.,
@@ -148,7 +148,7 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   // normalize variance
   caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
-  caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
+  caffe_sqrt(variance_.count(), variance_.cpu_data(),
              variance_.mutable_cpu_data());
 
   // replicate variance to input size
diff --git a/src/caffe/layers/batch_norm_layer.cu b/src/caffe/layers/batch_norm_layer.cu
index c21713c..a35e778 100644
--- a/src/caffe/layers/batch_norm_layer.cu
+++ b/src/caffe/layers/batch_norm_layer.cu
@@ -48,14 +48,14 @@ void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   if (!use_global_stats_) {
     // compute variance using var(X) = E((X-EX)^2)
-    caffe_gpu_powx(top[0]->count(), top_data, Dtype(2),
+    caffe_gpu_mul(top[0]->count(), top[0]->gpu_data(), top[0]->gpu_data(),
         temp_.mutable_gpu_data());  // (X-EX)^2
     caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
         1. / (num * spatial_dim), temp_.gpu_data(),
         spatial_sum_multiplier_.gpu_data(), 0.,
         num_by_chans_.mutable_gpu_data());
-    caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
-        num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
+    caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, Dtype(1.),
+        num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), Dtype(0.),
         variance_.mutable_gpu_data());  // E((X_EX)^2)
 
     // compute and save moving average
@@ -72,7 +72,7 @@ void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   // normalize variance
   caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
-  caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
+  caffe_gpu_sqrt(variance_.count(), variance_.gpu_data(),
       variance_.mutable_gpu_data());
 
   // replicate variance to input size
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index 1987fb0..efc9e04 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -252,6 +252,7 @@ CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
   }
 
   cudaFree(workspaceData);
+  delete [] workspace;
   delete [] stream_;
   delete [] handle_;
   delete [] fwd_algo_;
diff --git a/src/caffe/layers/cudnn_relu_layer.cpp b/src/caffe/layers/cudnn_relu_layer.cpp
index 795e0a9..687c905 100644
--- a/src/caffe/layers/cudnn_relu_layer.cpp
+++ b/src/caffe/layers/cudnn_relu_layer.cpp
@@ -36,6 +36,7 @@ CuDNNReLULayer<Dtype>::~CuDNNReLULayer() {
 
   cudnnDestroyTensorDescriptor(this->bottom_desc_);
   cudnnDestroyTensorDescriptor(this->top_desc_);
+  cudnnDestroyActivationDescriptor(this->activ_desc_);
   cudnnDestroy(this->handle_);
 }
 
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index 2125616..3d82b0e 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -31,7 +31,9 @@ template <typename Dtype>
 void EltwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   for (int i = 1; i < bottom.size(); ++i) {
-    CHECK(bottom[i]->shape() == bottom[0]->shape());
+    CHECK(bottom[0]->shape() == bottom[i]->shape())
+        << "bottom[0]: " << bottom[0]->shape_string()
+        << ", bottom[" << i << "]: " << bottom[i]->shape_string();
   }
   top[0]->ReshapeLike(*bottom[0]);
   // If max operation, we will initialize the vector index part.
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index b9a071c..00716a9 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -39,8 +39,9 @@ void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
 
   for (int i = 0; i < top_size; ++i) {
     hdf_blobs_[i] = shared_ptr<Blob<Dtype> >(new Blob<Dtype>());
+    // Allow reshape here, as we are loading data not params
     hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(),
-        MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get());
+        MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get(), true);
   }
 
   herr_t status = H5Fclose(file_id);
diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp
index 624d311..3c3f460 100644
--- a/src/caffe/layers/infogain_loss_layer.cpp
+++ b/src/caffe/layers/infogain_loss_layer.cpp
@@ -3,7 +3,8 @@
 #include <vector>
 
 #include "caffe/layers/infogain_loss_layer.hpp"
-#include "caffe/util/io.hpp"
+#include "caffe/util/io.hpp"  // for bolb reading of matrix H
+#include "caffe/util/math_functions.hpp"
 
 namespace caffe {
 
@@ -11,6 +12,31 @@ template <typename Dtype>
 void InfogainLossLayer<Dtype>::LayerSetUp(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   LossLayer<Dtype>::LayerSetUp(bottom, top);
+  // internal softmax layer
+  LayerParameter softmax_layer_param(this->layer_param_);
+  SoftmaxParameter* softmax_param = softmax_layer_param.mutable_softmax_param();
+  softmax_param->set_axis(this->layer_param_.infogain_loss_param().axis());
+  softmax_layer_param.set_type("Softmax");
+  softmax_layer_param.clear_loss_weight();
+  softmax_layer_param.add_loss_weight(1);
+  softmax_layer_ = LayerRegistry<Dtype>::CreateLayer(softmax_layer_param);
+  softmax_bottom_vec_.clear();
+  softmax_bottom_vec_.push_back(bottom[0]);
+  softmax_top_vec_.clear();
+  softmax_top_vec_.push_back(&prob_);
+  softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);
+
+  // ignore label
+  has_ignore_label_ =
+    this->layer_param_.loss_param().has_ignore_label();
+  if (has_ignore_label_) {
+    ignore_label_ = this->layer_param_.loss_param().ignore_label();
+  }
+  // normalization
+  CHECK(!this->layer_param_.loss_param().has_normalize())
+    << "normalize is deprecated. use \"normalization\"";
+  normalization_ = this->layer_param_.loss_param().normalization();
+  // matrix H
   if (bottom.size() < 3) {
     CHECK(this->layer_param_.infogain_loss_param().has_source())
         << "Infogain matrix source must be specified.";
@@ -25,28 +51,86 @@ template <typename Dtype>
 void InfogainLossLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   LossLayer<Dtype>::Reshape(bottom, top);
+  softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_);
+  infogain_axis_ =
+    bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.infogain_loss_param().axis());
+  outer_num_ = bottom[0]->count(0, infogain_axis_);
+  inner_num_ = bottom[0]->count(infogain_axis_ + 1);
+  CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
+      << "Number of labels must match number of predictions; "
+      << "e.g., if infogain axis == 1 and prediction shape is (N, C, H, W), "
+      << "label count (number of labels) must be N*H*W, "
+      << "with integer values in {0, 1, ..., C-1}.";
+  num_labels_ = bottom[0]->shape(infogain_axis_);
   Blob<Dtype>* infogain = NULL;
   if (bottom.size() < 3) {
     infogain = &infogain_;
   } else {
     infogain = bottom[2];
   }
-  CHECK_EQ(bottom[1]->channels(), 1);
-  CHECK_EQ(bottom[1]->height(), 1);
-  CHECK_EQ(bottom[1]->width(), 1);
-  const int num = bottom[0]->num();
-  const int dim = bottom[0]->count() / num;
-  CHECK_EQ(infogain->num(), 1);
-  CHECK_EQ(infogain->channels(), 1);
-  CHECK_EQ(infogain->height(), dim);
-  CHECK_EQ(infogain->width(), dim);
+  CHECK_EQ(infogain->count(), num_labels_*num_labels_);
+  sum_rows_H_.Reshape(vector<int>(1, num_labels_));
+  if (bottom.size() == 2) {
+    // H is provided as a parameter and will not change. sum rows once
+    sum_rows_of_H(infogain);
+  }
+  if (top.size() >= 2) {
+    // softmax output
+    top[1]->ReshapeLike(*bottom[0]);
+  }
+}
+
+template <typename Dtype>
+Dtype InfogainLossLayer<Dtype>::get_normalizer(
+    LossParameter_NormalizationMode normalization_mode, int valid_count) {
+  Dtype normalizer;
+  switch (normalization_mode) {
+    case LossParameter_NormalizationMode_FULL:
+      normalizer = Dtype(outer_num_ * inner_num_);
+      break;
+    case LossParameter_NormalizationMode_VALID:
+      if (valid_count == -1) {
+        normalizer = Dtype(outer_num_ * inner_num_);
+      } else {
+        normalizer = Dtype(valid_count);
+      }
+      break;
+    case LossParameter_NormalizationMode_BATCH_SIZE:
+      normalizer = Dtype(outer_num_);
+      break;
+    case LossParameter_NormalizationMode_NONE:
+      normalizer = Dtype(1);
+      break;
+    default:
+      LOG(FATAL) << "Unknown normalization mode: "
+          << LossParameter_NormalizationMode_Name(normalization_mode);
+  }
+  // Some users will have no labels for some examples in order to 'turn off' a
+  // particular loss in a multi-task setup. The max prevents NaNs in that case.
+  return std::max(Dtype(1.0), normalizer);
 }
 
+template <typename Dtype>
+void InfogainLossLayer<Dtype>::sum_rows_of_H(const Blob<Dtype>* H) {
+  CHECK_EQ(H->count(), num_labels_*num_labels_)
+    << "H must be " << num_labels_ << "x" << num_labels_;
+  const Dtype* infogain_mat = H->cpu_data();
+  Dtype* sum = sum_rows_H_.mutable_cpu_data();
+  for ( int row = 0; row < num_labels_ ; row++ ) {
+    sum[row] = 0;
+    for ( int col = 0; col < num_labels_ ; col++ ) {
+      sum[row] += infogain_mat[row*num_labels_+col];
+    }
+  }
+}
 
 template <typename Dtype>
 void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
+  // The forward pass computes the softmax prob values.
+  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
+  const Dtype* prob_data = prob_.cpu_data();
   const Dtype* bottom_label = bottom[1]->cpu_data();
   const Dtype* infogain_mat = NULL;
   if (bottom.size() < 3) {
@@ -54,17 +138,30 @@ void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   } else {
     infogain_mat = bottom[2]->cpu_data();
   }
-  int num = bottom[0]->num();
-  int dim = bottom[0]->count() / bottom[0]->num();
+  int count = 0;
   Dtype loss = 0;
-  for (int i = 0; i < num; ++i) {
-    int label = static_cast<int>(bottom_label[i]);
-    for (int j = 0; j < dim; ++j) {
-      Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-      loss -= infogain_mat[label * dim + j] * log(prob);
+  for (int i = 0; i < outer_num_; ++i) {
+    for (int j = 0; j < inner_num_; j++) {
+      const int label_value =
+        static_cast<int>(bottom_label[i * inner_num_ + j]);
+      if (has_ignore_label_ && label_value == ignore_label_) {
+        continue;
+      }
+      DCHECK_GE(label_value, 0);
+      DCHECK_LT(label_value, num_labels_);
+      for (int l = 0; l < num_labels_; l++) {
+        loss -= infogain_mat[label_value * num_labels_ + l] *
+          log(std::max(
+                prob_data[i * inner_num_*num_labels_ + l * inner_num_ + j],
+                Dtype(kLOG_THRESHOLD)));
+      }
+      ++count;
     }
   }
-  top[0]->mutable_cpu_data()[0] = loss / num;
+  top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count);
+  if (top.size() == 2) {
+    top[1]->ShareData(prob_);
+  }
 }
 
 template <typename Dtype>
@@ -80,25 +177,44 @@ void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                << " Layer cannot backpropagate to infogain inputs.";
   }
   if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const Dtype* prob_data = prob_.cpu_data();
     const Dtype* bottom_label = bottom[1]->cpu_data();
     const Dtype* infogain_mat = NULL;
     if (bottom.size() < 3) {
       infogain_mat = infogain_.cpu_data();
     } else {
       infogain_mat = bottom[2]->cpu_data();
+      // H is provided as a "bottom" and might change. sum rows every time.
+      sum_rows_of_H(bottom[2]);
     }
+    const Dtype* sum_rows_H = sum_rows_H_.cpu_data();
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    int num = bottom[0]->num();
-    int dim = bottom[0]->count() / bottom[0]->num();
-    const Dtype scale = - top[0]->cpu_diff()[0] / num;
-    for (int i = 0; i < num; ++i) {
-      const int label = static_cast<int>(bottom_label[i]);
-      for (int j = 0; j < dim; ++j) {
-        Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-        bottom_diff[i * dim + j] = scale * infogain_mat[label * dim + j] / prob;
+    const int dim = bottom[0]->count() / outer_num_;
+    int count = 0;
+    for (int i = 0; i < outer_num_; ++i) {
+      for (int j = 0; j < inner_num_; ++j) {
+        const int label_value =
+          static_cast<int>(bottom_label[i * inner_num_ + j]);
+        DCHECK_GE(label_value, 0);
+        DCHECK_LT(label_value, num_labels_);
+        if (has_ignore_label_ && label_value == ignore_label_) {
+          for (int l = 0; l < num_labels_; ++l) {
+            bottom_diff[i * dim + l * inner_num_ + j] = 0;
+          }
+        } else {
+          for (int l = 0; l < num_labels_; ++l) {
+            bottom_diff[i * dim + l * inner_num_ + j] =
+               prob_data[i*dim + l*inner_num_ + j]*sum_rows_H[label_value]
+               - infogain_mat[label_value * num_labels_ + l];
+          }
+          ++count;
+        }
       }
     }
+    // Scale gradient
+    Dtype loss_weight = top[0]->cpu_diff()[0] /
+                        get_normalizer(normalization_, count);
+    caffe_scal(bottom[0]->count(), loss_weight, bottom_diff);
   }
 }
 
diff --git a/src/caffe/layers/lstm_unit_layer.cpp b/src/caffe/layers/lstm_unit_layer.cpp
index 277c031..d1ab59c 100644
--- a/src/caffe/layers/lstm_unit_layer.cpp
+++ b/src/caffe/layers/lstm_unit_layer.cpp
@@ -31,7 +31,6 @@ void LSTMUnitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     CHECK_EQ(num_instances, bottom[i]->shape(1));
   }
   hidden_dim_ = bottom[0]->shape(2);
-  CHECK_EQ(num_instances, bottom[1]->shape(1));
   CHECK_EQ(4 * hidden_dim_, bottom[1]->shape(2));
   top[0]->ReshapeLike(*bottom[0]);
   top[1]->ReshapeLike(*bottom[0]);
diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/pooling_layer.cu
index 1ea46cc..46eddb9 100644
--- a/src/caffe/layers/pooling_layer.cu
+++ b/src/caffe/layers/pooling_layer.cu
@@ -138,7 +138,7 @@ __global__ void StoPoolForwardTest(const int nthreads,
     const int wstart = pw * stride_w;
     const int wend = min(wstart + kernel_w, width);
     // We set cumsum to be 0 to avoid divide-by-zero problems
-    Dtype cumsum = FLT_MIN;
+    Dtype cumsum = 0.;
     Dtype cumvalues = 0.;
     const Dtype* const bottom_slice =
         bottom_data + (n * channels + c) * height * width;
@@ -149,7 +149,7 @@ __global__ void StoPoolForwardTest(const int nthreads,
         cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
       }
     }
-    top_data[index] = cumvalues / cumsum;
+    top_data[index] = (cumsum > 0.) ? cumvalues / cumsum : 0.;
   }
 }
 
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index 85fd967..f8aa769 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -7,7 +7,7 @@ namespace caffe {
 
 template <typename Dtype>
 inline Dtype sigmoid(Dtype x) {
-  return 1. / (1. + exp(-x));
+  return 0.5 * tanh(0.5 * x) + 0.5;
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/sigmoid_layer.cu b/src/caffe/layers/sigmoid_layer.cu
index 184c61e..8a4ea66 100644
--- a/src/caffe/layers/sigmoid_layer.cu
+++ b/src/caffe/layers/sigmoid_layer.cu
@@ -8,7 +8,7 @@ namespace caffe {
 template <typename Dtype>
 __global__ void SigmoidForward(const int n, const Dtype* in, Dtype* out) {
   CUDA_KERNEL_LOOP(index, n) {
-    out[index] = 1. / (1. + exp(-in[index]));
+    out[index] = 0.5 * tanh(0.5 * in[index]) + 0.5;
   }
 }
 
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 70d5180..353c2f9 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -769,8 +769,7 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
 
 template <typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
-  if (trained_filename.size() >= 3 &&
-      trained_filename.compare(trained_filename.size() - 3, 3, ".h5") == 0) {
+  if (H5Fis_hdf5(trained_filename.c_str())) {
     CopyTrainedLayersFromHDF5(trained_filename);
   } else {
     CopyTrainedLayersFromBinaryProto(trained_filename);
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index a145c54..c96966b 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -502,11 +502,21 @@ message ConcatParameter {
 }
 
 message BatchNormParameter {
-  // If false, accumulate global mean/variance values via a moving average. If
-  // true, use those accumulated values instead of computing mean/variance
-  // across the batch.
+  // If false, normalization is performed over the current mini-batch
+  // and global statistics are accumulated (but not yet used) by a moving
+  // average.
+  // If true, those accumulated mean and variance values are used for the
+  // normalization.
+  // By default, it is set to false when the network is in the training
+  // phase and true when the network is in the testing phase.
   optional bool use_global_stats = 1;
-  // How much does the moving average decay each iteration?
+  // What fraction of the moving average remains each iteration?
+  // Smaller values make the moving average decay faster, giving more
+  // weight to the recent values.
+  // Each iteration updates the moving average @f$S_{t-1}@f$ with the
+  // current mean @f$ Y_t @f$ by
+  // @f$ S_t = (1-\beta)Y_t + \beta \cdot S_{t-1} @f$, where @f$ \beta @f$
+  // is the moving_average_fraction parameter.
   optional float moving_average_fraction = 2 [default = .999];
   // Small value to add to the variance estimate so that we don't divide by
   // zero.
@@ -805,6 +815,7 @@ message ImageDataParameter {
 message InfogainLossParameter {
   // Specify the infogain matrix source.
   optional string source = 1;
+  optional int32 axis = 2 [default = 1]; // axis of prob
 }
 
 message InnerProductParameter {
@@ -927,9 +938,7 @@ message PythonParameter {
   // string, dictionary in Python dict format, JSON, etc. You may parse this
   // string in `setup` method and use it in `forward` and `backward`.
   optional string param_str = 3 [default = ''];
-  // Whether this PythonLayer is shared among worker solvers during data parallelism.
-  // If true, each worker solver sequentially run forward from this layer.
-  // This value should be set true if you are using it as a data layer.
+  // DEPRECATED
   optional bool share_in_parallel = 4 [default = false];
 }
 
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index fd4c037..0442693 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -51,8 +51,8 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
   }
   // Scaffolding code
   InitTrainNet();
+  InitTestNets();
   if (Caffe::root_solver()) {
-    InitTestNets();
     LOG(INFO) << "Solver scaffolding done.";
   }
   iter_ = 0;
@@ -102,7 +102,6 @@ void Solver<Dtype>::InitTrainNet() {
 
 template <typename Dtype>
 void Solver<Dtype>::InitTestNets() {
-  CHECK(Caffe::root_solver());
   const bool has_net_param = param_.has_net_param();
   const bool has_net_file = param_.has_net();
   const int num_generic_nets = has_net_param + has_net_file;
diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp
index 6473b74..8f333bd 100644
--- a/src/caffe/test/test_caffe_main.cpp
+++ b/src/caffe/test/test_caffe_main.cpp
@@ -15,7 +15,7 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   caffe::GlobalInit(&argc, &argv);
 #ifndef CPU_ONLY
-  // Before starting testing, let's first print out a few cuda defice info.
+  // Before starting testing, let's first print out a few cuda device info.
   int device;
   cudaGetDeviceCount(&device);
   cout << "Cuda number of devices: " << device << endl;
diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp
index 9bb19d1..85c10a2 100644
--- a/src/caffe/test/test_convolution_layer.cpp
+++ b/src/caffe/test/test_convolution_layer.cpp
@@ -695,7 +695,7 @@ TYPED_TEST(ConvolutionLayerTest, TestNDAgainst2D) {
   }
   ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
   for (int i = 0; i < backward_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
+    EXPECT_FLOAT_EQ(backward_result_2d.cpu_diff()[i],
               backward_result_nd.cpu_diff()[i]);
   }
   ASSERT_EQ(backward_weight_result_nd.count(),
diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index 6ad0d8f..f4395f5 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -28,7 +28,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
       seed_(1701), num_(4), channels_(3), height_(10), width_(10),
       share_(false) {
         input_file_ = new string(
-        CMAKE_SOURCE_DIR "caffe/test/test_data/solver_data_list.txt" CMAKE_EXT);
+        ABS_TEST_DATA_DIR "/solver_data_list.txt");
       }
   ~GradientBasedSolverTest() {
     delete input_file_;
@@ -558,9 +558,11 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const vector<Blob<Dtype>*>& params = solver_->net()->learnable_params();
     for (int i = 0; i < params.size(); ++i) {
       for (int j = 0; j < params[i]->count(); ++j) {
-        EXPECT_EQ(param_copies[i]->cpu_data()[j], params[i]->cpu_data()[j])
+        EXPECT_FLOAT_EQ(param_copies[i]->cpu_data()[j],
+            params[i]->cpu_data()[j])
             << "param " << i << " data differed at dim " << j;
-        EXPECT_EQ(param_copies[i]->cpu_diff()[j], params[i]->cpu_diff()[j])
+        EXPECT_FLOAT_EQ(param_copies[i]->cpu_diff()[j],
+            params[i]->cpu_diff()[j])
             << "param " << i << " diff differed at dim " << j;
       }
     }
@@ -569,9 +571,11 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const vector<shared_ptr<Blob<Dtype> > >& history = solver_->history();
     for (int i = 0; i < history.size(); ++i) {
       for (int j = 0; j < history[i]->count(); ++j) {
-        EXPECT_EQ(history_copies[i]->cpu_data()[j], history[i]->cpu_data()[j])
+        EXPECT_FLOAT_EQ(history_copies[i]->cpu_data()[j],
+            history[i]->cpu_data()[j])
             << "history blob " << i << " data differed at dim " << j;
-        EXPECT_EQ(history_copies[i]->cpu_diff()[j], history[i]->cpu_diff()[j])
+        EXPECT_FLOAT_EQ(history_copies[i]->cpu_diff()[j],
+            history[i]->cpu_diff()[j])
             << "history blob " << i << " diff differed at dim " << j;
       }
     }
diff --git a/src/caffe/test/test_hdf5_output_layer.cpp b/src/caffe/test/test_hdf5_output_layer.cpp
index 3833ebf..f94dd57 100644
--- a/src/caffe/test/test_hdf5_output_layer.cpp
+++ b/src/caffe/test/test_hdf5_output_layer.cpp
@@ -20,8 +20,7 @@ class HDF5OutputLayerTest : public MultiDeviceTest<TypeParam> {
 
  protected:
   HDF5OutputLayerTest()
-      : input_file_name_(
-        CMAKE_SOURCE_DIR "caffe/test/test_data/sample_data.h5"),
+      : input_file_name_(ABS_TEST_DATA_DIR "/sample_data.h5"),
         blob_data_(new Blob<Dtype>()),
         blob_label_(new Blob<Dtype>()),
         num_(5),
@@ -77,10 +76,12 @@ TYPED_TEST(HDF5OutputLayerTest, TestForward) {
                           H5P_DEFAULT);
   ASSERT_GE(file_id, 0)<< "Failed to open HDF5 file" <<
       this->input_file_name_;
+  // Allow reshape here as we are loading data not params
+  bool reshape = true;
   hdf5_load_nd_dataset(file_id, HDF5_DATA_DATASET_NAME, 0, 4,
-                       this->blob_data_);
+                       this->blob_data_, reshape);
   hdf5_load_nd_dataset(file_id, HDF5_DATA_LABEL_NAME, 0, 4,
-                       this->blob_label_);
+                       this->blob_label_, reshape);
   herr_t status = H5Fclose(file_id);
   EXPECT_GE(status, 0)<< "Failed to close HDF5 file " <<
       this->input_file_name_;
@@ -105,12 +106,12 @@ TYPED_TEST(HDF5OutputLayerTest, TestForward) {
 
   Blob<Dtype>* blob_data = new Blob<Dtype>();
   hdf5_load_nd_dataset(file_id, HDF5_DATA_DATASET_NAME, 0, 4,
-                       blob_data);
+                       blob_data, reshape);
   this->CheckBlobEqual(*(this->blob_data_), *blob_data);
 
   Blob<Dtype>* blob_label = new Blob<Dtype>();
   hdf5_load_nd_dataset(file_id, HDF5_DATA_LABEL_NAME, 0, 4,
-                       blob_label);
+                       blob_label, reshape);
   this->CheckBlobEqual(*(this->blob_label_), *blob_label);
 
   status = H5Fclose(file_id);
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp
index 68e1028..3977c48 100644
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ b/src/caffe/test/test_hdf5data_layer.cpp
@@ -30,8 +30,7 @@ class HDF5DataLayerTest : public MultiDeviceTest<TypeParam> {
     blob_top_vec_.push_back(blob_top_label2_);
 
     // Check out generate_sample_data.py in the same directory.
-    filename = new string(
-    CMAKE_SOURCE_DIR "caffe/test/test_data/sample_data_list.txt" CMAKE_EXT);
+    filename = new string(ABS_TEST_DATA_DIR "/sample_data_list.txt");
     LOG(INFO)<< "Using sample HDF5 data file " << filename;
   }
 
@@ -70,7 +69,7 @@ TYPED_TEST(HDF5DataLayerTest, TestRead) {
   int height = 6;
   int width = 5;
 
-  // Test that the layer setup got the correct parameters.
+  // Test that the layer setup gives correct parameters.
   HDF5DataLayer<Dtype> layer(param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_data_->num(), batch_size);
diff --git a/src/caffe/test/test_infogain_loss_layer.cpp b/src/caffe/test/test_infogain_loss_layer.cpp
index a24ac68..34f2127 100644
--- a/src/caffe/test/test_infogain_loss_layer.cpp
+++ b/src/caffe/test/test_infogain_loss_layer.cpp
@@ -1,3 +1,4 @@
+#include <algorithm>
 #include <vector>
 
 #include "gtest/gtest.h"
@@ -18,17 +19,22 @@ class InfogainLossLayerTest : public MultiDeviceTest<TypeParam> {
 
  protected:
   InfogainLossLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
-        blob_bottom_label_(new Blob<Dtype>(10, 1, 1, 1)),
+      : blob_bottom_data_(new Blob<Dtype>(4, 2, 5, 2)),
+        blob_bottom_label_(new Blob<Dtype>(4, 2, 1, 2)),
         blob_bottom_infogain_(new Blob<Dtype>(1, 1, 5, 5)),
-        blob_top_loss_(new Blob<Dtype>()) {
+        blob_top_loss_(new Blob<Dtype>()),
+        blob_top_prob_(new Blob<Dtype>()),
+        inner_(2), outer_(4*2), num_labels_(5) {
     Caffe::set_random_seed(1701);
     FillerParameter filler_param;
-    PositiveUnitballFiller<Dtype> filler(filler_param);
+    filler_param.set_min(-0.5);
+    filler_param.set_max(2.0);
+    UniformFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_data_);
     blob_bottom_vec_.push_back(blob_bottom_data_);
     for (int i = 0; i < blob_bottom_label_->count(); ++i) {
-      blob_bottom_label_->mutable_cpu_data()[i] = caffe_rng_rand() % 5;
+      blob_bottom_label_->mutable_cpu_data()[i] =
+        caffe_rng_rand() % num_labels_;
     }
     blob_bottom_vec_.push_back(blob_bottom_label_);
     filler_param.set_min(0.1);
@@ -37,29 +43,94 @@ class InfogainLossLayerTest : public MultiDeviceTest<TypeParam> {
     infogain_filler.Fill(this->blob_bottom_infogain_);
     blob_bottom_vec_.push_back(blob_bottom_infogain_);
     blob_top_vec_.push_back(blob_top_loss_);
+    blob_top_vec_.push_back(blob_top_prob_);
   }
   virtual ~InfogainLossLayerTest() {
     delete blob_bottom_data_;
     delete blob_bottom_label_;
     delete blob_bottom_infogain_;
     delete blob_top_loss_;
+    delete blob_top_prob_;
   }
   Blob<Dtype>* const blob_bottom_data_;
   Blob<Dtype>* const blob_bottom_label_;
   Blob<Dtype>* const blob_bottom_infogain_;
   Blob<Dtype>* const blob_top_loss_;
+  Blob<Dtype>* const blob_top_prob_;
   vector<Blob<Dtype>*> blob_bottom_vec_;
   vector<Blob<Dtype>*> blob_top_vec_;
+  int inner_, outer_, num_labels_;
 };
 
 TYPED_TEST_CASE(InfogainLossLayerTest, TestDtypesAndDevices);
 
+TYPED_TEST(InfogainLossLayerTest, TestInfogainLoss) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_infogain_loss_param()->set_axis(2);
+  layer_param.clear_loss_weight();
+  layer_param.add_loss_weight(1);
+  layer_param.add_loss_weight(0);
+  /*vector<float>* lw = layer_param.mutable_loss_weight();
+  lw->clear();
+  lw->push_back(1);
+  lw->push_back(1);*/
+  InfogainLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  const Dtype* data = this->blob_bottom_vec_[0]->cpu_data();
+  const Dtype* prob = this->blob_top_vec_[1]->cpu_data();
+  const Dtype* labels = this->blob_bottom_vec_[1]->cpu_data();
+  const Dtype* H = this->blob_bottom_vec_[2]->cpu_data();
+  // first. test the prob top
+  CHECK_EQ(this->blob_bottom_vec_[0]->num_axes(),
+    this->blob_top_vec_[1]->num_axes())
+      << "prob top shape not match bottom data";
+  for (int ai = 0 ; ai < this->blob_bottom_vec_[0]->num_axes(); ai++) {
+    CHECK_EQ(this->blob_bottom_vec_[0]->shape(ai),
+      this->blob_top_vec_[1]->shape(ai))
+        << "prob top shape not match bottom data";
+  }
+  vector<Dtype> est_prob(this->num_labels_, 0);
+  for ( int i = 0 ; i < this->outer_; i++ ) {
+    for ( int j = 0; j < this->inner_; j++ ) {
+      Dtype den = 0;
+      for ( int  l = 0; l < this->num_labels_; l++ ) {
+        est_prob[l] = std::exp(
+          data[i*this->num_labels_*this->inner_ + l*this->inner_ + j]);
+        den += est_prob[l];
+      }
+      for ( int l = 0; l < this->num_labels_; l++ ) {
+        EXPECT_NEAR(prob[i*this->num_labels_*this->inner_ + l*this->inner_ + j],
+          est_prob[l]/den, 1e-6);
+      }
+    }
+  }
+  Dtype loss = 0;  // loss from prob top
+  for ( int i = 0 ; i < this->outer_; i++ ) {
+    for ( int j = 0; j < this->inner_; j++ ) {
+      int gt = static_cast<int>(labels[i*this->inner_+j]);
+      for ( int l = 0; l < this->num_labels_; l++ ) {
+        loss -= H[gt*this->num_labels_ + l] *
+          log(std::max(
+            prob[i*this->num_labels_*this->inner_ + l*this->inner_ + j],
+            Dtype(kLOG_THRESHOLD)));
+      }
+    }
+  }
+  EXPECT_NEAR(this->blob_top_loss_->cpu_data()[0],
+    loss/(this->outer_*this->inner_), 1e-6);
+}
 
 TYPED_TEST(InfogainLossLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
+  layer_param.mutable_infogain_loss_param()->set_axis(2);
   InfogainLossLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-4, 2e-2, 1701, 1, 0.01);
+  this->blob_top_vec_.clear();  // ignore prob top.
+  this->blob_top_vec_.push_back(this->blob_top_loss_);
+  GradientChecker<Dtype> checker(1e-4, 2e-2, 1701);  // no "kink"
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_, 0);
 }
diff --git a/src/caffe/test/test_neuron_layer.cpp b/src/caffe/test/test_neuron_layer.cpp
index 342f825..180871a 100644
--- a/src/caffe/test/test_neuron_layer.cpp
+++ b/src/caffe/test/test_neuron_layer.cpp
@@ -791,16 +791,19 @@ TYPED_TEST(NeuronLayerTest, TestPReLUInPlace) {
   ip2.Backward(blob_middle_vec_2, propagate_down, blob_bottom_vec_2);
   // Check numbers
   for (int s = 0; s < blob_bottom_2->count(); ++s) {
-    EXPECT_EQ(this->blob_bottom_->cpu_diff()[s], blob_bottom_2->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(this->blob_bottom_->cpu_diff()[s],
+        blob_bottom_2->cpu_diff()[s]);
   }
   for (int s = 0; s < ip.blobs()[0]->count(); ++s) {
-    EXPECT_EQ(ip.blobs()[0]->cpu_diff()[s], ip2.blobs()[0]->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(ip.blobs()[0]->cpu_diff()[s],
+        ip2.blobs()[0]->cpu_diff()[s]);
   }
   for (int s = 0; s < ip.blobs()[1]->count(); ++s) {
-    EXPECT_EQ(ip.blobs()[1]->cpu_diff()[s], ip2.blobs()[1]->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(ip.blobs()[1]->cpu_diff()[s],
+        ip2.blobs()[1]->cpu_diff()[s]);
   }
   for (int s = 0; s < prelu.blobs()[0]->count(); ++s) {
-    EXPECT_EQ(prelu.blobs()[0]->cpu_diff()[s],
+    EXPECT_FLOAT_EQ(prelu.blobs()[0]->cpu_diff()[s],
         prelu2.blobs()[0]->cpu_diff()[s]);
   }
 }
diff --git a/src/caffe/util/hdf5.cpp b/src/caffe/util/hdf5.cpp
index d255877..ed73742 100644
--- a/src/caffe/util/hdf5.cpp
+++ b/src/caffe/util/hdf5.cpp
@@ -9,7 +9,7 @@ namespace caffe {
 template <typename Dtype>
 void hdf5_load_nd_dataset_helper(
     hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob) {
+    Blob<Dtype>* blob, bool reshape) {
   // Verify that the dataset exists.
   CHECK(H5LTfind_dataset(file_id, dataset_name_))
       << "Failed to find HDF5 dataset " << dataset_name_;
@@ -56,17 +56,38 @@ void hdf5_load_nd_dataset_helper(
     LOG(FATAL) << "Datatype class unknown";
   }
 
+
   vector<int> blob_dims(dims.size());
   for (int i = 0; i < dims.size(); ++i) {
     blob_dims[i] = dims[i];
   }
-  blob->Reshape(blob_dims);
+
+  if (reshape) {
+    blob->Reshape(blob_dims);
+  } else {
+    if (blob_dims != blob->shape()) {
+      // create shape string for error message
+      ostringstream stream;
+      int count = 1;
+      for (int i = 0; i < blob_dims.size(); ++i) {
+        stream << blob_dims[i] << " ";
+        count = count * blob_dims[i];
+      }
+      stream << "(" << count << ")";
+      string source_shape_string = stream.str();
+
+      CHECK(blob_dims == blob->shape()) << "Cannot load blob from hdf5; shape "
+            << "mismatch. Source shape is " << source_shape_string
+            << " target shape is " << blob->shape_string();
+    }
+  }
 }
 
 template <>
 void hdf5_load_nd_dataset<float>(hid_t file_id, const char* dataset_name_,
-        int min_dim, int max_dim, Blob<float>* blob) {
-  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
+        int min_dim, int max_dim, Blob<float>* blob, bool reshape) {
+  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob,
+                              reshape);
   herr_t status = H5LTread_dataset_float(
     file_id, dataset_name_, blob->mutable_cpu_data());
   CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_;
@@ -74,8 +95,9 @@ void hdf5_load_nd_dataset<float>(hid_t file_id, const char* dataset_name_,
 
 template <>
 void hdf5_load_nd_dataset<double>(hid_t file_id, const char* dataset_name_,
-        int min_dim, int max_dim, Blob<double>* blob) {
-  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
+        int min_dim, int max_dim, Blob<double>* blob, bool reshape) {
+  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob,
+                              reshape);
   herr_t status = H5LTread_dataset_double(
     file_id, dataset_name_, blob->mutable_cpu_data());
   CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_;
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 71c0227..59625bc 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -197,6 +197,16 @@ void caffe_sqr<double>(const int n, const double* a, double* y) {
 }
 
 template <>
+void caffe_sqrt<float>(const int n, const float* a, float* y) {
+  vsSqrt(n, a, y);
+}
+
+template <>
+void caffe_sqrt<double>(const int n, const double* a, double* y) {
+  vdSqrt(n, a, y);
+}
+
+template <>
 void caffe_exp<float>(const int n, const float* a, float* y) {
   vsExp(n, a, y);
 }
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index 6d00102..314e6ba 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -387,6 +387,27 @@ void caffe_gpu_powx<double>(const int N, const double* a,
       N, a, alpha, y);
 }
 
+template <typename Dtype>
+__global__ void sqrt_kernel(const int n, const Dtype* a, Dtype* y) {
+  CUDA_KERNEL_LOOP(index, n) {
+    y[index] = sqrt(a[index]);
+  }
+}
+
+template <>
+void caffe_gpu_sqrt<float>(const int N, const float* a, float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  sqrt_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, a, y);
+}
+
+template <>
+void caffe_gpu_sqrt<double>(const int N, const double* a, double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  sqrt_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, a, y);
+}
+
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
                                       - (x[index] < Dtype(0)));
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 02fbd5c..3789450 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -25,5 +25,6 @@ foreach(source ${srcs})
   endif()
 
   # Install
-  install(TARGETS ${name} DESTINATION bin)
+  install(TARGETS ${name} DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 endforeach(source)
diff --git a/tools/compute_image_mean.cpp b/tools/compute_image_mean.cpp
index 2035d51..417f5e4 100644
--- a/tools/compute_image_mean.cpp
+++ b/tools/compute_image_mean.cpp
@@ -22,9 +22,11 @@ DEFINE_string(backend, "lmdb",
         "The backend {leveldb, lmdb} containing the images");
 
 int main(int argc, char** argv) {
+#ifdef USE_OPENCV
   ::google::InitGoogleLogging(argv[0]);
+  // Print output to stderr (while still logging)
+  FLAGS_alsologtostderr = 1;
 
-#ifdef USE_OPENCV
 #ifndef GFLAGS_GFLAGS_H_
   namespace gflags = google;
 #endif
@@ -65,7 +67,7 @@ int main(int argc, char** argv) {
   for (int i = 0; i < size_in_datum; ++i) {
     sum_blob.add_data(0.);
   }
-  LOG(INFO) << "Starting Iteration";
+  LOG(INFO) << "Starting iteration";
   while (cursor->valid()) {
     Datum datum;
     datum.ParseFromString(cursor->value());
@@ -114,7 +116,7 @@ int main(int argc, char** argv) {
     for (int i = 0; i < dim; ++i) {
       mean_values[c] += sum_blob.data(dim * c + i);
     }
-    LOG(INFO) << "mean_value channel [" << c << "]:" << mean_values[c] / dim;
+    LOG(INFO) << "mean_value channel [" << c << "]: " << mean_values[c] / dim;
   }
 #else
   LOG(FATAL) << "This tool requires OpenCV; compile with USE_OPENCV.";
diff --git a/tools/extra/parse_log.sh b/tools/extra/parse_log.sh
index 9892c89..122eb9e 100755
--- a/tools/extra/parse_log.sh
+++ b/tools/extra/parse_log.sh
@@ -39,7 +39,7 @@ rm aux.txt aux0.txt aux1.txt aux2.txt aux3.txt aux4.txt
 grep '] Solving ' $1 > aux.txt
 grep ', loss = ' $1 >> aux.txt
 grep 'Iteration ' aux.txt | sed  's/.*Iteration \([[:digit:]]*\).*/\1/g' > aux0.txt
-grep ', loss = ' $1 | awk '{print $9}' > aux1.txt
+grep ', loss = ' $1 | awk -F = '{print $2}' > aux1.txt
 grep ', lr = ' $1 | awk '{print $9}' > aux2.txt
 
 # Extracting elapsed seconds
diff --git a/tools/extra/resize_and_crop_images.py b/tools/extra/resize_and_crop_images.py
index c844f59..fd2c313 100755
--- a/tools/extra/resize_and_crop_images.py
+++ b/tools/extra/resize_and_crop_images.py
@@ -101,7 +101,7 @@ class ResizeCropImagesMapper(mapreducer.BasicMapper):
         yield value, FLAGS.output_folder
 
 mapreducer.REGISTER_DEFAULT_MAPPER(ResizeCropImagesMapper)
-
+mapreducer.REGISTER_DEFAULT_REDUCER(mapreducer.NoPassReducer)
 mapreducer.REGISTER_DEFAULT_READER(mapreducer.FileReader)
 mapreducer.REGISTER_DEFAULT_WRITER(mapreducer.FileWriter)
  

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/caffe-contrib.git