From e7f543bb264ad8597a3edaf3b938e9c3cc57bf33 Mon Sep 17 00:00:00 2001
From: ih4cku <ih4cku@gmail.com>
Date: Wed, 17 Jun 2015 12:15:28 +0800
Subject: [PATCH 001/183] register a dummy reducer to prevent mincepie runtime
 error

---
 tools/extra/resize_and_crop_images.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/extra/resize_and_crop_images.py b/tools/extra/resize_and_crop_images.py
index c844f590..fd2c3134 100755
--- a/tools/extra/resize_and_crop_images.py
+++ b/tools/extra/resize_and_crop_images.py
@@ -101,7 +101,7 @@ def map(self, key, value):
         yield value, FLAGS.output_folder
 
 mapreducer.REGISTER_DEFAULT_MAPPER(ResizeCropImagesMapper)
-
+mapreducer.REGISTER_DEFAULT_REDUCER(mapreducer.NoPassReducer)
 mapreducer.REGISTER_DEFAULT_READER(mapreducer.FileReader)
 mapreducer.REGISTER_DEFAULT_WRITER(mapreducer.FileWriter)
  

From 10725393518df14b9b6976686f72fae792c3f393 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Mon, 5 Oct 2015 15:46:54 -0700
Subject: [PATCH 002/183] NetSpec: type-check Function inputs (they must be Top
 instances)

---
 python/caffe/net_spec.py           | 4 ++++
 python/caffe/test/test_net_spec.py | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/python/caffe/net_spec.py b/python/caffe/net_spec.py
index 93fc0192..b6520627 100644
--- a/python/caffe/net_spec.py
+++ b/python/caffe/net_spec.py
@@ -103,6 +103,10 @@ class Function(object):
 
     def __init__(self, type_name, inputs, params):
         self.type_name = type_name
+        for index, input in enumerate(inputs):
+            if not isinstance(input, Top):
+                raise TypeError('%s input %d is not a Top (type is %s)' %
+                                (type_name, index, type(input)))
         self.inputs = inputs
         self.params = params
         self.ntop = self.params.get('ntop', 1)
diff --git a/python/caffe/test/test_net_spec.py b/python/caffe/test/test_net_spec.py
index fee3c0aa..ffe71bac 100644
--- a/python/caffe/test/test_net_spec.py
+++ b/python/caffe/test/test_net_spec.py
@@ -79,3 +79,11 @@ def test_zero_tops(self):
         net_proto = silent_net()
         net = self.load_net(net_proto)
         self.assertEqual(len(net.forward()), 0)
+
+    def test_type_error(self):
+        """Test that a TypeError is raised when a Function input isn't a Top."""
+        data = L.DummyData(ntop=2)  # data is a 2-tuple of Tops
+        r = r"^Silence input 0 is not a Top \(type is <(type|class) 'tuple'>\)$"
+        with self.assertRaisesRegexp(TypeError, r):
+            L.Silence(data, ntop=0)  # should raise: data is a tuple, not a Top
+        L.Silence(*data, ntop=0)  # shouldn't raise: each elt of data is a Top

From 5395cc66d68df74ff5d0920ed80eabcdd439c660 Mon Sep 17 00:00:00 2001
From: ixartz <contact@ixartz.com>
Date: Mon, 2 Nov 2015 23:07:45 -0500
Subject: [PATCH 003/183] OSX 10.10 (and more) use Accelerate Framework instead
 of veclib

---
 cmake/Dependencies.cmake             | 6 ++++++
 include/caffe/util/mkl_alternate.hpp | 5 +++++
 2 files changed, 11 insertions(+)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 51a803c1..64e6500e 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -102,6 +102,12 @@ elseif(APPLE)
   find_package(vecLib REQUIRED)
   include_directories(SYSTEM ${vecLib_INCLUDE_DIR})
   list(APPEND Caffe_LINKER_LIBS ${vecLib_LINKER_LIBS})
+
+  if(VECLIB_FOUND)
+    if(NOT vecLib_INCLUDE_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
+      add_definitions(-DUSE_ACCELERATE)
+    endif()
+  endif()
 endif()
 
 # ---[ Python
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
index 3355b665..95df0f93 100644
--- a/include/caffe/util/mkl_alternate.hpp
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -7,9 +7,14 @@
 
 #else  // If use MKL, simply include the MKL header
 
+#ifdef USE_ACCELERATE
+#include <Accelerate/Accelerate.h>
+#else
 extern "C" {
 #include <cblas.h>
 }
+#endif  // USE_ACCELERATE
+
 #include <math.h>
 
 // Functions that caffe uses but are not present if MKL is not linked.

From 37413f9d0b102950dc6b94e52e367c762974f02a Mon Sep 17 00:00:00 2001
From: Tea <tea.desouza@gmail.com>
Date: Mon, 23 Nov 2015 11:36:46 +0800
Subject: [PATCH 004/183] Scope macros inside switch

---
 src/caffe/util/hdf5.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/caffe/util/hdf5.cpp b/src/caffe/util/hdf5.cpp
index 7730e76a..d255877b 100644
--- a/src/caffe/util/hdf5.cpp
+++ b/src/caffe/util/hdf5.cpp
@@ -29,10 +29,10 @@ void hdf5_load_nd_dataset_helper(
   CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_;
   switch (class_) {
   case H5T_FLOAT:
-    LOG_FIRST_N(INFO, 1) << "Datatype class: H5T_FLOAT";
+    { LOG_FIRST_N(INFO, 1) << "Datatype class: H5T_FLOAT"; }
     break;
   case H5T_INTEGER:
-    LOG_FIRST_N(INFO, 1) << "Datatype class: H5T_INTEGER";
+    { LOG_FIRST_N(INFO, 1) << "Datatype class: H5T_INTEGER"; }
     break;
   case H5T_TIME:
     LOG(FATAL) << "Unsupported datatype class: H5T_TIME";

From 52dcf4801dddf05df3ddef238895cabbc6c4384a Mon Sep 17 00:00:00 2001
From: Azat <davletag@mail.ru>
Date: Thu, 3 Dec 2015 13:56:48 +0300
Subject: [PATCH 005/183] sigmoid fix (cu)

Previous implementation caused FP overflow for x less than -90
---
 src/caffe/layers/sigmoid_layer.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/layers/sigmoid_layer.cu b/src/caffe/layers/sigmoid_layer.cu
index 184c61ed..8a4ea661 100644
--- a/src/caffe/layers/sigmoid_layer.cu
+++ b/src/caffe/layers/sigmoid_layer.cu
@@ -8,7 +8,7 @@ namespace caffe {
 template <typename Dtype>
 __global__ void SigmoidForward(const int n, const Dtype* in, Dtype* out) {
   CUDA_KERNEL_LOOP(index, n) {
-    out[index] = 1. / (1. + exp(-in[index]));
+    out[index] = 0.5 * tanh(0.5 * in[index]) + 0.5;
   }
 }
 

From 0f61cc09467afa35835dc09617f1042e4f77c9fb Mon Sep 17 00:00:00 2001
From: Azat <davletag@mail.ru>
Date: Thu, 3 Dec 2015 14:00:08 +0300
Subject: [PATCH 006/183] sigmoid fix (cpp)

Previous implementation caused FP overflow for x less than -90
---
 src/caffe/layers/sigmoid_layer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index 85fd9676..f8aa769a 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -7,7 +7,7 @@ namespace caffe {
 
 template <typename Dtype>
 inline Dtype sigmoid(Dtype x) {
-  return 1. / (1. + exp(-x));
+  return 0.5 * tanh(0.5 * x) + 0.5;
 }
 
 template <typename Dtype>

From 337b07589f4e44761bdb9ef4c242f83ca40c9da5 Mon Sep 17 00:00:00 2001
From: shai <shai@magisto.com>
Date: Mon, 21 Mar 2016 09:08:02 +0200
Subject: [PATCH 007/183] upgrading InfogainLoss layer: (1) incorporating
 Softmax layer to make the gradeint computation robust, much like
 SoftmaxWithLoss layer (see: http://stackoverflow.com/a/34917052/1714410 for
 more information). (2) supporting loss along axis

---
 include/caffe/layers/infogain_loss_layer.hpp |  35 ++++
 src/caffe/layers/infogain_loss_layer.cpp     | 172 ++++++++++++++++---
 src/caffe/proto/caffe.proto                  |   1 +
 src/caffe/test/test_infogain_loss_layer.cpp  |  83 ++++++++-
 4 files changed, 257 insertions(+), 34 deletions(-)

diff --git a/include/caffe/layers/infogain_loss_layer.hpp b/include/caffe/layers/infogain_loss_layer.hpp
index 633f339a..edecde82 100644
--- a/include/caffe/layers/infogain_loss_layer.hpp
+++ b/include/caffe/layers/infogain_loss_layer.hpp
@@ -8,6 +8,7 @@
 #include "caffe/proto/caffe.pb.h"
 
 #include "caffe/layers/loss_layer.hpp"
+#include "caffe/layers/softmax_layer.hpp"
 
 namespace caffe {
 
@@ -60,6 +61,12 @@ class InfogainLossLayer : public LossLayer<Dtype> {
   virtual inline int MinBottomBlobs() const { return 2; }
   virtual inline int MaxBottomBlobs() const { return 3; }
 
+  // InfogainLossLayer computes softmax prob internally.
+  // optional second "top" outputs the softmax prob
+  virtual inline int ExactNumTopBlobs() const { return -1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline int MaxTopBlobs() const { return 2; }
+
   virtual inline const char* type() const { return "InfogainLoss"; }
 
  protected:
@@ -102,7 +109,35 @@ class InfogainLossLayer : public LossLayer<Dtype> {
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+  /// Read the normalization mode parameter and compute the normalizer based
+  /// on the blob size.  If normalization_mode is VALID, the count of valid
+  /// outputs will be read from valid_count, unless it is -1 in which case
+  /// all outputs are assumed to be valid.
+  virtual Dtype get_normalizer(
+      LossParameter_NormalizationMode normalization_mode, int valid_count);
+  /// fill sum_rows_H_ according to matrix H
+  virtual void sum_rows_of_H(const Blob<Dtype>* H);
+
+  /// The internal SoftmaxLayer used to map predictions to a distribution.
+  shared_ptr<Layer<Dtype> > softmax_layer_;
+  /// prob stores the output probability predictions from the SoftmaxLayer.
+  Blob<Dtype> prob_;
+  /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
+  vector<Blob<Dtype>*> softmax_bottom_vec_;
+  /// top vector holder used in call to the underlying SoftmaxLayer::Forward
+  vector<Blob<Dtype>*> softmax_top_vec_;
+
   Blob<Dtype> infogain_;
+  Blob<Dtype> sum_rows_H_;  // cache the row sums of H.
+
+  /// Whether to ignore instances with a certain label.
+  bool has_ignore_label_;
+  /// The label indicating that an instance should be ignored.
+  int ignore_label_;
+  /// How to normalize the output loss.
+  LossParameter_NormalizationMode normalization_;
+
+  int infogain_axis_, outer_num_, inner_num_, num_labels_;
 };
 
 }  // namespace caffe
diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp
index 624d3118..3c3f460e 100644
--- a/src/caffe/layers/infogain_loss_layer.cpp
+++ b/src/caffe/layers/infogain_loss_layer.cpp
@@ -3,7 +3,8 @@
 #include <vector>
 
 #include "caffe/layers/infogain_loss_layer.hpp"
-#include "caffe/util/io.hpp"
+#include "caffe/util/io.hpp"  // for bolb reading of matrix H
+#include "caffe/util/math_functions.hpp"
 
 namespace caffe {
 
@@ -11,6 +12,31 @@ template <typename Dtype>
 void InfogainLossLayer<Dtype>::LayerSetUp(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   LossLayer<Dtype>::LayerSetUp(bottom, top);
+  // internal softmax layer
+  LayerParameter softmax_layer_param(this->layer_param_);
+  SoftmaxParameter* softmax_param = softmax_layer_param.mutable_softmax_param();
+  softmax_param->set_axis(this->layer_param_.infogain_loss_param().axis());
+  softmax_layer_param.set_type("Softmax");
+  softmax_layer_param.clear_loss_weight();
+  softmax_layer_param.add_loss_weight(1);
+  softmax_layer_ = LayerRegistry<Dtype>::CreateLayer(softmax_layer_param);
+  softmax_bottom_vec_.clear();
+  softmax_bottom_vec_.push_back(bottom[0]);
+  softmax_top_vec_.clear();
+  softmax_top_vec_.push_back(&prob_);
+  softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);
+
+  // ignore label
+  has_ignore_label_ =
+    this->layer_param_.loss_param().has_ignore_label();
+  if (has_ignore_label_) {
+    ignore_label_ = this->layer_param_.loss_param().ignore_label();
+  }
+  // normalization
+  CHECK(!this->layer_param_.loss_param().has_normalize())
+    << "normalize is deprecated. use \"normalization\"";
+  normalization_ = this->layer_param_.loss_param().normalization();
+  // matrix H
   if (bottom.size() < 3) {
     CHECK(this->layer_param_.infogain_loss_param().has_source())
         << "Infogain matrix source must be specified.";
@@ -25,28 +51,86 @@ template <typename Dtype>
 void InfogainLossLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   LossLayer<Dtype>::Reshape(bottom, top);
+  softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_);
+  infogain_axis_ =
+    bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.infogain_loss_param().axis());
+  outer_num_ = bottom[0]->count(0, infogain_axis_);
+  inner_num_ = bottom[0]->count(infogain_axis_ + 1);
+  CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
+      << "Number of labels must match number of predictions; "
+      << "e.g., if infogain axis == 1 and prediction shape is (N, C, H, W), "
+      << "label count (number of labels) must be N*H*W, "
+      << "with integer values in {0, 1, ..., C-1}.";
+  num_labels_ = bottom[0]->shape(infogain_axis_);
   Blob<Dtype>* infogain = NULL;
   if (bottom.size() < 3) {
     infogain = &infogain_;
   } else {
     infogain = bottom[2];
   }
-  CHECK_EQ(bottom[1]->channels(), 1);
-  CHECK_EQ(bottom[1]->height(), 1);
-  CHECK_EQ(bottom[1]->width(), 1);
-  const int num = bottom[0]->num();
-  const int dim = bottom[0]->count() / num;
-  CHECK_EQ(infogain->num(), 1);
-  CHECK_EQ(infogain->channels(), 1);
-  CHECK_EQ(infogain->height(), dim);
-  CHECK_EQ(infogain->width(), dim);
+  CHECK_EQ(infogain->count(), num_labels_*num_labels_);
+  sum_rows_H_.Reshape(vector<int>(1, num_labels_));
+  if (bottom.size() == 2) {
+    // H is provided as a parameter and will not change. sum rows once
+    sum_rows_of_H(infogain);
+  }
+  if (top.size() >= 2) {
+    // softmax output
+    top[1]->ReshapeLike(*bottom[0]);
+  }
+}
+
+template <typename Dtype>
+Dtype InfogainLossLayer<Dtype>::get_normalizer(
+    LossParameter_NormalizationMode normalization_mode, int valid_count) {
+  Dtype normalizer;
+  switch (normalization_mode) {
+    case LossParameter_NormalizationMode_FULL:
+      normalizer = Dtype(outer_num_ * inner_num_);
+      break;
+    case LossParameter_NormalizationMode_VALID:
+      if (valid_count == -1) {
+        normalizer = Dtype(outer_num_ * inner_num_);
+      } else {
+        normalizer = Dtype(valid_count);
+      }
+      break;
+    case LossParameter_NormalizationMode_BATCH_SIZE:
+      normalizer = Dtype(outer_num_);
+      break;
+    case LossParameter_NormalizationMode_NONE:
+      normalizer = Dtype(1);
+      break;
+    default:
+      LOG(FATAL) << "Unknown normalization mode: "
+          << LossParameter_NormalizationMode_Name(normalization_mode);
+  }
+  // Some users will have no labels for some examples in order to 'turn off' a
+  // particular loss in a multi-task setup. The max prevents NaNs in that case.
+  return std::max(Dtype(1.0), normalizer);
 }
 
+template <typename Dtype>
+void InfogainLossLayer<Dtype>::sum_rows_of_H(const Blob<Dtype>* H) {
+  CHECK_EQ(H->count(), num_labels_*num_labels_)
+    << "H must be " << num_labels_ << "x" << num_labels_;
+  const Dtype* infogain_mat = H->cpu_data();
+  Dtype* sum = sum_rows_H_.mutable_cpu_data();
+  for ( int row = 0; row < num_labels_ ; row++ ) {
+    sum[row] = 0;
+    for ( int col = 0; col < num_labels_ ; col++ ) {
+      sum[row] += infogain_mat[row*num_labels_+col];
+    }
+  }
+}
 
 template <typename Dtype>
 void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
+  // The forward pass computes the softmax prob values.
+  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
+  const Dtype* prob_data = prob_.cpu_data();
   const Dtype* bottom_label = bottom[1]->cpu_data();
   const Dtype* infogain_mat = NULL;
   if (bottom.size() < 3) {
@@ -54,17 +138,30 @@ void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   } else {
     infogain_mat = bottom[2]->cpu_data();
   }
-  int num = bottom[0]->num();
-  int dim = bottom[0]->count() / bottom[0]->num();
+  int count = 0;
   Dtype loss = 0;
-  for (int i = 0; i < num; ++i) {
-    int label = static_cast<int>(bottom_label[i]);
-    for (int j = 0; j < dim; ++j) {
-      Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-      loss -= infogain_mat[label * dim + j] * log(prob);
+  for (int i = 0; i < outer_num_; ++i) {
+    for (int j = 0; j < inner_num_; j++) {
+      const int label_value =
+        static_cast<int>(bottom_label[i * inner_num_ + j]);
+      if (has_ignore_label_ && label_value == ignore_label_) {
+        continue;
+      }
+      DCHECK_GE(label_value, 0);
+      DCHECK_LT(label_value, num_labels_);
+      for (int l = 0; l < num_labels_; l++) {
+        loss -= infogain_mat[label_value * num_labels_ + l] *
+          log(std::max(
+                prob_data[i * inner_num_*num_labels_ + l * inner_num_ + j],
+                Dtype(kLOG_THRESHOLD)));
+      }
+      ++count;
     }
   }
-  top[0]->mutable_cpu_data()[0] = loss / num;
+  top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count);
+  if (top.size() == 2) {
+    top[1]->ShareData(prob_);
+  }
 }
 
 template <typename Dtype>
@@ -80,25 +177,44 @@ void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                << " Layer cannot backpropagate to infogain inputs.";
   }
   if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const Dtype* prob_data = prob_.cpu_data();
     const Dtype* bottom_label = bottom[1]->cpu_data();
     const Dtype* infogain_mat = NULL;
     if (bottom.size() < 3) {
       infogain_mat = infogain_.cpu_data();
     } else {
       infogain_mat = bottom[2]->cpu_data();
+      // H is provided as a "bottom" and might change. sum rows every time.
+      sum_rows_of_H(bottom[2]);
     }
+    const Dtype* sum_rows_H = sum_rows_H_.cpu_data();
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    int num = bottom[0]->num();
-    int dim = bottom[0]->count() / bottom[0]->num();
-    const Dtype scale = - top[0]->cpu_diff()[0] / num;
-    for (int i = 0; i < num; ++i) {
-      const int label = static_cast<int>(bottom_label[i]);
-      for (int j = 0; j < dim; ++j) {
-        Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-        bottom_diff[i * dim + j] = scale * infogain_mat[label * dim + j] / prob;
+    const int dim = bottom[0]->count() / outer_num_;
+    int count = 0;
+    for (int i = 0; i < outer_num_; ++i) {
+      for (int j = 0; j < inner_num_; ++j) {
+        const int label_value =
+          static_cast<int>(bottom_label[i * inner_num_ + j]);
+        DCHECK_GE(label_value, 0);
+        DCHECK_LT(label_value, num_labels_);
+        if (has_ignore_label_ && label_value == ignore_label_) {
+          for (int l = 0; l < num_labels_; ++l) {
+            bottom_diff[i * dim + l * inner_num_ + j] = 0;
+          }
+        } else {
+          for (int l = 0; l < num_labels_; ++l) {
+            bottom_diff[i * dim + l * inner_num_ + j] =
+               prob_data[i*dim + l*inner_num_ + j]*sum_rows_H[label_value]
+               - infogain_mat[label_value * num_labels_ + l];
+          }
+          ++count;
+        }
       }
     }
+    // Scale gradient
+    Dtype loss_weight = top[0]->cpu_diff()[0] /
+                        get_normalizer(normalization_, count);
+    caffe_scal(bottom[0]->count(), loss_weight, bottom_diff);
   }
 }
 
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 6900bb71..591e9647 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -794,6 +794,7 @@ message ImageDataParameter {
 message InfogainLossParameter {
   // Specify the infogain matrix source.
   optional string source = 1;
+  optional int32 axis = 2 [default = 1]; // axis of prob
 }
 
 message InnerProductParameter {
diff --git a/src/caffe/test/test_infogain_loss_layer.cpp b/src/caffe/test/test_infogain_loss_layer.cpp
index a24ac683..34f21271 100644
--- a/src/caffe/test/test_infogain_loss_layer.cpp
+++ b/src/caffe/test/test_infogain_loss_layer.cpp
@@ -1,3 +1,4 @@
+#include <algorithm>
 #include <vector>
 
 #include "gtest/gtest.h"
@@ -18,17 +19,22 @@ class InfogainLossLayerTest : public MultiDeviceTest<TypeParam> {
 
  protected:
   InfogainLossLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
-        blob_bottom_label_(new Blob<Dtype>(10, 1, 1, 1)),
+      : blob_bottom_data_(new Blob<Dtype>(4, 2, 5, 2)),
+        blob_bottom_label_(new Blob<Dtype>(4, 2, 1, 2)),
         blob_bottom_infogain_(new Blob<Dtype>(1, 1, 5, 5)),
-        blob_top_loss_(new Blob<Dtype>()) {
+        blob_top_loss_(new Blob<Dtype>()),
+        blob_top_prob_(new Blob<Dtype>()),
+        inner_(2), outer_(4*2), num_labels_(5) {
     Caffe::set_random_seed(1701);
     FillerParameter filler_param;
-    PositiveUnitballFiller<Dtype> filler(filler_param);
+    filler_param.set_min(-0.5);
+    filler_param.set_max(2.0);
+    UniformFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_data_);
     blob_bottom_vec_.push_back(blob_bottom_data_);
     for (int i = 0; i < blob_bottom_label_->count(); ++i) {
-      blob_bottom_label_->mutable_cpu_data()[i] = caffe_rng_rand() % 5;
+      blob_bottom_label_->mutable_cpu_data()[i] =
+        caffe_rng_rand() % num_labels_;
     }
     blob_bottom_vec_.push_back(blob_bottom_label_);
     filler_param.set_min(0.1);
@@ -37,29 +43,94 @@ class InfogainLossLayerTest : public MultiDeviceTest<TypeParam> {
     infogain_filler.Fill(this->blob_bottom_infogain_);
     blob_bottom_vec_.push_back(blob_bottom_infogain_);
     blob_top_vec_.push_back(blob_top_loss_);
+    blob_top_vec_.push_back(blob_top_prob_);
   }
   virtual ~InfogainLossLayerTest() {
     delete blob_bottom_data_;
     delete blob_bottom_label_;
     delete blob_bottom_infogain_;
     delete blob_top_loss_;
+    delete blob_top_prob_;
   }
   Blob<Dtype>* const blob_bottom_data_;
   Blob<Dtype>* const blob_bottom_label_;
   Blob<Dtype>* const blob_bottom_infogain_;
   Blob<Dtype>* const blob_top_loss_;
+  Blob<Dtype>* const blob_top_prob_;
   vector<Blob<Dtype>*> blob_bottom_vec_;
   vector<Blob<Dtype>*> blob_top_vec_;
+  int inner_, outer_, num_labels_;
 };
 
 TYPED_TEST_CASE(InfogainLossLayerTest, TestDtypesAndDevices);
 
+TYPED_TEST(InfogainLossLayerTest, TestInfogainLoss) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_infogain_loss_param()->set_axis(2);
+  layer_param.clear_loss_weight();
+  layer_param.add_loss_weight(1);
+  layer_param.add_loss_weight(0);
+  /*vector<float>* lw = layer_param.mutable_loss_weight();
+  lw->clear();
+  lw->push_back(1);
+  lw->push_back(1);*/
+  InfogainLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  const Dtype* data = this->blob_bottom_vec_[0]->cpu_data();
+  const Dtype* prob = this->blob_top_vec_[1]->cpu_data();
+  const Dtype* labels = this->blob_bottom_vec_[1]->cpu_data();
+  const Dtype* H = this->blob_bottom_vec_[2]->cpu_data();
+  // first. test the prob top
+  CHECK_EQ(this->blob_bottom_vec_[0]->num_axes(),
+    this->blob_top_vec_[1]->num_axes())
+      << "prob top shape not match bottom data";
+  for (int ai = 0 ; ai < this->blob_bottom_vec_[0]->num_axes(); ai++) {
+    CHECK_EQ(this->blob_bottom_vec_[0]->shape(ai),
+      this->blob_top_vec_[1]->shape(ai))
+        << "prob top shape not match bottom data";
+  }
+  vector<Dtype> est_prob(this->num_labels_, 0);
+  for ( int i = 0 ; i < this->outer_; i++ ) {
+    for ( int j = 0; j < this->inner_; j++ ) {
+      Dtype den = 0;
+      for ( int  l = 0; l < this->num_labels_; l++ ) {
+        est_prob[l] = std::exp(
+          data[i*this->num_labels_*this->inner_ + l*this->inner_ + j]);
+        den += est_prob[l];
+      }
+      for ( int l = 0; l < this->num_labels_; l++ ) {
+        EXPECT_NEAR(prob[i*this->num_labels_*this->inner_ + l*this->inner_ + j],
+          est_prob[l]/den, 1e-6);
+      }
+    }
+  }
+  Dtype loss = 0;  // loss from prob top
+  for ( int i = 0 ; i < this->outer_; i++ ) {
+    for ( int j = 0; j < this->inner_; j++ ) {
+      int gt = static_cast<int>(labels[i*this->inner_+j]);
+      for ( int l = 0; l < this->num_labels_; l++ ) {
+        loss -= H[gt*this->num_labels_ + l] *
+          log(std::max(
+            prob[i*this->num_labels_*this->inner_ + l*this->inner_ + j],
+            Dtype(kLOG_THRESHOLD)));
+      }
+    }
+  }
+  EXPECT_NEAR(this->blob_top_loss_->cpu_data()[0],
+    loss/(this->outer_*this->inner_), 1e-6);
+}
 
 TYPED_TEST(InfogainLossLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
+  layer_param.mutable_infogain_loss_param()->set_axis(2);
   InfogainLossLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-4, 2e-2, 1701, 1, 0.01);
+  this->blob_top_vec_.clear();  // ignore prob top.
+  this->blob_top_vec_.push_back(this->blob_top_loss_);
+  GradientChecker<Dtype> checker(1e-4, 2e-2, 1701);  // no "kink"
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_, 0);
 }

From 8c041a7cf3e571b175cfd8859f1af5f067f8cd7a Mon Sep 17 00:00:00 2001
From: rscohn2 <rscohn2@gmail.com>
Date: Sat, 26 Mar 2016 10:00:26 -0400
Subject: [PATCH 008/183] Update info about MKL licensing

The instructions say that MKL is free for students, but as of 8/2015, MKL is free for everyone with community licensing.
---
 docs/installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/installation.md b/docs/installation.md
index 89316458..e273034f 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -52,7 +52,7 @@ Caffe requires BLAS as the backend of its matrix and vector computations.
 There are several implementations of this library. The choice is yours:
 
 * [ATLAS](http://math-atlas.sourceforge.net/): free, open source, and so the default for Caffe.
-* [Intel MKL](http://software.intel.com/en-us/intel-mkl): commercial and optimized for Intel CPUs, with a free trial and [student](http://software.intel.com/en-us/intel-education-offerings) licenses.
+* [Intel MKL](http://software.intel.com/en-us/intel-mkl): commercial and optimized for Intel CPUs, with [free](https://registrationcenter.intel.com/en/forms/?productid=2558) licenses.
     1. Install MKL.
     2. Set up MKL environment (Details: [Linux](https://software.intel.com/en-us/node/528499), [OS X](https://software.intel.com/en-us/node/528659)). Example: *source /opt/intel/mkl/bin/mklvars.sh intel64*
     3. Set `BLAS := mkl` in `Makefile.config`

From a66bea30d6c0706f106b355c7cafc9e7ffae7bb5 Mon Sep 17 00:00:00 2001
From: An Tran <tranlaman@gmail.com>
Date: Wed, 30 Mar 2016 17:32:10 +0800
Subject: [PATCH 009/183] small bug in pooling_layer.cu

---
 src/caffe/layers/pooling_layer.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/pooling_layer.cu
index 1ea46cc8..81ead1e8 100644
--- a/src/caffe/layers/pooling_layer.cu
+++ b/src/caffe/layers/pooling_layer.cu
@@ -138,7 +138,7 @@ __global__ void StoPoolForwardTest(const int nthreads,
     const int wstart = pw * stride_w;
     const int wend = min(wstart + kernel_w, width);
     // We set cumsum to be 0 to avoid divide-by-zero problems
-    Dtype cumsum = FLT_MIN;
+    Dtype cumsum = 0.;
     Dtype cumvalues = 0.;
     const Dtype* const bottom_slice =
         bottom_data + (n * channels + c) * height * width;

From d17fbea6aad122c3818d5ef3593487869948b4b7 Mon Sep 17 00:00:00 2001
From: An Tran <tranlaman@gmail.com>
Date: Thu, 31 Mar 2016 10:27:31 +0800
Subject: [PATCH 010/183] avoid divide by zeros, suggested by SeanBell

---
 src/caffe/layers/pooling_layer.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/pooling_layer.cu
index 81ead1e8..46eddb94 100644
--- a/src/caffe/layers/pooling_layer.cu
+++ b/src/caffe/layers/pooling_layer.cu
@@ -149,7 +149,7 @@ __global__ void StoPoolForwardTest(const int nthreads,
         cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
       }
     }
-    top_data[index] = cumvalues / cumsum;
+    top_data[index] = (cumsum > 0.) ? cumvalues / cumsum : 0.;
   }
 }
 

From d4e7c93a6873f75a53d7618e82343e4b5b8a239e Mon Sep 17 00:00:00 2001
From: Aaron Schumacher <ajschumacher@gmail.com>
Date: Thu, 19 May 2016 14:04:22 -0500
Subject: [PATCH 011/183] convert non-uint8 dtypes to float; refs #2391

As recommended by @longjon, this will allow `caffe.io.array_to_datum` to handle, for example, numpy.float32 arrays.

It might be worth noting that `datum.float_data` is stored as protobuf type 2, which is float32, as opposed to protobuf type 1, which is float64. It is a little unintuitive that caffe currently requires data to be passed in as float64 but then writes float32 to LMDB. To demonstrate this:

```python
datum = caffe.io.array_to_datum(np.array([[[0.9]]]))
caffe.io.datum_to_array(datum)
# array([[[ 0.9]]])
datum_str = datum.SerializeToString()
new_datum = caffe.proto.caffe_pb2.Datum()
new_datum.ParseFromString(datum_str)
caffe.io.datum_to_array(new_datum)
# array([[[ 0.89999998]]])
```

This behavior is somewhat hidden because `datum_to_array` returns type float64, even though the data doesn't actually have that resolution if it has been stored as protobuf text anywhere (for example in LMDB).

Alternative solutions:
 * Require and return float32, consistent with the protobuf representation.
 * Change the protobuf to allow float32 or float64 and update surrounding code to support this.
---
 python/caffe/io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/caffe/io.py b/python/caffe/io.py
index e1759beb..966c164c 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -75,7 +75,7 @@ def array_to_datum(arr, label=None):
     if arr.dtype == np.uint8:
         datum.data = arr.tostring()
     else:
-        datum.float_data.extend(arr.flat)
+        datum.float_data.extend(arr.astype(float).flat)
     if label is not None:
         datum.label = label
     return datum

From 5d7a71ae108f86c05bc03eb542155b30bd28ca74 Mon Sep 17 00:00:00 2001
From: Lumin Zhou <CDLuminate@users.noreply.github.com>
Date: Mon, 30 May 2016 04:19:16 +0000
Subject: [PATCH 012/183] using GNUInstallDirs in root cmake file

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index da7142c9..c765889e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,6 +18,7 @@ add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
 include(ExternalProject)
+include(GNUInstallDirs)
 
 include(cmake/Utils.cmake)
 include(cmake/Targets.cmake)

From 90b98ce76fe8613d345932f47a6250dc772f7b8f Mon Sep 17 00:00:00 2001
From: Lumin Zhou <CDLuminate@users.noreply.github.com>
Date: Mon, 30 May 2016 04:21:27 +0000
Subject: [PATCH 013/183] fix install path with GNUInstallDir support

---
 src/caffe/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
index 8a80c940..5a1b73f7 100644
--- a/src/caffe/CMakeLists.txt
+++ b/src/caffe/CMakeLists.txt
@@ -29,9 +29,9 @@ set_target_properties(caffe PROPERTIES
  add_subdirectory(test)
 
 # ---[ Install
-install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION include)
-install(FILES ${proto_hdrs} DESTINATION include/caffe/proto)
-install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION lib)
+install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(FILES ${proto_hdrs} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/caffe/proto)
+install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
 list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)

From 581650b18d7580df726d1d6d54d83c397d1379bb Mon Sep 17 00:00:00 2001
From: Lumin Zhou <CDLuminate@users.noreply.github.com>
Date: Mon, 30 May 2016 04:22:42 +0000
Subject: [PATCH 014/183] fix install path with GNUInstallDir support

---
 tools/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 02fbd5ca..37894505 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -25,5 +25,6 @@ foreach(source ${srcs})
   endif()
 
   # Install
-  install(TARGETS ${name} DESTINATION bin)
+  install(TARGETS ${name} DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 endforeach(source)

From f710ef5e89d3ec22891b24099c66b7a6e9f06c45 Mon Sep 17 00:00:00 2001
From: Lumin Zhou <CDLuminate@users.noreply.github.com>
Date: Mon, 30 May 2016 04:24:13 +0000
Subject: [PATCH 015/183] fix install path with GNUInstallDir support

---
 examples/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 663d7360..2a230033 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -19,7 +19,8 @@ foreach(source_file ${examples_srcs})
   caffe_set_solution_folder(${name} examples)
 
   # install
-  install(TARGETS ${name} DESTINATION bin)
+  install(TARGETS ${name} DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 
   if(UNIX OR APPLE)
     # Funny command to make tutorials work

From 918d9994e4b2e9d82bd7929b0ef1d90393f68b31 Mon Sep 17 00:00:00 2001
From: Josh Klontz <josh.klontz@gmail.com>
Date: Tue, 31 May 2016 18:08:04 -0600
Subject: [PATCH 016/183] Fix vecLib search order for clients with both the old
 vecLib framework and the new Accelerate framework

---
 cmake/Modules/FindvecLib.cmake | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cmake/Modules/FindvecLib.cmake b/cmake/Modules/FindvecLib.cmake
index 9600da43..46043367 100644
--- a/cmake/Modules/FindvecLib.cmake
+++ b/cmake/Modules/FindvecLib.cmake
@@ -14,9 +14,10 @@ set(__veclib_include_suffix "Frameworks/vecLib.framework/Versions/Current/Header
 
 find_path(vecLib_INCLUDE_DIR vecLib.h
           DOC "vecLib include directory"
-          PATHS /System/Library/${__veclib_include_suffix}
-                /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
-                /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/)
+          PATHS /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
+                /System/Library/${__veclib_include_suffix}
+                /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
+          NO_DEFAULT_PATH)
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(vecLib DEFAULT_MSG vecLib_INCLUDE_DIR)

From 9f1855273fa27d106b3675d32ec01acb658a80f0 Mon Sep 17 00:00:00 2001
From: Raffi Enficiaud <raffi.enficiaud@tuebingen.mpg.de>
Date: Tue, 21 Jun 2016 13:41:06 +0200
Subject: [PATCH 017/183] Fix glog upstream autoconf

---
 cmake/External/glog.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/External/glog.cmake b/cmake/External/glog.cmake
index a44672f2..f9d0549c 100644
--- a/cmake/External/glog.cmake
+++ b/cmake/External/glog.cmake
@@ -37,6 +37,7 @@ if (NOT __GLOG_INCLUDED)
       GIT_TAG "v0.3.4"
       UPDATE_COMMAND ""
       INSTALL_DIR ${gflags_INSTALL}
+      PATCH_COMMAND autoreconf -i ${glog_PREFIX}/src/glog
       CONFIGURE_COMMAND env "CFLAGS=${GLOG_C_FLAGS}" "CXXFLAGS=${GLOG_CXX_FLAGS}" ${glog_PREFIX}/src/glog/configure --prefix=${glog_INSTALL} --enable-shared=no --enable-static=yes --with-gflags=${GFLAGS_LIBRARY_DIRS}/..
       LOG_DOWNLOAD 1
       LOG_CONFIGURE 1

From b29d271b8cd679588618d502add8a4eae2beb853 Mon Sep 17 00:00:00 2001
From: Valentin Tolmer <valentin.tolmer@gmail.com>
Date: Tue, 21 Jun 2016 16:22:20 -0700
Subject: [PATCH 018/183] add layer_dict to the python interface

---
 python/caffe/pycaffe.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index ca6d050e..4f84605b 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -43,6 +43,16 @@ def _Net_blob_loss_weights(self):
                                                        self._blob_loss_weights))
     return self._blob_loss_weights_dict
 
+@property
+def _Net_layer_dict(self):
+    """
+    An OrderedDict (bottom to top, i.e., input to output) of network
+    layers indexed by name
+    """
+    if not hasattr(self, '_layer_dict'):
+        self._layer_dict = OrderedDict(zip(self._layer_names, self.layers))
+    return self._layer_dict
+
 
 @property
 def _Net_params(self):
@@ -311,6 +321,7 @@ def __getitem__(self, name):
 # Attach methods to Net.
 Net.blobs = _Net_blobs
 Net.blob_loss_weights = _Net_blob_loss_weights
+Net.layer_dict = _Net_layer_dict
 Net.params = _Net_params
 Net.forward = _Net_forward
 Net.backward = _Net_backward

From 5417f106c14c782865e2a5484020b8e45a8b2b80 Mon Sep 17 00:00:00 2001
From: Valentin Tolmer <valentin.tolmer@gmail.com>
Date: Tue, 21 Jun 2016 16:39:30 -0700
Subject: [PATCH 019/183] add tests for pycaffe's layer_dict

---
 python/caffe/test/test_net.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/caffe/test/test_net.py b/python/caffe/test/test_net.py
index 4cacfcd0..546bd5fa 100644
--- a/python/caffe/test/test_net.py
+++ b/python/caffe/test/test_net.py
@@ -59,6 +59,13 @@ def test_memory(self):
         for bl in blobs:
             total += bl.data.sum() + bl.diff.sum()
 
+    def test_layer_dict(self):
+        layer_dict = self.net.layer_dict
+        self.assertEqual(list(layer_dict.keys()), list(self.net._layer_names))
+        for i, name in enumerate(self.net._layer_names):
+            self.assertEqual(layer_dict[name].type,
+                             self.net.layers[i].type)
+
     def test_forward_backward(self):
         self.net.forward()
         self.net.backward()

From bdb94577d97da5cf5b6ec046952dbe79e9c886bf Mon Sep 17 00:00:00 2001
From: Alican Bozkurt <alican@ece.neu.edu>
Date: Tue, 28 Jun 2016 16:28:33 -0400
Subject: [PATCH 020/183] add default value for rms_decay

---
 src/caffe/proto/caffe.proto | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 1556781c..6940a705 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -219,7 +219,7 @@ message SolverParameter {
 
   // RMSProp decay value
   // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
-  optional float rms_decay = 38;
+  optional float rms_decay = 38 [default = 0.99];
 
   // If true, print information about the state of the net that may help with
   // debugging learning problems.

From 80f60dae071fca4457d7a439960385a4579f489d Mon Sep 17 00:00:00 2001
From: Alican Bozkurt <alican@ece.neu.edu>
Date: Tue, 28 Jun 2016 16:59:36 -0400
Subject: [PATCH 021/183] corrected rmsprop documentation

---
 docs/tutorial/solver.md | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/docs/tutorial/solver.md b/docs/tutorial/solver.md
index b719f715..81c62638 100644
--- a/docs/tutorial/solver.md
+++ b/docs/tutorial/solver.md
@@ -209,18 +209,11 @@ What distinguishes the method from SGD is the weight setting $$ W $$ on which we
 The **RMSprop** (`type: "RMSProp"`), suggested by Tieleman in a Coursera course lecture, is a gradient-based optimization method (like SGD). The update formulas are
 
 $$
-(v_t)_i =
-\begin{cases}
-(v_{t-1})_i + \delta, &(\nabla L(W_t))_i(\nabla L(W_{t-1}))_i > 0\\
-(v_{t-1})_i \cdot (1-\delta), & \text{else}
-\end{cases}
+\operatorname{MS}((W_t)_i)= \delta\operatorname{MS}((W_{t-1})_i)+ (1-\delta)(\nabla L(W_t))_i^2 \\
+(W_{t+1})_i= (W_{t})_i -\alpha\frac{(\nabla L(W_t))_i}{\sqrt{\operatorname{MS}((W_t)_i)}}
 $$
 
-$$
-(W_{t+1})_i =(W_t)_i - \alpha (v_t)_i,
-$$
-
-If the gradient updates results in oscillations the gradient is reduced by times $$1-\delta$$. Otherwise it will be increased by $$\delta$$. The default value of $$\delta$$ (`rms_decay`) is set to $$\delta = 0.02$$.
+The default value of $$\delta$$ (`rms_decay`) is set to $$\delta=0.99$$.
 
 [1] T. Tieleman, and G. Hinton.
     [RMSProp: Divide the gradient by a running average of its recent magnitude](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).

From 35a9a075cdc65c86021dde4d11e3b1c05e27971b Mon Sep 17 00:00:00 2001
From: Valentin Tolmer <valentin.tolmer@gmail.com>
Date: Wed, 22 Jun 2016 15:13:54 -0700
Subject: [PATCH 022/183] add set_random_seed to the python interface

---
 python/caffe/__init__.py | 2 +-
 python/caffe/_caffe.cpp  | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index e2881b89..35868a40 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,5 @@
 from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver
-from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list
+from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 334088e8..3db55ea4 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -51,6 +51,8 @@ const int NPY_DTYPE = NPY_FLOAT32;
 void set_mode_cpu() { Caffe::set_mode(Caffe::CPU); }
 void set_mode_gpu() { Caffe::set_mode(Caffe::GPU); }
 
+void set_random_seed(unsigned int seed) { Caffe::set_random_seed(seed); }
+
 // For convenience, check that input files can be opened, and raise an
 // exception that boost will send to Python if not (caffe could still crash
 // later if the input files are disturbed before they are actually used, but
@@ -260,6 +262,7 @@ BOOST_PYTHON_MODULE(_caffe) {
   // Caffe utility functions
   bp::def("set_mode_cpu", &set_mode_cpu);
   bp::def("set_mode_gpu", &set_mode_gpu);
+  bp::def("set_random_seed", &set_random_seed);
   bp::def("set_device", &Caffe::SetDevice);
 
   bp::def("layer_type_list", &LayerRegistry<Dtype>::LayerTypeList);

From 12c74460d3e7c416b869e6b4afa0e5c2e84ec29b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Malte=20St=C3=A6r=20Nissen?= <nissen@di.ku.dk>
Date: Tue, 12 Jul 2016 13:17:52 +0200
Subject: [PATCH 023/183] Support for spaces in directories when downloading
 cifar10

---
 data/cifar10/get_cifar10.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/cifar10/get_cifar10.sh b/data/cifar10/get_cifar10.sh
index 623c8485..423f1098 100755
--- a/data/cifar10/get_cifar10.sh
+++ b/data/cifar10/get_cifar10.sh
@@ -2,7 +2,7 @@
 # This scripts downloads the CIFAR10 (binary version) data and unzips it.
 
 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
+cd "$DIR"
 
 echo "Downloading..."
 

From e14b7f7ea597afe532bf1c4d4013f2c63494d7a6 Mon Sep 17 00:00:00 2001
From: Valentin Tolmer <valentin.tolmer@gmail.com>
Date: Tue, 21 Jun 2016 14:58:43 -0700
Subject: [PATCH 024/183] improve top_names and bottom_names in pycaffe

---
 python/caffe/pycaffe.py | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index ca6d050e..5bae18d9 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -292,21 +292,31 @@ def _Net_batch(self, blobs):
                                                  padding])
         yield padded_batch
 
-
-class _Net_IdNameWrapper:
-    """
-    A simple wrapper that allows the ids propery to be accessed as a dict
-    indexed by names. Used for top and bottom names
+def _Net_get_id_name(func, field):
     """
-    def __init__(self, net, func):
-        self.net, self.func = net, func
+    Generic property that maps func to the layer names into an OrderedDict.
+
+    Used for top_names and bottom_names.
 
-    def __getitem__(self, name):
-        # Map the layer name to id
-        ids = self.func(self.net, list(self.net._layer_names).index(name))
-        # Map the blob id to name
-        id_to_name = list(self.net.blobs)
-        return [id_to_name[i] for i in ids]
+    Parameters
+    ----------
+    func: function id -> [id]
+    field: implementation field name (cache)
+
+    Returns
+    ------
+    A one-parameter function that can be set as a property.
+    """
+    @property
+    def get_id_name(self):
+        if not hasattr(self, field):
+            id_to_name = list(self.blobs)
+            res = OrderedDict([(self._layer_names[i],
+                                [id_to_name[j] for j in func(self, i)])
+                                for i in range(len(self.layers))])
+            setattr(self, field, res)
+        return getattr(self, field)
+    return get_id_name
 
 # Attach methods to Net.
 Net.blobs = _Net_blobs
@@ -320,5 +330,5 @@ def __getitem__(self, name):
 Net._batch = _Net_batch
 Net.inputs = _Net_inputs
 Net.outputs = _Net_outputs
-Net.top_names = property(lambda n: _Net_IdNameWrapper(n, Net._top_ids))
-Net.bottom_names = property(lambda n: _Net_IdNameWrapper(n, Net._bottom_ids))
+Net.top_names = _Net_get_id_name(Net._top_ids, "_top_names")
+Net.bottom_names = _Net_get_id_name(Net._bottom_ids, "_bottom_names")

From 7c50a2cb87c6b044f85ced87273d302fb21394f7 Mon Sep 17 00:00:00 2001
From: Valentin Tolmer <valentin.tolmer@gmail.com>
Date: Tue, 21 Jun 2016 17:17:05 -0700
Subject: [PATCH 025/183] add test for top/bottom names

---
 python/caffe/test/test_net.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/python/caffe/test/test_net.py b/python/caffe/test/test_net.py
index 4cacfcd0..96821e40 100644
--- a/python/caffe/test/test_net.py
+++ b/python/caffe/test/test_net.py
@@ -3,6 +3,7 @@
 import os
 import numpy as np
 import six
+from collections import OrderedDict
 
 import caffe
 
@@ -67,6 +68,18 @@ def test_inputs_outputs(self):
         self.assertEqual(self.net.inputs, [])
         self.assertEqual(self.net.outputs, ['loss'])
 
+    def test_top_bottom_names(self):
+        self.assertEqual(self.net.top_names,
+                         OrderedDict([('data', ['data', 'label']),
+                                      ('conv', ['conv']),
+                                      ('ip', ['ip']),
+                                      ('loss', ['loss'])]))
+        self.assertEqual(self.net.bottom_names,
+                         OrderedDict([('data', []),
+                                      ('conv', ['data']),
+                                      ('ip', ['conv']),
+                                      ('loss', ['ip', 'label'])]))
+
     def test_save_and_read(self):
         f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
         f.close()

From d9ad2ef90a1cbaa2b22b229539a14341efe59ee6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Malte=20St=C3=A6r=20Nissen?= <nissen@di.ku.dk>
Date: Wed, 13 Jul 2016 11:17:54 +0200
Subject: [PATCH 026/183] Support spaces in path when downloading ILSVRC12 and
 MNIST

---
 data/ilsvrc12/get_ilsvrc_aux.sh | 2 +-
 data/mnist/get_mnist.sh         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/ilsvrc12/get_ilsvrc_aux.sh b/data/ilsvrc12/get_ilsvrc_aux.sh
index 90935f25..dc0d0a72 100755
--- a/data/ilsvrc12/get_ilsvrc_aux.sh
+++ b/data/ilsvrc12/get_ilsvrc_aux.sh
@@ -8,7 +8,7 @@
 # - the training splits with labels
 
 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
+cd "$DIR"
 
 echo "Downloading..."
 
diff --git a/data/mnist/get_mnist.sh b/data/mnist/get_mnist.sh
index 6d875219..ecadffa4 100755
--- a/data/mnist/get_mnist.sh
+++ b/data/mnist/get_mnist.sh
@@ -2,7 +2,7 @@
 # This scripts downloads the mnist data and unzips it.
 
 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
+cd "$DIR"
 
 echo "Downloading..."
 

From 93d321227f0681165b126d9ca47b211f5d2c1909 Mon Sep 17 00:00:00 2001
From: Luke Yeager <luke.yeager@gmail.com>
Date: Wed, 13 Jul 2016 15:58:29 -0700
Subject: [PATCH 027/183] Add "set -e" and $@ to example scripts

---
 examples/cifar10/create_cifar10.sh                 | 1 +
 examples/cifar10/train_full.sh                     | 7 ++++---
 examples/cifar10/train_full_sigmoid.sh             | 3 ++-
 examples/cifar10/train_full_sigmoid_bn.sh          | 3 ++-
 examples/cifar10/train_quick.sh                    | 5 +++--
 examples/imagenet/create_imagenet.sh               | 1 +
 examples/imagenet/resume_training.sh               | 4 +++-
 examples/imagenet/train_caffenet.sh                | 3 ++-
 examples/mnist/create_mnist.sh                     | 1 +
 examples/mnist/train_lenet.sh                      | 3 ++-
 examples/mnist/train_lenet_adam.sh                 | 3 ++-
 examples/mnist/train_lenet_consolidated.sh         | 3 ++-
 examples/mnist/train_lenet_rmsprop.sh              | 4 +++-
 examples/mnist/train_mnist_autoencoder.sh          | 3 ++-
 examples/mnist/train_mnist_autoencoder_adadelta.sh | 3 ++-
 examples/mnist/train_mnist_autoencoder_adagrad.sh  | 3 ++-
 examples/mnist/train_mnist_autoencoder_nesterov.sh | 3 ++-
 examples/siamese/create_mnist_siamese.sh           | 1 +
 examples/siamese/train_mnist_siamese.sh            | 3 ++-
 19 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/examples/cifar10/create_cifar10.sh b/examples/cifar10/create_cifar10.sh
index a42725cb..7ee1d6ad 100755
--- a/examples/cifar10/create_cifar10.sh
+++ b/examples/cifar10/create_cifar10.sh
@@ -1,5 +1,6 @@
 #!/usr/bin/env sh
 # This script converts the cifar data into leveldb format.
+set -e
 
 EXAMPLE=examples/cifar10
 DATA=data/cifar10
diff --git a/examples/cifar10/train_full.sh b/examples/cifar10/train_full.sh
index ef112e1f..06ecc2dc 100755
--- a/examples/cifar10/train_full.sh
+++ b/examples/cifar10/train_full.sh
@@ -1,16 +1,17 @@
 #!/usr/bin/env sh
+set -e
 
 TOOLS=./build/tools
 
 $TOOLS/caffe train \
-    --solver=examples/cifar10/cifar10_full_solver.prototxt
+    --solver=examples/cifar10/cifar10_full_solver.prototxt $@
 
 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
     --solver=examples/cifar10/cifar10_full_solver_lr1.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate.h5
+    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate.h5 $@
 
 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
     --solver=examples/cifar10/cifar10_full_solver_lr2.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate.h5
+    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate.h5 $@
diff --git a/examples/cifar10/train_full_sigmoid.sh b/examples/cifar10/train_full_sigmoid.sh
index 9cff06d3..9b5d5213 100755
--- a/examples/cifar10/train_full_sigmoid.sh
+++ b/examples/cifar10/train_full_sigmoid.sh
@@ -1,7 +1,8 @@
 #!/usr/bin/env sh
+set -e
 
 TOOLS=./build/tools
 
 $TOOLS/caffe train \
-    --solver=examples/cifar10/cifar10_full_sigmoid_solver.prototxt
+    --solver=examples/cifar10/cifar10_full_sigmoid_solver.prototxt $@
 
diff --git a/examples/cifar10/train_full_sigmoid_bn.sh b/examples/cifar10/train_full_sigmoid_bn.sh
index 011387c9..05547f3a 100755
--- a/examples/cifar10/train_full_sigmoid_bn.sh
+++ b/examples/cifar10/train_full_sigmoid_bn.sh
@@ -1,7 +1,8 @@
 #!/usr/bin/env sh
+set -e
 
 TOOLS=./build/tools
 
 $TOOLS/caffe train \
-    --solver=examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt
+    --solver=examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt $@
 
diff --git a/examples/cifar10/train_quick.sh b/examples/cifar10/train_quick.sh
index 6b7d2288..d2b87534 100755
--- a/examples/cifar10/train_quick.sh
+++ b/examples/cifar10/train_quick.sh
@@ -1,11 +1,12 @@
 #!/usr/bin/env sh
+set -e
 
 TOOLS=./build/tools
 
 $TOOLS/caffe train \
-  --solver=examples/cifar10/cifar10_quick_solver.prototxt
+  --solver=examples/cifar10/cifar10_quick_solver.prototxt $@
 
 # reduce learning rate by factor of 10 after 8 epochs
 $TOOLS/caffe train \
   --solver=examples/cifar10/cifar10_quick_solver_lr1.prototxt \
-  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate.h5
+  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate.h5 $@
diff --git a/examples/imagenet/create_imagenet.sh b/examples/imagenet/create_imagenet.sh
index e912ac43..1bf08b1a 100755
--- a/examples/imagenet/create_imagenet.sh
+++ b/examples/imagenet/create_imagenet.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env sh
 # Create the imagenet lmdb inputs
 # N.B. set the path to the imagenet train + val data dirs
+set -e
 
 EXAMPLE=examples/imagenet
 DATA=data/ilsvrc12
diff --git a/examples/imagenet/resume_training.sh b/examples/imagenet/resume_training.sh
index bf7945c0..4aef2043 100755
--- a/examples/imagenet/resume_training.sh
+++ b/examples/imagenet/resume_training.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env sh
+set -e
 
 ./build/tools/caffe train \
     --solver=models/bvlc_reference_caffenet/solver.prototxt \
-    --snapshot=models/bvlc_reference_caffenet/caffenet_train_10000.solverstate.h5
+    --snapshot=models/bvlc_reference_caffenet/caffenet_train_10000.solverstate.h5 \
+    $@
diff --git a/examples/imagenet/train_caffenet.sh b/examples/imagenet/train_caffenet.sh
index 94558ec5..a5094d44 100755
--- a/examples/imagenet/train_caffenet.sh
+++ b/examples/imagenet/train_caffenet.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env sh
+set -e
 
 ./build/tools/caffe train \
-    --solver=models/bvlc_reference_caffenet/solver.prototxt
+    --solver=models/bvlc_reference_caffenet/solver.prototxt $@
diff --git a/examples/mnist/create_mnist.sh b/examples/mnist/create_mnist.sh
index 06ecc27d..f5e2e796 100755
--- a/examples/mnist/create_mnist.sh
+++ b/examples/mnist/create_mnist.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env sh
 # This script converts the mnist data into lmdb/leveldb format,
 # depending on the value assigned to $BACKEND.
+set -e
 
 EXAMPLE=examples/mnist
 DATA=data/mnist
diff --git a/examples/mnist/train_lenet.sh b/examples/mnist/train_lenet.sh
index 1b6bf7d9..f7f9b861 100755
--- a/examples/mnist/train_lenet.sh
+++ b/examples/mnist/train_lenet.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env sh
+set -e
 
-./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt
+./build/tools/caffe train --solver=examples/mnist/lenet_solver.prototxt $@
diff --git a/examples/mnist/train_lenet_adam.sh b/examples/mnist/train_lenet_adam.sh
index a32ecf2d..7b4e9056 100755
--- a/examples/mnist/train_lenet_adam.sh
+++ b/examples/mnist/train_lenet_adam.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env sh
+set -e
 
-./build/tools/caffe train --solver=examples/mnist/lenet_solver_adam.prototxt
+./build/tools/caffe train --solver=examples/mnist/lenet_solver_adam.prototxt $@
diff --git a/examples/mnist/train_lenet_consolidated.sh b/examples/mnist/train_lenet_consolidated.sh
index c8554678..c5f02666 100755
--- a/examples/mnist/train_lenet_consolidated.sh
+++ b/examples/mnist/train_lenet_consolidated.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env sh
+set -e
 
 ./build/tools/caffe train \
-  --solver=examples/mnist/lenet_consolidated_solver.prototxt
+  --solver=examples/mnist/lenet_consolidated_solver.prototxt $@
diff --git a/examples/mnist/train_lenet_rmsprop.sh b/examples/mnist/train_lenet_rmsprop.sh
index 621cab23..adfa7ab0 100755
--- a/examples/mnist/train_lenet_rmsprop.sh
+++ b/examples/mnist/train_lenet_rmsprop.sh
@@ -1,3 +1,5 @@
 #!/usr/bin/env sh
+set -e
 
-./build/tools/caffe train --solver=examples/mnist/lenet_solver_rmsprop.prototxt
+./build/tools/caffe train \
+    --solver=examples/mnist/lenet_solver_rmsprop.prototxt $@
diff --git a/examples/mnist/train_mnist_autoencoder.sh b/examples/mnist/train_mnist_autoencoder.sh
index cfd67e82..724a0f14 100755
--- a/examples/mnist/train_mnist_autoencoder.sh
+++ b/examples/mnist/train_mnist_autoencoder.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env sh
+set -e
 
 ./build/tools/caffe train \
-  --solver=examples/mnist/mnist_autoencoder_solver.prototxt
+  --solver=examples/mnist/mnist_autoencoder_solver.prototxt $@
diff --git a/examples/mnist/train_mnist_autoencoder_adadelta.sh b/examples/mnist/train_mnist_autoencoder_adadelta.sh
index 4be0ebdd..a660dbb9 100755
--- a/examples/mnist/train_mnist_autoencoder_adadelta.sh
+++ b/examples/mnist/train_mnist_autoencoder_adadelta.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -e
 
 ./build/tools/caffe train \
-  --solver=examples/mnist/mnist_autoencoder_solver_adadelta.prototxt
+  --solver=examples/mnist/mnist_autoencoder_solver_adadelta.prototxt $@
diff --git a/examples/mnist/train_mnist_autoencoder_adagrad.sh b/examples/mnist/train_mnist_autoencoder_adagrad.sh
index 95fe1b17..4c11dfa6 100755
--- a/examples/mnist/train_mnist_autoencoder_adagrad.sh
+++ b/examples/mnist/train_mnist_autoencoder_adagrad.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -e
 
 ./build/tools/caffe train \
-  --solver=examples/mnist/mnist_autoencoder_solver_adagrad.prototxt
+  --solver=examples/mnist/mnist_autoencoder_solver_adagrad.prototxt $@
diff --git a/examples/mnist/train_mnist_autoencoder_nesterov.sh b/examples/mnist/train_mnist_autoencoder_nesterov.sh
index cf19ea74..fd0559d2 100755
--- a/examples/mnist/train_mnist_autoencoder_nesterov.sh
+++ b/examples/mnist/train_mnist_autoencoder_nesterov.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -e
 
 ./build/tools/caffe train \
-  --solver=examples/mnist/mnist_autoencoder_solver_nesterov.prototxt
+  --solver=examples/mnist/mnist_autoencoder_solver_nesterov.prototxt $@
diff --git a/examples/siamese/create_mnist_siamese.sh b/examples/siamese/create_mnist_siamese.sh
index 43ad6b18..03adce54 100755
--- a/examples/siamese/create_mnist_siamese.sh
+++ b/examples/siamese/create_mnist_siamese.sh
@@ -1,5 +1,6 @@
 #!/usr/bin/env sh
 # This script converts the mnist data into leveldb format.
+set -e
 
 EXAMPLES=./build/examples/siamese
 DATA=./data/mnist
diff --git a/examples/siamese/train_mnist_siamese.sh b/examples/siamese/train_mnist_siamese.sh
index 84a30a8a..e01ac2ce 100755
--- a/examples/siamese/train_mnist_siamese.sh
+++ b/examples/siamese/train_mnist_siamese.sh
@@ -1,5 +1,6 @@
 #!/usr/bin/env sh
+set -e
 
 TOOLS=./build/tools
 
-$TOOLS/caffe train --solver=examples/siamese/mnist_siamese_solver.prototxt
+$TOOLS/caffe train --solver=examples/siamese/mnist_siamese_solver.prototxt $@

From a110ac7c2ad9e0966a02ba360327907cd2646dd4 Mon Sep 17 00:00:00 2001
From: Luke Yeager <luke.yeager@gmail.com>
Date: Fri, 15 Jul 2016 14:12:01 -0700
Subject: [PATCH 028/183] Stop setting cache timeout in TravisCI

It refers to the caching command timeout, not how long before the caches
expire as I had thought.
---
 .travis.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 4849a7ac..32979547 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -28,7 +28,6 @@ env:
     - BUILD_NAME="cudnn-cmake" WITH_CMAKE=true WITH_CUDA=true WITH_CUDNN=true
 
 cache:
-  timeout: 604800  # 1 week
   apt: true
   directories:
     - ~/protobuf3

From 9376bde1beba649e4c522b742064223ac9d2cab4 Mon Sep 17 00:00:00 2001
From: jasjuang <jasjuang@gmail.com>
Date: Thu, 21 Jul 2016 12:04:41 -0700
Subject: [PATCH 029/183] add in sudo make uninstall for cmake

---
 CMakeLists.txt           | 11 +++++++++++
 cmake/Uninstall.cmake.in | 26 ++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 cmake/Uninstall.cmake.in

diff --git a/CMakeLists.txt b/CMakeLists.txt
index da7142c9..7b8dab2b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,8 +85,19 @@ if(BUILD_python)
   add_dependencies(pytest pycaffe)
 endif()
 
+# ---[ uninstall target
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Uninstall.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake/Uninstall.cmake
+    IMMEDIATE @ONLY)
+
+add_custom_target(uninstall
+    COMMAND ${CMAKE_COMMAND} -P
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake/Uninstall.cmake)
+
 # ---[ Configuration summary
 caffe_print_configuration_summary()
 
 # ---[ Export configs generation
 caffe_generate_export_configs()
+
diff --git a/cmake/Uninstall.cmake.in b/cmake/Uninstall.cmake.in
new file mode 100644
index 00000000..bb8e2964
--- /dev/null
+++ b/cmake/Uninstall.cmake.in
@@ -0,0 +1,26 @@
+if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+  message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+
+if (NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set (CMAKE_INSTALL_PREFIX "@CMAKE_INSTALL_PREFIX@")
+endif ()
+ message(${CMAKE_INSTALL_PREFIX})
+
+file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+string(REGEX REPLACE "\n" ";" files "${files}")
+foreach(file ${files})
+  message(STATUS "Uninstalling $ENV{DESTDIR}${file}")
+  if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    exec_program(
+      "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
+      OUTPUT_VARIABLE rm_out
+      RETURN_VALUE rm_retval
+      )
+    if(NOT "${rm_retval}" STREQUAL 0)
+      message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}")
+    endif(NOT "${rm_retval}" STREQUAL 0)
+  else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    message(STATUS "File $ENV{DESTDIR}${file} does not exist.")
+  endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+endforeach(file)
\ No newline at end of file

From 0ad1284bf6af4ee59f782b72cdf4af0fd194af29 Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Mon, 25 Jul 2016 09:01:24 +0300
Subject: [PATCH 030/183] CMake: link with ${HDF5_HL_LIBRARIES}

Fixes issue #3224.
---
 cmake/Dependencies.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index c7b6a17a..d7eb59e3 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -26,7 +26,7 @@ include(cmake/ProtoBuf.cmake)
 # ---[ HDF5
 find_package(HDF5 COMPONENTS HL REQUIRED)
 include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
-list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES})
+list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES})
 
 # ---[ LMDB
 if(USE_LMDB)

From c62e06bccafa57f5b21f90b49e81a988d50a4620 Mon Sep 17 00:00:00 2001
From: Hans Gaiser <j.c.gaiser@delftrobotics.com>
Date: Tue, 26 Jul 2016 11:44:44 +0200
Subject: [PATCH 031/183] Fix search for Atlas on arch.

---
 cmake/Modules/FindAtlas.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/Modules/FindAtlas.cmake b/cmake/Modules/FindAtlas.cmake
index 6e156435..9c665a47 100644
--- a/cmake/Modules/FindAtlas.cmake
+++ b/cmake/Modules/FindAtlas.cmake
@@ -26,9 +26,9 @@ set(Atlas_LIB_SEARCH_PATHS
 find_path(Atlas_CBLAS_INCLUDE_DIR   NAMES cblas.h   PATHS ${Atlas_INCLUDE_SEARCH_PATHS})
 find_path(Atlas_CLAPACK_INCLUDE_DIR NAMES clapack.h PATHS ${Atlas_INCLUDE_SEARCH_PATHS})
 
-find_library(Atlas_CBLAS_LIBRARY NAMES  ptcblas_r ptcblas cblas_r cblas PATHS ${Atlas_LIB_SEARCH_PATHS})
-find_library(Atlas_BLAS_LIBRARY NAMES   atlas_r   atlas                 PATHS ${Atlas_LIB_SEARCH_PATHS})
-find_library(Atlas_LAPACK_LIBRARY NAMES alapack_r alapack lapack_atlas  PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_CBLAS_LIBRARY NAMES  ptcblas_r ptcblas cblas_r cblas       PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_BLAS_LIBRARY NAMES   atlas_r   atlas                       PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_LAPACK_LIBRARY NAMES lapack alapack_r alapack lapack_atlas PATHS ${Atlas_LIB_SEARCH_PATHS})
 
 set(LOOKED_FOR
   Atlas_CBLAS_INCLUDE_DIR

From 61e01654d2054531133a6d154a69b872a4479099 Mon Sep 17 00:00:00 2001
From: Fisher Yu <i@yf.io>
Date: Sat, 6 Aug 2016 23:01:45 -0400
Subject: [PATCH 032/183] num in blob is deprecated

---
 src/caffe/layers/loss_layer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp
index c0b7a862..afb1ce94 100644
--- a/src/caffe/layers/loss_layer.cpp
+++ b/src/caffe/layers/loss_layer.cpp
@@ -16,8 +16,8 @@ void LossLayer<Dtype>::LayerSetUp(
 template <typename Dtype>
 void LossLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(bottom[0]->num(), bottom[1]->num())
-      << "The data and label should have the same number.";
+  CHECK_EQ(bottom[0]->shape(0), bottom[1]->shape(0))
+      << "The data and label should have the same first dimension.";
   vector<int> loss_shape(0);  // Loss layers output a scalar; 0 axes.
   top[0]->Reshape(loss_shape);
 }

From d607858b90b645d8177c3970d782f0ab5c529558 Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Tue, 9 Aug 2016 15:13:47 +0000
Subject: [PATCH 033/183] Fix more float comparison precision issue

With reference to this commit:
f1a8470aa21e35a5b2bb83007f8fb7680a354815

This fix changes some EXPECT_EQ into EXPECT_FLOAT_EQ .
---
 src/caffe/test/test_convolution_layer.cpp     | 2 +-
 src/caffe/test/test_gradient_based_solver.cpp | 8 ++++----
 src/caffe/test/test_neuron_layer.cpp          | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp
index 9bb19d13..85c10a29 100644
--- a/src/caffe/test/test_convolution_layer.cpp
+++ b/src/caffe/test/test_convolution_layer.cpp
@@ -695,7 +695,7 @@ TYPED_TEST(ConvolutionLayerTest, TestNDAgainst2D) {
   }
   ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
   for (int i = 0; i < backward_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
+    EXPECT_FLOAT_EQ(backward_result_2d.cpu_diff()[i],
               backward_result_nd.cpu_diff()[i]);
   }
   ASSERT_EQ(backward_weight_result_nd.count(),
diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index 975a8f0f..9395f4e9 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -538,9 +538,9 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const vector<Blob<Dtype>*>& params = solver_->net()->learnable_params();
     for (int i = 0; i < params.size(); ++i) {
       for (int j = 0; j < params[i]->count(); ++j) {
-        EXPECT_EQ(param_copies[i]->cpu_data()[j], params[i]->cpu_data()[j])
+        EXPECT_FLOAT_EQ(param_copies[i]->cpu_data()[j], params[i]->cpu_data()[j])
             << "param " << i << " data differed at dim " << j;
-        EXPECT_EQ(param_copies[i]->cpu_diff()[j], params[i]->cpu_diff()[j])
+        EXPECT_FLOAT_EQ(param_copies[i]->cpu_diff()[j], params[i]->cpu_diff()[j])
             << "param " << i << " diff differed at dim " << j;
       }
     }
@@ -549,9 +549,9 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const vector<shared_ptr<Blob<Dtype> > >& history = solver_->history();
     for (int i = 0; i < history.size(); ++i) {
       for (int j = 0; j < history[i]->count(); ++j) {
-        EXPECT_EQ(history_copies[i]->cpu_data()[j], history[i]->cpu_data()[j])
+        EXPECT_FLOAT_EQ(history_copies[i]->cpu_data()[j], history[i]->cpu_data()[j])
             << "history blob " << i << " data differed at dim " << j;
-        EXPECT_EQ(history_copies[i]->cpu_diff()[j], history[i]->cpu_diff()[j])
+        EXPECT_FLOAT_EQ(history_copies[i]->cpu_diff()[j], history[i]->cpu_diff()[j])
             << "history blob " << i << " diff differed at dim " << j;
       }
     }
diff --git a/src/caffe/test/test_neuron_layer.cpp b/src/caffe/test/test_neuron_layer.cpp
index 342f825c..57bd47b3 100644
--- a/src/caffe/test/test_neuron_layer.cpp
+++ b/src/caffe/test/test_neuron_layer.cpp
@@ -791,16 +791,16 @@ TYPED_TEST(NeuronLayerTest, TestPReLUInPlace) {
   ip2.Backward(blob_middle_vec_2, propagate_down, blob_bottom_vec_2);
   // Check numbers
   for (int s = 0; s < blob_bottom_2->count(); ++s) {
-    EXPECT_EQ(this->blob_bottom_->cpu_diff()[s], blob_bottom_2->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(this->blob_bottom_->cpu_diff()[s], blob_bottom_2->cpu_diff()[s]);
   }
   for (int s = 0; s < ip.blobs()[0]->count(); ++s) {
-    EXPECT_EQ(ip.blobs()[0]->cpu_diff()[s], ip2.blobs()[0]->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(ip.blobs()[0]->cpu_diff()[s], ip2.blobs()[0]->cpu_diff()[s]);
   }
   for (int s = 0; s < ip.blobs()[1]->count(); ++s) {
-    EXPECT_EQ(ip.blobs()[1]->cpu_diff()[s], ip2.blobs()[1]->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(ip.blobs()[1]->cpu_diff()[s], ip2.blobs()[1]->cpu_diff()[s]);
   }
   for (int s = 0; s < prelu.blobs()[0]->count(); ++s) {
-    EXPECT_EQ(prelu.blobs()[0]->cpu_diff()[s],
+    EXPECT_FLOAT_EQ(prelu.blobs()[0]->cpu_diff()[s],
         prelu2.blobs()[0]->cpu_diff()[s]);
   }
 }

From 42d20fe21eeb8067b09ef5e935bb4c235dbf9f3f Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Wed, 10 Aug 2016 14:36:33 +0000
Subject: [PATCH 034/183] Import bash completion script for caffe from Debian
 Package.

Imported from Debian Package caffe (1.0.0~rc3+20160715-g42cd785-2).
---
 scripts/caffe | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 scripts/caffe

diff --git a/scripts/caffe b/scripts/caffe
new file mode 100644
index 00000000..8a0b22af
--- /dev/null
+++ b/scripts/caffe
@@ -0,0 +1,73 @@
+# bash completion for Caffe's command line utility       -*- shell-script -*-
+# COPYRIGHT (C) 2015,2016 Zhou Mo <cdluminate@gmail.com>
+# License: BSD-2-Clause
+# Originally appeard at https://github.com/BVLC/caffe/issues/3149
+
+# Updated for caffe (1.0.0~rc3+20160715-g42cd785)
+_caffe()
+{
+  local cur prev words cword
+  _init_completion -s || return
+
+  local prototxts='@(prototxt)'
+  local caffemodels='@(caffemodel,binaryproto)'
+  local solverstates='@(solverstate)'
+  local caffefiles='@(prototxt|caffemodel|solverstate)'
+
+  local flags='-gpu -iterations -model -snapshot -solver -weights -sighup_effect -sigint_effect -level -stage -phase'
+  
+  if [[ $cword -eq 1 ]]; then
+    COMPREPLY=( $( compgen -W 'train test time device_query' -- "$cur" ) )
+    return 0
+  fi
+  
+  if [[ $cword -eq 2 ]]; then
+    case ${words[1]} in
+    train|test|device_query|time)
+      COMPREPLY=( $( compgen -W "$flags" -- "$cur") )
+      return 0
+      ;;
+    *)
+      return 0
+      ;;
+    esac
+  fi
+
+  case $prev in
+  -gpu|-iterations|-version|-level|-stage)
+    return 0
+    ;;
+  -solver|-model)
+    _filedir $prototxts
+    return 0
+    ;;
+  -weights)
+    _filedir $caffemodels
+    return 0
+    ;;
+  -snapshot)
+    _filedir $solverstates
+    return 0
+    ;;
+  -sighup_effect|-sigint_effect)
+    COMPREPLY=( $( compgen -W 'snapshot stop none' -- "$cur") )
+    return 0
+    ;;
+  -phase)
+    COMPREPLY=( $( compgen -W 'TRAIN TEST' -- "$cur") )
+    return 0
+    ;;
+  *)
+    COMPREPLY=( $( compgen -W "$flags" -- "$cur") )
+    return 0
+    ;;
+  esac
+
+  # file completion on relevant files
+  _filedir "$caffefiles"
+
+  return 0
+}
+complete -F _caffe caffe
+
+# vim

From 6382d67da1d2b5d9ebe92df8a20a8ac1947366ea Mon Sep 17 00:00:00 2001
From: An Tran <tranlaman@gmail.com>
Date: Fri, 12 Aug 2016 16:39:11 +0800
Subject: [PATCH 035/183] small improments in compute_image_mean

---
 tools/compute_image_mean.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/compute_image_mean.cpp b/tools/compute_image_mean.cpp
index 2035d515..417f5e4c 100644
--- a/tools/compute_image_mean.cpp
+++ b/tools/compute_image_mean.cpp
@@ -22,9 +22,11 @@ DEFINE_string(backend, "lmdb",
         "The backend {leveldb, lmdb} containing the images");
 
 int main(int argc, char** argv) {
+#ifdef USE_OPENCV
   ::google::InitGoogleLogging(argv[0]);
+  // Print output to stderr (while still logging)
+  FLAGS_alsologtostderr = 1;
 
-#ifdef USE_OPENCV
 #ifndef GFLAGS_GFLAGS_H_
   namespace gflags = google;
 #endif
@@ -65,7 +67,7 @@ int main(int argc, char** argv) {
   for (int i = 0; i < size_in_datum; ++i) {
     sum_blob.add_data(0.);
   }
-  LOG(INFO) << "Starting Iteration";
+  LOG(INFO) << "Starting iteration";
   while (cursor->valid()) {
     Datum datum;
     datum.ParseFromString(cursor->value());
@@ -114,7 +116,7 @@ int main(int argc, char** argv) {
     for (int i = 0; i < dim; ++i) {
       mean_values[c] += sum_blob.data(dim * c + i);
     }
-    LOG(INFO) << "mean_value channel [" << c << "]:" << mean_values[c] / dim;
+    LOG(INFO) << "mean_value channel [" << c << "]: " << mean_values[c] / dim;
   }
 #else
   LOG(FATAL) << "This tool requires OpenCV; compile with USE_OPENCV.";

From d4a413cbf56f43a9d5a6ea3a5568447117cefff0 Mon Sep 17 00:00:00 2001
From: Sungjun HONG <imjune@yonsei.ac.kr>
Date: Sun, 14 Aug 2016 17:51:56 +0900
Subject: [PATCH 036/183] Correct a mistake on math notation

---
 examples/net_surgery.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/net_surgery.ipynb b/examples/net_surgery.ipynb
index d50d503b..217c2d1a 100644
--- a/examples/net_surgery.ipynb
+++ b/examples/net_surgery.ipynb
@@ -5479,7 +5479,7 @@
     "\n",
     "Let's take the standard Caffe Reference ImageNet model \"CaffeNet\" and transform it into a fully convolutional net for efficient, dense inference on large inputs. This model generates a classification map that covers a given input size instead of a single classification. In particular a 8 $\\times$ 8 classification map on a 451 $\\times$ 451 input gives 64x the output in only 3x the time. The computation exploits a natural efficiency of convolutional network (convnet) structure by amortizing the computation of overlapping receptive fields.\n",
     "\n",
-    "To do so we translate the `InnerProduct` matrix multiplication layers of CaffeNet into `Convolutional` layers. This is the only change: the other layer types are agnostic to spatial size. Convolution is translation-invariant, activations are elementwise operations, and so on. The `fc6` inner product when carried out as convolution by `fc6-conv` turns into a 6 \\times 6 filter with stride 1 on `pool5`. Back in image space this gives a classification for each 227 $\\times$ 227 box with stride 32 in pixels. Remember the equation for output map / receptive field size, output = (input - kernel_size) / stride + 1, and work out the indexing details for a clear understanding."
+    "To do so we translate the `InnerProduct` matrix multiplication layers of CaffeNet into `Convolutional` layers. This is the only change: the other layer types are agnostic to spatial size. Convolution is translation-invariant, activations are elementwise operations, and so on. The `fc6` inner product when carried out as convolution by `fc6-conv` turns into a 6 $\\times$ 6 filter with stride 1 on `pool5`. Back in image space this gives a classification for each 227 $\\times$ 227 box with stride 32 in pixels. Remember the equation for output map / receptive field size, output = (input - kernel_size) / stride + 1, and work out the indexing details for a clear understanding."
    ]
   },
   {

From 5d594806aed7d44feb36cae12bacbaabfabf6fa8 Mon Sep 17 00:00:00 2001
From: Nitish Keskar <keskar.nitish@gmail.com>
Date: Mon, 15 Aug 2016 19:47:34 -0500
Subject: [PATCH 037/183] Fixing Typo In Sigmoid CIFAR-10 Examples

There was a mismatch between the iterations interval in the comment and the actual code.
---
 examples/cifar10/cifar10_full_sigmoid_solver.prototxt    | 2 +-
 examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cifar10/cifar10_full_sigmoid_solver.prototxt b/examples/cifar10/cifar10_full_sigmoid_solver.prototxt
index 7dd3ecb9..a8e55399 100644
--- a/examples/cifar10/cifar10_full_sigmoid_solver.prototxt
+++ b/examples/cifar10/cifar10_full_sigmoid_solver.prototxt
@@ -17,7 +17,7 @@ momentum: 0.9
 lr_policy: "step"
 gamma: 1
 stepsize: 5000
-# Display every 200 iterations
+# Display every 100 iterations
 display: 100
 # The maximum number of iterations
 max_iter: 60000
diff --git a/examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt b/examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt
index a57b280f..a4dabd67 100644
--- a/examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt
+++ b/examples/cifar10/cifar10_full_sigmoid_solver_bn.prototxt
@@ -17,7 +17,7 @@ momentum: 0.9
 lr_policy: "step"
 gamma: 1
 stepsize: 5000
-# Display every 200 iterations
+# Display every 100 iterations
 display: 100
 # The maximum number of iterations
 max_iter: 60000

From 9029695ee358caa82116fc192cb4d505ea936274 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Thu, 18 Aug 2016 11:03:42 -0700
Subject: [PATCH 038/183] [build] set default BLAS include for OS X 10.11

the latest hunt for the ever-elusive vecLib/Accelerate
---
 Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 403e00a3..24894062 100644
--- a/Makefile
+++ b/Makefile
@@ -382,8 +382,11 @@ else
 		LIBRARIES += cblas
 		# 10.10 has accelerate while 10.9 has veclib
 		XCODE_CLT_VER := $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables | grep 'version' | sed 's/[^0-9]*\([0-9]\).*/\1/')
+		XCODE_CLT_GEQ_7 := $(shell [ $(XCODE_CLT_VER) -gt 6 ] && echo 1)
 		XCODE_CLT_GEQ_6 := $(shell [ $(XCODE_CLT_VER) -gt 5 ] && echo 1)
-		ifeq ($(XCODE_CLT_GEQ_6), 1)
+		ifeq ($(XCODE_CLT_GEQ_7), 1)
+			BLAS_INCLUDE ?= /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.11.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers
+		else ifeq ($(XCODE_CLT_GEQ_6), 1)
 			BLAS_INCLUDE ?= /System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
 			LDFLAGS += -framework Accelerate
 		else

From 1110d2ba7b52c35f898da8febdd53524761ecb97 Mon Sep 17 00:00:00 2001
From: Tianwei Shen <shentianweipku@gmail.com>
Date: Tue, 26 Jul 2016 00:19:35 +0800
Subject: [PATCH 039/183] make cmake find cuDNN on Mac OS

dylib instead of so on OS X
---
 cmake/Cuda.cmake | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 286a4280..eeeb7325 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -174,11 +174,18 @@ function(detect_cuDNN)
             PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDA_TOOLKIT_INCLUDE}
             DOC "Path to cuDNN include directory." )
 
-  get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
-  find_library(CUDNN_LIBRARY NAMES libcudnn.so # libcudnn_static.a
-                             PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDNN_INCLUDE} ${__libpath_hist}
-                             DOC "Path to cuDNN library.")
+  # dynamic libs have different suffix in mac and linux
+  if(APPLE)
+    set(CUDNN_LIB_NAME "libcudnn.dylib")
+  else()
+    set(CUDNN_LIB_NAME "libcudnn.so")
+  endif()
 
+  get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+  find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME}
+   PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDNN_INCLUDE} ${__libpath_hist} ${__libpath_hist}/../lib
+   DOC "Path to cuDNN library.")
+  
   if(CUDNN_INCLUDE AND CUDNN_LIBRARY)
     set(HAVE_CUDNN  TRUE PARENT_SCOPE)
     set(CUDNN_FOUND TRUE PARENT_SCOPE)

From 51c39b87738962c323c8bd05aa4c23ac97e1c030 Mon Sep 17 00:00:00 2001
From: Preston Parry <ClimbsBytes@gmail.com>
Date: Sun, 28 Aug 2016 14:32:41 -0700
Subject: [PATCH 040/183] updates tense in docs

"could" seems to imply for some reason that something is blocking one from calling the registered layers. "can" lays out more directly that a user can choose to do this.
---
 include/caffe/layer_factory.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/caffe/layer_factory.hpp b/include/caffe/layer_factory.hpp
index f385afcc..2369c132 100644
--- a/include/caffe/layer_factory.hpp
+++ b/include/caffe/layer_factory.hpp
@@ -1,6 +1,6 @@
 /**
  * @brief A layer factory that allows one to register layers.
- * During runtime, registered layers could be called by passing a LayerParameter
+ * During runtime, registered layers can be called by passing a LayerParameter
  * protobuffer to the CreateLayer function:
  *
  *     LayerRegistry<Dtype>::CreateLayer(param);

From 8797e7b3720d97afea24ad6f78b7811c57a3919d Mon Sep 17 00:00:00 2001
From: Preston Parry <ClimbsBytes@gmail.com>
Date: Sun, 28 Aug 2016 14:34:42 -0700
Subject: [PATCH 041/183] fixes typo- duplicate "a a"

---
 include/caffe/solver.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index 38259eda..eafcee32 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -12,7 +12,7 @@ namespace caffe {
 /**
   * @brief Enumeration of actions that a client of the Solver may request by
   * implementing the Solver's action request function, which a
-  * a client may optionally provide in order to request early termination
+  * client may optionally provide in order to request early termination
   * or saving a snapshot without exiting. In the executable caffe, this
   * mechanism is used to allow the snapshot to be saved when stopping
   * execution with a SIGINT (Ctrl-C).

From cd54d9e0f96df65a4972306f29d042bc34c63077 Mon Sep 17 00:00:00 2001
From: Preston Parry <ClimbsBytes@gmail.com>
Date: Sun, 28 Aug 2016 14:42:57 -0700
Subject: [PATCH 042/183] changes "c++" to "C++" for consistency

---
 include/caffe/solver_factory.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/caffe/solver_factory.hpp b/include/caffe/solver_factory.hpp
index cfff721a..a5b16073 100644
--- a/include/caffe/solver_factory.hpp
+++ b/include/caffe/solver_factory.hpp
@@ -15,7 +15,7 @@
  * and its type is its C++ class name, but without the "Solver" at the end
  * ("MyAwesomeSolver" -> "MyAwesome").
  *
- * If the solver is going to be created simply by its constructor, in your c++
+ * If the solver is going to be created simply by its constructor, in your C++
  * file, add the following line:
  *
  *    REGISTER_SOLVER_CLASS(MyAwesome);

From 4024b82c7c8e9f12898becf7b3947e603a4dd0bb Mon Sep 17 00:00:00 2001
From: Luke Yeager <lukeyeager@users.noreply.github.com>
Date: Mon, 29 Aug 2016 11:14:17 -0700
Subject: [PATCH 043/183] [TravisCI] - build protobuf3 GA

---
 scripts/travis/install-deps.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index ee16d36a..4e86ac73 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -56,7 +56,7 @@ else
       dh-autoreconf \
       unzip
 
-    wget https://github.com/google/protobuf/archive/v3.0.0-beta-3.tar.gz -O protobuf3.tar.gz
+    wget https://github.com/google/protobuf/archive/3.0.0-GA.tar.gz -O protobuf3.tar.gz
     tar -xzf protobuf3.tar.gz -C $PROTOBUF3_DIR --strip 1
     rm protobuf3.tar.gz
     cd $PROTOBUF3_DIR

From b9c3c06c28dafce67c89603e8b73cf18057264eb Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Sun, 14 Aug 2016 04:52:25 +0300
Subject: [PATCH 044/183] cmake: fix usage of INCLUDE_DIR/INCLUDE_DIRS in
 Dependencies.cmake

---
 cmake/Dependencies.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index ae9ce8e4..bf882ce9 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -3,7 +3,7 @@ set(Caffe_LINKER_LIBS "")
 
 # ---[ Boost
 find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
-include_directories(SYSTEM ${Boost_INCLUDE_DIR})
+include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
 list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES})
 
 # ---[ Threads
@@ -25,7 +25,7 @@ include(cmake/ProtoBuf.cmake)
 
 # ---[ HDF5
 find_package(HDF5 COMPONENTS HL REQUIRED)
-include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
+include_directories(SYSTEM ${HDF5_INCLUDE_DIRS})
 list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES})
 
 # ---[ LMDB
@@ -42,7 +42,7 @@ endif()
 # ---[ LevelDB
 if(USE_LEVELDB)
   find_package(LevelDB REQUIRED)
-  include_directories(SYSTEM ${LevelDB_INCLUDE})
+  include_directories(SYSTEM ${LevelDB_INCLUDES})
   list(APPEND Caffe_LINKER_LIBS ${LevelDB_LIBRARIES})
   add_definitions(-DUSE_LEVELDB)
 endif()

From a59e647117705236d8bcef46cc6d4e0c72b42804 Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Mon, 15 Aug 2016 20:19:09 +0300
Subject: [PATCH 045/183] cmake/Templates: properly spell OpenCV CMake config
 file name

---
 cmake/Templates/CaffeConfig.cmake.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Templates/CaffeConfig.cmake.in b/cmake/Templates/CaffeConfig.cmake.in
index 73f57ac2..b58124aa 100644
--- a/cmake/Templates/CaffeConfig.cmake.in
+++ b/cmake/Templates/CaffeConfig.cmake.in
@@ -27,7 +27,7 @@ if(@USE_OPENCV@)
 
       if(EXISTS ${Caffe_OpenCV_CONFIG_PATH} AND NOT TARGET opencv_core)
         message(STATUS "Caffe: using OpenCV config from ${Caffe_OpenCV_CONFIG_PATH}")
-        include(${Caffe_OpenCV_CONFIG_PATH}/OpenCVModules.cmake)
+	include(${Caffe_OpenCV_CONFIG_PATH}/OpenCVConfig.cmake)
       endif()
 
     else()

From ba189d907d60b17cc24b54d1a22cb68ce6c11193 Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Sat, 20 Aug 2016 00:59:05 +0300
Subject: [PATCH 046/183] cmake: refactor deps detection, specify all
 dependencies in the exported caffe target

This is the first step towards "modern" IMPORTED-targets-only CMake
setup. The find_package modules still need to be rewritten and upstreamed in
form of config exports where possible.
---
 CMakeLists.txt                       | 24 +++++++--
 cmake/ConfigGen.cmake                | 65 +---------------------
 cmake/Cuda.cmake                     | 12 ++---
 cmake/Dependencies.cmake             | 81 +++++++++++++++-------------
 cmake/ProtoBuf.cmake                 |  4 +-
 cmake/Templates/CaffeConfig.cmake.in | 13 ++---
 python/CMakeLists.txt                |  6 +--
 src/caffe/CMakeLists.txt             | 13 +++--
 src/gtest/CMakeLists.txt             |  3 ++
 9 files changed, 94 insertions(+), 127 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index da7142c9..cb25b43a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,8 +54,6 @@ if(USE_libstdcpp)
   message("-- Warning: forcing libstdc++ (controlled by USE_libstdcpp option in cmake)")
 endif()
 
-add_definitions(-DGTEST_USE_OWN_TR1_TUPLE)
-
 # ---[ Warnings
 caffe_warnings_disable(CMAKE_CXX_FLAGS -Wno-sign-compare -Wno-uninitialized)
 
@@ -64,8 +62,26 @@ configure_file(cmake/Templates/caffe_config.h.in "${PROJECT_BINARY_DIR}/caffe_co
 
 # ---[ Includes
 set(Caffe_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
-include_directories(${Caffe_INCLUDE_DIR} ${PROJECT_BINARY_DIR})
-include_directories(BEFORE src) # This is needed for gtest.
+set(Caffe_SRC_DIR ${PROJECT_SOURCE_DIR}/src)
+include_directories(${PROJECT_BINARY_DIR})
+
+# ---[ Includes & defines for CUDA
+
+# cuda_compile() does not have per-call dependencies or include pathes
+# (cuda_compile() has per-call flags, but we set them here too for clarity)
+#
+# list(REMOVE_ITEM ...) invocations remove PRIVATE and PUBLIC keywords from collected definitions and include pathes
+if(HAVE_CUDA)
+  # pass include pathes to cuda_include_directories()
+  set(Caffe_ALL_INCLUDE_DIRS ${Caffe_INCLUDE_DIRS})
+  list(REMOVE_ITEM Caffe_ALL_INCLUDE_DIRS PRIVATE PUBLIC)
+  cuda_include_directories(${Caffe_INCLUDE_DIR} ${Caffe_SRC_DIR} ${Caffe_ALL_INCLUDE_DIRS})
+
+  # add definitions to nvcc flags directly
+  set(Caffe_ALL_DEFINITIONS ${Caffe_DEFINITIONS})
+  list(REMOVE_ITEM Caffe_ALL_DEFINITIONS PRIVATE PUBLIC)
+  list(APPEND CUDA_NVCC_FLAGS ${Caffe_ALL_DEFINITIONS})
+endif()
 
 # ---[ Subdirectories
 add_subdirectory(src/gtest)
diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
index 05637111..077d5b28 100644
--- a/cmake/ConfigGen.cmake
+++ b/cmake/ConfigGen.cmake
@@ -1,31 +1,4 @@
 
-################################################################################################
-# Helper function to fetch caffe includes which will be passed to dependent projects
-# Usage:
-#   caffe_get_current_includes(<includes_list_variable>)
-function(caffe_get_current_includes includes_variable)
-  get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES)
-  caffe_convert_absolute_paths(current_includes)
-
-  # remove at most one ${PROJECT_BINARY_DIR} include added for caffe_config.h
-  list(FIND current_includes ${PROJECT_BINARY_DIR} __index)
-  list(REMOVE_AT current_includes ${__index})
-
-  # removing numpy includes (since not required for client libs)
-  set(__toremove "")
-  foreach(__i ${current_includes})
-    if(${__i} MATCHES "python")
-      list(APPEND __toremove ${__i})
-    endif()
-  endforeach()
-  if(__toremove)
-    list(REMOVE_ITEM current_includes ${__toremove})
-  endif()
-
-  caffe_list_unique(current_includes)
-  set(${includes_variable} ${current_includes} PARENT_SCOPE)
-endfunction()
-
 ################################################################################################
 # Helper function to get all list items that begin with given prefix
 # Usage:
@@ -47,39 +20,15 @@ endfunction()
 function(caffe_generate_export_configs)
   set(install_cmake_suffix "share/Caffe")
 
-  # ---[ Configure build-tree CaffeConfig.cmake file ]---
-  caffe_get_current_includes(Caffe_INCLUDE_DIRS)
-
-  set(Caffe_DEFINITIONS "")
   if(NOT HAVE_CUDA)
     set(HAVE_CUDA FALSE)
-    list(APPEND Caffe_DEFINITIONS -DCPU_ONLY)
-  endif()
-
-  if(USE_OPENCV)
-    list(APPEND Caffe_DEFINITIONS -DUSE_OPENCV)
-  endif()
-
-  if(USE_LMDB)
-    list(APPEND Caffe_DEFINITIONS -DUSE_LMDB)
-    if (ALLOW_LMDB_NOLOCK)
-        list(APPEND Caffe_DEFINITIONS -DALLOW_LMDB_NOLOCK)
-    endif()
-  endif()
-
-  if(USE_LEVELDB)
-    list(APPEND Caffe_DEFINITIONS -DUSE_LEVELDB)
   endif()
 
   if(NOT HAVE_CUDNN)
     set(HAVE_CUDNN FALSE)
-  else()
-    list(APPEND DEFINITIONS -DUSE_CUDNN)
   endif()
 
-  if(BLAS STREQUAL "MKL" OR BLAS STREQUAL "mkl")
-    list(APPEND Caffe_DEFINITIONS -DUSE_MKL)
-  endif()
+  # ---[ Configure build-tree CaffeConfig.cmake file ]---
 
   configure_file("cmake/Templates/CaffeConfig.cmake.in" "${PROJECT_BINARY_DIR}/CaffeConfig.cmake" @ONLY)
 
@@ -89,18 +38,6 @@ function(caffe_generate_export_configs)
 
   # ---[ Configure install-tree CaffeConfig.cmake file ]---
 
-  # remove source and build dir includes
-  caffe_get_items_with_prefix(${PROJECT_SOURCE_DIR} Caffe_INCLUDE_DIRS __insource)
-  caffe_get_items_with_prefix(${PROJECT_BINARY_DIR} Caffe_INCLUDE_DIRS __inbinary)
-  list(REMOVE_ITEM Caffe_INCLUDE_DIRS ${__insource} ${__inbinary})
-
-  # add `install` include folder
-  set(lines
-     "get_filename_component(__caffe_include \"\${Caffe_CMAKE_DIR}/../../include\" ABSOLUTE)\n"
-     "list(APPEND Caffe_INCLUDE_DIRS \${__caffe_include})\n"
-     "unset(__caffe_include)\n")
-  string(REPLACE ";" "" Caffe_INSTALL_INCLUDE_DIR_APPEND_COMMAND ${lines})
-
   configure_file("cmake/Templates/CaffeConfig.cmake.in" "${PROJECT_BINARY_DIR}/cmake/CaffeConfig.cmake" @ONLY)
 
   # Install the CaffeConfig.cmake and export set to use with install-tree
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index eeeb7325..c6b0de8c 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -238,17 +238,17 @@ endif()
 
 set(HAVE_CUDA TRUE)
 message(STATUS "CUDA detected: " ${CUDA_VERSION})
-include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
-list(APPEND Caffe_LINKER_LIBS ${CUDA_CUDART_LIBRARY}
-                              ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
+list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${CUDA_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS PUBLIC ${CUDA_CUDART_LIBRARY}
+                                     ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
 
 # cudnn detection
 if(USE_CUDNN)
   detect_cuDNN()
   if(HAVE_CUDNN)
-    add_definitions(-DUSE_CUDNN)
-    include_directories(SYSTEM ${CUDNN_INCLUDE})
-    list(APPEND Caffe_LINKER_LIBS ${CUDNN_LIBRARY})
+    list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_CUDNN)
+    list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${CUDNN_INCLUDE})
+    list(APPEND Caffe_LINKER_LIBS PUBLIC ${CUDNN_LIBRARY})
   endif()
 endif()
 
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index bf882ce9..6a127592 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1,57 +1,67 @@
 # This list is required for static linking and exported to CaffeConfig.cmake
 set(Caffe_LINKER_LIBS "")
+set(Caffe_INCLUDE_DIRS "")
+set(Caffe_DEFINITIONS "")
 
 # ---[ Boost
 find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
-include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
-list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES})
+list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${Boost_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS PUBLIC ${Boost_LIBRARIES})
 
 # ---[ Threads
 find_package(Threads REQUIRED)
-list(APPEND Caffe_LINKER_LIBS ${CMAKE_THREAD_LIBS_INIT})
+list(APPEND Caffe_LINKER_LIBS PRIVATE ${CMAKE_THREAD_LIBS_INIT})
+
+# ---[ OpenMP
+if(USE_OPENMP)
+  # TODO: use something exportable here
+  find_package(OpenMP REQUIRED)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
 
 # ---[ Google-glog
 include("cmake/External/glog.cmake")
-include_directories(SYSTEM ${GLOG_INCLUDE_DIRS})
-list(APPEND Caffe_LINKER_LIBS ${GLOG_LIBRARIES})
+list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${GLOG_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS PUBLIC ${GLOG_LIBRARIES})
 
 # ---[ Google-gflags
 include("cmake/External/gflags.cmake")
-include_directories(SYSTEM ${GFLAGS_INCLUDE_DIRS})
-list(APPEND Caffe_LINKER_LIBS ${GFLAGS_LIBRARIES})
+list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${GFLAGS_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS PUBLIC ${GFLAGS_LIBRARIES})
 
 # ---[ Google-protobuf
 include(cmake/ProtoBuf.cmake)
 
 # ---[ HDF5
 find_package(HDF5 COMPONENTS HL REQUIRED)
-include_directories(SYSTEM ${HDF5_INCLUDE_DIRS})
-list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES})
+list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${HDF5_INCLUDE_DIRS})
+list(APPEND Caffe_LINKER_LIBS PUBLIC ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES})
 
 # ---[ LMDB
 if(USE_LMDB)
   find_package(LMDB REQUIRED)
-  include_directories(SYSTEM ${LMDB_INCLUDE_DIR})
-  list(APPEND Caffe_LINKER_LIBS ${LMDB_LIBRARIES})
-  add_definitions(-DUSE_LMDB)
+  list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${LMDB_INCLUDE_DIR})
+  list(APPEND Caffe_LINKER_LIBS PUBLIC ${LMDB_LIBRARIES})
+  list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_LMDB)
   if(ALLOW_LMDB_NOLOCK)
-    add_definitions(-DALLOW_LMDB_NOLOCK)
+    list(APPEND Caffe_DEFINITIONS PRIVATE -DALLOW_LMDB_NOLOCK)
   endif()
 endif()
 
 # ---[ LevelDB
 if(USE_LEVELDB)
   find_package(LevelDB REQUIRED)
-  include_directories(SYSTEM ${LevelDB_INCLUDES})
-  list(APPEND Caffe_LINKER_LIBS ${LevelDB_LIBRARIES})
-  add_definitions(-DUSE_LEVELDB)
+  list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${LevelDB_INCLUDES})
+  list(APPEND Caffe_LINKER_LIBS PUBLIC ${LevelDB_LIBRARIES})
+  list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_LEVELDB)
 endif()
 
 # ---[ Snappy
 if(USE_LEVELDB)
   find_package(Snappy REQUIRED)
-  include_directories(SYSTEM ${Snappy_INCLUDE_DIR})
-  list(APPEND Caffe_LINKER_LIBS ${Snappy_LIBRARIES})
+  list(APPEND Caffe_INCLUDE_DIRS PRIVATE ${Snappy_INCLUDE_DIR})
+  list(APPEND Caffe_LINKER_LIBS PRIVATE ${Snappy_LIBRARIES})
 endif()
 
 # ---[ CUDA
@@ -63,8 +73,7 @@ if(NOT HAVE_CUDA)
     message(WARNING "-- CUDA is not detected by cmake. Building without it...")
   endif()
 
-  # TODO: remove this not cross platform define in future. Use caffe_config.h instead.
-  add_definitions(-DCPU_ONLY)
+  list(APPEND Caffe_DEFINITIONS PUBLIC -DCPU_ONLY)
 endif()
 
 # ---[ OpenCV
@@ -73,10 +82,10 @@ if(USE_OPENCV)
   if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
     find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
   endif()
-  include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
-  list(APPEND Caffe_LINKER_LIBS ${OpenCV_LIBS})
+  list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${OpenCV_INCLUDE_DIRS})
+  list(APPEND Caffe_LINKER_LIBS PUBLIC ${OpenCV_LIBS})
   message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
-  add_definitions(-DUSE_OPENCV)
+  list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_OPENCV)
 endif()
 
 # ---[ BLAS
@@ -86,26 +95,26 @@ if(NOT APPLE)
 
   if(BLAS STREQUAL "Atlas" OR BLAS STREQUAL "atlas")
     find_package(Atlas REQUIRED)
-    include_directories(SYSTEM ${Atlas_INCLUDE_DIR})
-    list(APPEND Caffe_LINKER_LIBS ${Atlas_LIBRARIES})
+    list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${Atlas_INCLUDE_DIR})
+    list(APPEND Caffe_LINKER_LIBS PUBLIC ${Atlas_LIBRARIES})
   elseif(BLAS STREQUAL "Open" OR BLAS STREQUAL "open")
     find_package(OpenBLAS REQUIRED)
-    include_directories(SYSTEM ${OpenBLAS_INCLUDE_DIR})
-    list(APPEND Caffe_LINKER_LIBS ${OpenBLAS_LIB})
+    list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${OpenBLAS_INCLUDE_DIR})
+    list(APPEND Caffe_LINKER_LIBS PUBLIC ${OpenBLAS_LIB})
   elseif(BLAS STREQUAL "MKL" OR BLAS STREQUAL "mkl")
     find_package(MKL REQUIRED)
-    include_directories(SYSTEM ${MKL_INCLUDE_DIR})
-    list(APPEND Caffe_LINKER_LIBS ${MKL_LIBRARIES})
-    add_definitions(-DUSE_MKL)
+    list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${MKL_INCLUDE_DIR})
+    list(APPEND Caffe_LINKER_LIBS PUBLIC ${MKL_LIBRARIES})
+    list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_MKL)
   endif()
 elseif(APPLE)
   find_package(vecLib REQUIRED)
-  include_directories(SYSTEM ${vecLib_INCLUDE_DIR})
-  list(APPEND Caffe_LINKER_LIBS ${vecLib_LINKER_LIBS})
+  list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${vecLib_INCLUDE_DIR})
+  list(APPEND Caffe_LINKER_LIBS PUBLIC ${vecLib_LINKER_LIBS})
 
   if(VECLIB_FOUND)
     if(NOT vecLib_INCLUDE_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
-      add_definitions(-DUSE_ACCELERATE)
+      list(APPEND Caffe_DEFINITIONS PUBLIC -DUSE_ACCELERATE)
     endif()
   endif()
 endif()
@@ -149,9 +158,9 @@ if(BUILD_python)
   if(PYTHONLIBS_FOUND AND NUMPY_FOUND AND Boost_PYTHON_FOUND)
     set(HAVE_PYTHON TRUE)
     if(BUILD_python_layer)
-      add_definitions(-DWITH_PYTHON_LAYER)
-      include_directories(SYSTEM ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
-      list(APPEND Caffe_LINKER_LIBS ${PYTHON_LIBRARIES} ${Boost_LIBRARIES})
+      list(APPEND Caffe_DEFINITIONS PRIVATE -DWITH_PYTHON_LAYER)
+      list(APPEND Caffe_INCLUDE_DIRS PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} PUBLIC ${Boost_INCLUDE_DIRS})
+      list(APPEND Caffe_LINKER_LIBS PRIVATE ${PYTHON_LIBRARIES} PUBLIC ${Boost_LIBRARIES})
     endif()
   endif()
 endif()
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
index 73f647f5..8005b448 100644
--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@@ -2,8 +2,8 @@
 # the standard cmake script with version and python generation support
 
 find_package( Protobuf REQUIRED )
-include_directories(SYSTEM ${PROTOBUF_INCLUDE_DIR})
-list(APPEND Caffe_LINKER_LIBS ${PROTOBUF_LIBRARIES})
+list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${PROTOBUF_INCLUDE_DIR})
+list(APPEND Caffe_LINKER_LIBS PUBLIC ${PROTOBUF_LIBRARIES})
 
 # As of Ubuntu 14.04 protoc is no longer a part of libprotobuf-dev package
 # and should be installed separately as in: sudo apt-get install protobuf-compiler
diff --git a/cmake/Templates/CaffeConfig.cmake.in b/cmake/Templates/CaffeConfig.cmake.in
index b58124aa..77c4059e 100644
--- a/cmake/Templates/CaffeConfig.cmake.in
+++ b/cmake/Templates/CaffeConfig.cmake.in
@@ -9,9 +9,9 @@
 # After successful configuration the following variables
 # will be defined:
 #
-#   Caffe_INCLUDE_DIRS - Caffe include directories
-#   Caffe_LIBRARIES    - libraries to link against
-#   Caffe_DEFINITIONS  - a list of definitions to pass to compiler
+#   Caffe_LIBRARIES    - IMPORTED targets to link against
+#                        (There is no Caffe_INCLUDE_DIRS and Caffe_DEFINITIONS
+#                         because they are specified in the IMPORTED target interface.)
 #
 #   Caffe_HAVE_CUDA    - signals about CUDA support
 #   Caffe_HAVE_CUDNN   - signals about cuDNN support
@@ -39,9 +39,6 @@ endif()
 
 # Compute paths
 get_filename_component(Caffe_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
-set(Caffe_INCLUDE_DIRS "@Caffe_INCLUDE_DIRS@")
-
-@Caffe_INSTALL_INCLUDE_DIR_APPEND_COMMAND@
 
 # Our library dependencies
 if(NOT TARGET caffe AND NOT caffe_BINARY_DIR)
@@ -49,11 +46,9 @@ if(NOT TARGET caffe AND NOT caffe_BINARY_DIR)
 endif()
 
 # List of IMPORTED libs created by CaffeTargets.cmake
+# These targets already specify all needed definitions and include pathes
 set(Caffe_LIBRARIES caffe)
 
-# Definitions
-set(Caffe_DEFINITIONS "@Caffe_DEFINITIONS@")
-
 # Cuda support variables
 set(Caffe_CPU_ONLY @CPU_ONLY@)
 set(Caffe_HAVE_CUDA @HAVE_CUDA@)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index bf492a24..c53299d2 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -3,13 +3,13 @@ if(NOT HAVE_PYTHON)
   return()
 endif()
 
-include_directories(${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
 file(GLOB_RECURSE python_srcs ${PROJECT_SOURCE_DIR}/python/*.cpp)
 
 add_library(pycaffe SHARED ${python_srcs})
-target_link_libraries(pycaffe ${Caffe_LINK} ${PYTHON_LIBRARIES} ${Boost_LIBRARIES})
-set_target_properties(pycaffe PROPERTIES PREFIX "" OUTPUT_NAME "_caffe")
 caffe_default_properties(pycaffe)
+set_target_properties(pycaffe PROPERTIES PREFIX "" OUTPUT_NAME "_caffe")
+target_include_directories(pycaffe PUBLIC ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR})
+target_link_libraries(pycaffe PUBLIC ${Caffe_LINK} ${PYTHON_LIBRARIES})
 
 if(UNIX OR APPLE)
     set(__linkname "${PROJECT_SOURCE_DIR}/python/caffe/_caffe.so")
diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
index 8a80c940..ed4d50be 100644
--- a/src/caffe/CMakeLists.txt
+++ b/src/caffe/CMakeLists.txt
@@ -4,8 +4,11 @@ caffe_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_p
 
 # include python files either to force generation
 add_library(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
-set(Caffe_LINKER_LIBS proto ${Caffe_LINKER_LIBS}) # note, crucial to prepend!
 caffe_default_properties(proto)
+target_link_libraries(proto PUBLIC ${PROTOBUF_LIBRARIES})
+target_include_directories(proto PUBLIC ${PROTOBUF_INCLUDE_DIR})
+
+list(INSERT Caffe_LINKER_LIBS 0 PUBLIC proto) # note, crucial to prepend!
 
 # --[ Caffe library
 
@@ -18,8 +21,13 @@ if(HAVE_CUDA)
 endif()
 
 add_library(caffe ${srcs})
-target_link_libraries(caffe proto ${Caffe_LINKER_LIBS})
 caffe_default_properties(caffe)
+target_link_libraries(caffe ${Caffe_LINKER_LIBS})
+target_include_directories(caffe ${Caffe_INCLUDE_DIRS}
+                                 PUBLIC
+                                 $<BUILD_INTERFACE:${Caffe_INCLUDE_DIR}>
+                                 $<INSTALL_INTERFACE:include>)
+target_compile_definitions(caffe ${Caffe_DEFINITIONS})
 set_target_properties(caffe PROPERTIES
     VERSION   ${CAFFE_TARGET_VERSION}
     SOVERSION ${CAFFE_TARGET_SOVERSION}
@@ -37,4 +45,3 @@ file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
 list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)
 install(PROGRAMS ${proto_python} DESTINATION python/caffe/proto)
 
-
diff --git a/src/gtest/CMakeLists.txt b/src/gtest/CMakeLists.txt
index ef7ff7ed..e98254af 100644
--- a/src/gtest/CMakeLists.txt
+++ b/src/gtest/CMakeLists.txt
@@ -1,5 +1,8 @@
 add_library(gtest STATIC EXCLUDE_FROM_ALL gtest.h gtest-all.cpp)
 caffe_default_properties(gtest)
+target_include_directories(gtest PUBLIC ${Caffe_SRC_DIR})
+target_compile_definitions(gtest PUBLIC -DGTEST_USE_OWN_TR1_TUPLE)
+
 
 #add_library(gtest_main gtest_main.cc)
 #target_link_libraries(gtest_main gtest)

From 6200b915601e1f7b2ec6d4746dc143114722ec38 Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Sat, 20 Aug 2016 01:08:26 +0300
Subject: [PATCH 047/183] net.cpp: do not include test/test_caffe_main.hpp

---
 src/caffe/net.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 644cb7e9..a3408734 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -17,8 +17,6 @@
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/upgrade_proto.hpp"
 
-#include "caffe/test/test_caffe_main.hpp"
-
 namespace caffe {
 
 template <typename Dtype>

From f1b9da54598923c531e1a98c4f1546169165e441 Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Sun, 14 Aug 2016 04:57:22 +0300
Subject: [PATCH 048/183] cmake: add option to link with OpenMP

Despite Caffe itself does not use OpenMP, explicitly linking to OpenMP
should be done when one statically links to a BLAS library which uses
OpenMP internally and does not provide proper CMake imported targets
with proper dependencies (nobody this so far).
---
 CMakeLists.txt           |  1 +
 cmake/Dependencies.cmake | 17 +++++++++++++----
 src/caffe/CMakeLists.txt |  3 +++
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb25b43a..378b285c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,6 +38,7 @@ caffe_option(USE_OPENCV "Build with OpenCV support" ON)
 caffe_option(USE_LEVELDB "Build with levelDB" ON)
 caffe_option(USE_LMDB "Build with lmdb" ON)
 caffe_option(ALLOW_LMDB_NOLOCK "Allow MDB_NOLOCK when reading LMDB files (only if necessary)" OFF)
+caffe_option(USE_OPENMP "Link with OpenMP (when your BLAS wants OpenMP and you get linker errors)" OFF)
 
 # ---[ Dependencies
 include(cmake/Dependencies.cmake)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 6a127592..290c161b 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1,7 +1,8 @@
-# This list is required for static linking and exported to CaffeConfig.cmake
+# These lists are later turned into target properties on main caffe library target
 set(Caffe_LINKER_LIBS "")
 set(Caffe_INCLUDE_DIRS "")
 set(Caffe_DEFINITIONS "")
+set(Caffe_COMPILE_OPTIONS "")
 
 # ---[ Boost
 find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
@@ -14,10 +15,18 @@ list(APPEND Caffe_LINKER_LIBS PRIVATE ${CMAKE_THREAD_LIBS_INIT})
 
 # ---[ OpenMP
 if(USE_OPENMP)
-  # TODO: use something exportable here
+  # Ideally, this should be provided by the BLAS library IMPORTED target. However,
+  # nobody does this, so we need to link to OpenMP explicitly and have the maintainer
+  # to flick the switch manually as needed.
+  #
+  # Moreover, OpenMP package does not provide an IMPORTED target as well, and the
+  # suggested way of linking to OpenMP is to append to CMAKE_{C,CXX}_FLAGS.
+  # However, this naïve method will force any user of Caffe to add the same kludge
+  # into their buildsystem again, so we put these options into per-target PUBLIC
+  # compile options and link flags, so that they will be exported properly.
   find_package(OpenMP REQUIRED)
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+  list(APPEND Caffe_LINKER_LIBS PRIVATE ${OpenMP_CXX_FLAGS})
+  list(APPEND Caffe_COMPILE_OPTIONS PRIVATE ${OpenMP_CXX_FLAGS})
 endif()
 
 # ---[ Google-glog
diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
index ed4d50be..7b25a98a 100644
--- a/src/caffe/CMakeLists.txt
+++ b/src/caffe/CMakeLists.txt
@@ -28,6 +28,9 @@ target_include_directories(caffe ${Caffe_INCLUDE_DIRS}
                                  $<BUILD_INTERFACE:${Caffe_INCLUDE_DIR}>
                                  $<INSTALL_INTERFACE:include>)
 target_compile_definitions(caffe ${Caffe_DEFINITIONS})
+if(Caffe_COMPILE_OPTIONS)
+  target_compile_options(caffe ${Caffe_COMPILE_OPTIONS})
+endif()
 set_target_properties(caffe PROPERTIES
     VERSION   ${CAFFE_TARGET_VERSION}
     SOVERSION ${CAFFE_TARGET_SOVERSION}

From 6ed799cb206c6b70bdd260d62e8ff3e077f5b635 Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Wed, 24 Aug 2016 06:28:41 +0300
Subject: [PATCH 049/183] cmake/Templates: remove duplicated #cmakedefines from
 caffe_config.h.in

Rationale: these are duplicated in CMakeLists code, and they cannot be
removed from there because many definitions need to be exported to the
library clients. See issue #4625.
---
 cmake/Templates/caffe_config.h.in | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in
index 8a31b43c..45465b98 100644
--- a/cmake/Templates/caffe_config.h.in
+++ b/cmake/Templates/caffe_config.h.in
@@ -4,16 +4,6 @@
 /* Binaries directory */
 #define BINARY_FOLDER "${PROJECT_BINARY_DIR}"
 
-/* NVIDA Cuda */
-#cmakedefine HAVE_CUDA
-
-/* NVIDA cuDNN */
-#cmakedefine HAVE_CUDNN
-#cmakedefine USE_CUDNN
-
-/* NVIDA cuDNN */
-#cmakedefine CPU_ONLY
-
 /* Test device */
 #define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE}
 
@@ -27,12 +17,3 @@
   #define EXAMPLES_SOURCE_DIR "examples/"
   #define CMAKE_EXT ""
 #endif
-
-/* Matlab */
-#cmakedefine HAVE_MATLAB
-
-/* IO libraries */
-#cmakedefine USE_OPENCV
-#cmakedefine USE_LEVELDB
-#cmakedefine USE_LMDB
-#cmakedefine ALLOW_LMDB_NOLOCK

From 9bc83e32b39e2c9bbf4bf20d69d4f215d73a414e Mon Sep 17 00:00:00 2001
From: Benedikt Wilbertz <benedikt.wilbertz@gmx.de>
Date: Fri, 12 Aug 2016 22:33:06 +0200
Subject: [PATCH 050/183] fix layerSetUp of scale_layer to not add bias blob
 when already present

---
 src/caffe/layers/scale_layer.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/caffe/layers/scale_layer.cpp b/src/caffe/layers/scale_layer.cpp
index ecdbb123..e652dad6 100644
--- a/src/caffe/layers/scale_layer.cpp
+++ b/src/caffe/layers/scale_layer.cpp
@@ -56,9 +56,17 @@ void ScaleLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     bias_bottom_vec_.resize(1);
     bias_bottom_vec_[0] = bottom[0];
     bias_layer_->SetUp(bias_bottom_vec_, top);
-    bias_param_id_ = this->blobs_.size();
-    this->blobs_.resize(bias_param_id_ + 1);
-    this->blobs_[bias_param_id_] = bias_layer_->blobs()[0];
+    if (this->blobs_.size() + bottom.size() < 3) {
+      // case: blobs.size == 1 && bottom.size == 1
+      // or blobs.size == 0 && bottom.size == 2
+      bias_param_id_ = this->blobs_.size();
+      this->blobs_.resize(bias_param_id_ + 1);
+      this->blobs_[bias_param_id_] = bias_layer_->blobs()[0];
+    } else {
+      // bias param already initialized
+      bias_param_id_ = this->blobs_.size() - 1;
+      bias_layer_->blobs()[0] = this->blobs_[bias_param_id_];
+    }
     bias_propagate_down_.resize(1, false);
   }
   this->param_propagate_down_.resize(this->blobs_.size(), true);

From cdcf2e07dba951774be7feb9d486b7f84ef0c0b1 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Fri, 9 Sep 2016 12:49:35 -0700
Subject: [PATCH 051/183] Benchmarking should not impact perf until timer is
 read

---
 src/caffe/util/benchmark.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/caffe/util/benchmark.cpp b/src/caffe/util/benchmark.cpp
index 1d269c35..d994225f 100644
--- a/src/caffe/util/benchmark.cpp
+++ b/src/caffe/util/benchmark.cpp
@@ -44,7 +44,6 @@ void Timer::Stop() {
     if (Caffe::mode() == Caffe::GPU) {
 #ifndef CPU_ONLY
       CUDA_CHECK(cudaEventRecord(stop_gpu_, 0));
-      CUDA_CHECK(cudaEventSynchronize(stop_gpu_));
 #else
       NO_GPU;
 #endif
@@ -66,6 +65,7 @@ float Timer::MicroSeconds() {
   }
   if (Caffe::mode() == Caffe::GPU) {
 #ifndef CPU_ONLY
+    CUDA_CHECK(cudaEventSynchronize(stop_gpu_));
     CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
                                     stop_gpu_));
     // Cuda only measure milliseconds
@@ -89,6 +89,7 @@ float Timer::MilliSeconds() {
   }
   if (Caffe::mode() == Caffe::GPU) {
 #ifndef CPU_ONLY
+    CUDA_CHECK(cudaEventSynchronize(stop_gpu_));
     CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
                                     stop_gpu_));
 #else

From 50b5697a0e0b85921e3ea38e961984ea08f014c3 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Fri, 9 Sep 2016 12:57:09 -0700
Subject: [PATCH 052/183] Avoids missing return values during build.

---
 src/caffe/layer_factory.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index e967bd61..f14253a5 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -67,6 +67,7 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer(
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
   }
 }
 
@@ -104,6 +105,7 @@ shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
   }
 }
 
@@ -141,6 +143,7 @@ shared_ptr<Layer<Dtype> > GetLRNLayer(const LayerParameter& param) {
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
   }
 }
 
@@ -164,6 +167,7 @@ shared_ptr<Layer<Dtype> > GetReLULayer(const LayerParameter& param) {
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
   }
 }
 
@@ -187,6 +191,7 @@ shared_ptr<Layer<Dtype> > GetSigmoidLayer(const LayerParameter& param) {
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
   }
 }
 
@@ -210,6 +215,7 @@ shared_ptr<Layer<Dtype> > GetSoftmaxLayer(const LayerParameter& param) {
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
   }
 }
 
@@ -233,6 +239,7 @@ shared_ptr<Layer<Dtype> > GetTanHLayer(const LayerParameter& param) {
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
   }
 }
 

From 04f9a77801af3233bacadcca178ee7d7a6406bd5 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Sat, 27 Aug 2016 20:19:00 -0700
Subject: [PATCH 053/183] [docs] clarify handling of bias and scaling by
 BiasLayer, ScaleLayer

A bias/scaling can be applied wherever desired by defining the
respective layers, and `ScaleLayer` can handle both as a memory
optimization.
---
 include/caffe/layers/batch_norm_layer.hpp |  8 +++-----
 include/caffe/layers/bias_layer.hpp       | 10 +++++-----
 include/caffe/layers/scale_layer.hpp      | 12 +++++++-----
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/caffe/layers/batch_norm_layer.hpp b/include/caffe/layers/batch_norm_layer.hpp
index 9b2d5126..c38c8410 100644
--- a/include/caffe/layers/batch_norm_layer.hpp
+++ b/include/caffe/layers/batch_norm_layer.hpp
@@ -27,11 +27,9 @@ namespace caffe {
  * param {lr_mult: 0} three times in the layer definition.
  *
  * Note that the original paper also included a per-channel learned bias and
- * scaling factor.  It is possible (though a bit cumbersome) to implement
- * this in caffe using a single-channel DummyDataLayer filled with zeros,
- * followed by a Convolution layer with output the same size as the current.
- * This produces a channel-specific value that can be added or multiplied by
- * the BatchNorm layer's output.
+ * scaling factor. To implement this in Caffe, define a `ScaleLayer` configured
+ * with `bias_term: true` after each `BatchNormLayer` to handle both the bias
+ * and scaling factor.
  *
  * [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network
  *     Training by Reducing Internal Covariate Shift." arXiv preprint
diff --git a/include/caffe/layers/bias_layer.hpp b/include/caffe/layers/bias_layer.hpp
index eedc3aaa..9639c9cd 100644
--- a/include/caffe/layers/bias_layer.hpp
+++ b/include/caffe/layers/bias_layer.hpp
@@ -10,13 +10,13 @@
 namespace caffe {
 
 /**
- * @brief Computes a sum of two input Blobs, with the shape of the
- *        latter Blob "broadcast" to match the shape of the former.
- *        Equivalent to tiling the latter Blob, then computing the elementwise
- *        sum.
+ * @brief Computes a sum of two input Blobs, with the shape of the latter Blob
+ *        "broadcast" to match the shape of the former. Equivalent to tiling
+ *        the latter Blob, then computing the elementwise sum.
  *
  * The second input may be omitted, in which case it's learned as a parameter
- * of the layer.
+ * of the layer. Note: in case bias and scaling are desired, both operations can
+ * be handled by `ScaleLayer` configured with `bias_term: true`.
  */
 template <typename Dtype>
 class BiasLayer : public Layer<Dtype> {
diff --git a/include/caffe/layers/scale_layer.hpp b/include/caffe/layers/scale_layer.hpp
index 924df2e5..45b714d4 100644
--- a/include/caffe/layers/scale_layer.hpp
+++ b/include/caffe/layers/scale_layer.hpp
@@ -12,13 +12,15 @@
 namespace caffe {
 
 /**
- * @brief Computes a product of two input Blobs, with the shape of the
- *        latter Blob "broadcast" to match the shape of the former.
+ * @brief Computes the elementwise product of two input Blobs, with the shape of
+ *        the latter Blob "broadcast" to match the shape of the former.
  *        Equivalent to tiling the latter Blob, then computing the elementwise
- *        product.
+ *        product. Note: for efficiency and convenience, this layer can
+ *        additionally perform a "broadcast" sum too when `bias_term: true`
+ *        is set.
  *
- * The second input may be omitted, in which case it's learned as a parameter
- * of the layer.
+ * The latter, scale input may be omitted, in which case it's learned as
+ * parameter of the layer (as is the bias, if it is included).
  */
 template <typename Dtype>
 class ScaleLayer: public Layer<Dtype> {

From d195e605de5f6964eadeba467f5ad85d46841c87 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 9 Sep 2016 19:46:41 -0700
Subject: [PATCH 054/183] [docs] note CUDA 8 requirement for Ubuntu 16.04

---
 docs/install_apt.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/install_apt.md b/docs/install_apt.md
index 2976e3cd..3de5a494 100644
--- a/docs/install_apt.md
+++ b/docs/install_apt.md
@@ -9,14 +9,19 @@ title: Installation: Ubuntu
     sudo apt-get install libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libhdf5-serial-dev protobuf-compiler
     sudo apt-get install --no-install-recommends libboost-all-dev
 
-**CUDA**: Install via the NVIDIA package instead of `apt-get` to be certain of the library and driver versions.
-Install the library and latest driver separately; the driver bundled with the library is usually out-of-date.
+**CUDA**: Install by `apt-get` or the NVIDIA `.run` package.
+The NVIDIA package tends to follow more recent library and driver versions, but the installation is more manual.
+If installing from packages, install the library and latest driver separately; the driver bundled with the library is usually out-of-date.
 This can be skipped for CPU-only installation.
 
 **BLAS**: install ATLAS by `sudo apt-get install libatlas-base-dev` or install OpenBLAS or MKL for better CPU performance.
 
 **Python** (optional): if you use the default Python you will need to `sudo apt-get install` the `python-dev` package to have the Python headers for building the pycaffe interface.
 
+**Compatibility notes, 16.04**
+
+CUDA 8 is required on Ubuntu 16.04.
+
 **Remaining dependencies, 14.04**
 
 Everything is packaged in 14.04.

From 3b6fd1d95b374b0484f32a4f86380714c456a293 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Sat, 27 Aug 2016 20:23:13 -0700
Subject: [PATCH 055/183] [docs] identify batch norm layer blobs

---
 include/caffe/layers/batch_norm_layer.hpp | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/caffe/layers/batch_norm_layer.hpp b/include/caffe/layers/batch_norm_layer.hpp
index c38c8410..a26ad1a4 100644
--- a/include/caffe/layers/batch_norm_layer.hpp
+++ b/include/caffe/layers/batch_norm_layer.hpp
@@ -13,18 +13,19 @@ namespace caffe {
  * @brief Normalizes the input to have 0-mean and/or unit (1) variance across
  *        the batch.
  *
- * This layer computes Batch Normalization described in [1].  For
- * each channel in the data (i.e. axis 1), it subtracts the mean and divides
- * by the variance, where both statistics are computed across both spatial
- * dimensions and across the different examples in the batch.
+ * This layer computes Batch Normalization as described in [1]. For each channel
+ * in the data (i.e. axis 1), it subtracts the mean and divides by the variance,
+ * where both statistics are computed across both spatial dimensions and across
+ * the different examples in the batch.
  *
- * By default, during training time, the network is computing global mean/
- * variance statistics via a running average, which is then used at test
- * time to allow deterministic outputs for each input.  You can manually
- * toggle whether the network is accumulating or using the statistics via the
- * use_global_stats option.  IMPORTANT: for this feature to work, you MUST
- * set the learning rate to zero for all three parameter blobs, i.e.,
- * param {lr_mult: 0} three times in the layer definition.
+ * By default, during training time, the network is computing global
+ * mean/variance statistics via a running average, which is then used at test
+ * time to allow deterministic outputs for each input. You can manually toggle
+ * whether the network is accumulating or using the statistics via the
+ * use_global_stats option. IMPORTANT: for this feature to work, you MUST set
+ * the learning rate to zero for all three blobs, i.e., param {lr_mult: 0} three
+ * times in the layer definition. For reference, these three blobs are (0)
+ * mean, (1) variance, and (2) the moving average factor.
  *
  * Note that the original paper also included a per-channel learned bias and
  * scaling factor. To implement this in Caffe, define a `ScaleLayer` configured

From c8f446f640b12b0577063eca8fab004e73c0aefc Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Mon, 29 Aug 2016 23:42:58 -0700
Subject: [PATCH 056/183] batch norm: hide statistics from solver, simplifying
 layer definition

batch norm statistics are not learnable parameters subject to solver
updates, so they must be shielded from the solver. `BatchNorm` layer now
masks its statistics for itself by zeroing parameter learning rates
instead of relying on the layer definition.

n.b. declaring `param`s for batch norm layers is no longer allowed.
---
 include/caffe/layers/batch_norm_layer.hpp | 6 ++----
 src/caffe/layers/batch_norm_layer.cpp     | 8 ++++++++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/caffe/layers/batch_norm_layer.hpp b/include/caffe/layers/batch_norm_layer.hpp
index a26ad1a4..43f7b28b 100644
--- a/include/caffe/layers/batch_norm_layer.hpp
+++ b/include/caffe/layers/batch_norm_layer.hpp
@@ -22,10 +22,8 @@ namespace caffe {
  * mean/variance statistics via a running average, which is then used at test
  * time to allow deterministic outputs for each input. You can manually toggle
  * whether the network is accumulating or using the statistics via the
- * use_global_stats option. IMPORTANT: for this feature to work, you MUST set
- * the learning rate to zero for all three blobs, i.e., param {lr_mult: 0} three
- * times in the layer definition. For reference, these three blobs are (0)
- * mean, (1) variance, and (2) the moving average factor.
+ * use_global_stats option. For reference, these statistics are kept in the
+ * layer's three blobs: (0) mean, (1) variance, and (2) moving average factor.
  *
  * Note that the original paper also included a per-channel learned bias and
  * scaling factor. To implement this in Caffe, define a `ScaleLayer` configured
diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp
index a69d8f99..0b1037ed 100644
--- a/src/caffe/layers/batch_norm_layer.cpp
+++ b/src/caffe/layers/batch_norm_layer.cpp
@@ -34,6 +34,14 @@ void BatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
                 this->blobs_[i]->mutable_cpu_data());
     }
   }
+  // Mask statistics from optimization by setting local learning rates
+  // for mean, variance, and the bias correction to zero.
+  CHECK_EQ(this->layer_param_.param_size(), 0)
+      << "Cannot configure batch normalization statistics as layer parameters.";
+  for (int i = 0; i < this->blobs_.size(); ++i) {
+    ParamSpec* fixed_param_spec = this->layer_param_.add_param();
+    fixed_param_spec->set_lr_mult(0.);
+  }
 }
 
 template <typename Dtype>

From a8ec123c00723df0d0ad897e1eea32a29201c81b Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 9 Sep 2016 16:49:31 -0700
Subject: [PATCH 057/183] batch norm: auto-upgrade old layer definitions w/
 param messages

automatically strip old batch norm layer definitions including `param`
messages. the batch norm layer used to require manually masking its
state from the solver by setting `param { lr_mult: 0 }` messages for
each of its statistics. this is now handled automatically by the layer.
---
 include/caffe/util/upgrade_proto.hpp |  6 +++++
 src/caffe/util/upgrade_proto.cpp     | 34 +++++++++++++++++++++++++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/include/caffe/util/upgrade_proto.hpp b/include/caffe/util/upgrade_proto.hpp
index 14e1936a..b145822a 100644
--- a/include/caffe/util/upgrade_proto.hpp
+++ b/include/caffe/util/upgrade_proto.hpp
@@ -65,6 +65,12 @@ bool NetNeedsInputUpgrade(const NetParameter& net_param);
 // Perform all necessary transformations to upgrade input fields into layers.
 void UpgradeNetInput(NetParameter* net_param);
 
+// Return true iff the Net contains batch norm layers with manual local LRs.
+bool NetNeedsBatchNormUpgrade(const NetParameter& net_param);
+
+// Perform all necessary transformations to upgrade batch norm layers.
+void UpgradeNetBatchNorm(NetParameter* net_param);
+
 // Return true iff the solver contains any old solver_type specified as enums
 bool SolverNeedsTypeUpgrade(const SolverParameter& solver_param);
 
diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp
index 9e186915..a0aacbe9 100644
--- a/src/caffe/util/upgrade_proto.cpp
+++ b/src/caffe/util/upgrade_proto.cpp
@@ -14,7 +14,8 @@ namespace caffe {
 
 bool NetNeedsUpgrade(const NetParameter& net_param) {
   return NetNeedsV0ToV1Upgrade(net_param) || NetNeedsV1ToV2Upgrade(net_param)
-      || NetNeedsDataUpgrade(net_param) || NetNeedsInputUpgrade(net_param);
+      || NetNeedsDataUpgrade(net_param) || NetNeedsInputUpgrade(net_param)
+      || NetNeedsBatchNormUpgrade(net_param);
 }
 
 bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
@@ -71,6 +72,14 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
     LOG(WARNING) << "Note that future Caffe releases will only support "
                  << "input layers and not input fields.";
   }
+  // NetParameter uses old style batch norm layers; try to upgrade it.
+  if (NetNeedsBatchNormUpgrade(*param)) {
+    LOG(INFO) << "Attempting to upgrade batch norm layers using deprecated "
+              << "params: " << param_file;
+    UpgradeNetBatchNorm(param);
+    LOG(INFO) << "Successfully upgraded batch norm layers using deprecated "
+              << "params.";
+  }
   return success;
 }
 
@@ -991,6 +1000,29 @@ void UpgradeNetInput(NetParameter* net_param) {
   net_param->clear_input_dim();
 }
 
+bool NetNeedsBatchNormUpgrade(const NetParameter& net_param) {
+  for (int i = 0; i < net_param.layer_size(); ++i) {
+    // Check if BatchNorm layers declare three parameters, as required by
+    // the previous BatchNorm layer definition.
+    if (net_param.layer(i).type() == "BatchNorm"
+        && net_param.layer(i).param_size() == 3) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void UpgradeNetBatchNorm(NetParameter* net_param) {
+  for (int i = 0; i < net_param->layer_size(); ++i) {
+    // Check if BatchNorm layers declare three parameters, as required by
+    // the previous BatchNorm layer definition.
+    if (net_param->layer(i).type() == "BatchNorm"
+        && net_param->layer(i).param_size() == 3) {
+      net_param->mutable_layer(i)->clear_param();
+    }
+  }
+}
+
 // Return true iff the solver contains any old solver_type specified as enums
 bool SolverNeedsTypeUpgrade(const SolverParameter& solver_param) {
   if (solver_param.has_solver_type()) {

From fc8f3eba6fa06be2f55d1b576f46664e07f5d0a6 Mon Sep 17 00:00:00 2001
From: Youssef Kashef <youssef.kashef@gmail.com>
Date: Tue, 13 Sep 2016 15:52:39 +0200
Subject: [PATCH 058/183] fix comments in matlab classification demo

---
 matlab/demo/classification_demo.m | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/matlab/demo/classification_demo.m b/matlab/demo/classification_demo.m
index 2b603329..435c0778 100644
--- a/matlab/demo/classification_demo.m
+++ b/matlab/demo/classification_demo.m
@@ -8,7 +8,7 @@
 %
 % ****************************************************************************
 % For detailed documentation and usage on Caffe's Matlab interface, please
-% refer to Caffe Interface Tutorial at
+% refer to the Caffe Interface Tutorial at
 % http://caffe.berkeleyvision.org/tutorial/interfaces.html#matlab
 % ****************************************************************************
 %
@@ -24,6 +24,7 @@
 %  $ export LD_LIBRARY_PATH=/opt/intel/mkl/lib/intel64:/usr/local/cuda-5.5/lib64
 %  $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
 % Or the equivalent based on where things are installed on your system
+% and what versions are installed.
 %
 % Usage:
 %  im = imread('../../examples/images/cat.jpg');
@@ -39,7 +40,7 @@
 % Data coming in from matlab needs to be in the order
 %   [width, height, channels, images]
 % where width is the fastest dimension.
-% Here is the rough matlab for putting image data into the correct
+% Here is the rough matlab code for putting image data into the correct
 % format in W x H x C with BGR channels:
 %   % permute channels from RGB to BGR
 %   im_data = im(:, :, [3, 2, 1]);
@@ -54,7 +55,7 @@
 
 % If you have multiple images, cat them with cat(4, ...)
 
-% Add caffe/matlab to you Matlab search PATH to use matcaffe
+% Add caffe/matlab to your Matlab search PATH in order to use matcaffe
 if exist('../+caffe', 'dir')
   addpath('..');
 else

From eee3be15589e81b5385c7d0d02a151c789134905 Mon Sep 17 00:00:00 2001
From: Miguel Lloreda <mlloreda@users.noreply.github.com>
Date: Thu, 15 Sep 2016 17:28:02 -0400
Subject: [PATCH 059/183] Fixed typos in examples/cpp_classification/readme

---
 examples/cpp_classification/readme.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cpp_classification/readme.md b/examples/cpp_classification/readme.md
index 0de2885b..4f683aa6 100644
--- a/examples/cpp_classification/readme.md
+++ b/examples/cpp_classification/readme.md
@@ -10,7 +10,7 @@ priority: 10
 
 Caffe, at its core, is written in C++. It is possible to use the C++
 API of Caffe to implement an image classification application similar
-to the Python code presented in one of the Notebook example. To look
+to the Python code presented in one of the Notebook examples. To look
 at a more general-purpose example of the Caffe C++ API, you should
 study the source code of the command line tool `caffe` in `tools/caffe.cpp`.
 
@@ -19,7 +19,7 @@ study the source code of the command line tool `caffe` in `tools/caffe.cpp`.
 A simple C++ code is proposed in
 `examples/cpp_classification/classification.cpp`. For the sake of
 simplicity, this example does not support oversampling of a single
-sample nor batching of multiple independant samples. This example is
+sample nor batching of multiple independent samples. This example is
 not trying to reach the maximum possible classification throughput on
 a system, but special care was given to avoid unnecessary
 pessimization while keeping the code readable.

From 2f55f42cff9147e69b1f5dff9232058d7b654eba Mon Sep 17 00:00:00 2001
From: Rok Mandeljc <rok.mandeljc@fe.uni-lj.si>
Date: Mon, 29 Jun 2015 15:48:43 +0200
Subject: [PATCH 060/183] matcaffe: allow destruction of individual networks
 and solvers

---
 matlab/+caffe/Net.m              |  3 +++
 matlab/+caffe/Solver.m           |  3 +++
 matlab/+caffe/private/caffe_.cpp | 24 ++++++++++++++++++++++++
 3 files changed, 30 insertions(+)

diff --git a/matlab/+caffe/Net.m b/matlab/+caffe/Net.m
index e6295bba..349e060e 100644
--- a/matlab/+caffe/Net.m
+++ b/matlab/+caffe/Net.m
@@ -68,6 +68,9 @@
       self.layer_names = self.attributes.layer_names;
       self.blob_names = self.attributes.blob_names;
     end
+    function delete (self)
+      caffe_('delete_net', self.hNet_self);
+    end
     function layer = layers(self, layer_name)
       CHECK(ischar(layer_name), 'layer_name must be a string');
       layer = self.layer_vec(self.name2layer_index(layer_name));
diff --git a/matlab/+caffe/Solver.m b/matlab/+caffe/Solver.m
index f8bdc4e2..2d3c98b2 100644
--- a/matlab/+caffe/Solver.m
+++ b/matlab/+caffe/Solver.m
@@ -36,6 +36,9 @@
         self.test_nets(n) = caffe.Net(self.attributes.hNet_test_nets(n));
       end
     end
+    function delete (self)
+      caffe_('delete_solver', self.hSolver_self);
+    end
     function iter = iter(self)
       iter = caffe_('solver_get_iter', self.hSolver_self);
     end
diff --git a/matlab/+caffe/private/caffe_.cpp b/matlab/+caffe/private/caffe_.cpp
index 1b1b2bff..bc04f417 100644
--- a/matlab/+caffe/private/caffe_.cpp
+++ b/matlab/+caffe/private/caffe_.cpp
@@ -197,6 +197,17 @@ static void get_solver(MEX_ARGS) {
   mxFree(solver_file);
 }
 
+// Usage: caffe_('delete_solver', hSolver)
+static void delete_solver(MEX_ARGS) {
+  mxCHECK(nrhs == 1 && mxIsStruct(prhs[0]),
+      "Usage: caffe_('delete_solver', hSolver)");
+  Solver<float>* solver = handle_to_ptr<Solver<float> >(prhs[0]);
+  solvers_.erase(std::remove_if(solvers_.begin(), solvers_.end(),
+      [solver] (const shared_ptr< Solver<float> > &solverPtr) {
+      return solverPtr.get() == solver;
+  }), solvers_.end());
+}
+
 // Usage: caffe_('solver_get_attr', hSolver)
 static void solver_get_attr(MEX_ARGS) {
   mxCHECK(nrhs == 1 && mxIsStruct(prhs[0]),
@@ -271,6 +282,17 @@ static void get_net(MEX_ARGS) {
   mxFree(phase_name);
 }
 
+// Usage: caffe_('delete_solver', hSolver)
+static void delete_net(MEX_ARGS) {
+  mxCHECK(nrhs == 1 && mxIsStruct(prhs[0]),
+      "Usage: caffe_('delete_solver', hNet)");
+  Net<float>* net = handle_to_ptr<Net<float> >(prhs[0]);
+  nets_.erase(std::remove_if(nets_.begin(), nets_.end(),
+      [net] (const shared_ptr< Net<float> > &netPtr) {
+      return netPtr.get() == net;
+  }), nets_.end());
+}
+
 // Usage: caffe_('net_get_attr', hNet)
 static void net_get_attr(MEX_ARGS) {
   mxCHECK(nrhs == 1 && mxIsStruct(prhs[0]),
@@ -522,12 +544,14 @@ struct handler_registry {
 static handler_registry handlers[] = {
   // Public API functions
   { "get_solver",         get_solver      },
+  { "delete_solver",      delete_solver   },
   { "solver_get_attr",    solver_get_attr },
   { "solver_get_iter",    solver_get_iter },
   { "solver_restore",     solver_restore  },
   { "solver_solve",       solver_solve    },
   { "solver_step",        solver_step     },
   { "get_net",            get_net         },
+  { "delete_net",         delete_net      },
   { "net_get_attr",       net_get_attr    },
   { "net_forward",        net_forward     },
   { "net_backward",       net_backward    },

From f96ccea124314d4ea1374e906fbd709d1dc43585 Mon Sep 17 00:00:00 2001
From: Luke Yeager <lukeyeager@users.noreply.github.com>
Date: Fri, 23 Sep 2016 11:22:48 -0700
Subject: [PATCH 061/183] [TravisCI] google/protobuf renamed the 3.0 branch

---
 scripts/travis/install-deps.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index 4e86ac73..daef5c4a 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -56,7 +56,7 @@ else
       dh-autoreconf \
       unzip
 
-    wget https://github.com/google/protobuf/archive/3.0.0-GA.tar.gz -O protobuf3.tar.gz
+    wget https://github.com/google/protobuf/archive/3.0.x.tar.gz -O protobuf3.tar.gz
     tar -xzf protobuf3.tar.gz -C $PROTOBUF3_DIR --strip 1
     rm protobuf3.tar.gz
     cd $PROTOBUF3_DIR

From 79a8c5210846f70108e5a2be1bedc95d9f8aea30 Mon Sep 17 00:00:00 2001
From: Ken Yu <kyu_115s@hotmail.com>
Date: Wed, 21 Sep 2016 16:19:17 +0800
Subject: [PATCH 062/183] Ignore Visual Studio Code files.

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 53c1fb05..281ef326 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,9 @@
 # PyCharm files
 .idea
 
+# Visual Studio Code files
+.vscode
+
 # OSX dir files
 .DS_Store
 

From ce6ac831b96725bd770eaec5c0f743e423e355fd Mon Sep 17 00:00:00 2001
From: Benedikt Wilbertz <benedikt.wilbertz@gmx.de>
Date: Thu, 29 Sep 2016 21:55:58 +0200
Subject: [PATCH 063/183] slightly relax batch norm check

---
 src/caffe/layers/batch_norm_layer.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp
index 0b1037ed..e661abb1 100644
--- a/src/caffe/layers/batch_norm_layer.cpp
+++ b/src/caffe/layers/batch_norm_layer.cpp
@@ -36,11 +36,15 @@ void BatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   }
   // Mask statistics from optimization by setting local learning rates
   // for mean, variance, and the bias correction to zero.
-  CHECK_EQ(this->layer_param_.param_size(), 0)
-      << "Cannot configure batch normalization statistics as layer parameters.";
   for (int i = 0; i < this->blobs_.size(); ++i) {
-    ParamSpec* fixed_param_spec = this->layer_param_.add_param();
-    fixed_param_spec->set_lr_mult(0.);
+    if (this->layer_param_.param_size() == i) {
+      ParamSpec* fixed_param_spec = this->layer_param_.add_param();
+      fixed_param_spec->set_lr_mult(0.f);
+    } else {
+      CHECK_EQ(this->layer_param_.param(i).lr_mult(), 0.f)
+          << "Cannot configure batch normalization statistics as layer "
+          << "parameters.";
+    }
   }
 }
 

From 08ca70326966ad24b012ca8084c8baba5b1a23b5 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Fri, 30 Sep 2016 18:18:47 -0700
Subject: [PATCH 064/183] NV changed path to cudnn

---
 scripts/travis/install-deps.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index daef5c4a..1900b16d 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -84,7 +84,7 @@ if $WITH_CUDA ; then
   rm $CUDA_REPO_PKG
 
   if $WITH_CUDNN ; then
-    ML_REPO_PKG=nvidia-machine-learning-repo_4.0-2_amd64.deb
+    ML_REPO_PKG=nvidia-machine-learning-repo-ubuntu1404_4.0-2_amd64.deb
     wget http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/$ML_REPO_PKG
     dpkg -i $ML_REPO_PKG
   fi

From c97a964a23f0ddd455c619537e208f117ae77743 Mon Sep 17 00:00:00 2001
From: Kun Wang <wk910930@gmail.com>
Date: Wed, 5 Oct 2016 18:59:07 +0800
Subject: [PATCH 065/183] fix typo in pascal_multilabel_datalayers.py

---
 examples/pycaffe/layers/pascal_multilabel_datalayers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/pycaffe/layers/pascal_multilabel_datalayers.py b/examples/pycaffe/layers/pascal_multilabel_datalayers.py
index 68e4fa79..9420cb32 100644
--- a/examples/pycaffe/layers/pascal_multilabel_datalayers.py
+++ b/examples/pycaffe/layers/pascal_multilabel_datalayers.py
@@ -20,7 +20,7 @@
 class PascalMultilabelDataLayerSync(caffe.Layer):
 
     """
-    This is a simple syncronous datalayer for training a multilabel model on
+    This is a simple synchronous datalayer for training a multilabel model on
     PASCAL.
     """
 
@@ -33,7 +33,7 @@ def setup(self, bottom, top):
         # params is a python dictionary with layer parameters.
         params = eval(self.param_str)
 
-        # Check the paramameters for validity.
+        # Check the parameters for validity.
         check_params(params)
 
         # store input as class variables
@@ -207,7 +207,7 @@ def check_params(params):
 
 def print_info(name, params):
     """
-    Ouput some info regarding the class
+    Output some info regarding the class
     """
     print "{} initialized for split: {}, with bs: {}, im_shape: {}.".format(
         name,

From cdd2d0ee9ed42200b6ab8b52c0213bb5916b46c4 Mon Sep 17 00:00:00 2001
From: Vincent <vincent.biret@gmail.com>
Date: Wed, 5 Oct 2016 13:12:04 +0100
Subject: [PATCH 066/183] Fix: docs/yum_install.md glog broken link

fixes the broken glog link in yum_install.md which is currently returning a 404.
---
 docs/install_yum.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/install_yum.md b/docs/install_yum.md
index 2104912e..38bf7255 100644
--- a/docs/install_yum.md
+++ b/docs/install_yum.md
@@ -15,7 +15,7 @@ title: Installation: RHEL / Fedora / CentOS
 **Remaining dependencies, if not found**
 
     # glog
-    wget https://google-glog.googlecode.com/files/glog-0.3.3.tar.gz
+    wget https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/google-glog/glog-0.3.3.tar.gz
     tar zxvf glog-0.3.3.tar.gz
     cd glog-0.3.3
     ./configure

From 553a645f1d6f950bf1a36284bb13b5fc7c3bacdc Mon Sep 17 00:00:00 2001
From: Guillaume Dumont <dumont.guillaume@gmail.com>
Date: Thu, 13 Oct 2016 22:29:56 -0400
Subject: [PATCH 067/183] pytest fix: Files created with NamedTemporary files
 cannot be opened on Windows

---
 python/caffe/test/test_net.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/caffe/test/test_net.py b/python/caffe/test/test_net.py
index e1090934..a0739fba 100644
--- a/python/caffe/test/test_net.py
+++ b/python/caffe/test/test_net.py
@@ -173,12 +173,12 @@ class TestLevels(unittest.TestCase):
 """
 
     def setUp(self):
-        self.f = tempfile.NamedTemporaryFile(mode='w+')
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
         self.f.write(self.TEST_NET)
-        self.f.flush()
+        self.f.close()
 
     def tearDown(self):
-        self.f.close()
+        os.remove(self.f.name)
 
     def check_net(self, net, blobs):
         net_blobs = [b for b in net.blobs.keys() if 'data' not in b]
@@ -238,12 +238,12 @@ class TestStages(unittest.TestCase):
 """
 
     def setUp(self):
-        self.f = tempfile.NamedTemporaryFile(mode='w+')
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
         self.f.write(self.TEST_NET)
-        self.f.flush()
+        self.f.close()
 
     def tearDown(self):
-        self.f.close()
+        os.remove(self.f.name)
 
     def check_net(self, net, blobs):
         net_blobs = [b for b in net.blobs.keys() if 'data' not in b]
@@ -320,12 +320,12 @@ class TestAllInOne(unittest.TestCase):
 """
 
     def setUp(self):
-        self.f = tempfile.NamedTemporaryFile(mode='w+')
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
         self.f.write(self.TEST_NET)
-        self.f.flush()
+        self.f.close()
 
     def tearDown(self):
-        self.f.close()
+        os.remove(self.f.name)
 
     def check_net(self, net, outputs):
         self.assertEqual(list(net.blobs['data'].shape), [1,1,10,10])

From 95a436c601a04af620a0e166393d3ff695905bc4 Mon Sep 17 00:00:00 2001
From: max argus <argus.max@gmail.com>
Date: Thu, 25 Aug 2016 09:20:24 +0000
Subject: [PATCH 068/183] Fix: made load_hd5 check blob dims by default.

Size checks are needed for loading parameters to avoid strange bugs
when loading data we continue to reshape.
---
 include/caffe/util/hdf5.hpp               |  4 +--
 src/caffe/layers/hdf5_data_layer.cpp      |  3 +-
 src/caffe/test/test_hdf5_output_layer.cpp | 10 ++++---
 src/caffe/test/test_hdf5data_layer.cpp    |  2 +-
 src/caffe/util/hdf5.cpp                   | 34 +++++++++++++++++++----
 5 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/include/caffe/util/hdf5.hpp b/include/caffe/util/hdf5.hpp
index ce568c5e..71549c1c 100644
--- a/include/caffe/util/hdf5.hpp
+++ b/include/caffe/util/hdf5.hpp
@@ -13,12 +13,12 @@ namespace caffe {
 template <typename Dtype>
 void hdf5_load_nd_dataset_helper(
     hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob);
+    Blob<Dtype>* blob, bool reshape);
 
 template <typename Dtype>
 void hdf5_load_nd_dataset(
     hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob);
+    Blob<Dtype>* blob, bool reshape = false);
 
 template <typename Dtype>
 void hdf5_save_nd_dataset(
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 2f13dc64..00991290 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -39,8 +39,9 @@ void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
 
   for (int i = 0; i < top_size; ++i) {
     hdf_blobs_[i] = shared_ptr<Blob<Dtype> >(new Blob<Dtype>());
+    // Allow reshape here, as we are loading data not params
     hdf5_load_nd_dataset(file_id, this->layer_param_.top(i).c_str(),
-        MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get());
+        MIN_DATA_DIM, MAX_DATA_DIM, hdf_blobs_[i].get(), true);
   }
 
   herr_t status = H5Fclose(file_id);
diff --git a/src/caffe/test/test_hdf5_output_layer.cpp b/src/caffe/test/test_hdf5_output_layer.cpp
index 3833ebff..2bc2de1e 100644
--- a/src/caffe/test/test_hdf5_output_layer.cpp
+++ b/src/caffe/test/test_hdf5_output_layer.cpp
@@ -77,10 +77,12 @@ TYPED_TEST(HDF5OutputLayerTest, TestForward) {
                           H5P_DEFAULT);
   ASSERT_GE(file_id, 0)<< "Failed to open HDF5 file" <<
       this->input_file_name_;
+  // Allow reshape here as we are loading data not params
+  bool reshape = true;
   hdf5_load_nd_dataset(file_id, HDF5_DATA_DATASET_NAME, 0, 4,
-                       this->blob_data_);
+                       this->blob_data_, reshape);
   hdf5_load_nd_dataset(file_id, HDF5_DATA_LABEL_NAME, 0, 4,
-                       this->blob_label_);
+                       this->blob_label_, reshape);
   herr_t status = H5Fclose(file_id);
   EXPECT_GE(status, 0)<< "Failed to close HDF5 file " <<
       this->input_file_name_;
@@ -105,12 +107,12 @@ TYPED_TEST(HDF5OutputLayerTest, TestForward) {
 
   Blob<Dtype>* blob_data = new Blob<Dtype>();
   hdf5_load_nd_dataset(file_id, HDF5_DATA_DATASET_NAME, 0, 4,
-                       blob_data);
+                       blob_data, reshape);
   this->CheckBlobEqual(*(this->blob_data_), *blob_data);
 
   Blob<Dtype>* blob_label = new Blob<Dtype>();
   hdf5_load_nd_dataset(file_id, HDF5_DATA_LABEL_NAME, 0, 4,
-                       blob_label);
+                       blob_label, reshape);
   this->CheckBlobEqual(*(this->blob_label_), *blob_label);
 
   status = H5Fclose(file_id);
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp
index 8884ce95..e0fd6213 100644
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ b/src/caffe/test/test_hdf5data_layer.cpp
@@ -70,7 +70,7 @@ TYPED_TEST(HDF5DataLayerTest, TestRead) {
   int height = 6;
   int width = 5;
 
-  // Test that the layer setup got the correct parameters.
+  // Test that the layer setup gives correct parameters.
   HDF5DataLayer<Dtype> layer(param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_data_->num(), batch_size);
diff --git a/src/caffe/util/hdf5.cpp b/src/caffe/util/hdf5.cpp
index 7730e76a..0003f1b3 100644
--- a/src/caffe/util/hdf5.cpp
+++ b/src/caffe/util/hdf5.cpp
@@ -9,7 +9,7 @@ namespace caffe {
 template <typename Dtype>
 void hdf5_load_nd_dataset_helper(
     hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob) {
+    Blob<Dtype>* blob, bool reshape) {
   // Verify that the dataset exists.
   CHECK(H5LTfind_dataset(file_id, dataset_name_))
       << "Failed to find HDF5 dataset " << dataset_name_;
@@ -56,17 +56,38 @@ void hdf5_load_nd_dataset_helper(
     LOG(FATAL) << "Datatype class unknown";
   }
 
+
   vector<int> blob_dims(dims.size());
   for (int i = 0; i < dims.size(); ++i) {
     blob_dims[i] = dims[i];
   }
-  blob->Reshape(blob_dims);
+
+  if (reshape) {
+    blob->Reshape(blob_dims);
+  } else {
+    if (blob_dims != blob->shape()) {
+      // create shape string for error message
+      ostringstream stream;
+      int count = 1;
+      for (int i = 0; i < blob_dims.size(); ++i) {
+        stream << blob_dims[i] << " ";
+        count = count * blob_dims[i];
+      }
+      stream << "(" << count << ")";
+      string source_shape_string = stream.str();
+
+      CHECK(blob_dims == blob->shape()) << "Cannot load blob from hdf5; shape "
+            << "mismatch. Source shape is " << source_shape_string
+            << " target shape is " << blob->shape_string();
+    }
+  }
 }
 
 template <>
 void hdf5_load_nd_dataset<float>(hid_t file_id, const char* dataset_name_,
-        int min_dim, int max_dim, Blob<float>* blob) {
-  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
+        int min_dim, int max_dim, Blob<float>* blob, bool reshape) {
+  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob,
+                              reshape);
   herr_t status = H5LTread_dataset_float(
     file_id, dataset_name_, blob->mutable_cpu_data());
   CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_;
@@ -74,8 +95,9 @@ void hdf5_load_nd_dataset<float>(hid_t file_id, const char* dataset_name_,
 
 template <>
 void hdf5_load_nd_dataset<double>(hid_t file_id, const char* dataset_name_,
-        int min_dim, int max_dim, Blob<double>* blob) {
-  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
+        int min_dim, int max_dim, Blob<double>* blob, bool reshape) {
+  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob,
+                              reshape);
   herr_t status = H5LTread_dataset_double(
     file_id, dataset_name_, blob->mutable_cpu_data());
   CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_;

From 197d11a0e1be7ad35714eb38d9b391e1cd39af39 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Thu, 27 Oct 2016 00:41:03 -0700
Subject: [PATCH 069/183] sigmoid cross-entropy loss: add GPU forward for full
 GPU mode

close #3004
---
 .../sigmoid_cross_entropy_loss_layer.hpp      |  2 ++
 .../sigmoid_cross_entropy_loss_layer.cpp      |  2 +-
 .../sigmoid_cross_entropy_loss_layer.cu       | 36 +++++++++++++++++--
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp b/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
index 598dca5f..6452ea51 100644
--- a/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
+++ b/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
@@ -59,6 +59,8 @@ class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
   /// @copydoc SigmoidCrossEntropyLossLayer
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index 10ac9470..eb77a9c2 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -68,7 +68,7 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
 }
 
 #ifdef CPU_ONLY
-STUB_GPU_BACKWARD(SigmoidCrossEntropyLossLayer, Backward);
+STUB_GPU(SigmoidCrossEntropyLossLayer);
 #endif
 
 INSTANTIATE_CLASS(SigmoidCrossEntropyLossLayer);
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
index 046cb9d3..7cb982d2 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
@@ -5,6 +5,39 @@
 
 namespace caffe {
 
+template <typename Dtype>
+__global__ void SigmoidCrossEntropyLossForwardGPU(const int nthreads,
+          const Dtype* input_data, const Dtype* target, Dtype* loss) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
+    loss[i] = input_data[i] * (target[i] - (input_data[i] >= 0)) -
+        log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
+  }
+}
+
+template <typename Dtype>
+void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  // The forward pass computes the sigmoid outputs.
+  sigmoid_bottom_vec_[0] = bottom[0];
+  sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
+  // Compute the loss (negative log likelihood)
+  const int count = bottom[0]->count();
+  const int num = bottom[0]->num();
+  // Stable version of loss computation from input data
+  const Dtype* input_data = bottom[0]->gpu_data();
+  const Dtype* target = bottom[1]->gpu_data();
+  // Since this memory is not used for anything until it is overwritten
+  // on the backward pass, we use it here to avoid having to allocate new GPU
+  // memory to accumulate intermediate results in the kernel.
+  Dtype* loss_data = bottom[0]->mutable_gpu_diff();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SigmoidCrossEntropyLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
+      CAFFE_CUDA_NUM_THREADS>>>(count, input_data, target, loss_data);
+  Dtype loss;
+  caffe_gpu_asum(count, loss_data, &loss);
+  top[0]->mutable_cpu_data()[0] = loss / num;
+}
+
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
     const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
@@ -28,7 +61,6 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
   }
 }
 
-INSTANTIATE_LAYER_GPU_BACKWARD(SigmoidCrossEntropyLossLayer);
-
+INSTANTIATE_LAYER_GPU_FUNCS(SigmoidCrossEntropyLossLayer);
 
 }  // namespace caffe

From f59dc97b090259f54801d620b6b10ad1fb1542e2 Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Tue, 1 Nov 2016 14:02:52 +0800
Subject: [PATCH 070/183] add the missing star in comment

a trival commit which adds the missing star ;)
---
 src/caffe/layers/rnn_layer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/layers/rnn_layer.cpp b/src/caffe/layers/rnn_layer.cpp
index f62ae8c7..8c2fa22e 100644
--- a/src/caffe/layers/rnn_layer.cpp
+++ b/src/caffe/layers/rnn_layer.cpp
@@ -215,7 +215,7 @@ void RNNLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const {
     }
 
     // Add layers to compute
-    //     o_t := \tanh( W_ho h_t + b_o)
+    //     o_t := \tanh( W_ho * h_t + b_o)
     //          = \tanh( W_ho_h_t )
     {
       LayerParameter* o_neuron_param = net_param->add_layer();

From 0d20df51901550f1b7eb2d56e0a84df5d6e2f029 Mon Sep 17 00:00:00 2001
From: baecchi <claudio.baecchi@gmail.com>
Date: Tue, 1 Nov 2016 16:15:51 +0100
Subject: [PATCH 071/183] corrected typo in accuracy_layer.hpp: MaxTopBlos ->
 MaxTopBlobs

---
 include/caffe/layers/accuracy_layer.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/caffe/layers/accuracy_layer.hpp b/include/caffe/layers/accuracy_layer.hpp
index fe2adb93..a9ad3225 100644
--- a/include/caffe/layers/accuracy_layer.hpp
+++ b/include/caffe/layers/accuracy_layer.hpp
@@ -39,7 +39,7 @@ class AccuracyLayer : public Layer<Dtype> {
   // If there are two top blobs, then the second blob will contain
   // accuracies per class.
   virtual inline int MinTopBlobs() const { return 1; }
-  virtual inline int MaxTopBlos() const { return 2; }
+  virtual inline int MaxTopBlobs() const { return 2; }
 
  protected:
   /**

From 3b443eacb30d8f4b3e551707faeebeeb15e77960 Mon Sep 17 00:00:00 2001
From: "Jonathan R. Williford" <williford-nin566@neural.vision>
Date: Fri, 28 Oct 2016 10:39:44 +0200
Subject: [PATCH 072/183] Add Github issue template to curb misuse.

For information on Github issue templates, see:
https://github.com/blog/2111-issue-and-pull-request-templates

The template has been revised according to discussion with @shelhamer
and @willyd on pull request BVLC/caffe#4914.
---
 .github/ISSUE_TEMPLATE.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE.md

diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 00000000..d78a3dc3
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,19 @@
+Please use the [caffe-users list](https://groups.google.com/forum/#!forum/caffe-users) for usage, installation, or modeling questions, or other requests for help.
+_Do not post such requests to Issues._ Doing so interferes with the development of Caffe.
+
+Please read the [guidelines for contributing](https://github.com/BVLC/caffe/blob/master/CONTRIBUTING.md) before submitting this issue.
+
+### Issue summary
+
+
+### Steps to reproduce
+
+If you are having difficulty building Caffe or training a model, please ask the caffe-users mailing list. If you are reporting a build error that seems to be due to a bug in Caffe, please attach your build configuration (either Makefile.config or CMakeCache.txt) and the output of the make (or cmake) command.
+
+### Your system configuration
+Operating system:
+Compiler:
+CUDA version (if applicable):
+CUDNN version (if applicable):
+BLAS:
+Python or MATLAB version (for pycaffe and matcaffe respectively):

From 20feab5771ae5cbb257cfec85e0b98da06269068 Mon Sep 17 00:00:00 2001
From: "Jonathan R. Williford" <jonathan@neural.vision>
Date: Fri, 11 Nov 2016 07:38:14 +0000
Subject: [PATCH 073/183] Put quotes around titles in YAML front matter.

The colon produces errors unless the title is in quotes. This causes the minor
issue of the HTML title not being set.

See:
https://github.com/jekyll/jekyll/issues/549
---
 docs/install_apt.md | 2 +-
 docs/install_osx.md | 2 +-
 docs/install_yum.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/install_apt.md b/docs/install_apt.md
index 3de5a494..e95b0227 100644
--- a/docs/install_apt.md
+++ b/docs/install_apt.md
@@ -1,5 +1,5 @@
 ---
-title: Installation: Ubuntu
+title: "Installation: Ubuntu"
 ---
 
 # Ubuntu Installation
diff --git a/docs/install_osx.md b/docs/install_osx.md
index 6405d8ad..a2da82f0 100644
--- a/docs/install_osx.md
+++ b/docs/install_osx.md
@@ -1,5 +1,5 @@
 ---
-title: Installation: OS X
+title: "Installation: OS X"
 ---
 
 # OS X Installation
diff --git a/docs/install_yum.md b/docs/install_yum.md
index 38bf7255..842fbd64 100644
--- a/docs/install_yum.md
+++ b/docs/install_yum.md
@@ -1,5 +1,5 @@
 ---
-title: Installation: RHEL / Fedora / CentOS
+title: "Installation: RHEL / Fedora / CentOS"
 ---
 
 # RHEL / Fedora / CentOS Installation

From aaf7b6b17fdded6f6489eaf84a4d336b3344c356 Mon Sep 17 00:00:00 2001
From: davidbrai <davidbrai@gmail.com>
Date: Mon, 14 Nov 2016 22:10:27 +0200
Subject: [PATCH 074/183] support solver resumes in parse_log.py

Currently parse_log.py skips all non timestamped lines
only once. When resuming a solver and appending to the
same log file, it creates more non timestamped log lines.
This change allows the script to silently skip those lines.
---
 tools/extra/parse_log.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/extra/parse_log.py b/tools/extra/parse_log.py
index 375b0db7..017306b5 100755
--- a/tools/extra/parse_log.py
+++ b/tools/extra/parse_log.py
@@ -48,8 +48,13 @@ def parse_log(path_to_log):
                 # iteration
                 continue
 
-            time = extract_seconds.extract_datetime_from_line(line,
-                                                              logfile_year)
+            try:
+                time = extract_seconds.extract_datetime_from_line(line,
+                                                                  logfile_year)
+            except ValueError:
+                # Skip lines with bad formatting, for example when resuming solver
+                continue
+
             seconds = (time - start_time).total_seconds()
 
             learning_rate_match = regex_learning_rate.search(line)

From c6ab96596d9eae01c2c403487dc8be8e3edc8fbb Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Tue, 15 Nov 2016 11:19:37 -0800
Subject: [PATCH 075/183] sigmoid cross-entropy loss: ignore selected targets
 by `ignore_label`

sig-ce learns to ignore by zeroing out the loss/diff at targets equal to
the configured `ignore_label`.

n.b. as of now the loss/diff are not properly normalized when there are
ignored targets. sig-ce loss should adopt the same normalization options
as softmax loss.
---
 .../sigmoid_cross_entropy_loss_layer.hpp      |  5 ++++
 .../sigmoid_cross_entropy_loss_layer.cpp      | 19 +++++++++++++
 .../sigmoid_cross_entropy_loss_layer.cu       | 23 +++++++++++++++
 .../test_sigmoid_cross_entropy_loss_layer.cpp | 28 +++++++++++++++++++
 4 files changed, 75 insertions(+)

diff --git a/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp b/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
index 6452ea51..a9fe33c8 100644
--- a/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
+++ b/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
@@ -105,6 +105,11 @@ class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
   vector<Blob<Dtype>*> sigmoid_bottom_vec_;
   /// top vector holder to call the underlying SigmoidLayer::Forward
   vector<Blob<Dtype>*> sigmoid_top_vec_;
+
+  /// Whether to ignore instances with a certain label.
+  bool has_ignore_label_;
+  /// The label indicating that an instance should be ignored.
+  int ignore_label_;
 };
 
 }  // namespace caffe
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index eb77a9c2..21b64c28 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -14,6 +14,12 @@ void SigmoidCrossEntropyLossLayer<Dtype>::LayerSetUp(
   sigmoid_top_vec_.clear();
   sigmoid_top_vec_.push_back(sigmoid_output_.get());
   sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_);
+
+  has_ignore_label_ =
+    this->layer_param_.loss_param().has_ignore_label();
+  if (has_ignore_label_) {
+    ignore_label_ = this->layer_param_.loss_param().ignore_label();
+  }
 }
 
 template <typename Dtype>
@@ -39,6 +45,10 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
   const Dtype* target = bottom[1]->cpu_data();
   Dtype loss = 0;
   for (int i = 0; i < count; ++i) {
+    const int target_value = static_cast<int>(target[i]);
+    if (has_ignore_label_ && target_value == ignore_label_) {
+      continue;
+    }
     loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -
         log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
   }
@@ -64,6 +74,15 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
     // Scale down gradient
     const Dtype loss_weight = top[0]->cpu_diff()[0];
     caffe_scal(count, loss_weight / num, bottom_diff);
+    // Zero out gradient of ignored targets.
+    if (has_ignore_label_) {
+      for (int i = 0; i < count; ++i) {
+        const int target_value = static_cast<int>(target[i]);
+        if (target_value == ignore_label_) {
+          bottom_diff[i] = 0;
+        }
+      }
+    }
   }
 }
 
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
index 7cb982d2..39eb0506 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
@@ -14,6 +14,17 @@ __global__ void SigmoidCrossEntropyLossForwardGPU(const int nthreads,
   }
 }
 
+template <typename Dtype>
+__global__ void SigmoidCrossEntropyLossIgnoreGPU(const int count,
+    const int ignore_label, const Dtype* target, Dtype* reference) {
+  CUDA_KERNEL_LOOP(index, count) {
+    const int target_value = static_cast<int>(target[index]);
+    if (target_value == ignore_label) {
+      reference[index] = 0;
+    }
+  }
+}
+
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
@@ -33,6 +44,12 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
   // NOLINT_NEXT_LINE(whitespace/operators)
   SigmoidCrossEntropyLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
       CAFFE_CUDA_NUM_THREADS>>>(count, input_data, target, loss_data);
+  // Zero out loss of ignored targets.
+  if (has_ignore_label_) {
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    SigmoidCrossEntropyLossIgnoreGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
+      CAFFE_CUDA_NUM_THREADS>>>(count, ignore_label_, target, loss_data);
+  }
   Dtype loss;
   caffe_gpu_asum(count, loss_data, &loss);
   top[0]->mutable_cpu_data()[0] = loss / num;
@@ -58,6 +75,12 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
     // Scale down gradient
     const Dtype loss_weight = top[0]->cpu_diff()[0];
     caffe_gpu_scal(count, loss_weight / num, bottom_diff);
+    // Zero out gradient of ignored targets.
+    if (has_ignore_label_) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      SigmoidCrossEntropyLossIgnoreGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
+        CAFFE_CUDA_NUM_THREADS>>>(count, ignore_label_, target, bottom_diff);
+    }
   }
 }
 
diff --git a/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp
index 5dfd7656..1bd5f937 100644
--- a/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp
@@ -116,5 +116,33 @@ TYPED_TEST(SigmoidCrossEntropyLossLayerTest, TestGradient) {
       this->blob_top_vec_, 0);
 }
 
+TYPED_TEST(SigmoidCrossEntropyLossLayerTest, TestIgnoreGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  FillerParameter data_filler_param;
+  data_filler_param.set_std(1);
+  GaussianFiller<Dtype> data_filler(data_filler_param);
+  data_filler.Fill(this->blob_bottom_data_);
+  LayerParameter layer_param;
+  LossParameter* loss_param = layer_param.mutable_loss_param();
+  loss_param->set_ignore_label(-1);
+  Dtype* target = this->blob_bottom_targets_->mutable_cpu_data();
+  const int count = this->blob_bottom_targets_->count();
+  // Ignore half of targets, then check that diff of this half is zero,
+  // while the other half is nonzero.
+  caffe_set(count / 2, Dtype(-1), target);
+  SigmoidCrossEntropyLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<bool> propagate_down(2);
+  propagate_down[0] = true;
+  propagate_down[1] = false;
+  layer.Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  const Dtype* diff = this->blob_bottom_data_->cpu_diff();
+  for (int i = 0; i < count / 2; ++i) {
+    EXPECT_FLOAT_EQ(diff[i], 0.);
+    EXPECT_NE(diff[i + count / 2], 0.);
+  }
+}
+
 
 }  // namespace caffe

From 6486a7b795b70fa9a6597e975577d3ec9cc146bd Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Wed, 16 Nov 2016 13:15:06 -0800
Subject: [PATCH 076/183] docs: Guillaume Dumont is the Windows maintainer

---
 docs/installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/installation.md b/docs/installation.md
index 4aac7c42..edfddcfb 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -14,7 +14,7 @@ The official Makefile and `Makefile.config` build are complemented by a [communi
 - [Ubuntu installation](install_apt.html) *the standard platform*
 - [OS X installation](install_osx.html)
 - [RHEL / CentOS / Fedora installation](install_yum.html)
-- [Windows](https://github.com/BVLC/caffe/tree/windows) *see the Windows branch led by Microsoft*
+- [Windows](https://github.com/BVLC/caffe/tree/windows) *see the Windows branch led by Guillaume Dumont*
 - [OpenCL](https://github.com/BVLC/caffe/tree/opencl) *see the OpenCL branch led by Fabian Tschopp*
 
 **Overview**:

From abcb973f5840c7b97ee3c7c910556ef0c5910baf Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Wed, 16 Nov 2016 13:26:37 -0800
Subject: [PATCH 077/183] docs: include AWS AMI pointer

---
 docs/installation.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/installation.md b/docs/installation.md
index edfddcfb..3254be3d 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -16,6 +16,7 @@ The official Makefile and `Makefile.config` build are complemented by a [communi
 - [RHEL / CentOS / Fedora installation](install_yum.html)
 - [Windows](https://github.com/BVLC/caffe/tree/windows) *see the Windows branch led by Guillaume Dumont*
 - [OpenCL](https://github.com/BVLC/caffe/tree/opencl) *see the OpenCL branch led by Fabian Tschopp*
+- [AWS AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-caffe) *pre-configured for AWS*
 
 **Overview**:
 

From 3d62e3cc9da66dbf3328567d0f30d5183b318d81 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Wed, 16 Nov 2016 20:39:42 -0800
Subject: [PATCH 078/183] sigmoid cross-entropy loss: normalize loss by
 different schemes

sig-ce loss handles all the same normalizations as the softmax loss;
refer to #3296 for more detail.

this preserves the default normalization for sig-ce loss: batch size.
---
 .../sigmoid_cross_entropy_loss_layer.hpp      | 11 ++++
 .../sigmoid_cross_entropy_loss_layer.cpp      | 60 ++++++++++++++++---
 .../sigmoid_cross_entropy_loss_layer.cu       | 57 +++++++++++-------
 src/caffe/proto/caffe.proto                   |  4 +-
 4 files changed, 102 insertions(+), 30 deletions(-)

diff --git a/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp b/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
index a9fe33c8..3d925244 100644
--- a/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
+++ b/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
@@ -97,6 +97,13 @@ class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+  /// Read the normalization mode parameter and compute the normalizer based
+  /// on the blob size.  If normalization_mode is VALID, the count of valid
+  /// outputs will be read from valid_count, unless it is -1 in which case
+  /// all outputs are assumed to be valid.
+  virtual Dtype get_normalizer(
+      LossParameter_NormalizationMode normalization_mode, int valid_count);
+
   /// The internal SigmoidLayer used to map predictions to probabilities.
   shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
   /// sigmoid_output stores the output of the SigmoidLayer.
@@ -110,6 +117,10 @@ class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
   bool has_ignore_label_;
   /// The label indicating that an instance should be ignored.
   int ignore_label_;
+  /// How to normalize the loss.
+  LossParameter_NormalizationMode normalization_;
+  Dtype normalizer_;
+  int outer_num_, inner_num_;
 };
 
 }  // namespace caffe
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index 21b64c28..99fa3eb6 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -1,3 +1,4 @@
+#include <algorithm>
 #include <vector>
 
 #include "caffe/layers/sigmoid_cross_entropy_loss_layer.hpp"
@@ -20,17 +21,60 @@ void SigmoidCrossEntropyLossLayer<Dtype>::LayerSetUp(
   if (has_ignore_label_) {
     ignore_label_ = this->layer_param_.loss_param().ignore_label();
   }
+  if (this->layer_param_.loss_param().has_normalization()) {
+    normalization_ = this->layer_param_.loss_param().normalization();
+  } else if (this->layer_param_.loss_param().has_normalize()) {
+    normalization_ = this->layer_param_.loss_param().normalize() ?
+                     LossParameter_NormalizationMode_VALID :
+                     LossParameter_NormalizationMode_BATCH_SIZE;
+  } else {
+    normalization_ = LossParameter_NormalizationMode_BATCH_SIZE;
+  }
 }
 
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   LossLayer<Dtype>::Reshape(bottom, top);
+  outer_num_ = bottom[0]->shape(0);  // batch size
+  inner_num_ = bottom[0]->count(1);  // instance size: |output| == |target|
   CHECK_EQ(bottom[0]->count(), bottom[1]->count()) <<
       "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count.";
   sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_);
 }
 
+// TODO(shelhamer) loss normalization should be pulled up into LossLayer,
+// instead of duplicated here and in SoftMaxWithLossLayer
+template <typename Dtype>
+Dtype SigmoidCrossEntropyLossLayer<Dtype>::get_normalizer(
+    LossParameter_NormalizationMode normalization_mode, int valid_count) {
+  Dtype normalizer;
+  switch (normalization_mode) {
+    case LossParameter_NormalizationMode_FULL:
+      normalizer = Dtype(outer_num_ * inner_num_);
+      break;
+    case LossParameter_NormalizationMode_VALID:
+      if (valid_count == -1) {
+        normalizer = Dtype(outer_num_ * inner_num_);
+      } else {
+        normalizer = Dtype(valid_count);
+      }
+      break;
+    case LossParameter_NormalizationMode_BATCH_SIZE:
+      normalizer = Dtype(outer_num_);
+      break;
+    case LossParameter_NormalizationMode_NONE:
+      normalizer = Dtype(1);
+      break;
+    default:
+      LOG(FATAL) << "Unknown normalization mode: "
+          << LossParameter_NormalizationMode_Name(normalization_mode);
+  }
+  // Some users will have no labels for some examples in order to 'turn off' a
+  // particular loss in a multi-task setup. The max prevents NaNs in that case.
+  return std::max(Dtype(1.0), normalizer);
+}
+
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
@@ -38,21 +82,22 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
   sigmoid_bottom_vec_[0] = bottom[0];
   sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
   // Compute the loss (negative log likelihood)
-  const int count = bottom[0]->count();
-  const int num = bottom[0]->num();
   // Stable version of loss computation from input data
   const Dtype* input_data = bottom[0]->cpu_data();
   const Dtype* target = bottom[1]->cpu_data();
+  int valid_count = 0;
   Dtype loss = 0;
-  for (int i = 0; i < count; ++i) {
+  for (int i = 0; i < bottom[0]->count(); ++i) {
     const int target_value = static_cast<int>(target[i]);
     if (has_ignore_label_ && target_value == ignore_label_) {
       continue;
     }
     loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -
         log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
+    ++valid_count;
   }
-  top[0]->mutable_cpu_data()[0] = loss / num;
+  normalizer_ = get_normalizer(normalization_, valid_count);
+  top[0]->mutable_cpu_data()[0] = loss / normalizer_;
 }
 
 template <typename Dtype>
@@ -66,14 +111,10 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
   if (propagate_down[0]) {
     // First, compute the diff
     const int count = bottom[0]->count();
-    const int num = bottom[0]->num();
     const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data();
     const Dtype* target = bottom[1]->cpu_data();
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
     caffe_sub(count, sigmoid_output_data, target, bottom_diff);
-    // Scale down gradient
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    caffe_scal(count, loss_weight / num, bottom_diff);
     // Zero out gradient of ignored targets.
     if (has_ignore_label_) {
       for (int i = 0; i < count; ++i) {
@@ -83,6 +124,9 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
         }
       }
     }
+    // Scale down gradient
+    Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer_;
+    caffe_scal(count, loss_weight, bottom_diff);
   }
 }
 
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
index 39eb0506..b9877e6a 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
@@ -5,26 +5,38 @@
 
 namespace caffe {
 
+
 template <typename Dtype>
 __global__ void SigmoidCrossEntropyLossForwardGPU(const int nthreads,
-          const Dtype* input_data, const Dtype* target, Dtype* loss) {
+          const Dtype* input_data, const Dtype* target, Dtype* loss,
+          const bool has_ignore_label_, const int ignore_label_,
+          Dtype* counts) {
   CUDA_KERNEL_LOOP(i, nthreads) {
-    loss[i] = input_data[i] * (target[i] - (input_data[i] >= 0)) -
-        log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
+    const int target_value = static_cast<int>(target[i]);
+    if (has_ignore_label_ && target_value == ignore_label_) {
+      loss[i] = 0;
+      counts[i] = 0;
+    } else {
+      loss[i] = input_data[i] * (target[i] - (input_data[i] >= 0)) -
+          log(1 + exp(input_data[i] - 2 * input_data[i] *
+          (input_data[i] >= 0)));
+      counts[i] = 1;
+    }
   }
 }
 
 template <typename Dtype>
-__global__ void SigmoidCrossEntropyLossIgnoreGPU(const int count,
-    const int ignore_label, const Dtype* target, Dtype* reference) {
-  CUDA_KERNEL_LOOP(index, count) {
-    const int target_value = static_cast<int>(target[index]);
+__global__ void SigmoidCrossEntropyLossIgnoreDiffGPU(const int count,
+    const int ignore_label, const Dtype* target, Dtype* diff) {
+  CUDA_KERNEL_LOOP(i, count) {
+    const int target_value = static_cast<int>(target[i]);
     if (target_value == ignore_label) {
-      reference[index] = 0;
+      diff[i] = 0;
     }
   }
 }
 
+
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
@@ -33,7 +45,6 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
   sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
   // Compute the loss (negative log likelihood)
   const int count = bottom[0]->count();
-  const int num = bottom[0]->num();
   // Stable version of loss computation from input data
   const Dtype* input_data = bottom[0]->gpu_data();
   const Dtype* target = bottom[1]->gpu_data();
@@ -41,18 +52,23 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
   // on the backward pass, we use it here to avoid having to allocate new GPU
   // memory to accumulate intermediate results in the kernel.
   Dtype* loss_data = bottom[0]->mutable_gpu_diff();
+  Dtype* count_data = bottom[1]->mutable_gpu_diff();
+  Dtype valid_count;
   // NOLINT_NEXT_LINE(whitespace/operators)
   SigmoidCrossEntropyLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, input_data, target, loss_data);
-  // Zero out loss of ignored targets.
-  if (has_ignore_label_) {
-    // NOLINT_NEXT_LINE(whitespace/operators)
-    SigmoidCrossEntropyLossIgnoreGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
-      CAFFE_CUDA_NUM_THREADS>>>(count, ignore_label_, target, loss_data);
+      CAFFE_CUDA_NUM_THREADS>>>(count, input_data, target, loss_data,
+      has_ignore_label_, ignore_label_, count_data);
+  // Only launch another CUDA kernel if we actually need the valid count.
+  if (normalization_ == LossParameter_NormalizationMode_VALID &&
+      has_ignore_label_) {
+    caffe_gpu_asum(count, count_data, &valid_count);
+  } else {
+    valid_count = count;
   }
   Dtype loss;
   caffe_gpu_asum(count, loss_data, &loss);
-  top[0]->mutable_cpu_data()[0] = loss / num;
+  normalizer_ = get_normalizer(normalization_, valid_count);
+  top[0]->mutable_cpu_data()[0] = loss / normalizer_;
 }
 
 template <typename Dtype>
@@ -66,21 +82,20 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
   if (propagate_down[0]) {
     // First, compute the diff
     const int count = bottom[0]->count();
-    const int num = bottom[0]->num();
     const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
     const Dtype* target = bottom[1]->gpu_data();
     Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
     caffe_copy(count, sigmoid_output_data, bottom_diff);
     caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
-    // Scale down gradient
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    caffe_gpu_scal(count, loss_weight / num, bottom_diff);
     // Zero out gradient of ignored targets.
     if (has_ignore_label_) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      SigmoidCrossEntropyLossIgnoreGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
+      SigmoidCrossEntropyLossIgnoreDiffGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
         CAFFE_CUDA_NUM_THREADS>>>(count, ignore_label_, target, bottom_diff);
     }
+    // Scale down gradient
+    Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer_;
+    caffe_gpu_scal(count, loss_weight, bottom_diff);
   }
 }
 
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 6940a705..0b2768b7 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -434,7 +434,7 @@ message LossParameter {
   optional int32 ignore_label = 1;
   // How to normalize the loss for loss layers that aggregate across batches,
   // spatial dimensions, or other dimensions.  Currently only implemented in
-  // SoftmaxWithLoss layer.
+  // SoftmaxWithLoss and SigmoidCrossEntropyLoss layers.
   enum NormalizationMode {
     // Divide by the number of examples in the batch times spatial dimensions.
     // Outputs that receive the ignore label will NOT be ignored in computing
@@ -448,6 +448,8 @@ message LossParameter {
     // Do not normalize the loss.
     NONE = 3;
   }
+  // For historical reasons, the default normalization for
+  // SigmoidCrossEntropyLoss is BATCH_SIZE and *not* VALID.
   optional NormalizationMode normalization = 3 [default = VALID];
   // Deprecated.  Ignored if normalization is specified.  If normalization
   // is not specified, then setting this to false will be equivalent to

From 2cf9dd3750073ce8a119f4a71cc41eeef63e0748 Mon Sep 17 00:00:00 2001
From: chenzy <chenzeyuczy@qq.com>
Date: Fri, 18 Nov 2016 10:28:13 +0800
Subject: [PATCH 079/183] Add missing spaces besides equal signs in
 batch_norm_layer.cpp

---
 src/caffe/layers/batch_norm_layer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp
index e661abb1..0a08ed4c 100644
--- a/src/caffe/layers/batch_norm_layer.cpp
+++ b/src/caffe/layers/batch_norm_layer.cpp
@@ -27,7 +27,7 @@ void BatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     sz.push_back(channels_);
     this->blobs_[0].reset(new Blob<Dtype>(sz));
     this->blobs_[1].reset(new Blob<Dtype>(sz));
-    sz[0]=1;
+    sz[0] = 1;
     this->blobs_[2].reset(new Blob<Dtype>(sz));
     for (int i = 0; i < 3; ++i) {
       caffe_set(this->blobs_[i]->count(), Dtype(0),
@@ -61,7 +61,7 @@ void BatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   variance_.Reshape(sz);
   temp_.ReshapeLike(*bottom[0]);
   x_norm_.ReshapeLike(*bottom[0]);
-  sz[0]=bottom[0]->shape(0);
+  sz[0] = bottom[0]->shape(0);
   batch_sum_multiplier_.Reshape(sz);
 
   int spatial_dim = bottom[0]->count()/(channels_*bottom[0]->shape(0));

From e52451de914312b80a83459cb160c2f72a5b4fea Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Mon, 21 Nov 2016 09:35:57 -0800
Subject: [PATCH 080/183] solver: check and set type to reconcile class and
 proto

the solver checks its proto type (SolverParameter.type) on
instantiation:

- if the proto type is unspecified it's set according to the class type
  `Solver::type()`
- if the proto type and class type conflict, the solver dies loudly

this helps avoid accidental instantiation of a different solver type
than intended when the solver def and class differ. guaranteed type
information in the SolverParameter will simplify multi-solver
coordination too.
---
 include/caffe/solver.hpp                      |  2 ++
 src/caffe/solver.cpp                          | 12 ++++++++++++
 src/caffe/test/test_gradient_based_solver.cpp |  5 +++++
 3 files changed, 19 insertions(+)

diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index eafcee32..ef38d6e4 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -108,6 +108,8 @@ class Solver {
   virtual void RestoreSolverStateFromBinaryProto(const string& state_file) = 0;
   void DisplayOutputBlobs(const int net_id);
   void UpdateSmoothedLoss(Dtype loss, int start_iter, int average_loss);
+  /// Harmonize solver class type with configured proto type.
+  void CheckType(SolverParameter* param);
 
   SolverParameter param_;
   int iter_;
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index ece3913e..ae6a5a36 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -38,9 +38,21 @@ Solver<Dtype>::Solver(const string& param_file, const Solver* root_solver)
       requested_early_exit_(false) {
   SolverParameter param;
   ReadSolverParamsFromTextFileOrDie(param_file, &param);
+  CheckType(&param);
   Init(param);
 }
 
+template <typename Dtype>
+void Solver<Dtype>::CheckType(SolverParameter* param) {
+  // Harmonize solver class type with configured type to avoid confusion.
+  if (param->has_type()) {
+    CHECK_EQ(param->type(), this->type())
+        << "Solver type must agree with instantiated solver class.";
+  } else {
+    param->set_type(this->type());
+  }
+}
+
 template <typename Dtype>
 void Solver<Dtype>::Init(const SolverParameter& param) {
   CHECK(Caffe::root_solver() || root_solver_)
diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index 975a8f0f..e81caea2 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -694,6 +694,11 @@ TYPED_TEST(SGDSolverTest, TestSnapshotShare) {
   }
 }
 
+TYPED_TEST(SGDSolverTest, TestSolverType) {
+  this->TestLeastSquaresUpdate();
+  EXPECT_NE(this->solver_->type(), string(""));
+  EXPECT_EQ(this->solver_->type(), this->solver_->param().type());
+}
 
 template <typename TypeParam>
 class AdaGradSolverTest : public GradientBasedSolverTest<TypeParam> {

From 48e73c780295e56699ad71232a24c8b459c8fe01 Mon Sep 17 00:00:00 2001
From: Zylphrex <Zylphrex@users.noreply.github.com>
Date: Mon, 21 Nov 2016 13:11:34 -0500
Subject: [PATCH 081/183] Checks inside Xcode for latest OSX SDK (#4840)

OS X: build with latest SDK by default
---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 24894062..ccc4d8b9 100644
--- a/Makefile
+++ b/Makefile
@@ -192,12 +192,12 @@ ifeq ($(USE_LMDB), 1)
 	LIBRARIES += lmdb
 endif
 ifeq ($(USE_OPENCV), 1)
-	LIBRARIES += opencv_core opencv_highgui opencv_imgproc 
+	LIBRARIES += opencv_core opencv_highgui opencv_imgproc
 
 	ifeq ($(OPENCV_VERSION), 3)
 		LIBRARIES += opencv_imgcodecs
 	endif
-		
+
 endif
 PYTHON_LIBRARIES ?= boost_python python2.7
 WARNINGS := -Wall -Wno-sign-compare
@@ -385,7 +385,7 @@ else
 		XCODE_CLT_GEQ_7 := $(shell [ $(XCODE_CLT_VER) -gt 6 ] && echo 1)
 		XCODE_CLT_GEQ_6 := $(shell [ $(XCODE_CLT_VER) -gt 5 ] && echo 1)
 		ifeq ($(XCODE_CLT_GEQ_7), 1)
-			BLAS_INCLUDE ?= /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.11.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers
+			BLAS_INCLUDE ?= /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/$(shell ls /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/ | sort | tail -1)/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers
 		else ifeq ($(XCODE_CLT_GEQ_6), 1)
 			BLAS_INCLUDE ?= /System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
 			LDFLAGS += -framework Accelerate

From db6cf0a728cad63c93b345f2203f3ad1f5d5c2f4 Mon Sep 17 00:00:00 2001
From: Nico Galoppo <nico.galoppo@intel.com>
Date: Mon, 21 Nov 2016 11:03:52 -0800
Subject: [PATCH 082/183] Fix Python net drawing script

---
 python/caffe/draw.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index 9eecf6d7..e4fd7aac 100644
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -104,11 +104,11 @@ def get_layer_label(layer, rankdir):
                       pooling_types_dict[layer.pooling_param.pool],
                       layer.type,
                       separator,
-                      layer.pooling_param.kernel_size,
+                      layer.pooling_param.kernel_size[0] if len(layer.pooling_param.kernel_size._values) else 1,
                       separator,
-                      layer.pooling_param.stride,
+                      layer.pooling_param.stride[0] if len(layer.pooling_param.stride._values) else 1,
                       separator,
-                      layer.pooling_param.pad)
+                      layer.pooling_param.pad[0] if len(layer.pooling_param.pad._values) else 0)
     else:
         node_label = '"%s%s(%s)"' % (layer.name, separator, layer.type)
     return node_label

From 2e59864d4f35bf60ddf859185f4e0d8fd940f238 Mon Sep 17 00:00:00 2001
From: hmybmny <hmybmny@gmail.com>
Date: Thu, 24 Nov 2016 18:17:13 +0800
Subject: [PATCH 083/183] fix error link

---
 docs/install_apt.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/install_apt.md b/docs/install_apt.md
index e95b0227..bc1566b0 100644
--- a/docs/install_apt.md
+++ b/docs/install_apt.md
@@ -33,8 +33,8 @@ Everything is packaged in 14.04.
 These dependencies need manual installation in 12.04.
 
     # glog
-    wget https://google-glog.googlecode.com/files/glog-0.3.3.tar.gz
-    tar zxvf glog-0.3.3.tar.gz
+    wget https://github.com/google/glog/archive/v0.3.3.tar.gz
+    tar zxvf v0.3.3.tar.gz
     cd glog-0.3.3
     ./configure
     make && make install

From b644a87c842702de8291c97fa0e418797092fe41 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 25 Nov 2016 12:49:53 -0800
Subject: [PATCH 084/183] Revert "solver: check and set type to reconcile class
 and proto"

as pointed out by #5028 this does not achieve what it intended, and
furthermore causes trouble with direct solver instantiation.

revert commit e52451de914312b80a83459cb160c2f72a5b4fea
---
 include/caffe/solver.hpp                      |  2 --
 src/caffe/solver.cpp                          | 12 ------------
 src/caffe/test/test_gradient_based_solver.cpp |  5 -----
 3 files changed, 19 deletions(-)

diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index ef38d6e4..eafcee32 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -108,8 +108,6 @@ class Solver {
   virtual void RestoreSolverStateFromBinaryProto(const string& state_file) = 0;
   void DisplayOutputBlobs(const int net_id);
   void UpdateSmoothedLoss(Dtype loss, int start_iter, int average_loss);
-  /// Harmonize solver class type with configured proto type.
-  void CheckType(SolverParameter* param);
 
   SolverParameter param_;
   int iter_;
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index ae6a5a36..ece3913e 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -38,21 +38,9 @@ Solver<Dtype>::Solver(const string& param_file, const Solver* root_solver)
       requested_early_exit_(false) {
   SolverParameter param;
   ReadSolverParamsFromTextFileOrDie(param_file, &param);
-  CheckType(&param);
   Init(param);
 }
 
-template <typename Dtype>
-void Solver<Dtype>::CheckType(SolverParameter* param) {
-  // Harmonize solver class type with configured type to avoid confusion.
-  if (param->has_type()) {
-    CHECK_EQ(param->type(), this->type())
-        << "Solver type must agree with instantiated solver class.";
-  } else {
-    param->set_type(this->type());
-  }
-}
-
 template <typename Dtype>
 void Solver<Dtype>::Init(const SolverParameter& param) {
   CHECK(Caffe::root_solver() || root_solver_)
diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index e81caea2..975a8f0f 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -694,11 +694,6 @@ TYPED_TEST(SGDSolverTest, TestSnapshotShare) {
   }
 }
 
-TYPED_TEST(SGDSolverTest, TestSolverType) {
-  this->TestLeastSquaresUpdate();
-  EXPECT_NE(this->solver_->type(), string(""));
-  EXPECT_EQ(this->solver_->type(), this->solver_->param().type());
-}
 
 template <typename TypeParam>
 class AdaGradSolverTest : public GradientBasedSolverTest<TypeParam> {

From db6643232cc95ba79f2a21ad98ef15725ee576d6 Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Sun, 27 Nov 2016 09:13:42 +0000
Subject: [PATCH 085/183] fix many typos by using codespell

---
 cmake/Targets.cmake                          | 2 +-
 examples/02-fine-tuning.ipynb                | 2 +-
 examples/mnist/train_lenet_docker.sh         | 2 +-
 examples/pycaffe/tools.py                    | 4 ++--
 matlab/+caffe/private/caffe_.cpp             | 2 +-
 matlab/CMakeLists.txt                        | 2 +-
 scripts/cpp_lint.py                          | 6 +++---
 src/caffe/layers/crop_layer.cpp              | 2 +-
 src/caffe/layers/crop_layer.cu               | 2 +-
 src/caffe/layers/hdf5_data_layer.cpp         | 4 ++--
 src/caffe/proto/caffe.proto                  | 4 ++--
 src/caffe/test/CMakeLists.txt                | 2 +-
 src/caffe/test/test_euclidean_loss_layer.cpp | 2 +-
 src/gtest/gtest-all.cpp                      | 4 ++--
 src/gtest/gtest.h                            | 2 +-
 tools/extra/plot_log.gnuplot.example         | 2 +-
 16 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/cmake/Targets.cmake b/cmake/Targets.cmake
index a796d005..2cb11584 100644
--- a/cmake/Targets.cmake
+++ b/cmake/Targets.cmake
@@ -94,7 +94,7 @@ function(caffe_pickup_caffe_sources root)
   caffe_convert_absolute_paths(test_srcs)
   caffe_convert_absolute_paths(test_cuda)
 
-  # propogate to parent scope
+  # propagate to parent scope
   set(srcs ${srcs} PARENT_SCOPE)
   set(cuda ${cuda} PARENT_SCOPE)
   set(test_srcs ${test_srcs} PARENT_SCOPE)
diff --git a/examples/02-fine-tuning.ipynb b/examples/02-fine-tuning.ipynb
index 07ca8df4..f44eaf9a 100644
--- a/examples/02-fine-tuning.ipynb
+++ b/examples/02-fine-tuning.ipynb
@@ -1141,7 +1141,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "So we did finetuning and it is awesome. Let's take a look at what kind of results we are able to get with a longer, more complete run of the style recognition dataset. Note: the below URL might be occassionally down because it is run on a research machine.\n",
+    "So we did finetuning and it is awesome. Let's take a look at what kind of results we are able to get with a longer, more complete run of the style recognition dataset. Note: the below URL might be occasionally down because it is run on a research machine.\n",
     "\n",
     "http://demo.vislab.berkeleyvision.org/"
    ]
diff --git a/examples/mnist/train_lenet_docker.sh b/examples/mnist/train_lenet_docker.sh
index 32cf1c8e..e946ba0f 100755
--- a/examples/mnist/train_lenet_docker.sh
+++ b/examples/mnist/train_lenet_docker.sh
@@ -25,7 +25,7 @@ set -e
 # executed.
 #
 # In order to provide additional flexibility, the following shell (environment)
-# variables can be used to controll the execution of each of the phases:
+# variables can be used to control the execution of each of the phases:
 #
 # DOWNLOAD_DATA: Enable (1) or disable (0) the downloading of the MNIST dataset
 # CREATE_LMDB: Enable (1) or disable (0) the creation of the LMDB database
diff --git a/examples/pycaffe/tools.py b/examples/pycaffe/tools.py
index 88b1834a..7f6c2d83 100644
--- a/examples/pycaffe/tools.py
+++ b/examples/pycaffe/tools.py
@@ -26,7 +26,7 @@ def set_scale(self, scale):
 
     def preprocess(self, im):
         """
-        preprocess() emulate the pre-processing occuring in the vgg16 caffe
+        preprocess() emulate the pre-processing occurring in the vgg16 caffe
         prototxt.
         """
 
@@ -75,7 +75,7 @@ def __init__(self, testnet_prototxt_path="testnet.prototxt",
         # looks:
         self.sp['display'] = '25'
         self.sp['snapshot'] = '2500'
-        self.sp['snapshot_prefix'] = '"snapshot"'  # string withing a string!
+        self.sp['snapshot_prefix'] = '"snapshot"'  # string within a string!
 
         # learning rate policy
         self.sp['lr_policy'] = '"fixed"'
diff --git a/matlab/+caffe/private/caffe_.cpp b/matlab/+caffe/private/caffe_.cpp
index 1b1b2bff..4e466e66 100644
--- a/matlab/+caffe/private/caffe_.cpp
+++ b/matlab/+caffe/private/caffe_.cpp
@@ -44,7 +44,7 @@ void mxCHECK_FILE_EXIST(const char* file) {
 // The pointers to caffe::Solver and caffe::Net instances
 static vector<shared_ptr<Solver<float> > > solvers_;
 static vector<shared_ptr<Net<float> > > nets_;
-// init_key is generated at the beginning and everytime you call reset
+// init_key is generated at the beginning and every time you call reset
 static double init_key = static_cast<double>(caffe_rng_rand());
 
 /** -----------------------------------------------------------------
diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt
index f420df8d..987730d9 100644
--- a/matlab/CMakeLists.txt
+++ b/matlab/CMakeLists.txt
@@ -20,7 +20,7 @@ if(NOT BUILD_SHARED_LIBS AND build_using MATCHES Matlab)
   message(FATAL_ERROR "Matlab MEX interface (with default mex options file) can only be built if caffe is compiled as shared library. Please enable 'BUILD_SHARED_LIBS' in CMake. Aternativelly you can switch to Octave compiler.")
 endif()
 
-# helper function to set proper mex file extention
+# helper function to set proper mex file extension
 function(caffe_fetch_and_set_proper_mexext mexfile_variable)
   execute_process(COMMAND ${Matlab_mexext} OUTPUT_STRIP_TRAILING_WHITESPACE RESULT_VARIABLE res OUTPUT_VARIABLE ext)
   if(res MATCHES 0)
diff --git a/scripts/cpp_lint.py b/scripts/cpp_lint.py
index 14c76ecd..6ec4fb76 100755
--- a/scripts/cpp_lint.py
+++ b/scripts/cpp_lint.py
@@ -4460,7 +4460,7 @@ def UpdateIncludeState(filename, include_state, io=codecs):
     io: The io factory to use to read the file. Provided for testability.
 
   Returns:
-    True if a header was succesfully added. False otherwise.
+    True if a header was successfully added. False otherwise.
   """
   headerfile = None
   try:
@@ -4532,7 +4532,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
   # Let's copy the include_state so it is only messed up within this function.
   include_state = include_state.copy()
 
-  # Did we find the header for this file (if any) and succesfully load it?
+  # Did we find the header for this file (if any) and successfully load it?
   header_found = False
 
   # Use the absolute path so that matching works properly.
@@ -4833,7 +4833,7 @@ def ParseArguments(args):
       try:
           _valid_extensions = set(val.split(','))
       except ValueError:
-          PrintUsage('Extensions must be comma seperated list.')
+          PrintUsage('Extensions must be comma separated list.')
 
   if not filenames:
     PrintUsage('No files were specified.')
diff --git a/src/caffe/layers/crop_layer.cpp b/src/caffe/layers/crop_layer.cpp
index aecdcd63..d36b61ca 100644
--- a/src/caffe/layers/crop_layer.cpp
+++ b/src/caffe/layers/crop_layer.cpp
@@ -85,7 +85,7 @@ void CropLayer<Dtype>::crop_copy(const vector<Blob<Dtype>*>& bottom,
                 src_data, dest_data, is_forward);
     }
   } else {
-    // We are at the last dimensions, which is stored continously in memory
+    // We are at the last dimensions, which is stored continuously in memory
     for (int i = 0; i < top[0]->shape(cur_dim); ++i) {
       // prepare index vector reduced(red) and with offsets(off)
       std::vector<int> ind_red(cur_dim, 0);
diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu
index f78cecbb..6ea32d21 100644
--- a/src/caffe/layers/crop_layer.cu
+++ b/src/caffe/layers/crop_layer.cu
@@ -39,7 +39,7 @@ void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
                 src_data, dest_data, is_forward);
     }
   } else {
-    // We are at the last two dimensions, which are stored continously in memory
+    // We are at the last two dimensions, which are stored continuously in memory
     // With (N,C,H,W)
     //      (0,1,2,3) cur_dim   -> H
     //                cur_dim+1 -> W
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 2f13dc64..c957451a 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -61,10 +61,10 @@ void HDF5DataLayer<Dtype>::LoadHDF5FileData(const char* filename) {
   // Shuffle if needed.
   if (this->layer_param_.hdf5_data_param().shuffle()) {
     std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
-    DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0)
+    DLOG(INFO) << "Successfully loaded " << hdf_blobs_[0]->shape(0)
                << " rows (shuffled)";
   } else {
-    DLOG(INFO) << "Successully loaded " << hdf_blobs_[0]->shape(0) << " rows";
+    DLOG(INFO) << "Successfully loaded " << hdf_blobs_[0]->shape(0) << " rows";
   }
 }
 
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 0b2768b7..430a0dea 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -418,7 +418,7 @@ message TransformationParameter {
   optional uint32 crop_size = 3 [default = 0];
   // mean_file and mean_value cannot be specified at the same time
   optional string mean_file = 4;
-  // if specified can be repeated once (would substract it from all the channels)
+  // if specified can be repeated once (would subtract it from all the channels)
   // or can be repeated the same number of times as channels
   // (would subtract them from the corresponding channel)
   repeated float mean_value = 5;
@@ -1396,6 +1396,6 @@ message PReLUParameter {
 
   // Initial value of a_i. Default is a_i=0.25 for all i.
   optional FillerParameter filler = 1;
-  // Whether or not slope paramters are shared across channels.
+  // Whether or not slope parameters are shared across channels.
   optional bool channel_shared = 2 [default = false];
 }
diff --git a/src/caffe/test/CMakeLists.txt b/src/caffe/test/CMakeLists.txt
index 35a803f2..d8afc30b 100644
--- a/src/caffe/test/CMakeLists.txt
+++ b/src/caffe/test/CMakeLists.txt
@@ -1,7 +1,7 @@
 # The option allows to include in build only selected test files and exclude all others
 # Usage example:
 #  cmake -DBUILD_only_tests="common,net,blob,im2col_kernel"
-set(BUILD_only_tests "" CACHE STRING "Blank or comma-separated list of test files to build without 'test_' prefix and extention")
+set(BUILD_only_tests "" CACHE STRING "Blank or comma-separated list of test files to build without 'test_' prefix and extension")
 caffe_leave_only_selected_tests(test_srcs ${BUILD_only_tests})
 caffe_leave_only_selected_tests(test_cuda ${BUILD_only_tests})
 
diff --git a/src/caffe/test/test_euclidean_loss_layer.cpp b/src/caffe/test/test_euclidean_loss_layer.cpp
index f253f9fd..b026f5b2 100644
--- a/src/caffe/test/test_euclidean_loss_layer.cpp
+++ b/src/caffe/test/test_euclidean_loss_layer.cpp
@@ -39,7 +39,7 @@ class EuclideanLossLayerTest : public MultiDeviceTest<TypeParam> {
 
   void TestForward() {
     // Get the loss without a specified objective weight -- should be
-    // equivalent to explicitly specifiying a weight of 1.
+    // equivalent to explicitly specifying a weight of 1.
     LayerParameter layer_param;
     EuclideanLossLayer<Dtype> layer_weight_1(layer_param);
     layer_weight_1.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
diff --git a/src/gtest/gtest-all.cpp b/src/gtest/gtest-all.cpp
index 92619741..81cdb578 100644
--- a/src/gtest/gtest-all.cpp
+++ b/src/gtest/gtest-all.cpp
@@ -2697,7 +2697,7 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
 // Utility functions for encoding Unicode text (wide strings) in
 // UTF-8.
 
-// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
+// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
 // like this:
 //
 // Code-point length   Encoding
@@ -7550,7 +7550,7 @@ FilePath FilePath::RemoveExtension(const char* extension) const {
   return *this;
 }
 
-// Returns a pointer to the last occurence of a valid path separator in
+// Returns a pointer to the last occurrence of a valid path separator in
 // the FilePath. On Windows, for example, both '/' and '\' are valid path
 // separators. Returns NULL if no path separator was found.
 const char* FilePath::FindLastPathSeparator() const {
diff --git a/src/gtest/gtest.h b/src/gtest/gtest.h
index 3143bd67..124fb232 100644
--- a/src/gtest/gtest.h
+++ b/src/gtest/gtest.h
@@ -3395,7 +3395,7 @@ class GTEST_API_ FilePath {
 
   void Normalize();
 
-  // Returns a pointer to the last occurence of a valid path separator in
+  // Returns a pointer to the last occurrence of a valid path separator in
   // the FilePath. On Windows, for example, both '/' and '\' are valid path
   // separators. Returns NULL if no path separator was found.
   const char* FindLastPathSeparator() const;
diff --git a/tools/extra/plot_log.gnuplot.example b/tools/extra/plot_log.gnuplot.example
index 748b96e6..02c68e1d 100644
--- a/tools/extra/plot_log.gnuplot.example
+++ b/tools/extra/plot_log.gnuplot.example
@@ -4,7 +4,7 @@
 # Be warned that the fields in the training log may change in the future.
 # You had better check the data files before designing your own plots.
 
-# Please generate the neccessary data files with 
+# Please generate the necessary data files with 
 # /path/to/caffe/tools/extra/parse_log.sh before plotting.
 # Example usage: 
 #     ./parse_log.sh mnist.log

From fa7fda78661fa795e3f6d3bbe7040e5d5d02e732 Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Mon, 28 Nov 2016 01:20:58 +0000
Subject: [PATCH 086/183] Make lint happy (> 80 characters)

---
 src/caffe/layers/crop_layer.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu
index 6ea32d21..9ad40126 100644
--- a/src/caffe/layers/crop_layer.cu
+++ b/src/caffe/layers/crop_layer.cu
@@ -39,10 +39,10 @@ void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
                 src_data, dest_data, is_forward);
     }
   } else {
-    // We are at the last two dimensions, which are stored continuously in memory
-    // With (N,C,H,W)
-    //      (0,1,2,3) cur_dim   -> H
-    //                cur_dim+1 -> W
+    // We are at the last two dimensions, which are stored continuously in
+    // memory With (N,C,H,W)
+    //             (0,1,2,3) cur_dim   -> H
+    //                       cur_dim+1 -> W
     const int lines = top[0]->shape(cur_dim);
     const int height = top[0]->shape(cur_dim);
     const int width = top[0]->shape(cur_dim+1);

From cd681ecdd9383a0f84b854e6fefeb05966babce0 Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Mon, 28 Nov 2016 02:17:25 +0000
Subject: [PATCH 087/183] Add the missing period

---
 src/caffe/layers/crop_layer.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu
index 9ad40126..1ea13253 100644
--- a/src/caffe/layers/crop_layer.cu
+++ b/src/caffe/layers/crop_layer.cu
@@ -40,9 +40,9 @@ void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
     }
   } else {
     // We are at the last two dimensions, which are stored continuously in
-    // memory With (N,C,H,W)
-    //             (0,1,2,3) cur_dim   -> H
-    //                       cur_dim+1 -> W
+    // memory. With (N,C,H,W)
+    //              (0,1,2,3) cur_dim   -> H
+    //                        cur_dim+1 -> W
     const int lines = top[0]->shape(cur_dim);
     const int height = top[0]->shape(cur_dim);
     const int width = top[0]->shape(cur_dim+1);

From 8cd5c3df98734f4c43e1b7f43c05401fda0a94ac Mon Sep 17 00:00:00 2001
From: Max Ehrlich <max.ehr@gmail.com>
Date: Fri, 2 Dec 2016 10:13:50 -0500
Subject: [PATCH 088/183] Add Pascal to all cuda architectures

The known gpu architectures were missing the Pascal sm_60 and sm_61 compute capabilities. When building for this GPU, but on a separate machine, like a CI server or inside a docker image, caffe would be built for at most capability sm_50 and crash when run on the Pascal GPU.
---
 cmake/Cuda.cmake | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index eeeb7325..7146a244 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -4,7 +4,7 @@ endif()
 
 # Known NVIDIA GPU achitectures Caffe can be compiled for.
 # This list will be used for CUDA_ARCH_NAME = All option
-set(Caffe_known_gpu_archs "20 21(20) 30 35 50")
+set(Caffe_known_gpu_archs "20 21(20) 30 35 50 60 61")
 
 ################################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
@@ -56,7 +56,7 @@ endfunction()
 #   caffe_select_nvcc_arch_flags(out_variable)
 function(caffe_select_nvcc_arch_flags out_variable)
   # List of arch names
-  set(__archs_names "Fermi" "Kepler" "Maxwell" "All" "Manual")
+  set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "All" "Manual")
   set(__archs_name_default "All")
   if(NOT CMAKE_CROSSCOMPILING)
     list(APPEND __archs_names "Auto")
@@ -89,6 +89,8 @@ function(caffe_select_nvcc_arch_flags out_variable)
     set(__cuda_arch_bin "30 35")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
     set(__cuda_arch_bin "50")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    set(__cuda_arch_bin "60 61")
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(__cuda_arch_bin ${Caffe_known_gpu_archs})
   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")

From de3a12f46217dcac8aae467931e6d5ffb5fbc4e2 Mon Sep 17 00:00:00 2001
From: "Young H. Oh" <garion9013@gmail.com>
Date: Thu, 8 Dec 2016 06:54:46 +0900
Subject: [PATCH 089/183] fix wrongly used marker hash

---
 tools/extra/plot_training_log.py.example | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/tools/extra/plot_training_log.py.example b/tools/extra/plot_training_log.py.example
index 79924ae5..8caca6b8 100755
--- a/tools/extra/plot_training_log.py.example
+++ b/tools/extra/plot_training_log.py.example
@@ -90,9 +90,9 @@ def load_data(data_file, field_idx0, field_idx1):
 
 def random_marker():
     markers = mks.MarkerStyle.markers
-    num = len(markers.values())
+    num = len(markers.keys())
     idx = random.randint(0, num - 1)
-    return markers.values()[idx]
+    return markers.keys()[idx]
 
 def get_data_label(path_to_log):
     label = path_to_log[path_to_log.rfind('/')+1 : path_to_log.rfind(
@@ -126,16 +126,9 @@ def plot_chart(chart_type, path_to_png, path_to_log_list):
             plt.plot(data[0], data[1], label = label, color = color,
                      linewidth = linewidth)
         else:
-            ok = False
-            ## Some markers throw ValueError: Unrecognized marker style
-            while not ok:
-                try:
-                    marker = random_marker()
-                    plt.plot(data[0], data[1], label = label, color = color,
-                             marker = marker, linewidth = linewidth)
-                    ok = True
-                except:
-                    pass
+            marker = random_marker()
+            plt.plot(data[0], data[1], label = label, color = color,
+                     marker = marker, linewidth = linewidth)
     legend_loc = get_legend_loc(chart_type)
     plt.legend(loc = legend_loc, ncol = 1) # ajust ncol to fit the space
     plt.title(get_chart_type_description(chart_type))

From 57a5bbde4ede19c545c5932334782e3a755b2265 Mon Sep 17 00:00:00 2001
From: liyangguang <liyangguang@baidu.com>
Date: Fri, 16 Dec 2016 11:54:49 +0000
Subject: [PATCH 090/183] check leveldb iterator status for snappy format.

---
 include/caffe/util/db_leveldb.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp
index e9fa0d32..4cdb6db9 100644
--- a/include/caffe/util/db_leveldb.hpp
+++ b/include/caffe/util/db_leveldb.hpp
@@ -14,7 +14,10 @@ namespace caffe { namespace db {
 class LevelDBCursor : public Cursor {
  public:
   explicit LevelDBCursor(leveldb::Iterator* iter)
-    : iter_(iter) { SeekToFirst(); }
+    : iter_(iter) {
+    SeekToFirst();
+    CHECK(iter_->status().ok()) << iter_->status().ToString();
+  }
   ~LevelDBCursor() { delete iter_; }
   virtual void SeekToFirst() { iter_->SeekToFirst(); }
   virtual void Next() { iter_->Next(); }

From b55fe84ca13cb7d9971505ea4d160aa5d7b6be50 Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Wed, 21 Dec 2016 09:00:15 +0000
Subject: [PATCH 091/183] docs: add debian installation guide

---
 docs/install_apt_debian.md | 105 +++++++++++++++++++++++++++++++++++++
 docs/installation.md       |   1 +
 2 files changed, 106 insertions(+)
 create mode 100644 docs/install_apt_debian.md

diff --git a/docs/install_apt_debian.md b/docs/install_apt_debian.md
new file mode 100644
index 00000000..745a6f4f
--- /dev/null
+++ b/docs/install_apt_debian.md
@@ -0,0 +1,105 @@
+---
+title: "Installation: Debian"
+---
+
+# Debian Installation
+
+Caffe packages are available for `Debian/unstable`. Debian/stable users
+should take a look at Ubuntu installation instruction.  
+
+Only experienced linux users are recommended to try Debian/unstable (Sid).  
+
+Last update: Dec.21 2016  
+
+## Debian/unstable
+
+Apart from the installation methods based on source, Debian/unstable
+users can install pre-compiled Caffe packages via the official archive.
+
+### Binary installation
+
+Make sure that there is something like the follows in your `/etc/apt/sources.list`:
+```
+deb http://ftp2.cn.debian.org/debian sid main contrib non-free
+```
+Then we update APT cache and directly install Caffe. Note, the cpu version and
+the cuda version cannot be installed at the same time.
+```
+# apt update
+# apt install [ caffe-cpu | caffe-cuda ]
+```
+It should work out of box.
+
+#### Customizing caffe packages
+
+Some users may need to customize the Caffe package. Here is a brief
+guide of producing the customized `.deb` packages.
+
+Make sure that there is something like this in your `/etc/apt/sources.list`:
+```
+deb http://ftp2.cn.debian.org/debian sid main contrib non-free
+deb-src http://ftp2.cn.debian.org/debian sid main contrib non-free
+```
+
+Then we build caffe deb files with the following commands:
+```
+$ sudo apt update
+$ sudo apt install build-essential debhelper devscripts    # standard package building tools
+$ sudo apt build-dep [ caffe-cpu | caffe-cuda ]            # the most elegant way to pull caffe build dependencies
+$ apt source [ caffe-cpu | caffe-cuda ]               # download the source tarball and extract
+$ cd caffe-XXXX
+[ ... optional, customize caffe code/build ... ]
+$ debuild -B -j4                                      # build caffe with 4 parallel jobs (similar to make -j4)
+[ ... building ...]
+$ debc                                                # optional, if you want to check the package contents
+$ sudo debi                                           # optional, install the generated packages
+```
+The resulting deb packages can be found under the parent directory of the source tree.
+
+### Source installation
+
+Source installation under Debian/unstable is similar to that of Ubuntu, but
+here is a more elegant way to pull caffe build dependencies:
+```
+$ sudo apt build-dep [ caffe-cpu | caffe-cuda ]
+```
+Note, this requires a `deb-src` entry in your `/etc/apt/sources.list`.
+
+### Notes
+
+* Consider re-compiling OpenBLAS locally with optimization flags for sake of
+performance. This is highly recommended if you are writing a paper.
+
+* If you are installing `caffe-cuda`, APT will automatically pull some of the
+CUDA packages and the nvidia driver packages. Please take care if you have
+manually installed or hacked nvidia driver or CUDA toolkit or any other
+related stuff, because in this case it may fail.
+
+* If you encountered any problem when installing `caffe-*`, please report bug
+to Debian via Debian's bug tracking system. See https://www.debian.org/Bugs/ .
+
+* Additionally, a manpage (`man caffe`) and a bash complementation script
+(`caffe <TAB><TAB>`, `caffe train <TAB><TAB>`) are provided.
+Both of the two files are still not merged into caffe master.
+
+* The python interface is Python 3 version: `python3-caffe-{cpu,cuda}`.
+No plan to support python2.
+
+## FAQ
+
+* where is caffe-cudnn?
+
+CUDNN library seems not redistributable currently. If you really want the
+caffe-cudnn deb packages, the workaround is to install cudnn by yourself,
+and hack the packaging scripts, then build your customized package.
+
+* I installed the CPU version, How can I switch to the CUDA version?
+
+`sudo apt install caffe-cuda`, apt's dependency resolver is smart enough to deal with this.
+
+* Where is the examples, the models and other documentation stuff?
+
+```
+sudo apt install caffe-doc
+dpkg -L caffe-doc
+```
diff --git a/docs/installation.md b/docs/installation.md
index 3254be3d..14ec4674 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -12,6 +12,7 @@ The official Makefile and `Makefile.config` build are complemented by a [communi
 
 - [Docker setup](https://github.com/BVLC/caffe/tree/master/docker) *out-of-the-box brewing*
 - [Ubuntu installation](install_apt.html) *the standard platform*
+- [Debian installation](install_apt_debian.html) *deploy caffe with a single command*
 - [OS X installation](install_osx.html)
 - [RHEL / CentOS / Fedora installation](install_yum.html)
 - [Windows](https://github.com/BVLC/caffe/tree/windows) *see the Windows branch led by Guillaume Dumont*

From 2fac0d61afe290564f09067d3efa53d07ba0736f Mon Sep 17 00:00:00 2001
From: Tomasz Socha <tomasz.socha@intel.com>
Date: Thu, 8 Dec 2016 14:51:30 +0100
Subject: [PATCH 092/183] Use mkl_malloc when use mkl

---
 include/caffe/syncedmem.hpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 38ee4664..6474a696 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -3,6 +3,10 @@
 
 #include <cstdlib>
 
+#ifdef USE_MKL
+  #include "mkl.h"
+#endif
+
 #include "caffe/common.hpp"
 
 namespace caffe {
@@ -20,7 +24,11 @@ inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda) {
     return;
   }
 #endif
+#ifdef USE_MKL
+  *ptr = mkl_malloc(size ? size:1, 64);
+#else
   *ptr = malloc(size);
+#endif
   *use_cuda = false;
   CHECK(*ptr) << "host allocation of size " << size << " failed";
 }
@@ -32,7 +40,11 @@ inline void CaffeFreeHost(void* ptr, bool use_cuda) {
     return;
   }
 #endif
+#ifdef USE_MKL
+  mkl_free(ptr);
+#else
   free(ptr);
+#endif
 }
 
 

From 775f5b05dba28867f609c0e2b097e62176b4904a Mon Sep 17 00:00:00 2001
From: Yagnesh <yrevar@users.noreply.github.com>
Date: Wed, 21 Dec 2016 17:05:30 -0800
Subject: [PATCH 093/183] Fixed a typo

---
 examples/02-fine-tuning.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/02-fine-tuning.ipynb b/examples/02-fine-tuning.ipynb
index f44eaf9a..90803c98 100644
--- a/examples/02-fine-tuning.ipynb
+++ b/examples/02-fine-tuning.ipynb
@@ -70,7 +70,7 @@
     "\n",
     "- `get_ilsvrc_aux.sh` to download the ImageNet data mean, labels, etc.\n",
     "- `download_model_binary.py` to download the pretrained reference model\n",
-    "- `finetune_flickr_style/assemble_data.py` downloadsd the style training and testing data\n",
+    "- `finetune_flickr_style/assemble_data.py` downloads the style training and testing data\n",
     "\n",
     "We'll download just a small subset of the full dataset for this exercise: just 2000 of the 80K images, from 5 of the 20 style categories.  (To download the full dataset, set `full_dataset = True` in the cell below.)"
    ]

From 5693f3149688a2cb035858a9a9efde567763ebe7 Mon Sep 17 00:00:00 2001
From: Yagnesh <yrevar@users.noreply.github.com>
Date: Fri, 23 Dec 2016 15:31:21 -0800
Subject: [PATCH 094/183] Join path using "os.path.join" instead of "+"

(Needless to say it's much clearer, less error prone, and portable)
---
 examples/02-fine-tuning.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/02-fine-tuning.ipynb b/examples/02-fine-tuning.ipynb
index 90803c98..422259de 100644
--- a/examples/02-fine-tuning.ipynb
+++ b/examples/02-fine-tuning.ipynb
@@ -146,7 +146,7 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "weights = caffe_root + 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel'\n",
+    "weights = os.path.join(caffe_root, 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel')\n",
     "assert os.path.exists(weights)"
    ]
   },

From 1fd8bd0b4a842aa5a9d7ea1ec88d4cdd7eaf3b99 Mon Sep 17 00:00:00 2001
From: Fyodor Tokarev <ftokarev@gmail.com>
Date: Fri, 30 Dec 2016 17:47:20 +0300
Subject: [PATCH 095/183] Typos in test_inner_product_layer.cpp

---
 src/caffe/test/test_inner_product_layer.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/caffe/test/test_inner_product_layer.cpp b/src/caffe/test/test_inner_product_layer.cpp
index f1ec2333..6d84d292 100644
--- a/src/caffe/test/test_inner_product_layer.cpp
+++ b/src/caffe/test/test_inner_product_layer.cpp
@@ -60,9 +60,9 @@ TYPED_TEST(InnerProductLayerTest, TestSetUp) {
   EXPECT_EQ(this->blob_top_->channels(), 10);
 }
 
-/** @brief TestSetUp while toggling tranpose flag
+/** @brief TestSetUp while toggling transpose flag
  */
-TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeFalse) {
+TYPED_TEST(InnerProductLayerTest, TestSetUpTransposeFalse) {
   typedef typename TypeParam::Dtype Dtype;
   this->blob_bottom_vec_.push_back(this->blob_bottom_);
   LayerParameter layer_param;
@@ -82,9 +82,9 @@ TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeFalse) {
   EXPECT_EQ(60, layer->blobs()[0]->shape(1));
 }
 
-/** @brief TestSetUp while toggling tranpose flag
+/** @brief TestSetUp while toggling transpose flag
  */
-TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeTrue) {
+TYPED_TEST(InnerProductLayerTest, TestSetUpTransposeTrue) {
   typedef typename TypeParam::Dtype Dtype;
   this->blob_bottom_vec_.push_back(this->blob_bottom_);
   LayerParameter layer_param;
@@ -339,7 +339,7 @@ TYPED_TEST(InnerProductLayerTest, TestBackwardTranspose) {
     // copy bottom diffs
     Blob<Dtype>* const bottom_diff = new Blob<Dtype>();
     bottom_diff->CopyFrom(*this->blob_bottom_vec_[0], true, true);
-    // repeat original top with tranposed ip
+    // repeat original top with transposed ip
     this->blob_top_vec_.clear();
     this->blob_top_vec_.push_back(new Blob<Dtype>());
     inner_product_param->set_transpose(true);

From 4f0eb52a7ecd1bfb2c2d5906d368823eb312693c Mon Sep 17 00:00:00 2001
From: Xiaojie Deng <xiaojie.deng@intel.com>
Date: Sat, 31 Dec 2016 20:22:17 +0800
Subject: [PATCH 096/183] Fix parse_log.py and parse_log.sh for negative time
 duration if datetime in log across year boundary

---
 tools/extra/extract_seconds.py | 8 ++++++++
 tools/extra/parse_log.py       | 7 +++++++
 2 files changed, 15 insertions(+)

diff --git a/tools/extra/extract_seconds.py b/tools/extra/extract_seconds.py
index 591a51f9..68af69a2 100755
--- a/tools/extra/extract_seconds.py
+++ b/tools/extra/extract_seconds.py
@@ -48,11 +48,19 @@ def extract_seconds(input_file, output_file):
     start_datetime = get_start_time(lines, log_created_year)
     assert start_datetime, 'Start time not found'
 
+    last_dt = start_datetime
     out = open(output_file, 'w')
     for line in lines:
         line = line.strip()
         if line.find('Iteration') != -1:
             dt = extract_datetime_from_line(line, log_created_year)
+
+            # if it's another year
+            if dt.month < last_dt.month:
+                log_created_year += 1
+                dt = extract_datetime_from_line(line, log_created_year)
+            last_dt = dt
+
             elapsed_seconds = (dt - start_datetime).total_seconds()
             out.write('%f\n' % elapsed_seconds)
     out.close()
diff --git a/tools/extra/parse_log.py b/tools/extra/parse_log.py
index 017306b5..b47ffd0d 100755
--- a/tools/extra/parse_log.py
+++ b/tools/extra/parse_log.py
@@ -38,6 +38,7 @@ def parse_log(path_to_log):
     logfile_year = extract_seconds.get_log_created_year(path_to_log)
     with open(path_to_log) as f:
         start_time = extract_seconds.get_start_time(f, logfile_year)
+        last_time = start_time
 
         for line in f:
             iteration_match = regex_iteration.search(line)
@@ -55,6 +56,12 @@ def parse_log(path_to_log):
                 # Skip lines with bad formatting, for example when resuming solver
                 continue
 
+            # if it's another year
+            if time.month < last_time.month:
+                logfile_year += 1
+                time = extract_seconds.extract_datetime_from_line(line, logfile_year)
+            last_time = time
+
             seconds = (time - start_time).total_seconds()
 
             learning_rate_match = regex_learning_rate.search(line)

From bae06073864dbe86970429d53e35335304626a70 Mon Sep 17 00:00:00 2001
From: "Jonathan R. Williford" <jonathan@neural.vision>
Date: Sun, 1 Jan 2017 18:22:09 +0000
Subject: [PATCH 097/183] Overhaul layer catalogue documentation.

Create scripts/split_caffe_proto.py file for splitting up the
caffe.proto file, so that parts of the file can be included from the
layer help pages.

Create separate pages for each layer and link each page from layers.md.
---
 docs/tutorial/layers.md                       | 562 +++---------------
 docs/tutorial/layers/absval.md                |  22 +
 docs/tutorial/layers/accuracy.md              |  21 +
 docs/tutorial/layers/argmax.md                |  19 +
 docs/tutorial/layers/batchnorm.md             |  20 +
 docs/tutorial/layers/batchreindex.md          |  16 +
 docs/tutorial/layers/bias.md                  |  19 +
 docs/tutorial/layers/bnll.md                  |  25 +
 docs/tutorial/layers/concat.md                |  40 ++
 docs/tutorial/layers/contrastiveloss.md       |  20 +
 docs/tutorial/layers/convolution.md           |  63 ++
 docs/tutorial/layers/crop.md                  |  20 +
 docs/tutorial/layers/data.md                  |  29 +
 docs/tutorial/layers/deconvolution.md         |  22 +
 docs/tutorial/layers/dropout.md               |  20 +
 docs/tutorial/layers/dummydata.md             |  20 +
 docs/tutorial/layers/eltwise.md               |  20 +
 docs/tutorial/layers/elu.md                   |  25 +
 docs/tutorial/layers/embed.md                 |  20 +
 docs/tutorial/layers/euclideanloss.md         |  16 +
 docs/tutorial/layers/exp.md                   |  24 +
 docs/tutorial/layers/filter.md                |  15 +
 docs/tutorial/layers/flatten.md               |  21 +
 docs/tutorial/layers/hdf5data.md              |  20 +
 docs/tutorial/layers/hdf5output.md            |  25 +
 docs/tutorial/layers/hingeloss.md             |  19 +
 docs/tutorial/layers/im2col.md                |  16 +
 docs/tutorial/layers/imagedata.md             |  27 +
 docs/tutorial/layers/infogainloss.md          |  24 +
 docs/tutorial/layers/innerproduct.md          |  59 ++
 docs/tutorial/layers/input.md                 |  19 +
 docs/tutorial/layers/log.md                   |  20 +
 docs/tutorial/layers/lrn.md                   |  28 +
 docs/tutorial/layers/lstm.md                  |  21 +
 docs/tutorial/layers/memorydata.md            |  25 +
 .../layers/multinomiallogisticloss.md         |  19 +
 docs/tutorial/layers/mvn.md                   |  20 +
 docs/tutorial/layers/parameter.md             |  21 +
 docs/tutorial/layers/pooling.md               |  47 ++
 docs/tutorial/layers/power.md                 |  46 ++
 docs/tutorial/layers/prelu.md                 |  20 +
 docs/tutorial/layers/python.md                |  27 +
 docs/tutorial/layers/recurrent.md             |  20 +
 docs/tutorial/layers/reduction.md             |  20 +
 docs/tutorial/layers/relu.md                  |  32 +
 docs/tutorial/layers/reshape.md               |  51 ++
 docs/tutorial/layers/rnn.md                   |  19 +
 docs/tutorial/layers/scale.md                 |  20 +
 docs/tutorial/layers/sigmoid.md               |  20 +
 .../layers/sigmoidcrossentropyloss.md         |  13 +
 docs/tutorial/layers/silence.md               |  23 +
 docs/tutorial/layers/slice.md                 |  42 ++
 docs/tutorial/layers/softmax.md               |  24 +
 docs/tutorial/layers/softmaxwithloss.md       |  33 +
 docs/tutorial/layers/split.md                 |  17 +
 docs/tutorial/layers/spp.md                   |  20 +
 docs/tutorial/layers/tanh.md                  |  18 +
 docs/tutorial/layers/threshold.md             |  18 +
 docs/tutorial/layers/tile.md                  |  20 +
 docs/tutorial/layers/windowdata.md            |  19 +
 scripts/build_docs.sh                         |   3 +
 scripts/split_caffe_proto.py                  |  35 ++
 62 files changed, 1573 insertions(+), 476 deletions(-)
 create mode 100644 docs/tutorial/layers/absval.md
 create mode 100644 docs/tutorial/layers/accuracy.md
 create mode 100644 docs/tutorial/layers/argmax.md
 create mode 100644 docs/tutorial/layers/batchnorm.md
 create mode 100644 docs/tutorial/layers/batchreindex.md
 create mode 100644 docs/tutorial/layers/bias.md
 create mode 100644 docs/tutorial/layers/bnll.md
 create mode 100644 docs/tutorial/layers/concat.md
 create mode 100644 docs/tutorial/layers/contrastiveloss.md
 create mode 100644 docs/tutorial/layers/convolution.md
 create mode 100644 docs/tutorial/layers/crop.md
 create mode 100644 docs/tutorial/layers/data.md
 create mode 100644 docs/tutorial/layers/deconvolution.md
 create mode 100644 docs/tutorial/layers/dropout.md
 create mode 100644 docs/tutorial/layers/dummydata.md
 create mode 100644 docs/tutorial/layers/eltwise.md
 create mode 100644 docs/tutorial/layers/elu.md
 create mode 100644 docs/tutorial/layers/embed.md
 create mode 100644 docs/tutorial/layers/euclideanloss.md
 create mode 100644 docs/tutorial/layers/exp.md
 create mode 100644 docs/tutorial/layers/filter.md
 create mode 100644 docs/tutorial/layers/flatten.md
 create mode 100644 docs/tutorial/layers/hdf5data.md
 create mode 100644 docs/tutorial/layers/hdf5output.md
 create mode 100644 docs/tutorial/layers/hingeloss.md
 create mode 100644 docs/tutorial/layers/im2col.md
 create mode 100644 docs/tutorial/layers/imagedata.md
 create mode 100644 docs/tutorial/layers/infogainloss.md
 create mode 100644 docs/tutorial/layers/innerproduct.md
 create mode 100644 docs/tutorial/layers/input.md
 create mode 100644 docs/tutorial/layers/log.md
 create mode 100644 docs/tutorial/layers/lrn.md
 create mode 100644 docs/tutorial/layers/lstm.md
 create mode 100644 docs/tutorial/layers/memorydata.md
 create mode 100644 docs/tutorial/layers/multinomiallogisticloss.md
 create mode 100644 docs/tutorial/layers/mvn.md
 create mode 100644 docs/tutorial/layers/parameter.md
 create mode 100644 docs/tutorial/layers/pooling.md
 create mode 100644 docs/tutorial/layers/power.md
 create mode 100644 docs/tutorial/layers/prelu.md
 create mode 100644 docs/tutorial/layers/python.md
 create mode 100644 docs/tutorial/layers/recurrent.md
 create mode 100644 docs/tutorial/layers/reduction.md
 create mode 100644 docs/tutorial/layers/relu.md
 create mode 100644 docs/tutorial/layers/reshape.md
 create mode 100644 docs/tutorial/layers/rnn.md
 create mode 100644 docs/tutorial/layers/scale.md
 create mode 100644 docs/tutorial/layers/sigmoid.md
 create mode 100644 docs/tutorial/layers/sigmoidcrossentropyloss.md
 create mode 100644 docs/tutorial/layers/silence.md
 create mode 100644 docs/tutorial/layers/slice.md
 create mode 100644 docs/tutorial/layers/softmax.md
 create mode 100644 docs/tutorial/layers/softmaxwithloss.md
 create mode 100644 docs/tutorial/layers/split.md
 create mode 100644 docs/tutorial/layers/spp.md
 create mode 100644 docs/tutorial/layers/tanh.md
 create mode 100644 docs/tutorial/layers/threshold.md
 create mode 100644 docs/tutorial/layers/tile.md
 create mode 100644 docs/tutorial/layers/windowdata.md
 create mode 100755 scripts/split_caffe_proto.py

diff --git a/docs/tutorial/layers.md b/docs/tutorial/layers.md
index 7362aac2..a903d5ac 100644
--- a/docs/tutorial/layers.md
+++ b/docs/tutorial/layers.md
@@ -1,186 +1,77 @@
 ---
 title: Layer Catalogue
 ---
+
 # Layers
 
 To create a Caffe model you need to define the model architecture in a protocol buffer definition file (prototxt).
 
 Caffe layers and their parameters are defined in the protocol buffer definitions for the project in [caffe.proto](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto).
 
-### Vision Layers
-
-* Header: `./include/caffe/vision_layers.hpp`
-
-Vision layers usually take *images* as input and produce other *images* as output.
-A typical "image" in the real-world may have one color channel ($$c = 1$$), as in a grayscale image, or three color channels ($$c = 3$$) as in an RGB (red, green, blue) image.
-But in this context, the distinguishing characteristic of an image is its spatial structure: usually an image has some non-trivial height $$h > 1$$ and width $$w > 1$$.
-This 2D geometry naturally lends itself to certain decisions about how to process the input.
-In particular, most of the vision layers work by applying a particular operation to some region of the input to produce a corresponding region of the output.
-In contrast, other layers (with few exceptions) ignore the spatial structure of the input, effectively treating it as "one big vector" with dimension $$chw$$.
-
-
-#### Convolution
-
-* Layer type: `Convolution`
-* CPU implementation: `./src/caffe/layers/convolution_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/convolution_layer.cu`
-* Parameters (`ConvolutionParameter convolution_param`)
-    - Required
-        - `num_output` (`c_o`): the number of filters
-        - `kernel_size` (or `kernel_h` and `kernel_w`): specifies height and width of each filter
-    - Strongly Recommended
-        - `weight_filler` [default `type: 'constant' value: 0`]
-    - Optional
-        - `bias_term` [default `true`]: specifies whether to learn and apply a set of additive biases to the filter outputs
-        - `pad` (or `pad_h` and `pad_w`) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input
-        - `stride` (or `stride_h` and `stride_w`) [default 1]: specifies the intervals at which to apply the filters to the input
-        - `group` (g) [default 1]: If g > 1, we restrict the connectivity of each filter to a subset of the input. Specifically, the input and output channels are separated into g groups, and the $$i$$th output group channels will be only connected to the $$i$$th input group channels.
-* Input
-    - `n * c_i * h_i * w_i`
-* Output
-    - `n * c_o * h_o * w_o`, where `h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1` and `w_o` likewise.
-* Sample (as seen in `./models/bvlc_reference_caffenet/train_val.prototxt`)
-
-      layer {
-        name: "conv1"
-        type: "Convolution"
-        bottom: "data"
-        top: "conv1"
-        # learning rate and decay multipliers for the filters
-        param { lr_mult: 1 decay_mult: 1 }
-        # learning rate and decay multipliers for the biases
-        param { lr_mult: 2 decay_mult: 0 }
-        convolution_param {
-          num_output: 96     # learn 96 filters
-          kernel_size: 11    # each filter is 11x11
-          stride: 4          # step 4 pixels between each filter application
-          weight_filler {
-            type: "gaussian" # initialize the filters from a Gaussian
-            std: 0.01        # distribution with stdev 0.01 (default mean: 0)
-          }
-          bias_filler {
-            type: "constant" # initialize the biases to zero (0)
-            value: 0
-          }
-        }
-      }
-
-The `Convolution` layer convolves the input image with a set of learnable filters, each producing one feature map in the output image.
-
-#### Pooling
-
-* Layer type: `Pooling`
-* CPU implementation: `./src/caffe/layers/pooling_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/pooling_layer.cu`
-* Parameters (`PoolingParameter pooling_param`)
-    - Required
-        - `kernel_size` (or `kernel_h` and `kernel_w`): specifies height and width of each filter
-    - Optional
-        - `pool` [default MAX]: the pooling method. Currently MAX, AVE, or STOCHASTIC
-        - `pad` (or `pad_h` and `pad_w`) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input
-        - `stride` (or `stride_h` and `stride_w`) [default 1]: specifies the intervals at which to apply the filters to the input
-* Input
-    - `n * c * h_i * w_i`
-* Output
-    - `n * c * h_o * w_o`, where h_o and w_o are computed in the same way as convolution.
-* Sample (as seen in `./models/bvlc_reference_caffenet/train_val.prototxt`)
-
-      layer {
-        name: "pool1"
-        type: "Pooling"
-        bottom: "conv1"
-        top: "pool1"
-        pooling_param {
-          pool: MAX
-          kernel_size: 3 # pool over a 3x3 region
-          stride: 2      # step two pixels (in the bottom blob) between pooling regions
-        }
-      }
-
-#### Local Response Normalization (LRN)
-
-* Layer type: `LRN`
-* CPU Implementation: `./src/caffe/layers/lrn_layer.cpp`
-* CUDA GPU Implementation: `./src/caffe/layers/lrn_layer.cu`
-* Parameters (`LRNParameter lrn_param`)
-    - Optional
-        - `local_size` [default 5]: the number of channels to sum over (for cross channel LRN) or the side length of the square region to sum over (for within channel LRN)
-        - `alpha` [default 1]: the scaling parameter (see below)
-        - `beta` [default 5]: the exponent (see below)
-        - `norm_region` [default `ACROSS_CHANNELS`]: whether to sum over adjacent channels (`ACROSS_CHANNELS`) or nearby spatial locaitons (`WITHIN_CHANNEL`)
+## Data Layers
 
-The local response normalization layer performs a kind of "lateral inhibition" by normalizing over local input regions. In `ACROSS_CHANNELS` mode, the local regions extend across nearby channels, but have no spatial extent (i.e., they have shape `local_size x 1 x 1`). In `WITHIN_CHANNEL` mode, the local regions extend spatially, but are in separate channels (i.e., they have shape `1 x local_size x local_size`). Each input value is divided by $$(1 + (\alpha/n) \sum_i x_i^2)^\beta$$, where $$n$$ is the size of each local region, and the sum is taken over the region centered at that value (zero padding is added where necessary).
-
-#### im2col
-
-`Im2col` is a helper for doing the image-to-column transformation that you most likely do not need to know about. This is used in Caffe's original convolution to do matrix multiplication by laying out all patches into a matrix.
-
-### Loss Layers
+Data enters Caffe through data layers: they lie at the bottom of nets. Data can come from efficient databases (LevelDB or LMDB), directly from memory, or, when efficiency is not critical, from files on disk in HDF5 or common image formats.
 
-Loss drives learning by comparing an output to a target and assigning cost to minimize. The loss itself is computed by the forward pass and the gradient w.r.t. to the loss is computed by the backward pass.
+Common input preprocessing (mean subtraction, scaling, random cropping, and mirroring) is available by specifying `TransformationParameter`s by some of the layers.
+The [bias](layers/bias.html), [scale](layers/scale.html), and [crop](layers/crop.html) layers can be helpful with transforming the inputs, when `TransformationParameter` isn't available.
 
-#### Softmax
+Layers:
 
-* Layer type: `SoftmaxWithLoss`
+* [Image Data](layers/imagedata.html) - read raw images.
+* [Database](layers/data.html) - read data from LEVELDB or LMDB.
+* [HDF5 Input](layers/hdf5data.html) - read HDF5 data, allows data of arbitrary dimensions.
+* [HDF5 Output](layers/hdf5output.html) - write data as HDF5.
+* [Input](layers/input.html) - typically used for networks that are being deployed.
+* [Window Data](layers/windowdata.html) - read window data file.
+* [Memory Data](layers/memorydata.html) - read data directly from memory.
+* [Dummy Data](layers/dummydata.html) - for static data and debugging.
 
-The softmax loss layer computes the multinomial logistic loss of the softmax of its inputs. It's conceptually identical to a softmax layer followed by a multinomial logistic loss layer, but provides a more numerically stable gradient.
+Note that the [Python](layers/python.html) Layer can be useful for create custom data layers.
 
-#### Sum-of-Squares / Euclidean
+## Vision Layers
 
-* Layer type: `EuclideanLoss`
+Vision layers usually take *images* as input and produce other *images* as output, although they can take data of other types and dimensions.
+A typical "image" in the real-world may have one color channel ($$c = 1$$), as in a grayscale image, or three color channels ($$c = 3$$) as in an RGB (red, green, blue) image.
+But in this context, the distinguishing characteristic of an image is its spatial structure: usually an image has some non-trivial height $$h > 1$$ and width $$w > 1$$.
+This 2D geometry naturally lends itself to certain decisions about how to process the input.
+In particular, most of the vision layers work by applying a particular operation to some region of the input to produce a corresponding region of the output.
+In contrast, other layers (with few exceptions) ignore the spatial structure of the input, effectively treating it as "one big vector" with dimension $$chw$$.
 
-The Euclidean loss layer computes the sum of squares of differences of its two inputs, $$\frac 1 {2N} \sum_{i=1}^N \| x^1_i - x^2_i \|_2^2$$.
+Layers:
 
-#### Hinge / Margin
+* [Convolution Layer](layers/convolution.html) - convolves the input image with a set of learnable filters, each producing one feature map in the output image.
+* [Pooling Layer](layers/pooling.html) - max, average, or stochastic pooling.
+* [Spatial Pyramid Pooling (SPP)](layers/spp.html)
+* [Crop](layers/crop.html) - perform cropping transformation.
+* [Deconvolution Layer](layers/deconvolution.html) - transposed convolution.
 
-* Layer type: `HingeLoss`
-* CPU implementation: `./src/caffe/layers/hinge_loss_layer.cpp`
-* CUDA GPU implementation: none yet
-* Parameters (`HingeLossParameter hinge_loss_param`)
-    - Optional
-        - `norm` [default L1]: the norm used. Currently L1, L2
-* Inputs
-    - `n * c * h * w` Predictions
-    - `n * 1 * 1 * 1` Labels
-* Output
-    - `1 * 1 * 1 * 1` Computed Loss
-* Samples
+* [Im2Col](layers/im2col.html) - relic helper layer that is not used much anymore.
 
-      # L1 Norm
-      layer {
-        name: "loss"
-        type: "HingeLoss"
-        bottom: "pred"
-        bottom: "label"
-      }
+## Recurrent Layers
 
-      # L2 Norm
-      layer {
-        name: "loss"
-        type: "HingeLoss"
-        bottom: "pred"
-        bottom: "label"
-        top: "loss"
-        hinge_loss_param {
-          norm: L2
-        }
-      }
+Layers:
 
-The hinge loss layer computes a one-vs-all hinge or squared hinge loss.
+* [Recurrent](layers/recurrent.html)
+* [RNN](layers/rnn.html)
+* [Long-Short Term Memory (LSTM)](layers/lstm.html)
 
-#### Sigmoid Cross-Entropy
+## Common Layers
 
-`SigmoidCrossEntropyLoss`
+Layers:
 
-#### Infogain
+* [Inner Product](layers/innerproduct.html) - fully connected layer.
+* [Dropout](layers/dropout.html)
+* [Embed](layers/embed.html) - for learning embeddings of one-hot encoded vector (takes index as input).
 
-`InfogainLoss`
+## Normalization Layers
 
-#### Accuracy and Top-k
+* [Local Response Normalization (LRN)](layers/lrn.html) - performs a kind of "lateral inhibition" by normalizing over local input regions.
+* [Mean Variance Normalization (MVN)](layers/mvn.html) - performs contrast normalization / instance normalization.
+* [Batch Normalization](layers/batchnorm.html) - performs normalization over mini-batches.
 
-`Accuracy` scores the output as the accuracy of output with respect to target -- it is not actually a loss and has no backward step.
+The [bias](layers/bias.html) and [scale](layers/scale.html) layers can be helpful in combination with normalization.
 
-### Activation / Neuron Layers
+## Activation / Neuron Layers
 
 In general, activation / Neuron layers are element-wise operators, taking one bottom blob and producing one top blob of the same size. In the layers below, we will ignore the input and out sizes as they are identical:
 
@@ -189,337 +80,56 @@ In general, activation / Neuron layers are element-wise operators, taking one bo
 * Output
     - n * c * h * w
 
-#### ReLU / Rectified-Linear and Leaky-ReLU
-
-* Layer type: `ReLU`
-* CPU implementation: `./src/caffe/layers/relu_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/relu_layer.cu`
-* Parameters (`ReLUParameter relu_param`)
-    - Optional
-        - `negative_slope` [default 0]: specifies whether to leak the negative part by multiplying it with the slope value rather than setting it to 0.
-* Sample (as seen in `./models/bvlc_reference_caffenet/train_val.prototxt`)
-
-      layer {
-        name: "relu1"
-        type: "ReLU"
-        bottom: "conv1"
-        top: "conv1"
-      }
-
-Given an input value x, The `ReLU` layer computes the output as x if x > 0 and negative_slope * x if x <= 0. When the negative slope parameter is not set, it is equivalent to the standard ReLU function of taking max(x, 0). It also supports in-place computation, meaning that the bottom and the top blob could be the same to preserve memory consumption.
-
-#### Sigmoid
-
-* Layer type: `Sigmoid`
-* CPU implementation: `./src/caffe/layers/sigmoid_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/sigmoid_layer.cu`
-* Sample (as seen in `./examples/mnist/mnist_autoencoder.prototxt`)
-
-      layer {
-        name: "encode1neuron"
-        bottom: "encode1"
-        top: "encode1neuron"
-        type: "Sigmoid"
-      }
-
-The `Sigmoid` layer computes the output as sigmoid(x) for each input element x.
-
-#### TanH / Hyperbolic Tangent
-
-* Layer type: `TanH`
-* CPU implementation: `./src/caffe/layers/tanh_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/tanh_layer.cu`
-* Sample
-
-      layer {
-        name: "layer"
-        bottom: "in"
-        top: "out"
-        type: "TanH"
-      }
-
-The `TanH` layer computes the output as tanh(x) for each input element x.
-
-#### Absolute Value
-
-* Layer type: `AbsVal`
-* CPU implementation: `./src/caffe/layers/absval_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/absval_layer.cu`
-* Sample
-
-      layer {
-        name: "layer"
-        bottom: "in"
-        top: "out"
-        type: "AbsVal"
-      }
-
-The `AbsVal` layer computes the output as abs(x) for each input element x.
-
-#### Power
-
-* Layer type: `Power`
-* CPU implementation: `./src/caffe/layers/power_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/power_layer.cu`
-* Parameters (`PowerParameter power_param`)
-    - Optional
-        - `power` [default 1]
-        - `scale` [default 1]
-        - `shift` [default 0]
-* Sample
-
-      layer {
-        name: "layer"
-        bottom: "in"
-        top: "out"
-        type: "Power"
-        power_param {
-          power: 1
-          scale: 1
-          shift: 0
-        }
-      }
-
-The `Power` layer computes the output as (shift + scale * x) ^ power for each input element x.
-
-#### BNLL
-
-* Layer type: `BNLL`
-* CPU implementation: `./src/caffe/layers/bnll_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/bnll_layer.cu`
-* Sample
-
-      layer {
-        name: "layer"
-        bottom: "in"
-        top: "out"
-        type: BNLL
-      }
-
-The `BNLL` (binomial normal log likelihood) layer computes the output as log(1 + exp(x)) for each input element x.
-
-
-### Data Layers
-
-Data enters Caffe through data layers: they lie at the bottom of nets. Data can come from efficient databases (LevelDB or LMDB), directly from memory, or, when efficiency is not critical, from files on disk in HDF5 or common image formats.
-
-Common input preprocessing (mean subtraction, scaling, random cropping, and mirroring) is available by specifying `TransformationParameter`s.
-
-#### Database
+Layers:
 
-* Layer type: `Data`
-* Parameters
-    - Required
-        - `source`: the name of the directory containing the database
-        - `batch_size`: the number of inputs to process at one time
-    - Optional
-        - `rand_skip`: skip up to this number of inputs at the beginning; useful for asynchronous sgd
-        - `backend` [default `LEVELDB`]: choose whether to use a `LEVELDB` or `LMDB`
+* [ReLU / Rectified-Linear and Leaky-ReLU](layers/relu.html) - ReLU and Leaky-ReLU rectification.
+* [PReLU](layers/prelu.html) - parametric ReLU.
+* [ELU](layers/elu.html) - exponential linear rectification.
+* [Sigmoid](layers/sigmoid.html)
+* [TanH](layers/tanh.html)
+* [Absolute Value](layers/abs.html)
+* [Power](layers/power.html) - f(x) = (shift + scale * x) ^ power.
+* [Exp](layers/exp.html) - f(x) = base ^ (shift + scale * x).
+* [Log](layers/log.html) - f(x) = log(x).
+* [BNLL](layers/bnll.html) - f(x) = log(1 + exp(x)).
+* [Threshold](layers/threshold.html) - performs step function at user defined threshold.
+* [Bias](layers/bias.html) - adds a bias to a blob that can either be learned or fixed.
+* [Scale](layers/scale.html) - scales a blob by an amount that can either be learned or fixed.
 
+## Utility Layers
 
+Layers:
 
-#### In-Memory
+* [Flatten](layers/flatten.html)
+* [Reshape](layers/reshape.html)
+* [Batch Reindex](layers/batchreindex.html)
 
-* Layer type: `MemoryData`
-* Parameters
-    - Required
-        - `batch_size`, `channels`, `height`, `width`: specify the size of input chunks to read from memory
+* [Split](layers/split.html)
+* [Concat](layers/concat.html)
+* [Slicing](layers/slice.html)
+* [Eltwise](layers/eltwise.html) - element-wise operations such as product or sum between two blobs.
+* [Filter / Mask](layers/filter.html) - mask or select output using last blob.
+* [Parameter](layers/parameter.html) - enable parameters to be shared between layers.
+* [Reduction](layers/reduction.html) - reduce input blob to scalar blob using operations such as sum or mean.
+* [Silence](layers/silence.html) - prevent top-level blobs from being printed during training.
 
-The memory data layer reads data directly from memory, without copying it. In order to use it, one must call `MemoryDataLayer::Reset` (from C++) or `Net.set_input_arrays` (from Python) in order to specify a source of contiguous data (as 4D row major array), which is read one batch-sized chunk at a time.
+* [ArgMax](layers/argmax.html)
+* [Softmax](layers/softmax.html)
 
-#### HDF5 Input
+* [Python](layers/python.html) - allows custom Python layers.
 
-* Layer type: `HDF5Data`
-* Parameters
-    - Required
-        - `source`: the name of the file to read from
-        - `batch_size`
+## Loss Layers
 
-#### HDF5 Output
-
-* Layer type: `HDF5Output`
-* Parameters
-    - Required
-        - `file_name`: name of file to write to
-
-The HDF5 output layer performs the opposite function of the other layers in this section: it writes its input blobs to disk.
-
-#### Images
-
-* Layer type: `ImageData`
-* Parameters
-    - Required
-        - `source`: name of a text file, with each line giving an image filename and label
-        - `batch_size`: number of images to batch together
-    - Optional
-        - `rand_skip`
-        - `shuffle` [default false]
-        - `new_height`, `new_width`: if provided, resize all images to this size
-
-#### Windows
-
-`WindowData`
-
-#### Dummy
-
-`DummyData` is for development and debugging. See `DummyDataParameter`.
-
-### Common Layers
-
-#### Inner Product
-
-* Layer type: `InnerProduct`
-* CPU implementation: `./src/caffe/layers/inner_product_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/inner_product_layer.cu`
-* Parameters (`InnerProductParameter inner_product_param`)
-    - Required
-        - `num_output` (`c_o`): the number of filters
-    - Strongly recommended
-        - `weight_filler` [default `type: 'constant' value: 0`]
-    - Optional
-        - `bias_filler` [default `type: 'constant' value: 0`]
-        - `bias_term` [default `true`]: specifies whether to learn and apply a set of additive biases to the filter outputs
-* Input
-    - `n * c_i * h_i * w_i`
-* Output
-    - `n * c_o * 1 * 1`
-* Sample
-
-      layer {
-        name: "fc8"
-        type: "InnerProduct"
-        # learning rate and decay multipliers for the weights
-        param { lr_mult: 1 decay_mult: 1 }
-        # learning rate and decay multipliers for the biases
-        param { lr_mult: 2 decay_mult: 0 }
-        inner_product_param {
-          num_output: 1000
-          weight_filler {
-            type: "gaussian"
-            std: 0.01
-          }
-          bias_filler {
-            type: "constant"
-            value: 0
-          }
-        }
-        bottom: "fc7"
-        top: "fc8"
-      }
-
-The `InnerProduct` layer (also usually referred to as the fully connected layer) treats the input as a simple vector and produces an output in the form of a single vector (with the blob's height and width set to 1).
-
-#### Splitting
-
-The `Split` layer is a utility layer that splits an input blob to multiple output blobs. This is used when a blob is fed into multiple output layers.
-
-#### Flattening
-
-The `Flatten` layer is a utility layer that flattens an input of shape `n * c * h * w` to a simple vector output of shape `n * (c*h*w)`
-
-#### Reshape
-
-* Layer type: `Reshape`
-* Implementation: `./src/caffe/layers/reshape_layer.cpp`
-* Parameters (`ReshapeParameter reshape_param`)
-    - Optional: (also see detailed description below)
-        - `shape`
-
-* Input
-    - a single blob with arbitrary dimensions
-* Output
-    - the same blob, with modified dimensions, as specified by `reshape_param`
-
-* Sample
-
-        layer {
-          name: "reshape"
-          type: "Reshape"
-          bottom: "input"
-          top: "output"
-          reshape_param {
-            shape {
-              dim: 0  # copy the dimension from below
-              dim: 2
-              dim: 3
-              dim: -1 # infer it from the other dimensions
-            }
-          }
-        }
-
-The `Reshape` layer can be used to change the dimensions of its input, without changing its data. Just like the `Flatten` layer, only the dimensions are changed; no data is copied in the process.
-
-Output dimensions are specified by the `ReshapeParam` proto. Positive numbers are used directly, setting the corresponding dimension of the output blob. In addition, two special values are accepted for any of the target dimension values:
-
-* **0** means "copy the respective dimension of the bottom layer". That is, if the bottom has 2 as its 1st dimension, the top will have 2 as its 1st dimension as well, given `dim: 0` as the 1st target dimension.
-* **-1** stands for "infer this from the other dimensions". This behavior is similar to that of -1 in *numpy*'s or `[]` for *MATLAB*'s reshape: this dimension is calculated to keep the overall element count the same as in the bottom layer. At most one -1 can be used in a reshape operation.
-
-As another example, specifying `reshape_param { shape { dim: 0 dim: -1 } }` makes the layer behave in exactly the same way as the `Flatten` layer.
-
-#### Concatenation
-
-* Layer type: `Concat`
-* CPU implementation: `./src/caffe/layers/concat_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/concat_layer.cu`
-* Parameters (`ConcatParameter concat_param`)
-    - Optional
-        - `axis` [default 1]: 0 for concatenation along num and 1 for channels.
-* Input
-    - `n_i * c_i * h * w` for each input blob i from 1 to K.
-* Output
-    - if `axis = 0`: `(n_1 + n_2 + ... + n_K) * c_1 * h * w`, and all input `c_i` should be the same.
-    - if `axis = 1`: `n_1 * (c_1 + c_2 + ... + c_K) * h * w`, and all input `n_i` should be the same.
-* Sample
-
-      layer {
-        name: "concat"
-        bottom: "in1"
-        bottom: "in2"
-        top: "out"
-        type: "Concat"
-        concat_param {
-          axis: 1
-        }
-      }
-
-The `Concat` layer is a utility layer that concatenates its multiple input blobs to one single output blob.
-
-#### Slicing
-
-The `Slice` layer is a utility layer that slices an input layer to multiple output layers along a given dimension (currently num or channel only) with given slice indices.
-
-* Sample
-
-      layer {
-        name: "slicer_label"
-        type: "Slice"
-        bottom: "label"
-        ## Example of label with a shape N x 3 x 1 x 1
-        top: "label1"
-        top: "label2"
-        top: "label3"
-        slice_param {
-          axis: 1
-          slice_point: 1
-          slice_point: 2
-        }
-      }
-
-`axis` indicates the target axis; `slice_point` indicates indexes in the selected dimension (the number of indices must be equal to the number of top blobs minus one).
-
-
-#### Elementwise Operations
-
-`Eltwise`
-
-#### Argmax
-
-`ArgMax`
-
-#### Softmax
+Loss drives learning by comparing an output to a target and assigning cost to minimize. The loss itself is computed by the forward pass and the gradient w.r.t. to the loss is computed by the backward pass.
 
-`Softmax`
+Layers:
 
-#### Mean-Variance Normalization
+* [Multinomial Logistic Loss](layers/multinomiallogisticloss.html)
+* [Infogain Loss](layers/infogainloss.html) - a generalization of MultinomialLogisticLossLayer.
+* [Softmax with Loss](layers/softmaxwithloss.html) - computes the multinomial logistic loss of the softmax of its inputs. It's conceptually identical to a softmax layer followed by a multinomial logistic loss layer, but provides a more numerically stable gradient.
+* [Sum-of-Squares / Euclidean](layers/euclideanloss.html) - computes the sum of squares of differences of its two inputs, $$\frac 1 {2N} \sum_{i=1}^N \| x^1_i - x^2_i \|_2^2$$.
+* [Hinge / Margin](layers/hiddenloss.html) - The hinge loss layer computes a one-vs-all hinge (L1) or squared hinge loss (L2).
+* [Sigmoid Cross-Entropy Loss](layers/sigmoidcrossentropyloss.html) - computes the cross-entropy (logistic) loss, often used for predicting targets interpreted as probabilities.
+* [Accuracy / Top-k layer](layers/accuracy.html) - scores the output as an accuracy with respect to target -- it is not actually a loss and has no backward step.
+* [Contrastive Loss](layers/contrastiveloss.html)
 
-`MVN`
diff --git a/docs/tutorial/layers/absval.md b/docs/tutorial/layers/absval.md
new file mode 100644
index 00000000..220c4118
--- /dev/null
+++ b/docs/tutorial/layers/absval.md
@@ -0,0 +1,22 @@
+---
+title: Absolute Value Layer
+---
+
+# Absolute Value Layer
+
+* Layer type: `AbsVal`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1AbsValLayer.html)
+* Header: [`./include/caffe/layers/absval_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/absval_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/absval_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/absval_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/absval_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/absval_layer.cu)
+
+* Sample
+
+      layer {
+        name: "layer"
+        bottom: "in"
+        top: "out"
+        type: "AbsVal"
+      }
+
+The `AbsVal` layer computes the output as abs(x) for each input element x.
diff --git a/docs/tutorial/layers/accuracy.md b/docs/tutorial/layers/accuracy.md
new file mode 100644
index 00000000..ecf84090
--- /dev/null
+++ b/docs/tutorial/layers/accuracy.md
@@ -0,0 +1,21 @@
+---
+title: Accuracy and Top-k
+---
+
+# Accuracy and Top-k
+
+`Accuracy` scores the output as the accuracy of output with respect to target -- it is not actually a loss and has no backward step.
+
+* Layer type: `Accuracy`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1AccuracyLayer.html)
+* Header: [`./include/caffe/layers/accuracy_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/accuracy_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/accuracy_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/accuracy_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/accuracy_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/accuracy_layer.cu)
+
+## Parameters
+* Parameters (`AccuracyParameter accuracy_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/AccuracyParameter.txt %}
+{% endhighlight %}
\ No newline at end of file
diff --git a/docs/tutorial/layers/argmax.md b/docs/tutorial/layers/argmax.md
new file mode 100644
index 00000000..f5f173ac
--- /dev/null
+++ b/docs/tutorial/layers/argmax.md
@@ -0,0 +1,19 @@
+---
+title: ArgMax Layer
+---
+
+# ArgMax Layer
+
+* Layer type: `ArgMax`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ArgMaxLayer.html)
+* Header: [`./include/caffe/layers/argmax_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/argmax_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/argmax_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/argmax_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/argmax_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/argmax_layer.cu)
+
+## Parameters
+* Parameters (`ArgMaxParameter argmax_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ArgMaxParameter.txt %}
+{% endhighlight %}
\ No newline at end of file
diff --git a/docs/tutorial/layers/batchnorm.md b/docs/tutorial/layers/batchnorm.md
new file mode 100644
index 00000000..a5be5ce0
--- /dev/null
+++ b/docs/tutorial/layers/batchnorm.md
@@ -0,0 +1,20 @@
+---
+title: Batch Norm Layer
+---
+
+# Batch Norm Layer
+
+* Layer type: `BatchNorm`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1BatchNormLayer.html)
+* Header: [`./include/caffe/layers/batch_norm_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/batch_norm_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/batch_norm_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/batch_norm_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/batch_norm_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/batch_norm_layer.cu)
+
+## Parameters
+
+* Parameters (`BatchNormParameter batch_norm_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/BatchNormParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/batchreindex.md b/docs/tutorial/layers/batchreindex.md
new file mode 100644
index 00000000..21b36c39
--- /dev/null
+++ b/docs/tutorial/layers/batchreindex.md
@@ -0,0 +1,16 @@
+---
+title: Batch Reindex Layer
+---
+
+# Batch Reindex Layer
+
+* Layer type: `BatchReindex`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1BatchReindexLayer.html)
+* Header: [`./include/caffe/layers/batch_reindex_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/batch_reindex_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/batch_reindex_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/batch_reindex_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/batch_reindex_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/batch_reindex_layer.cu)
+
+
+## Parameters
+
+No parameters.
diff --git a/docs/tutorial/layers/bias.md b/docs/tutorial/layers/bias.md
new file mode 100644
index 00000000..d3a00c2f
--- /dev/null
+++ b/docs/tutorial/layers/bias.md
@@ -0,0 +1,19 @@
+---
+title: Bias Layer
+---
+
+# Bias Layer
+
+* Layer type: `Bias`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1BiasLayer.html)
+* Header: [`./include/caffe/layers/bias_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/bias_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/bias_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/bias_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/bias_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/bias_layer.cu)
+
+## Parameters
+* Parameters (`BiasParameter bias_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/BiasParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/bnll.md b/docs/tutorial/layers/bnll.md
new file mode 100644
index 00000000..2b68b79f
--- /dev/null
+++ b/docs/tutorial/layers/bnll.md
@@ -0,0 +1,25 @@
+---
+title: BNLL Layer
+---
+
+# BNLL Layer
+
+* Layer type: `BNLL`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1BNLLLayer.html)
+* Header: [`./include/caffe/layers/bnll_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/bnll_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/bnll_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/bnll_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/bnll_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/bnll_layer.cu)
+
+The `BNLL` (binomial normal log likelihood) layer computes the output as log(1 + exp(x)) for each input element x.
+
+## Parameters
+No parameters.
+
+## Sample
+
+      layer {
+        name: "layer"
+        bottom: "in"
+        top: "out"
+        type: BNLL
+      }
diff --git a/docs/tutorial/layers/concat.md b/docs/tutorial/layers/concat.md
new file mode 100644
index 00000000..c7b25395
--- /dev/null
+++ b/docs/tutorial/layers/concat.md
@@ -0,0 +1,40 @@
+---
+title: Concat Layer
+---
+
+# Concat Layer
+
+* Layer type: `Concat`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ConcatLayer.html)
+* Header: [`./include/caffe/layers/concat_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/concat_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/concat_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/concat_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/concat_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/concat_layer.cu)
+* Input
+    - `n_i * c_i * h * w` for each input blob i from 1 to K.
+* Output
+    - if `axis = 0`: `(n_1 + n_2 + ... + n_K) * c_1 * h * w`, and all input `c_i` should be the same.
+    - if `axis = 1`: `n_1 * (c_1 + c_2 + ... + c_K) * h * w`, and all input `n_i` should be the same.
+* Sample
+
+      layer {
+        name: "concat"
+        bottom: "in1"
+        bottom: "in2"
+        top: "out"
+        type: "Concat"
+        concat_param {
+          axis: 1
+        }
+      }
+
+The `Concat` layer is a utility layer that concatenates its multiple input blobs to one single output blob.
+
+## Parameters
+* Parameters (`ConcatParameter concat_param`)
+    - Optional
+        - `axis` [default 1]: 0 for concatenation along num and 1 for channels.
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ConcatParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/contrastiveloss.md b/docs/tutorial/layers/contrastiveloss.md
new file mode 100644
index 00000000..bb1859d9
--- /dev/null
+++ b/docs/tutorial/layers/contrastiveloss.md
@@ -0,0 +1,20 @@
+---
+title: Contrastive Loss Layer
+---
+
+# Contrastive Loss Layer
+
+* Layer type: `ContrastiveLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ContrastiveLossLayer.html)
+* Header: [`./include/caffe/layers/contrastive_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/contrastive_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/contrastive_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/contrastive_loss_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/contrastive_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/contrastive_loss_layer.cu)
+
+## Parameters
+
+* Parameters (`ContrastiveLossParameter contrastive_loss_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ContrastiveLossParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/convolution.md b/docs/tutorial/layers/convolution.md
new file mode 100644
index 00000000..cc9f4fd0
--- /dev/null
+++ b/docs/tutorial/layers/convolution.md
@@ -0,0 +1,63 @@
+---
+title: Convolution Layer
+---
+
+# Convolution Layer
+
+* Layer type: `Convolution`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ConvolutionLayer.html)
+* Header: [`./include/caffe/layers/conv_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/conv_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/conv_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/conv_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
+* Input
+    - `n * c_i * h_i * w_i`
+* Output
+    - `n * c_o * h_o * w_o`, where `h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1` and `w_o` likewise.
+
+The `Convolution` layer convolves the input image with a set of learnable filters, each producing one feature map in the output image.
+
+## Sample
+
+Sample (as seen in [`./models/bvlc_reference_caffenet/train_val.prototxt`](https://github.com/BVLC/caffe/blob/master/models/bvlc_reference_caffenet/train_val.prototxt)):
+
+      layer {
+        name: "conv1"
+        type: "Convolution"
+        bottom: "data"
+        top: "conv1"
+        # learning rate and decay multipliers for the filters
+        param { lr_mult: 1 decay_mult: 1 }
+        # learning rate and decay multipliers for the biases
+        param { lr_mult: 2 decay_mult: 0 }
+        convolution_param {
+          num_output: 96     # learn 96 filters
+          kernel_size: 11    # each filter is 11x11
+          stride: 4          # step 4 pixels between each filter application
+          weight_filler {
+            type: "gaussian" # initialize the filters from a Gaussian
+            std: 0.01        # distribution with stdev 0.01 (default mean: 0)
+          }
+          bias_filler {
+            type: "constant" # initialize the biases to zero (0)
+            value: 0
+          }
+        }
+      }
+
+## Parameters
+* Parameters (`ConvolutionParameter convolution_param`)
+    - Required
+        - `num_output` (`c_o`): the number of filters
+        - `kernel_size` (or `kernel_h` and `kernel_w`): specifies height and width of each filter
+    - Strongly Recommended
+        - `weight_filler` [default `type: 'constant' value: 0`]
+    - Optional
+        - `bias_term` [default `true`]: specifies whether to learn and apply a set of additive biases to the filter outputs
+        - `pad` (or `pad_h` and `pad_w`) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input
+        - `stride` (or `stride_h` and `stride_w`) [default 1]: specifies the intervals at which to apply the filters to the input
+        - `group` (g) [default 1]: If g > 1, we restrict the connectivity of each filter to a subset of the input. Specifically, the input and output channels are separated into g groups, and the $$i$$th output group channels will be only connected to the $$i$$th input group channels.
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ConvolutionParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/crop.md b/docs/tutorial/layers/crop.md
new file mode 100644
index 00000000..28f91241
--- /dev/null
+++ b/docs/tutorial/layers/crop.md
@@ -0,0 +1,20 @@
+---
+title: Crop Layer
+---
+
+# Crop Layer
+
+* Layer type: `Crop`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1CropLayer.html)
+* Header: [`./include/caffe/layers/crop_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/crop_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/crop_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/crop_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/crop_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/crop_layer.cu)
+
+## Parameters
+
+* Parameters (`CropParameter crop_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/CropParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/data.md b/docs/tutorial/layers/data.md
new file mode 100644
index 00000000..58e0dcaa
--- /dev/null
+++ b/docs/tutorial/layers/data.md
@@ -0,0 +1,29 @@
+---
+title: Database Layer
+---
+
+# Database Layer
+
+* Layer type: `Data`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1DataLayer.html)
+* Header: [`./include/caffe/layers/data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/data_layer.cpp)
+
+
+## Parameters
+
+* Parameters (`DataParameter data_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/DataParameter.txt %}
+{% endhighlight %}
+
+* Parameters
+    - Required
+        - `source`: the name of the directory containing the database
+        - `batch_size`: the number of inputs to process at one time
+    - Optional
+        - `rand_skip`: skip up to this number of inputs at the beginning; useful for asynchronous sgd
+        - `backend` [default `LEVELDB`]: choose whether to use a `LEVELDB` or `LMDB`
+
diff --git a/docs/tutorial/layers/deconvolution.md b/docs/tutorial/layers/deconvolution.md
new file mode 100644
index 00000000..2eff967d
--- /dev/null
+++ b/docs/tutorial/layers/deconvolution.md
@@ -0,0 +1,22 @@
+---
+title: Deconvolution Layer
+---
+
+# Deconvolution Layer
+
+* Layer type: `Deconvolution`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1DeconvolutionLayer.html)
+* Header: [`./include/caffe/layers/deconv_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/deconv_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/deconv_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/deconv_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/deconv_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/deconv_layer.cu)
+
+## Parameters
+
+Uses the same parameters as the Convolution layer.
+
+* Parameters (`ConvolutionParameter convolution_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ConvolutionParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/dropout.md b/docs/tutorial/layers/dropout.md
new file mode 100644
index 00000000..d8c6f955
--- /dev/null
+++ b/docs/tutorial/layers/dropout.md
@@ -0,0 +1,20 @@
+---
+title: Dropout Layer
+---
+
+# Dropout Layer
+
+* Layer type: `Dropout`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1DropoutLayer.html)
+* Header: [`./include/caffe/layers/dropout_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/dropout_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/dropout_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/dropout_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/dropout_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/dropout_layer.cu)
+
+## Parameters
+
+* Parameters (`DropoutParameter dropout_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/DropoutParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/dummydata.md b/docs/tutorial/layers/dummydata.md
new file mode 100644
index 00000000..d069f9c5
--- /dev/null
+++ b/docs/tutorial/layers/dummydata.md
@@ -0,0 +1,20 @@
+---
+title: Dummy Data Layer
+---
+
+# Dummy Data Layer
+
+* Layer type: `DummyData`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1DummyDataLayer.html)
+* Header: [`./include/caffe/layers/dummy_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/dummy_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/dummy_data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/dummy_data_layer.cpp)
+
+
+## Parameters
+
+* Parameters (`DummyDataParameter dummy_data_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/DummyDataParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/eltwise.md b/docs/tutorial/layers/eltwise.md
new file mode 100644
index 00000000..70fe7910
--- /dev/null
+++ b/docs/tutorial/layers/eltwise.md
@@ -0,0 +1,20 @@
+---
+title: Eltwise Layer
+---
+
+# Eltwise Layer
+
+* Layer type: `Eltwise`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1EltwiseLayer.html)
+* Header: [`./include/caffe/layers/eltwise_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/eltwise_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/eltwise_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/eltwise_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/eltwise_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/eltwise_layer.cu)
+
+## Parameters
+
+* Parameters (`EltwiseParameter eltwise_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/EltwiseParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/elu.md b/docs/tutorial/layers/elu.md
new file mode 100644
index 00000000..11db0f0e
--- /dev/null
+++ b/docs/tutorial/layers/elu.md
@@ -0,0 +1,25 @@
+---
+title: ELU Layer
+---
+
+# ELU Layer
+
+* Layer type: `ELU`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ELULayer.html)
+* Header: [`./include/caffe/layers/elu_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/elu_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/elu_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/elu_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/elu_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/elu_layer.cu)
+
+## References
+
+* Clevert, Djork-Arne, Thomas Unterthiner, and Sepp Hochreiter.
+  "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)" [arXiv:1511.07289](https://arxiv.org/abs/1511.07289). (2015).
+
+## Parameters
+
+* Parameters (`ELUParameter elu_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ELUParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/embed.md b/docs/tutorial/layers/embed.md
new file mode 100644
index 00000000..271636d8
--- /dev/null
+++ b/docs/tutorial/layers/embed.md
@@ -0,0 +1,20 @@
+---
+title: Embed Layer
+---
+
+# Embed Layer
+
+* Layer type: `Embed`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1EmbedLayer.html)
+* Header: [`./include/caffe/layers/embed_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/embed_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/embed_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/embed_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/embed_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/embed_layer.cu)
+
+## Parameters
+
+* Parameters (`EmbedParameter embed_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/EmbedParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/euclideanloss.md b/docs/tutorial/layers/euclideanloss.md
new file mode 100644
index 00000000..c1b72084
--- /dev/null
+++ b/docs/tutorial/layers/euclideanloss.md
@@ -0,0 +1,16 @@
+---
+title: Euclidean Loss Layer
+---
+# Sum-of-Squares / Euclidean Loss Layer
+
+* Layer type: `EuclideanLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1EuclideanLossLayer.html)
+* Header: [`./include/caffe/layers/euclidean_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/euclidean_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/euclidean_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/euclidean_loss_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/euclidean_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/euclidean_loss_layer.cu)
+
+The Euclidean loss layer computes the sum of squares of differences of its two inputs, $$\frac 1 {2N} \sum_{i=1}^N \| x^1_i - x^2_i \|_2^2$$.
+
+## Parameters
+
+Does not take any parameters.
diff --git a/docs/tutorial/layers/exp.md b/docs/tutorial/layers/exp.md
new file mode 100644
index 00000000..ef2500ec
--- /dev/null
+++ b/docs/tutorial/layers/exp.md
@@ -0,0 +1,24 @@
+---
+title: Exponential Layer
+---
+
+# Exponential Layer
+
+* Layer type: `Exp`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ExpLayer.html)
+* Header: [`./include/caffe/layers/exp_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/exp_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/exp_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/exp_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/exp_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/exp_layer.cu)
+
+## Parameters
+
+* Parameters (`Parameter exp_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ExpParameter.txt %}
+{% endhighlight %}
+
+## See also
+
+* [Power layer](power.html)
diff --git a/docs/tutorial/layers/filter.md b/docs/tutorial/layers/filter.md
new file mode 100644
index 00000000..aeda9ee6
--- /dev/null
+++ b/docs/tutorial/layers/filter.md
@@ -0,0 +1,15 @@
+---
+title: Filter Layer
+---
+
+# Filter Layer
+
+* Layer type: `Filter`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1FilterLayer.html)
+* Header: [`./include/caffe/layers/filter_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/filter_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/filter_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/filter_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/filter_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/filter_layer.cu)
+
+## Parameters
+
+Does not take any parameters.
diff --git a/docs/tutorial/layers/flatten.md b/docs/tutorial/layers/flatten.md
new file mode 100644
index 00000000..ecf08262
--- /dev/null
+++ b/docs/tutorial/layers/flatten.md
@@ -0,0 +1,21 @@
+---
+title: Flatten Layer
+---
+
+# Flatten Layer
+
+* Layer type: `Flatten`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1FlattenLayer.html)
+* Header: [`./include/caffe/layers/flatten_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/flatten_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/flatten_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/flatten_layer.cpp)
+
+The `Flatten` layer is a utility layer that flattens an input of shape `n * c * h * w` to a simple vector output of shape `n * (c*h*w)`.
+
+## Parameters
+
+* Parameters (`FlattenParameter flatten_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/FlattenParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/hdf5data.md b/docs/tutorial/layers/hdf5data.md
new file mode 100644
index 00000000..d6b7ea24
--- /dev/null
+++ b/docs/tutorial/layers/hdf5data.md
@@ -0,0 +1,20 @@
+---
+title: HDF5 Data Layer
+---
+
+# HDF5 Data Layer
+
+* Layer type: `HDF5Data`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1HDF5DataLayer.html)
+* Header: [`./include/caffe/layers/hdf5_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/hdf5_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/hdf5_data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hdf5_data_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/hdf5_data_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hdf5_data_layer.cu)
+
+## Parameters
+
+* Parameters (`HDF5DataParameter hdf5_data_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/HDF5DataParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/hdf5output.md b/docs/tutorial/layers/hdf5output.md
new file mode 100644
index 00000000..cfbe4ddb
--- /dev/null
+++ b/docs/tutorial/layers/hdf5output.md
@@ -0,0 +1,25 @@
+---
+title: HDF5 Output Layer
+---
+
+# HDF5 Output Layer
+
+* Layer type: `HDF5Output`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1HDF5OutputLayer.html)
+* Header: [`./include/caffe/layers/hdf5_output_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/hdf5_output_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/hdf5_output_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hdf5_output_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/hdf5_output_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hdf5_output_layer.cu)
+
+The HDF5 output layer performs the opposite function of the other layers in this section: it writes its input blobs to disk.
+
+## Parameters
+
+* Parameters (`HDF5OutputParameter hdf5_output_param`)
+    - Required
+        - `file_name`: name of file to write to
+
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/HDF5OutputParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/hingeloss.md b/docs/tutorial/layers/hingeloss.md
new file mode 100644
index 00000000..ef4fd95e
--- /dev/null
+++ b/docs/tutorial/layers/hingeloss.md
@@ -0,0 +1,19 @@
+---
+title: Hinge Loss Layer
+---
+
+# Hinge (L1, L2) Loss Layer
+
+* Layer type: `HingeLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1HingeLossLayer.html)
+* Header: [`./include/caffe/layers/hinge_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/hinge_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/hinge_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hinge_loss_layer.cpp)
+
+## Parameters
+
+* Parameters (`HingeLossParameter hinge_loss_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/HingeLossParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/im2col.md b/docs/tutorial/layers/im2col.md
new file mode 100644
index 00000000..0badc1cd
--- /dev/null
+++ b/docs/tutorial/layers/im2col.md
@@ -0,0 +1,16 @@
+---
+title: Im2col Layer
+---
+
+# im2col
+
+* File type: `Im2col`
+* Header: [`./include/caffe/layers/im2col_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/im2col_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/im2col_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/im2col_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/im2col_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/im2col_layer.cu)
+
+`Im2col` is a helper for doing the image-to-column transformation that you most
+likely do not need to know about. This is used in Caffe's original convolution
+to do matrix multiplication by laying out all patches into a matrix.
+
+
diff --git a/docs/tutorial/layers/imagedata.md b/docs/tutorial/layers/imagedata.md
new file mode 100644
index 00000000..82c8a600
--- /dev/null
+++ b/docs/tutorial/layers/imagedata.md
@@ -0,0 +1,27 @@
+---
+title: ImageData Layer
+---
+
+# ImageData Layer
+
+* Layer type: `ImageData`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ImageDataLayer.html)
+* Header: [`./include/caffe/layers/image_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/image_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/image_data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/image_data_layer.cpp)
+
+## Parameters
+
+* Parameters (`ImageDataParameter image_data_parameter`)
+    - Required
+        - `source`: name of a text file, with each line giving an image filename and label
+        - `batch_size`: number of images to batch together
+    - Optional
+        - `rand_skip`
+        - `shuffle` [default false]
+        - `new_height`, `new_width`: if provided, resize all images to this size
+
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ImageDataParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/infogainloss.md b/docs/tutorial/layers/infogainloss.md
new file mode 100644
index 00000000..86140b6c
--- /dev/null
+++ b/docs/tutorial/layers/infogainloss.md
@@ -0,0 +1,24 @@
+---
+title: Infogain Loss Layer
+---
+
+# Infogain Loss Layer
+
+* Layer type: `InfogainLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1InfogainLossLayer.html)
+* Header: [`./include/caffe/layers/infogain_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/infogain_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/infogain_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/infogain_loss_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/infogain_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/infogain_loss_layer.cu)
+
+A generalization of [MultinomialLogisticLossLayer](layers/multinomiallogisticloss.md) that takes an "information gain" (infogain) matrix specifying the "value" of all label pairs.
+
+Equivalent to the [MultinomialLogisticLossLayer](layers/multinomiallogisticloss.md) if the infogain matrix is the identity.
+
+## Parameters
+
+* Parameters (`Parameter infogain_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/InfogainLossParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/innerproduct.md b/docs/tutorial/layers/innerproduct.md
new file mode 100644
index 00000000..98b9bea8
--- /dev/null
+++ b/docs/tutorial/layers/innerproduct.md
@@ -0,0 +1,59 @@
+---
+title: Inner Product / Fully Connected Layer
+---
+
+# Inner Product / Fully Connected Layer
+
+* Layer type: `InnerProduct`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1InnerProductLayer.html)
+* Header: [`./include/caffe/layers/inner_product_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/inner_product_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/inner_product_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/inner_product_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/inner_product_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/inner_product_layer.cu)
+
+* Input
+    - `n * c_i * h_i * w_i`
+* Output
+    - `n * c_o * 1 * 1`
+* Sample
+
+      layer {
+        name: "fc8"
+        type: "InnerProduct"
+        # learning rate and decay multipliers for the weights
+        param { lr_mult: 1 decay_mult: 1 }
+        # learning rate and decay multipliers for the biases
+        param { lr_mult: 2 decay_mult: 0 }
+        inner_product_param {
+          num_output: 1000
+          weight_filler {
+            type: "gaussian"
+            std: 0.01
+          }
+          bias_filler {
+            type: "constant"
+            value: 0
+          }
+        }
+        bottom: "fc7"
+        top: "fc8"
+      }
+
+The `InnerProduct` layer (also usually referred to as the fully connected layer) treats the input as a simple vector and produces an output in the form of a single vector (with the blob's height and width set to 1).
+
+
+## Parameters
+
+* Parameters (`InnerProductParameter inner_product_param`)
+    - Required
+        - `num_output` (`c_o`): the number of filters
+    - Strongly recommended
+        - `weight_filler` [default `type: 'constant' value: 0`]
+    - Optional
+        - `bias_filler` [default `type: 'constant' value: 0`]
+        - `bias_term` [default `true`]: specifies whether to learn and apply a set of additive biases to the filter outputs
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/InnerProductParameter.txt %}
+{% endhighlight %}
+ 
diff --git a/docs/tutorial/layers/input.md b/docs/tutorial/layers/input.md
new file mode 100644
index 00000000..b74c35d2
--- /dev/null
+++ b/docs/tutorial/layers/input.md
@@ -0,0 +1,19 @@
+---
+title: Input Layer
+---
+
+# Input Layer
+
+* Layer type: `Input`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1InputLayer.html)
+* Header: [`./include/caffe/layers/input_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/input_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/input_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/input_layer.cpp)
+
+## Parameters
+
+* Parameters (`InputParameter input_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/InputParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/log.md b/docs/tutorial/layers/log.md
new file mode 100644
index 00000000..df520374
--- /dev/null
+++ b/docs/tutorial/layers/log.md
@@ -0,0 +1,20 @@
+---
+title: Log Layer
+---
+
+# Log Layer
+
+* Layer type: `Log`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1LogLayer.html)
+* Header: [`./include/caffe/layers/log_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/log_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/log_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/log_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/log_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/log_layer.cu)
+
+## Parameters
+
+* Parameters (`Parameter log_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/LogParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/lrn.md b/docs/tutorial/layers/lrn.md
new file mode 100644
index 00000000..387311c2
--- /dev/null
+++ b/docs/tutorial/layers/lrn.md
@@ -0,0 +1,28 @@
+---
+title: Local Response Normalization (LRN)
+---
+
+# Local Response Normalization (LRN)
+
+* Layer type: `LRN`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1LRNLayer.html)
+* Header: [`./include/caffe/layers/lrn_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/lrn_layer.hpp)
+* CPU Implementation: [`./src/caffe/layers/lrn_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lrn_layer.cpp)
+* CUDA GPU Implementation: [`./src/caffe/layers/lrn_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lrn_layer.cu)
+* Parameters (`LRNParameter lrn_param`)
+    - Optional
+        - `local_size` [default 5]: the number of channels to sum over (for cross channel LRN) or the side length of the square region to sum over (for within channel LRN)
+        - `alpha` [default 1]: the scaling parameter (see below)
+        - `beta` [default 5]: the exponent (see below)
+        - `norm_region` [default `ACROSS_CHANNELS`]: whether to sum over adjacent channels (`ACROSS_CHANNELS`) or nearby spatial locaitons (`WITHIN_CHANNEL`)
+
+The local response normalization layer performs a kind of "lateral inhibition" by normalizing over local input regions. In `ACROSS_CHANNELS` mode, the local regions extend across nearby channels, but have no spatial extent (i.e., they have shape `local_size x 1 x 1`). In `WITHIN_CHANNEL` mode, the local regions extend spatially, but are in separate channels (i.e., they have shape `1 x local_size x local_size`). Each input value is divided by $$(1 + (\alpha/n) \sum_i x_i^2)^\beta$$, where $$n$$ is the size of each local region, and the sum is taken over the region centered at that value (zero padding is added where necessary).
+
+## Parameters
+
+* Parameters (`Parameter lrn_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/BatchNormParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/lstm.md b/docs/tutorial/layers/lstm.md
new file mode 100644
index 00000000..8e4095e9
--- /dev/null
+++ b/docs/tutorial/layers/lstm.md
@@ -0,0 +1,21 @@
+---
+title: LSTM Layer
+---
+
+# LSTM Layer
+
+* Layer type: `LSTM`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1LSTMLayer.html)
+* Header: [`./include/caffe/layers/lstm_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/lstm_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/lstm_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lstm_layer.cpp)
+* CPU implementation (helper): [`./src/caffe/layers/lstm_unit_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lstm_unit_layer.cpp)
+* CUDA GPU implementation (helper): [`./src/caffe/layers/lstm_unit_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lstm_unit_layer.cu)
+
+## Parameters
+
+* Parameters (`Parameter recurrent_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/RecurrentParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/memorydata.md b/docs/tutorial/layers/memorydata.md
new file mode 100644
index 00000000..754e62ae
--- /dev/null
+++ b/docs/tutorial/layers/memorydata.md
@@ -0,0 +1,25 @@
+---
+title: Memory Data Layer
+---
+
+# Memory Data Layer
+
+* Layer type: `MemoryData`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1MemoryDataLayer.html)
+* Header: [`./include/caffe/layers/memory_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/memory_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/memory_data_layer.cpu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/memory_data_layer.cpu)
+
+The memory data layer reads data directly from memory, without copying it. In order to use it, one must call `MemoryDataLayer::Reset` (from C++) or `Net.set_input_arrays` (from Python) in order to specify a source of contiguous data (as 4D row major array), which is read one batch-sized chunk at a time.
+
+# Parameters
+
+* Parameters (`MemoryDataParameter memory_data_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/MemoryDataParameter.txt %}
+{% endhighlight %}
+
+* Parameters
+    - Required
+        - `batch_size`, `channels`, `height`, `width`: specify the size of input chunks to read from memory
diff --git a/docs/tutorial/layers/multinomiallogisticloss.md b/docs/tutorial/layers/multinomiallogisticloss.md
new file mode 100644
index 00000000..a28ab914
--- /dev/null
+++ b/docs/tutorial/layers/multinomiallogisticloss.md
@@ -0,0 +1,19 @@
+---
+title: Multinomial Logistic Loss Layer
+---
+
+# Multinomial Logistic Loss Layer
+
+* Layer type: `MultinomialLogisticLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1MultinomialLogisticLossLayer.html)
+* Header: [`./include/caffe/layers/multinomial_logistic_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/multinomial_logistic_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/multinomial_logistic_loss_layer.cpu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/multinomial_logistic_loss_layer.cpu)
+
+## Parameters
+
+* Parameters (`LossParameter loss_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/LossParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/mvn.md b/docs/tutorial/layers/mvn.md
new file mode 100644
index 00000000..08e44887
--- /dev/null
+++ b/docs/tutorial/layers/mvn.md
@@ -0,0 +1,20 @@
+---
+title: Mean-Variance Normalization (MVN) Layer
+---
+
+# Mean-Variance Normalization (MVN) Layer
+
+* Layer type: `MVN`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1MVNLayer.html)
+* Header: [`./include/caffe/layers/mvn_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/mvn_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/mvn_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/mvn_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/mvn_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/mvn_layer.cu)
+
+## Parameters
+
+* Parameters (`MVNParameter mvn_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/MVNParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/parameter.md b/docs/tutorial/layers/parameter.md
new file mode 100644
index 00000000..b7e85ec5
--- /dev/null
+++ b/docs/tutorial/layers/parameter.md
@@ -0,0 +1,21 @@
+---
+title: Parameter Layer
+---
+
+# Parameter Layer
+
+* Layer type: `Parameter`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ParameterLayer.html)
+* Header: [`./include/caffe/layers/parameter_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/parameter_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/parameter_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/parameter_layer.cpp)
+
+See [https://github.com/BVLC/caffe/pull/2079](https://github.com/BVLC/caffe/pull/2079).
+
+## Parameters
+
+* Parameters (`ParameterParameter parameter_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ParameterParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/pooling.md b/docs/tutorial/layers/pooling.md
new file mode 100644
index 00000000..12669ee8
--- /dev/null
+++ b/docs/tutorial/layers/pooling.md
@@ -0,0 +1,47 @@
+---
+title: Pooling Layer
+---
+# Pooling
+
+* Layer type: `Pooling`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1PoolingLayer.html)
+* Header: [`./include/caffe/layers/pooling_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/pooling_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/pooling_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/pooling_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
+
+* Input
+    - `n * c * h_i * w_i`
+* Output
+    - `n * c * h_o * w_o`, where h_o and w_o are computed in the same way as convolution.
+
+## Parameters
+
+* Parameters (`PoolingParameter pooling_param`)
+    - Required
+        - `kernel_size` (or `kernel_h` and `kernel_w`): specifies height and width of each filter
+    - Optional
+        - `pool` [default MAX]: the pooling method. Currently MAX, AVE, or STOCHASTIC
+        - `pad` (or `pad_h` and `pad_w`) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input
+        - `stride` (or `stride_h` and `stride_w`) [default 1]: specifies the intervals at which to apply the filters to the input
+
+
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/PoolingParameter.txt %}
+{% endhighlight %}
+
+## Sample
+* Sample (as seen in [`./models/bvlc_reference_caffenet/train_val.prototxt`](https://github.com/BVLC/caffe/blob/master/models/bvlc_reference_caffenet/train_val.prototxt))
+
+      layer {
+        name: "pool1"
+        type: "Pooling"
+        bottom: "conv1"
+        top: "pool1"
+        pooling_param {
+          pool: MAX
+          kernel_size: 3 # pool over a 3x3 region
+          stride: 2      # step two pixels (in the bottom blob) between pooling regions
+        }
+      }
diff --git a/docs/tutorial/layers/power.md b/docs/tutorial/layers/power.md
new file mode 100644
index 00000000..d6617529
--- /dev/null
+++ b/docs/tutorial/layers/power.md
@@ -0,0 +1,46 @@
+---
+title: Power Layer
+---
+
+# Power Layer
+
+* Layer type: `Power`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1PowerLayer.html)
+* Header: [`./include/caffe/layers/power_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/power_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/power_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/power_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/power_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/power_layer.cu)
+
+The `Power` layer computes the output as (shift + scale * x) ^ power for each input element x.
+
+## Parameters
+* Parameters (`PowerParameter power_param`)
+    - Optional
+        - `power` [default 1]
+        - `scale` [default 1]
+        - `shift` [default 0]
+
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/PowerParameter.txt %}
+{% endhighlight %}
+ 
+ 
+ 
+## Sample
+
+      layer {
+        name: "layer"
+        bottom: "in"
+        top: "out"
+        type: "Power"
+        power_param {
+          power: 1
+          scale: 1
+          shift: 0
+        }
+      }
+
+## See also
+
+* [Exponential layer](exp.html)
diff --git a/docs/tutorial/layers/prelu.md b/docs/tutorial/layers/prelu.md
new file mode 100644
index 00000000..e7b7b44a
--- /dev/null
+++ b/docs/tutorial/layers/prelu.md
@@ -0,0 +1,20 @@
+---
+title: PReLU Layer
+---
+
+# PReLU Layer
+
+* Layer type: `PReLU`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1PReLULayer.html)
+* Header: [`./include/caffe/layers/prelu_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/prelu_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/prelu_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/prelu_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/prelu_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/prelu_layer.cu)
+
+## Parameters
+
+* Parameters (`PReLUParameter prelu_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/PReLUParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/python.md b/docs/tutorial/layers/python.md
new file mode 100644
index 00000000..2e30b3a7
--- /dev/null
+++ b/docs/tutorial/layers/python.md
@@ -0,0 +1,27 @@
+---
+title: Python Layer
+---
+
+# Python Layer
+
+* Layer type: `Python`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1PythonLayer.html)
+* Header: [`./include/caffe/layers/python_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/python_layer.hpp)
+
+The Python layer allows users to add customized layers without modifying the Caffe core code.
+
+## Parameters
+
+* Parameters (`PythonParameter python_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/PythonParameter.txt %}
+{% endhighlight %}
+
+## Examples and tutorials
+
+* Simple Euclidean loss example
+** [Python code](https://github.com/BVLC/caffe/blob/master/examples/pycaffe/layers/pyloss.py)
+** [Prototxt](https://github.com/BVLC/caffe/blob/master/examples/pycaffe/linreg.prototxt)
+* [Tutorial for writing Python layers with DIGITS](https://github.com/NVIDIA/DIGITS/tree/master/examples/python-layer)
diff --git a/docs/tutorial/layers/recurrent.md b/docs/tutorial/layers/recurrent.md
new file mode 100644
index 00000000..a882b722
--- /dev/null
+++ b/docs/tutorial/layers/recurrent.md
@@ -0,0 +1,20 @@
+---
+title: Recurrent Layer
+---
+
+# Recurrent Layer
+
+* Layer type: `Recurrent`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1RecurrentLayer.html)
+* Header: [`./include/caffe/layers/recurrent_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/recurrent_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/recurrent_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/recurrent_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/recurrent_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/recurrent_layer.cu)
+
+## Parameters
+
+* Parameters (`RecurrentParameter recurrent_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/RecurrentParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/reduction.md b/docs/tutorial/layers/reduction.md
new file mode 100644
index 00000000..db55414b
--- /dev/null
+++ b/docs/tutorial/layers/reduction.md
@@ -0,0 +1,20 @@
+---
+title: Reduction Layer
+---
+
+# Reduction Layer
+
+* Layer type: `Reduction`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ReductionLayer.html)
+* Header: [`./include/caffe/layers/reduction_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/reduction_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/reduction_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/reduction_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/reduction_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/reduction_layer.cu)
+
+## Parameters
+
+* Parameters (`ReductionParameter reduction_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ReductionParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/relu.md b/docs/tutorial/layers/relu.md
new file mode 100644
index 00000000..01aab0af
--- /dev/null
+++ b/docs/tutorial/layers/relu.md
@@ -0,0 +1,32 @@
+---
+title: ReLU / Rectified-Linear and Leaky-ReLU Layer
+---
+
+# ReLU / Rectified-Linear and Leaky-ReLU Layer
+
+* Layer type: `ReLU`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ReLULayer.html)
+* Header: [`./include/caffe/layers/relu_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/relu_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/relu_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/relu_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/relu_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/relu_layer.cu)
+* Sample (as seen in [`./models/bvlc_reference_caffenet/train_val.prototxt`](https://github.com/BVLC/caffe/blob/master/models/bvlc_reference_caffenet/train_val.prototxt))
+
+      layer {
+        name: "relu1"
+        type: "ReLU"
+        bottom: "conv1"
+        top: "conv1"
+      }
+
+Given an input value x, The `ReLU` layer computes the output as x if x > 0 and negative_slope * x if x <= 0. When the negative slope parameter is not set, it is equivalent to the standard ReLU function of taking max(x, 0). It also supports in-place computation, meaning that the bottom and the top blob could be the same to preserve memory consumption.
+
+## Parameters
+
+* Parameters (`ReLUParameter relu_param`)
+    - Optional
+        - `negative_slope` [default 0]: specifies whether to leak the negative part by multiplying it with the slope value rather than setting it to 0.
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ReLUParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/reshape.md b/docs/tutorial/layers/reshape.md
new file mode 100644
index 00000000..92d23f2c
--- /dev/null
+++ b/docs/tutorial/layers/reshape.md
@@ -0,0 +1,51 @@
+---
+title: Reshape Layer
+---
+
+# Reshape Layer
+* Layer type: `Reshape`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ReshapeLayer.html)
+* Header: [`./include/caffe/layers/reshape_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/reshape_layer.hpp)
+* Implementation: [`./src/caffe/layers/reshape_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/reshape_layer.cpp)
+
+* Input
+    - a single blob with arbitrary dimensions
+* Output
+    - the same blob, with modified dimensions, as specified by `reshape_param`
+
+* Sample
+
+        layer {
+          name: "reshape"
+          type: "Reshape"
+          bottom: "input"
+          top: "output"
+          reshape_param {
+            shape {
+              dim: 0  # copy the dimension from below
+              dim: 2
+              dim: 3
+              dim: -1 # infer it from the other dimensions
+            }
+          }
+        }
+
+The `Reshape` layer can be used to change the dimensions of its input, without changing its data. Just like the `Flatten` layer, only the dimensions are changed; no data is copied in the process.
+
+Output dimensions are specified by the `ReshapeParam` proto. Positive numbers are used directly, setting the corresponding dimension of the output blob. In addition, two special values are accepted for any of the target dimension values:
+
+* **0** means "copy the respective dimension of the bottom layer". That is, if the bottom has 2 as its 1st dimension, the top will have 2 as its 1st dimension as well, given `dim: 0` as the 1st target dimension.
+* **-1** stands for "infer this from the other dimensions". This behavior is similar to that of -1 in *numpy*'s or `[]` for *MATLAB*'s reshape: this dimension is calculated to keep the overall element count the same as in the bottom layer. At most one -1 can be used in a reshape operation.
+
+As another example, specifying `reshape_param { shape { dim: 0 dim: -1 } }` makes the layer behave in exactly the same way as the `Flatten` layer.
+ 
+## Parameters
+
+* Parameters (`ReshapeParameter reshape_param`)
+    - Optional: (also see detailed description below)
+        - `shape`
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ReshapeParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/rnn.md b/docs/tutorial/layers/rnn.md
new file mode 100644
index 00000000..b6fcf471
--- /dev/null
+++ b/docs/tutorial/layers/rnn.md
@@ -0,0 +1,19 @@
+---
+title: RNN Layer
+---
+
+# RNN Layer
+
+* Layer type: `RNN`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1RNNLayer.html)
+* Header: [`./include/caffe/layers/rnn_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/rnn_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/rnn_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/rnn_layer.cpp)
+
+## Parameters
+
+* Parameters (`RecurrentParameter recurrent_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/RecurrentParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/scale.md b/docs/tutorial/layers/scale.md
new file mode 100644
index 00000000..0e27549a
--- /dev/null
+++ b/docs/tutorial/layers/scale.md
@@ -0,0 +1,20 @@
+---
+title: Scale Layer
+---
+
+# Scale Layer
+
+* Layer type: `Scale`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ScaleLayer.html)
+* Header: [`./include/caffe/layers/scale_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/scale_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/scale_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/scale_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/scale_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/scale_layer.cu)
+
+## Parameters
+
+* Parameters (`ScaleParameter scale_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ScaleParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/sigmoid.md b/docs/tutorial/layers/sigmoid.md
new file mode 100644
index 00000000..50531835
--- /dev/null
+++ b/docs/tutorial/layers/sigmoid.md
@@ -0,0 +1,20 @@
+---
+title: Sigmoid Layer
+---
+
+# Sigmoid Layer
+
+* Layer type: `Sigmoid`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SigmoidLayer.html)
+* Header: [`./include/caffe/layers/sigmoid_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/sigmoid_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/sigmoid_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/sigmoid_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_layer.cu)
+
+## Parameters
+
+* Parameters (`SigmoidParameter sigmoid_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SigmoidParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/sigmoidcrossentropyloss.md b/docs/tutorial/layers/sigmoidcrossentropyloss.md
new file mode 100644
index 00000000..a6e42cad
--- /dev/null
+++ b/docs/tutorial/layers/sigmoidcrossentropyloss.md
@@ -0,0 +1,13 @@
+---
+title: Sigmoid Cross-Entropy Loss Layer
+---
+
+# Sigmoid Cross-Entropy Loss Layer
+
+* Layer type: `SigmoidCrossEntropyLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SigmoidCrossEntropyLossLayer.html)
+* Header: [`./include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu)
+
+To-do.
diff --git a/docs/tutorial/layers/silence.md b/docs/tutorial/layers/silence.md
new file mode 100644
index 00000000..2c37a9cd
--- /dev/null
+++ b/docs/tutorial/layers/silence.md
@@ -0,0 +1,23 @@
+---
+title: Silence Layer
+---
+
+# Silence Layer
+
+* Layer type: `Silence`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SilenceLayer.html)
+* Header: [`./include/caffe/layers/silence_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/silence_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/silence_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/silence_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/silence_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/silence_layer.cu)
+
+Silences a blob, so that it is not printed.
+
+## Parameters
+
+* Parameters (`SilenceParameter silence_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/BatchNormParameter.txt %}
+{% endhighlight %}
+
diff --git a/docs/tutorial/layers/slice.md b/docs/tutorial/layers/slice.md
new file mode 100644
index 00000000..a492f1e8
--- /dev/null
+++ b/docs/tutorial/layers/slice.md
@@ -0,0 +1,42 @@
+---
+title: Slice Layer
+---
+
+# Slice Layer
+
+* Layer type: `Slice`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SliceLayer.html)
+* Header: [`./include/caffe/layers/slice_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/slice_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/slice_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/slice_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/slice_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/slice_layer.cu)
+
+The `Slice` layer is a utility layer that slices an input layer to multiple output layers along a given dimension (currently num or channel only) with given slice indices.
+
+* Sample
+
+      layer {
+        name: "slicer_label"
+        type: "Slice"
+        bottom: "label"
+        ## Example of label with a shape N x 3 x 1 x 1
+        top: "label1"
+        top: "label2"
+        top: "label3"
+        slice_param {
+          axis: 1
+          slice_point: 1
+          slice_point: 2
+        }
+      }
+
+`axis` indicates the target axis; `slice_point` indicates indexes in the selected dimension (the number of indices must be equal to the number of top blobs minus one).
+
+## Parameters
+
+* Parameters (`SliceParameter slice_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SliceParameter.txt %}
+{% endhighlight %}
+
diff --git a/docs/tutorial/layers/softmax.md b/docs/tutorial/layers/softmax.md
new file mode 100644
index 00000000..e5d53425
--- /dev/null
+++ b/docs/tutorial/layers/softmax.md
@@ -0,0 +1,24 @@
+---
+title: Softmax Layer
+---
+
+# Softmax Layer
+
+* Layer type: `Softmax`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SoftmaxLayer.html)
+* Header: [`./include/caffe/layers/softmax_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/softmax_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/softmax_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/softmax_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/softmax_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/softmax_layer.cu)
+
+## Parameters
+
+* Parameters (`SoftmaxParameter softmax_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SoftmaxParameter.txt %}
+{% endhighlight %}
+
+## See also
+
+* [Softmax loss layer](softmaxwithloss.html)
diff --git a/docs/tutorial/layers/softmaxwithloss.md b/docs/tutorial/layers/softmaxwithloss.md
new file mode 100644
index 00000000..d9a6774a
--- /dev/null
+++ b/docs/tutorial/layers/softmaxwithloss.md
@@ -0,0 +1,33 @@
+---
+title: Softmax with Loss Layer
+---
+
+# Softmax with Loss Layer
+
+* Layer type: `SoftmaxWithLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SoftmaxWithLossLayer.html)
+* Header: [`./include/caffe/layers/softmax_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/softmax_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/softmax_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/softmax_loss_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/softmax_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/softmax_loss_layer.cu)
+
+The softmax loss layer computes the multinomial logistic loss of the softmax of its inputs. It's conceptually identical to a softmax layer followed by a multinomial logistic loss layer, but provides a more numerically stable gradient.
+
+## Parameters
+
+* Parameters (`SoftmaxParameter softmax_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SoftmaxParameter.txt %}
+{% endhighlight %}
+
+* Parameters (`LossParameter loss_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/LossParameter.txt %}
+{% endhighlight %}
+
+## See also
+
+* [Softmax layer](softmax.html)
diff --git a/docs/tutorial/layers/split.md b/docs/tutorial/layers/split.md
new file mode 100644
index 00000000..4fb71d1f
--- /dev/null
+++ b/docs/tutorial/layers/split.md
@@ -0,0 +1,17 @@
+---
+title: Split Layer
+---
+
+# Split Layer
+
+* Layer type: `Split`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SplitLayer.html)
+* Header: [`./include/caffe/layers/split_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/split_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/split_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/split_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/split_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/split_layer.cu)
+
+The `Split` layer is a utility layer that splits an input blob to multiple output blobs. This is used when a blob is fed into multiple output layers.
+
+## Parameters
+
+Does not take any parameters.
diff --git a/docs/tutorial/layers/spp.md b/docs/tutorial/layers/spp.md
new file mode 100644
index 00000000..26e58620
--- /dev/null
+++ b/docs/tutorial/layers/spp.md
@@ -0,0 +1,20 @@
+---
+title: Spatial Pyramid Pooling Layer
+---
+
+# Spatial Pyramid Pooling Layer
+
+* Layer type: `SPP`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SPPLayer.html)
+* Header: [`./include/caffe/layers/spp_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/spp_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/spp_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/spp_layer.cpp)
+
+
+## Parameters
+
+* Parameters (`SPPParameter spp_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SPPParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/tanh.md b/docs/tutorial/layers/tanh.md
new file mode 100644
index 00000000..36063459
--- /dev/null
+++ b/docs/tutorial/layers/tanh.md
@@ -0,0 +1,18 @@
+---
+title: TanH Layer
+---
+
+# TanH Layer
+
+* Header: [`./include/caffe/layers/tanh_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/tanh_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/tanh_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/tanh_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/tanh_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/tanh_layer.cu)
+
+## Parameters
+
+* Parameters (`TanHParameter tanh_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/TanHParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/threshold.md b/docs/tutorial/layers/threshold.md
new file mode 100644
index 00000000..819e9e6f
--- /dev/null
+++ b/docs/tutorial/layers/threshold.md
@@ -0,0 +1,18 @@
+---
+title: Threshold Layer
+---
+
+# Threshold Layer
+
+* Header: [`./include/caffe/layers/threshold_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/threshold_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/threshold_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/threshold_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/threshold_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/threshold_layer.cu)
+
+## Parameters
+
+* Parameters (`ThresholdParameter threshold_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ThresholdParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/tile.md b/docs/tutorial/layers/tile.md
new file mode 100644
index 00000000..ea03aaa4
--- /dev/null
+++ b/docs/tutorial/layers/tile.md
@@ -0,0 +1,20 @@
+---
+title: Tile Layer
+---
+
+# Tile Layer
+
+* Layer type: `Tile`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1TileLayer.html)
+* Header: [`./include/caffe/layers/tile_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/tile_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/tile_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/tile_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/tile_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/tile_layer.cu)
+
+## Parameters
+
+* Parameters (`TileParameter tile_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/TileParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/windowdata.md b/docs/tutorial/layers/windowdata.md
new file mode 100644
index 00000000..0cb4a8df
--- /dev/null
+++ b/docs/tutorial/layers/windowdata.md
@@ -0,0 +1,19 @@
+---
+title: WindowData Layer
+---
+
+# WindowData Layer
+
+* Layer type: `WindowData`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1WindowDataLayer.html)
+* Header: [`./include/caffe/layers/window_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/window_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/window_data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/window_data_layer.cpp)
+
+## Parameters
+
+* Parameters (`WindowDataParameter`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/WindowDataParameter.txt %}
+{% endhighlight %}
diff --git a/scripts/build_docs.sh b/scripts/build_docs.sh
index 0e28bd71..4837587a 100755
--- a/scripts/build_docs.sh
+++ b/scripts/build_docs.sh
@@ -12,6 +12,9 @@ cd $ROOT_DIR
 # Gather docs.
 scripts/gather_examples.sh
 
+# Split caffe.proto for inclusion by layer catalogue.
+scripts/split_caffe_proto.py
+
 # Generate developer docs.
 make docs
 
diff --git a/scripts/split_caffe_proto.py b/scripts/split_caffe_proto.py
new file mode 100755
index 00000000..7e9dc3e7
--- /dev/null
+++ b/scripts/split_caffe_proto.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+import mmap
+import re
+import os
+import errno
+
+script_path = os.path.dirname(os.path.realpath(__file__))
+
+# a regex to match the parameter definitions in caffe.proto
+r = re.compile(r'(?://.*\n)*message ([^ ]*) \{\n(?: .*\n|\n)*\}')
+
+# create directory to put caffe.proto fragments
+try:
+    os.mkdir(
+        os.path.join(script_path,
+                     '../docs/_includes/'))
+    os.mkdir(
+        os.path.join(script_path,
+                     '../docs/_includes/proto/'))
+except OSError as exception:
+    if exception.errno != errno.EEXIST:
+        raise
+
+caffe_proto_fn = os.path.join(
+    script_path,
+    '../src/caffe/proto/caffe.proto')
+
+with open(caffe_proto_fn, 'r') as fin:
+
+    for m in r.finditer(fin.read(), re.MULTILINE):
+        fn = os.path.join(
+            script_path,
+            '../docs/_includes/proto/%s.txt' % m.group(1))
+        with open(fn, 'w') as fout:
+            fout.write(m.group(0))

From fb52c7ccd2b21b26621f5abe35e776736aa9db91 Mon Sep 17 00:00:00 2001
From: "Jonathan R. Williford" <jonathan.r.williford+github@gmail.com>
Date: Wed, 21 Dec 2016 11:33:42 +0100
Subject: [PATCH 098/183] Add Debian codenames and make link.

Add the Debian codenames / versions, so it is easier to tell which Debian version is which in the future when the releases are promoted.

Revise commit according to CDLuminate's comments.

Removed rolling release numbers. Mention that Debian/testing can install Caffe using the packages.
---
 docs/install_apt_debian.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/docs/install_apt_debian.md b/docs/install_apt_debian.md
index 745a6f4f..1580dc43 100644
--- a/docs/install_apt_debian.md
+++ b/docs/install_apt_debian.md
@@ -4,10 +4,13 @@ title: "Installation: Debian"
 
 # Debian Installation
 
-Caffe packages are available for `Debian/unstable`. Debian/stable users
-should take a look at Ubuntu installation instruction.  
+Caffe packages are available for `Debian/unstable`. Debian/stable
+(jessie) users should take a look at [Ubuntu installation instruction](
+install_apt.html). Debian/testing (stretch) users may be able to get Caffe
+to work using the packages in Debian/unstable, but it is beyond the scope of
+this guide.
 
-Only experienced linux users are recommended to try Debian/unstable (Sid).  
+Only experienced linux users are recommended to try Debian/unstable (Sid). 
 
 Last update: Dec.21 2016  
 

From 5c437b13d2afde8f8e961e1e8a50fda060cb4519 Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Wed, 4 Jan 2017 02:49:11 +0000
Subject: [PATCH 099/183] docs: update debian installation guide. Thanks to
 @lukeyeager for comments.

---
 docs/install_apt_debian.md | 29 +++++++++++++++++++----------
 docs/installation.md       |  2 +-
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/docs/install_apt_debian.md b/docs/install_apt_debian.md
index 1580dc43..3175f693 100644
--- a/docs/install_apt_debian.md
+++ b/docs/install_apt_debian.md
@@ -10,9 +10,9 @@ install_apt.html). Debian/testing (stretch) users may be able to get Caffe
 to work using the packages in Debian/unstable, but it is beyond the scope of
 this guide.
 
-Only experienced linux users are recommended to try Debian/unstable (Sid). 
+Only experienced linux users are recommended to try Debian/unstable (Sid).
 
-Last update: Dec.21 2016  
+Last update: 2017-01-04
 
 ## Debian/unstable
 
@@ -52,6 +52,7 @@ $ sudo apt build-dep [ caffe-cpu | caffe-cuda ]            # the most elegant wa
 $ apt source [ caffe-cpu | caffe-cuda ]               # download the source tarball and extract
 $ cd caffe-XXXX
 [ ... optional, customize caffe code/build ... ]
+$ dch -llocal "Modified XXX in order to XXX"          # write your one-line changelog
 $ debuild -B -j4                                      # build caffe with 4 parallel jobs (similar to make -j4)
 [ ... building ...]
 $ debc                                                # optional, if you want to check the package contents
@@ -59,6 +60,12 @@ $ sudo debi                                           # optional, install the ge
 ```
 The resulting deb packages can be found under the parent directory of the source tree.
 
+Note, the `dch ...` command line above is for bumping the package version number
+and adding an entry to the package changelog. If you would like to write
+more than one changelog entry, use subsequent `dch` command (see `man 1 dch`)
+instead of manually modifing `debian/changelog` unless you know how to keep its format correct.
+The changelog will be installed at e.g. `/usr/share/doc/caffe-cpu/changelog.Debian.gz`.
+
 ### Source installation
 
 Source installation under Debian/unstable is similar to that of Ubuntu, but
@@ -71,15 +78,13 @@ Note, this requires a `deb-src` entry in your `/etc/apt/sources.list`.
 ### Notes
 
 * Consider re-compiling OpenBLAS locally with optimization flags for sake of
-performance. This is highly recommended if you are writing a paper.
+performance. This is highly recommended for any kind of production use, including
+academic research.
 
 * If you are installing `caffe-cuda`, APT will automatically pull some of the
-CUDA packages and the nvidia driver packages. Please take care if you have
+CUDA packages and the nvidia driver packages. Please be careful if you have
 manually installed or hacked nvidia driver or CUDA toolkit or any other
-related stuff, because in this case it may fail.
-
-* If you encountered any problem when installing `caffe-*`, please report bug
-to Debian via Debian's bug tracking system. See https://www.debian.org/Bugs/ .
+related stuff, because in this case APT may fail.
 
 * Additionally, a manpage (`man caffe`) and a bash complementation script
 (`caffe <TAB><TAB>`, `caffe train <TAB><TAB>`) are provided.
@@ -88,6 +93,10 @@ Both of the two files are still not merged into caffe master.
 * The python interface is Python 3 version: `python3-caffe-{cpu,cuda}`.
 No plan to support python2.
 
+* If you encountered any problem related to the packaging system (e.g. failed to install `caffe-*`),
+please report bug to Debian via Debian's bug tracking system. See https://www.debian.org/Bugs/ .
+Patches and suggestions are also welcome.
+
 ## FAQ
 
 * where is caffe-cudnn?
@@ -96,11 +105,11 @@ CUDNN library seems not redistributable currently. If you really want the
 caffe-cudnn deb packages, the workaround is to install cudnn by yourself,
 and hack the packaging scripts, then build your customized package.
 
-* I installed the CPU version, How can I switch to the CUDA version?
+* I installed the CPU version. How can I switch to the CUDA version?
 
 `sudo apt install caffe-cuda`, apt's dependency resolver is smart enough to deal with this.
 
-* Where is the examples, the models and other documentation stuff?
+* Where are the examples, the models and other documentation stuff?
 
 ```
 sudo apt install caffe-doc
diff --git a/docs/installation.md b/docs/installation.md
index 14ec4674..6b2cd3bd 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -12,7 +12,7 @@ The official Makefile and `Makefile.config` build are complemented by a [communi
 
 - [Docker setup](https://github.com/BVLC/caffe/tree/master/docker) *out-of-the-box brewing*
 - [Ubuntu installation](install_apt.html) *the standard platform*
-- [Debian installation](install_apt_debian.html) *deploy caffe with a single command*
+- [Debian installation](install_apt_debian.html) *install caffe with a single command*
 - [OS X installation](install_osx.html)
 - [RHEL / CentOS / Fedora installation](install_yum.html)
 - [Windows](https://github.com/BVLC/caffe/tree/windows) *see the Windows branch led by Guillaume Dumont*

From 369a1f49fa7e40f39827c1dcaede224b78f6c10c Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Thu, 5 Jan 2017 05:00:37 +0000
Subject: [PATCH 100/183] docs: add some tables to debian install guide and
 misc update

docs: change UTF-8 characters
---
 docs/install_apt_debian.md | 70 +++++++++++++++++++++++++++++---------
 1 file changed, 54 insertions(+), 16 deletions(-)

diff --git a/docs/install_apt_debian.md b/docs/install_apt_debian.md
index 3175f693..0d39e3ae 100644
--- a/docs/install_apt_debian.md
+++ b/docs/install_apt_debian.md
@@ -4,39 +4,51 @@ title: "Installation: Debian"
 
 # Debian Installation
 
-Caffe packages are available for `Debian/unstable`. Debian/stable
-(jessie) users should take a look at [Ubuntu installation instruction](
-install_apt.html). Debian/testing (stretch) users may be able to get Caffe
-to work using the packages in Debian/unstable, but it is beyond the scope of
-this guide.
+Caffe packages are available for several Debian versions, as shown in the
+following chart
 
-Only experienced linux users are recommended to try Debian/unstable (Sid).
+```
+Your Distro     |  CPU_ONLY  |  CUDA  |     Alias
+----------------+------------+--------+-------------------
+Debian/stable   |     ✘      |   ✘    | Debian Jessie
+Debian/testing  |     ✔      |   ☐    | Debian Stretch/Sid
+Debian/unstable |     ✔      |   ✔    | Debian Sid
+```
 
-Last update: 2017-01-04
+* `✘ ` You should take a look at [Ubuntu installation instruction](install_apt.html).
 
-## Debian/unstable
+* `✔ ` You can install caffe with a single command line following this guide.
 
-Apart from the installation methods based on source, Debian/unstable
-users can install pre-compiled Caffe packages via the official archive.
+* `☐ ` The same with `✔ `. However it will not work any more when Debian/Stretch becomes the stable branch.
 
-### Binary installation
+Last update: 2017-01-05
+
+## Binary installation with APT
+
+Apart from the installation methods based on source, Debian/unstable
+and Debian/testing users can install pre-compiled Caffe packages via the official archive.
 
 Make sure that there is something like the follows in your `/etc/apt/sources.list`:
 ```
-deb http://ftp2.cn.debian.org/debian sid main contrib non-free
+deb http://MIRROR/debian CODENAME main contrib non-free
 ```
+where `MIRROR` is your favorate Debian mirror, and `CODENAME ∈ {testing,stretch,sid}`.
+
 Then we update APT cache and directly install Caffe. Note, the cpu version and
 the cuda version cannot be installed at the same time.
 ```
 # apt update
 # apt install [ caffe-cpu | caffe-cuda ]
+# caffe                                              # command line interface working
+# python3 -c 'import caffe; print(caffe.__path__)'   # python3 interface working
 ```
 It should work out of box.
 
 #### Customizing caffe packages
 
-Some users may need to customize the Caffe package. Here is a brief
-guide of producing the customized `.deb` packages.
+Some users may need to customize the Caffe package. The way to customize
+the package is beyond this guide. Here is only a brief guide of producing
+the customized `.deb` packages. 
 
 Make sure that there is something like this in your `/etc/apt/sources.list`:
 ```
@@ -66,7 +78,7 @@ more than one changelog entry, use subsequent `dch` command (see `man 1 dch`)
 instead of manually modifing `debian/changelog` unless you know how to keep its format correct.
 The changelog will be installed at e.g. `/usr/share/doc/caffe-cpu/changelog.Debian.gz`.
 
-### Source installation
+## Source installation
 
 Source installation under Debian/unstable is similar to that of Ubuntu, but
 here is a more elegant way to pull caffe build dependencies:
@@ -75,7 +87,27 @@ $ sudo apt build-dep [ caffe-cpu | caffe-cuda ]
 ```
 Note, this requires a `deb-src` entry in your `/etc/apt/sources.list`.
 
-### Notes
+#### Compiler Combinations
+
+Some users may find their favorate compiler doesn't work well with CUDA.
+```
+CXX compiler |  CUDA 7.5  |  CUDA 8.0  |
+-------------+------------+------------+-
+GCC-7        |     ?      |     ?      |
+GCC-6        |     ✘      |     ✘      |
+GCC-5        |     ✔ [1]  |     ✔      |
+CLANG-4.0    |     ?      |     ?      |
+CLANG-3.9    |     ✘      |     ✘      |
+CLANG-3.8    |     ?      |     ✔      |
+```
+
+`[1]` CUDA 7.5 's `host_config.h` must be patched before working with GCC-5.
+
+BTW, please forget the GCC-4.X series, since its `libstdc++` ABI is not compatible with GCC-5's.
+You may encounter failure linking GCC-4.X object files against GCC-5 libraries.
+(See https://wiki.debian.org/GCC5 )
+
+## Notes
 
 * Consider re-compiling OpenBLAS locally with optimization flags for sake of
 performance. This is highly recommended for any kind of production use, including
@@ -115,3 +147,9 @@ and hack the packaging scripts, then build your customized package.
 sudo apt install caffe-doc
 dpkg -L caffe-doc
 ```
+
+* Where can I find the Debian package status?
+
+https://tracker.debian.org/pkg/caffe  (for the CPU_ONLY version)
+
+https://tracker.debian.org/pkg/caffe-contrib  (for the CUDA version)

From 2317fa19d3f5a65cb22adcbd3792ea248996744e Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Tue, 22 Nov 2016 13:14:45 -0800
Subject: [PATCH 101/183] Logging from python, e.g. for lower log level on
 multi-GPU workers

---
 python/caffe/__init__.py |  2 +-
 python/caffe/_caffe.cpp  | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index 35868a40..5fc6ec9b 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,5 @@
 from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver
-from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed
+from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index bdee75ac..0a86045b 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -51,6 +51,19 @@ const int NPY_DTYPE = NPY_FLOAT32;
 void set_mode_cpu() { Caffe::set_mode(Caffe::CPU); }
 void set_mode_gpu() { Caffe::set_mode(Caffe::GPU); }
 
+void InitLog(int level) {
+  FLAGS_logtostderr = 1;
+  FLAGS_minloglevel = level;
+  ::google::InitGoogleLogging("");
+  ::google::InstallFailureSignalHandler();
+}
+void InitLogInfo() {
+  InitLog(google::INFO);
+}
+void Log(const string& s) {
+  LOG(INFO) << s;
+}
+
 void set_random_seed(unsigned int seed) { Caffe::set_random_seed(seed); }
 
 // For convenience, check that input files can be opened, and raise an
@@ -283,6 +296,9 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::scope().attr("__version__") = AS_STRING(CAFFE_VERSION);
 
   // Caffe utility functions
+  bp::def("init_log", &InitLog);
+  bp::def("init_log", &InitLogInfo);
+  bp::def("log", &Log);
   bp::def("set_mode_cpu", &set_mode_cpu);
   bp::def("set_mode_gpu", &set_mode_gpu);
   bp::def("set_random_seed", &set_random_seed);

From 3ba20549b7f49a76cd023d19f781a6891b2c2122 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Fri, 6 Jan 2017 14:55:12 -0800
Subject: [PATCH 102/183] Switched multi-GPU to NCCL

---
 CMakeLists.txt                                |   1 +
 Makefile                                      |   6 +
 Makefile.config.example                       |   4 +
 cmake/Dependencies.cmake                      |  15 +-
 cmake/Modules/FindNCCL.cmake                  |  26 +
 cmake/Summary.cmake                           |   1 +
 include/caffe/blob.hpp                        |   1 +
 include/caffe/common.hpp                      |  14 +-
 include/caffe/data_reader.hpp                 |  82 ---
 include/caffe/internal_thread.hpp             |   4 +-
 include/caffe/layer.hpp                       |  43 +-
 include/caffe/layers/base_data_layer.hpp      |   5 +-
 include/caffe/layers/data_layer.hpp           |   7 +-
 include/caffe/layers/hdf5_data_layer.hpp      |   6 +-
 include/caffe/layers/python_layer.hpp         |   4 +-
 include/caffe/net.hpp                         |  40 +-
 include/caffe/parallel.hpp                    |  96 ++--
 include/caffe/solver.hpp                      |  40 +-
 include/caffe/syncedmem.hpp                   |  14 +-
 include/caffe/util/math_functions.hpp         |   5 +
 include/caffe/util/nccl.hpp                   |  37 ++
 src/caffe/blob.cpp                            |  18 +
 src/caffe/common.cpp                          |   5 +-
 src/caffe/data_reader.cpp                     | 119 ----
 src/caffe/internal_thread.cpp                 |  10 +-
 src/caffe/layer.cpp                           |  20 -
 src/caffe/layers/base_data_layer.cpp          |  45 +-
 src/caffe/layers/base_data_layer.cu           |  21 +-
 src/caffe/layers/data_layer.cpp               |  82 ++-
 src/caffe/layers/hdf5_data_layer.cpp          |  55 +-
 src/caffe/layers/hdf5_data_layer.cu           |  22 +-
 src/caffe/layers/image_data_layer.cpp         |  13 +-
 src/caffe/layers/window_data_layer.cpp        |   8 +-
 src/caffe/net.cpp                             |  47 +-
 src/caffe/parallel.cpp                        | 514 ++++++++----------
 src/caffe/proto/caffe.proto                   |   9 +-
 src/caffe/solver.cpp                          |  44 +-
 src/caffe/solvers/adagrad_solver.cpp          |   1 -
 src/caffe/solvers/nesterov_solver.cpp         |   1 -
 src/caffe/solvers/sgd_solver.cpp              |   4 +-
 src/caffe/syncedmem.cpp                       |  59 +-
 src/caffe/test/test_data_layer.cpp            |  36 ++
 src/caffe/test/test_gradient_based_solver.cpp |  34 +-
 src/caffe/test/test_hdf5data_layer.cpp        |  30 +
 src/caffe/util/blocking_queue.cpp             |   5 -
 src/caffe/util/db_lmdb.cpp                    |   2 +-
 src/caffe/util/math_functions.cu              |  20 +
 tools/caffe.cpp                               |  11 +-
 48 files changed, 813 insertions(+), 873 deletions(-)
 create mode 100644 cmake/Modules/FindNCCL.cmake
 delete mode 100644 include/caffe/data_reader.hpp
 create mode 100644 include/caffe/util/nccl.hpp
 delete mode 100644 src/caffe/data_reader.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index da7142c9..3af394f7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,7 @@ include(cmake/ConfigGen.cmake)
 # ---[ Options
 caffe_option(CPU_ONLY  "Build Caffe without CUDA support" OFF) # TODO: rename to USE_CUDA
 caffe_option(USE_CUDNN "Build Caffe with cuDNN library support" ON IF NOT CPU_ONLY)
+caffe_option(USE_NCCL "Build Caffe with NCCL library support" OFF)
 caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 caffe_option(BUILD_python "Build Python wrapper" ON)
 set(python_version "2" CACHE STRING "Specify which Python version to use")
diff --git a/Makefile b/Makefile
index ccc4d8b9..65d08f7d 100644
--- a/Makefile
+++ b/Makefile
@@ -328,6 +328,12 @@ ifeq ($(USE_CUDNN), 1)
 	COMMON_FLAGS += -DUSE_CUDNN
 endif
 
+# NCCL acceleration configuration
+ifeq ($(USE_NCCL), 1)
+	LIBRARIES += nccl
+	COMMON_FLAGS += -DUSE_NCCL
+endif
+
 # configure IO libraries
 ifeq ($(USE_OPENCV), 1)
 	COMMON_FLAGS += -DUSE_OPENCV
diff --git a/Makefile.config.example b/Makefile.config.example
index 07bed63a..541cf807 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -94,6 +94,10 @@ LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
 # INCLUDE_DIRS += $(shell brew --prefix)/include
 # LIBRARY_DIRS += $(shell brew --prefix)/lib
 
+# NCCL acceleration switch (uncomment to build with NCCL)
+# https://github.com/NVIDIA/nccl (last tested version: v1.2.3-1+cuda8.0)
+# USE_NCCL := 1
+
 # Uncomment to use `pkg-config` to specify OpenCV library paths.
 # (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.)
 # USE_PKG_CONFIG := 1
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index ae9ce8e4..ba28a128 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -67,6 +67,13 @@ if(NOT HAVE_CUDA)
   add_definitions(-DCPU_ONLY)
 endif()
 
+if(USE_NCCL)
+  find_package(NCCL REQUIRED)
+  include_directories(SYSTEM ${NCCL_INCLUDE_DIR})
+  list(APPEND Caffe_LINKER_LIBS ${NCCL_LIBRARIES})
+  add_definitions(-DUSE_NCCL)
+endif()
+
 # ---[ OpenCV
 if(USE_OPENCV)
   find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
@@ -119,18 +126,18 @@ if(BUILD_python)
     find_package(NumPy 1.7.1)
     # Find the matching boost python implementation
     set(version ${PYTHONLIBS_VERSION_STRING})
-    
+
     STRING( REGEX REPLACE "[^0-9]" "" boost_py_version ${version} )
     find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
     set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
-    
+
     while(NOT "${version}" STREQUAL "" AND NOT Boost_PYTHON_FOUND)
       STRING( REGEX REPLACE "([0-9.]+).[0-9]+" "\\1" version ${version} )
-      
+
       STRING( REGEX REPLACE "[^0-9]" "" boost_py_version ${version} )
       find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
       set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
-      
+
       STRING( REGEX MATCHALL "([0-9.]+).[0-9]+" has_more_version ${version} )
       if("${has_more_version}" STREQUAL "")
         break()
diff --git a/cmake/Modules/FindNCCL.cmake b/cmake/Modules/FindNCCL.cmake
new file mode 100644
index 00000000..c8845934
--- /dev/null
+++ b/cmake/Modules/FindNCCL.cmake
@@ -0,0 +1,26 @@
+set(NCCL_INC_PATHS
+    /usr/include
+    /usr/local/include
+    $ENV{NCCL_DIR}/include
+    )
+
+set(NCCL_LIB_PATHS
+    /lib
+    /lib64
+    /usr/lib
+    /usr/lib64
+    /usr/local/lib
+    /usr/local/lib64
+    $ENV{NCCL_DIR}/lib
+    )
+
+find_path(NCCL_INCLUDE_DIR NAMES nccl.h PATHS ${NCCL_INC_PATHS})
+find_library(NCCL_LIBRARIES NAMES nccl PATHS ${NCCL_LIB_PATHS})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARIES)
+
+if (NCCL_FOUND)
+  message(STATUS "Found NCCL    (include: ${NCCL_INCLUDE_DIR}, library: ${NCCL_LIBRARIES})")
+  mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARIES)
+endif ()
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index ba025cf8..ed8c2526 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -117,6 +117,7 @@ function(caffe_print_configuration_summary)
   caffe_status("  USE_OPENCV        :   ${USE_OPENCV}")
   caffe_status("  USE_LEVELDB       :   ${USE_LEVELDB}")
   caffe_status("  USE_LMDB          :   ${USE_LMDB}")
+  caffe_status("  USE_NCCL          :   ${USE_NCCL}")
   caffe_status("  ALLOW_LMDB_NOLOCK :   ${ALLOW_LMDB_NOLOCK}")
   caffe_status("")
   caffe_status("Dependencies:")
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index af360ac2..2f59471c 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -220,6 +220,7 @@ class Blob {
   void set_cpu_data(Dtype* data);
   const int* gpu_shape() const;
   const Dtype* gpu_data() const;
+  void set_gpu_data(Dtype* data);
   const Dtype* cpu_diff() const;
   const Dtype* gpu_diff() const;
   Dtype* mutable_cpu_data();
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 3c6a076e..4904d1d8 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -158,11 +158,14 @@ class Caffe {
   // Search from start_id to the highest possible device ordinal,
   // return the ordinal of the first available device.
   static int FindDevice(const int start_id = 0);
-  // Parallel training info
+  // Parallel training
   inline static int solver_count() { return Get().solver_count_; }
   inline static void set_solver_count(int val) { Get().solver_count_ = val; }
-  inline static bool root_solver() { return Get().root_solver_; }
-  inline static void set_root_solver(bool val) { Get().root_solver_ = val; }
+  inline static int solver_rank() { return Get().solver_rank_; }
+  inline static void set_solver_rank(int val) { Get().solver_rank_ = val; }
+  inline static bool multiprocess() { return Get().multiprocess_; }
+  inline static void set_multiprocess(bool val) { Get().multiprocess_ = val; }
+  inline static bool root_solver() { return Get().solver_rank_ == 0; }
 
  protected:
 #ifndef CPU_ONLY
@@ -172,8 +175,11 @@ class Caffe {
   shared_ptr<RNG> random_generator_;
 
   Brew mode_;
+
+  // Parallel training
   int solver_count_;
-  bool root_solver_;
+  int solver_rank_;
+  bool multiprocess_;
 
  private:
   // The private constructor to avoid duplicate instantiation.
diff --git a/include/caffe/data_reader.hpp b/include/caffe/data_reader.hpp
deleted file mode 100644
index 8ed5542c..00000000
--- a/include/caffe/data_reader.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef CAFFE_DATA_READER_HPP_
-#define CAFFE_DATA_READER_HPP_
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/internal_thread.hpp"
-#include "caffe/util/blocking_queue.hpp"
-#include "caffe/util/db.hpp"
-
-namespace caffe {
-
-/**
- * @brief Reads data from a source to queues available to data layers.
- * A single reading thread is created per source, even if multiple solvers
- * are running in parallel, e.g. for multi-GPU training. This makes sure
- * databases are read sequentially, and that each solver accesses a different
- * subset of the database. Data is distributed to solvers in a round-robin
- * way to keep parallel training deterministic.
- */
-class DataReader {
- public:
-  explicit DataReader(const LayerParameter& param);
-  ~DataReader();
-
-  inline BlockingQueue<Datum*>& free() const {
-    return queue_pair_->free_;
-  }
-  inline BlockingQueue<Datum*>& full() const {
-    return queue_pair_->full_;
-  }
-
- protected:
-  // Queue pairs are shared between a body and its readers
-  class QueuePair {
-   public:
-    explicit QueuePair(int size);
-    ~QueuePair();
-
-    BlockingQueue<Datum*> free_;
-    BlockingQueue<Datum*> full_;
-
-  DISABLE_COPY_AND_ASSIGN(QueuePair);
-  };
-
-  // A single body is created per source
-  class Body : public InternalThread {
-   public:
-    explicit Body(const LayerParameter& param);
-    virtual ~Body();
-
-   protected:
-    void InternalThreadEntry();
-    void read_one(db::Cursor* cursor, QueuePair* qp);
-
-    const LayerParameter param_;
-    BlockingQueue<shared_ptr<QueuePair> > new_queue_pairs_;
-
-    friend class DataReader;
-
-  DISABLE_COPY_AND_ASSIGN(Body);
-  };
-
-  // A source is uniquely identified by its layer name + path, in case
-  // the same database is read from two different locations in the net.
-  static inline string source_key(const LayerParameter& param) {
-    return param.name() + ":" + param.data_param().source();
-  }
-
-  const shared_ptr<QueuePair> queue_pair_;
-  shared_ptr<Body> body_;
-
-  static map<const string, boost::weak_ptr<DataReader::Body> > bodies_;
-
-DISABLE_COPY_AND_ASSIGN(DataReader);
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_DATA_READER_HPP_
diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp
index 6a8c5a02..0ba67665 100644
--- a/include/caffe/internal_thread.hpp
+++ b/include/caffe/internal_thread.hpp
@@ -42,8 +42,8 @@ class InternalThread {
   bool must_stop();
 
  private:
-  void entry(int device, Caffe::Brew mode, int rand_seed, int solver_count,
-      bool root_solver);
+  void entry(int device, Caffe::Brew mode, int rand_seed,
+      int solver_count, int solver_rank, bool multiprocess);
 
   shared_ptr<boost::thread> thread_;
 };
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 10f353f9..30dbfd53 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -38,7 +38,7 @@ class Layer {
    * layer.
    */
   explicit Layer(const LayerParameter& param)
-    : layer_param_(param), is_shared_(false) {
+    : layer_param_(param) {
       // Set phase and copy blobs (if there are any).
       phase_ = param.phase();
       if (layer_param_.blobs_size() > 0) {
@@ -66,7 +66,6 @@ class Layer {
    */
   void SetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-    InitMutex();
     CheckBlobCounts(bottom, top);
     LayerSetUp(bottom, top);
     Reshape(bottom, top);
@@ -92,30 +91,6 @@ class Layer {
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
 
-  /**
-   * @brief Whether a layer should be shared by multiple nets during data
-   *        parallelism. By default, all layers except for data layers should
-   *        not be shared. data layers should be shared to ensure each worker
-   *        solver access data sequentially during data parallelism.
-   */
-  virtual inline bool ShareInParallel() const { return false; }
-
-  /** @brief Return whether this layer is actually shared by other nets.
-   *         If ShareInParallel() is true and using more than one GPU and the
-   *         net has TRAIN phase, then this function is expected return true.
-   */
-  inline bool IsShared() const { return is_shared_; }
-
-  /** @brief Set whether this layer is actually shared by other nets
-   *         If ShareInParallel() is true and using more than one GPU and the
-   *         net has TRAIN phase, then is_shared should be set true.
-   */
-  inline void SetShared(bool is_shared) {
-    CHECK(ShareInParallel() || !is_shared)
-        << type() << "Layer does not support sharing.";
-    is_shared_ = is_shared;
-  }
-
   /**
    * @brief Adjust the shapes of top blobs and internal buffers to accommodate
    *        the shapes of the bottom blobs.
@@ -428,19 +403,6 @@ class Layer {
   }
 
  private:
-  /** Whether this layer is actually shared by other nets*/
-  bool is_shared_;
-
-  /** The mutex for sequential forward if this layer is shared */
-  shared_ptr<boost::mutex> forward_mutex_;
-
-  /** Initialize forward_mutex_ */
-  void InitMutex();
-  /** Lock forward_mutex_ if this layer is shared */
-  void Lock();
-  /** Unlock forward_mutex_ if this layer is shared */
-  void Unlock();
-
   DISABLE_COPY_AND_ASSIGN(Layer);
 };  // class Layer
 
@@ -450,8 +412,6 @@ class Layer {
 template <typename Dtype>
 inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  // Lock during forward to ensure sequential forward
-  Lock();
   Dtype loss = 0;
   Reshape(bottom, top);
   switch (Caffe::mode()) {
@@ -482,7 +442,6 @@ inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
   default:
     LOG(FATAL) << "Unknown caffe mode.";
   }
-  Unlock();
   return loss;
 }
 
diff --git a/include/caffe/layers/base_data_layer.hpp b/include/caffe/layers/base_data_layer.hpp
index 2c49b731..925b019d 100644
--- a/include/caffe/layers/base_data_layer.hpp
+++ b/include/caffe/layers/base_data_layer.hpp
@@ -68,15 +68,16 @@ class BasePrefetchingDataLayer :
       const vector<Blob<Dtype>*>& top);
 
   // Prefetches batches (asynchronously if to GPU memory)
-  static const int PREFETCH_COUNT = 3;
+  static const int PREFETCH_COUNT = 4;  // same as proto
 
  protected:
   virtual void InternalThreadEntry();
   virtual void load_batch(Batch<Dtype>* batch) = 0;
 
-  Batch<Dtype> prefetch_[PREFETCH_COUNT];
+  vector<shared_ptr<Batch<Dtype> > > prefetch_;
   BlockingQueue<Batch<Dtype>*> prefetch_free_;
   BlockingQueue<Batch<Dtype>*> prefetch_full_;
+  Batch<Dtype>* prefetch_current_;
 
   Blob<Dtype> transformed_data_;
 };
diff --git a/include/caffe/layers/data_layer.hpp b/include/caffe/layers/data_layer.hpp
index 6c361791..dec58180 100644
--- a/include/caffe/layers/data_layer.hpp
+++ b/include/caffe/layers/data_layer.hpp
@@ -4,7 +4,6 @@
 #include <vector>
 
 #include "caffe/blob.hpp"
-#include "caffe/data_reader.hpp"
 #include "caffe/data_transformer.hpp"
 #include "caffe/internal_thread.hpp"
 #include "caffe/layer.hpp"
@@ -29,9 +28,13 @@ class DataLayer : public BasePrefetchingDataLayer<Dtype> {
   virtual inline int MaxTopBlobs() const { return 2; }
 
  protected:
+  void Next();
+  bool Skip();
   virtual void load_batch(Batch<Dtype>* batch);
 
-  DataReader reader_;
+  shared_ptr<db::DB> db_;
+  shared_ptr<db::Cursor> cursor_;
+  uint64_t offset_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/hdf5_data_layer.hpp b/include/caffe/layers/hdf5_data_layer.hpp
index b04cf8e1..650a3fb0 100644
--- a/include/caffe/layers/hdf5_data_layer.hpp
+++ b/include/caffe/layers/hdf5_data_layer.hpp
@@ -23,7 +23,7 @@ template <typename Dtype>
 class HDF5DataLayer : public Layer<Dtype> {
  public:
   explicit HDF5DataLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
+      : Layer<Dtype>(param), offset_() {}
   virtual ~HDF5DataLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
@@ -38,6 +38,9 @@ class HDF5DataLayer : public Layer<Dtype> {
   virtual inline int MinTopBlobs() const { return 1; }
 
  protected:
+  void Next();
+  bool Skip();
+
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -55,6 +58,7 @@ class HDF5DataLayer : public Layer<Dtype> {
   std::vector<shared_ptr<Blob<Dtype> > > hdf_blobs_;
   std::vector<unsigned int> data_permutation_;
   std::vector<unsigned int> file_permutation_;
+  uint64_t offset_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/python_layer.hpp b/include/caffe/layers/python_layer.hpp
index 66dbbdf1..529b09cb 100644
--- a/include/caffe/layers/python_layer.hpp
+++ b/include/caffe/layers/python_layer.hpp
@@ -21,8 +21,8 @@ class PythonLayer : public Layer<Dtype> {
     // Disallow PythonLayer in MultiGPU training stage, due to GIL issues
     // Details: https://github.com/BVLC/caffe/issues/2936
     if (this->phase_ == TRAIN && Caffe::solver_count() > 1
-        && !ShareInParallel()) {
-      LOG(FATAL) << "PythonLayer is not implemented in Multi-GPU training";
+        && !Caffe::root_solver() && !Caffe::multiprocess()) {
+      LOG(FATAL) << "PythonLayer does not support CLI Multi-GPU, use train.py";
     }
     self_.attr("param_str") = bp::str(
         this->layer_param_.python_param().param_str());
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 493bdf29..d3c9306e 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -23,10 +23,9 @@ namespace caffe {
 template <typename Dtype>
 class Net {
  public:
-  explicit Net(const NetParameter& param, const Net* root_net = NULL);
+  explicit Net(const NetParameter& param);
   explicit Net(const string& param_file, Phase phase,
-      const int level = 0, const vector<string>* stages = NULL,
-      const Net* root_net = NULL);
+      const int level = 0, const vector<string>* stages = NULL);
   virtual ~Net() {}
 
   /// @brief Initialize a network with a NetParameter.
@@ -228,6 +227,31 @@ class Net {
   static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
       const string& layer_name);
 
+  // Invoked at specific points during an iteration
+  class Callback {
+   protected:
+    virtual void run(int layer) = 0;
+
+    template <typename T>
+    friend class Net;
+  };
+  const vector<Callback*>& before_forward() const { return before_forward_; }
+  void add_before_forward(Callback* value) {
+    before_forward_.push_back(value);
+  }
+  const vector<Callback*>& after_forward() const { return after_forward_; }
+  void add_after_forward(Callback* value) {
+    after_forward_.push_back(value);
+  }
+  const vector<Callback*>& before_backward() const { return before_backward_; }
+  void add_before_backward(Callback* value) {
+    before_backward_.push_back(value);
+  }
+  const vector<Callback*>& after_backward() const { return after_backward_; }
+  void add_after_backward(Callback* value) {
+    after_backward_.push_back(value);
+  }
+
  protected:
   // Helpers for Init.
   /// @brief Append a new top blob to the net.
@@ -306,9 +330,13 @@ class Net {
   size_t memory_used_;
   /// Whether to compute and display debug info for the net.
   bool debug_info_;
-  /// The root net that actually holds the shared layers in data parallelism
-  const Net* const root_net_;
-  DISABLE_COPY_AND_ASSIGN(Net);
+  // Callbacks
+  vector<Callback*> before_forward_;
+  vector<Callback*> after_forward_;
+  vector<Callback*> before_backward_;
+  vector<Callback*> after_backward_;
+
+DISABLE_COPY_AND_ASSIGN(Net);
 };
 
 
diff --git a/include/caffe/parallel.hpp b/include/caffe/parallel.hpp
index 6c496c88..64bb48e6 100644
--- a/include/caffe/parallel.hpp
+++ b/include/caffe/parallel.hpp
@@ -1,8 +1,11 @@
 #ifndef CAFFE_PARALLEL_HPP_
 #define CAFFE_PARALLEL_HPP_
 
-#include <boost/date_time/posix_time/posix_time.hpp>
+#ifdef USE_NCCL
 
+#include <boost/thread.hpp>
+
+#include <string>
 #include <vector>
 
 #include "caffe/blob.hpp"
@@ -13,6 +16,7 @@
 #include "caffe/solver.hpp"
 #include "caffe/syncedmem.hpp"
 #include "caffe/util/blocking_queue.hpp"
+#include "caffe/util/nccl.hpp"
 
 namespace caffe {
 
@@ -51,7 +55,7 @@ class GPUParams : public Params<Dtype> {
   GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device);
   virtual ~GPUParams();
 
-  void configure(Solver<Dtype>* solver) const;
+  void Configure(Solver<Dtype>* solver) const;
 
  protected:
   using Params<Dtype>::size_;
@@ -59,58 +63,55 @@ class GPUParams : public Params<Dtype> {
   using Params<Dtype>::diff_;
 };
 
-class DevicePair {
- public:
-  DevicePair(int parent, int device)
-      : parent_(parent),
-        device_(device) {
-  }
-  inline int parent() {
-    return parent_;
-  }
-  inline int device() {
-    return device_;
-  }
-
-  // Group GPUs in pairs, by proximity depending on machine's topology
-  static void compute(const vector<int> devices, vector<DevicePair>* pairs);
-
- protected:
-  int parent_;
-  int device_;
-};
-
-// Synchronous data parallelism using map-reduce between local GPUs.
 template<typename Dtype>
-class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
-    public InternalThread {
+class NCCL : public GPUParams<Dtype>,
+             public Solver<Dtype>::Callback,
+             public Net<Dtype>::Callback {
  public:
-  explicit P2PSync(shared_ptr<Solver<Dtype> > root_solver,
-                   P2PSync<Dtype>* parent, const SolverParameter& param);
-  virtual ~P2PSync();
-
-  inline const shared_ptr<Solver<Dtype> >& solver() const {
-    return solver_;
-  }
-
-  void Run(const vector<int>& gpus);
-  void Prepare(const vector<int>& gpus,
-               vector<shared_ptr<P2PSync<Dtype> > >* syncs);
-  inline const int initial_iter() const { return initial_iter_; }
+  /**
+   * Single process version.
+   */
+  explicit NCCL(shared_ptr<Solver<Dtype> > solver);
+  /**
+   * In multi-process settings, first create a NCCL id (new_uid), then
+   * pass it to each process to create connected instances.
+   */
+  NCCL(shared_ptr<Solver<Dtype> > solver, const string& uid);
+  ~NCCL();
+
+  boost::barrier* barrier();
+  void set_barrier(boost::barrier* value);
+
+  /**
+   * In single process settings, create instances without uids and
+   * call this to connect them.
+   */
+  static void InitSingleProcess(vector<NCCL<Dtype>*>* nccls);
+
+  static string new_uid();
+
+  /**
+   * Broadcast weights from rank 0 other solvers.
+   */
+  void Broadcast();
+
+  /**
+   * Single process multi-GPU.
+   */
+  void Run(const vector<int>& gpus, const char* restore);
 
  protected:
-  void on_start();
+  void Init();
+  void on_start() {}
+  void run(int layer);  // Net callback
   void on_gradients_ready();
 
-  void InternalThreadEntry();
+  ncclComm_t comm_;
+  cudaStream_t stream_;
 
-  P2PSync<Dtype>* parent_;
-  vector<P2PSync<Dtype>*> children_;
-  BlockingQueue<P2PSync<Dtype>*> queue_;
-  const int initial_iter_;
-  Dtype* parent_grads_;
   shared_ptr<Solver<Dtype> > solver_;
-
+  // Should not be necessary, https://github.com/NVIDIA/nccl/issues/37
+  boost::barrier* barrier_;
   using Params<Dtype>::size_;
   using Params<Dtype>::data_;
   using Params<Dtype>::diff_;
@@ -118,4 +119,5 @@ class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
 
 }  // namespace caffe
 
-#endif
+#endif  // USE_NCCL
+#endif  // header
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index eafcee32..a28d8cb8 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -6,6 +6,7 @@
 
 #include "caffe/net.hpp"
 #include "caffe/solver_factory.hpp"
+#include "caffe/util/benchmark.hpp"
 
 namespace caffe {
 
@@ -40,9 +41,8 @@ typedef boost::function<SolverAction::Enum()> ActionCallback;
 template <typename Dtype>
 class Solver {
  public:
-  explicit Solver(const SolverParameter& param,
-      const Solver* root_solver = NULL);
-  explicit Solver(const string& param_file, const Solver* root_solver = NULL);
+  explicit Solver(const SolverParameter& param);
+  explicit Solver(const string& param_file);
   void Init(const SolverParameter& param);
   void InitTrainNet();
   void InitTestNets();
@@ -72,7 +72,7 @@ class Solver {
   inline const vector<shared_ptr<Net<Dtype> > >& test_nets() {
     return test_nets_;
   }
-  int iter() { return iter_; }
+  int iter() const { return iter_; }
 
   // Invoked at specific points during an iteration
   class Callback {
@@ -118,10 +118,6 @@ class Solver {
   vector<Dtype> losses_;
   Dtype smoothed_loss_;
 
-  // The root solver that holds root nets (actually containing shared layers)
-  // in data parallelism
-  const Solver* const root_solver_;
-
   // A function that can be set by a client of the Solver to provide indication
   // that it wants a snapshot saved and/or to exit early.
   ActionCallback action_request_function_;
@@ -129,31 +125,11 @@ class Solver {
   // True iff a request to stop early was received.
   bool requested_early_exit_;
 
-  DISABLE_COPY_AND_ASSIGN(Solver);
-};
+  // Timing information, handy to tune e.g. nbr of GPUs
+  Timer iteration_timer_;
+  float iterations_last_;
 
-/**
- * @brief Solver that only computes gradients, used as worker
- *        for multi-GPU training.
- */
-template <typename Dtype>
-class WorkerSolver : public Solver<Dtype> {
- public:
-  explicit WorkerSolver(const SolverParameter& param,
-      const Solver<Dtype>* root_solver = NULL)
-      : Solver<Dtype>(param, root_solver) {}
-
- protected:
-  void ApplyUpdate() {}
-  void SnapshotSolverState(const string& model_filename) {
-    LOG(FATAL) << "Should not be called on worker solver.";
-  }
-  void RestoreSolverStateFromBinaryProto(const string& state_file) {
-    LOG(FATAL) << "Should not be called on worker solver.";
-  }
-  void RestoreSolverStateFromHDF5(const string& state_file) {
-    LOG(FATAL) << "Should not be called on worker solver.";
-  }
+  DISABLE_COPY_AND_ASSIGN(Solver);
 };
 
 }  // namespace caffe
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 38ee4664..a41066ba 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -44,14 +44,8 @@ inline void CaffeFreeHost(void* ptr, bool use_cuda) {
  */
 class SyncedMemory {
  public:
-  SyncedMemory()
-      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
-        own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
-        gpu_device_(-1) {}
-  explicit SyncedMemory(size_t size)
-      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
-        own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
-        gpu_device_(-1) {}
+  SyncedMemory();
+  explicit SyncedMemory(size_t size);
   ~SyncedMemory();
   const void* cpu_data();
   void set_cpu_data(void* data);
@@ -68,6 +62,8 @@ class SyncedMemory {
 #endif
 
  private:
+  void check_device();
+
   void to_cpu();
   void to_gpu();
   void* cpu_ptr_;
@@ -77,7 +73,7 @@ class SyncedMemory {
   bool own_cpu_data_;
   bool cpu_malloc_use_cuda_;
   bool own_gpu_data_;
-  int gpu_device_;
+  int device_;
 
   DISABLE_COPY_AND_ASSIGN(SyncedMemory);
 };  // class SyncedMemory
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 6f6d3fee..51068fe2 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -185,6 +185,11 @@ void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
 template <typename Dtype>
 void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X);
 
+#ifndef CPU_ONLY
+template <typename Dtype>
+void caffe_gpu_scal(const int N, const Dtype alpha, Dtype* X, cudaStream_t str);
+#endif
+
 template <typename Dtype>
 void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
diff --git a/include/caffe/util/nccl.hpp b/include/caffe/util/nccl.hpp
new file mode 100644
index 00000000..e01fb745
--- /dev/null
+++ b/include/caffe/util/nccl.hpp
@@ -0,0 +1,37 @@
+#ifndef CAFFE_UTIL_NCCL_H_
+#define CAFFE_UTIL_NCCL_H_
+#ifdef USE_NCCL
+
+#include <nccl.h>
+
+#include "caffe/common.hpp"
+
+#define NCCL_CHECK(condition) \
+{ \
+  ncclResult_t result = condition; \
+  CHECK_EQ(result, ncclSuccess) << " " \
+    << ncclGetErrorString(result); \
+}
+
+namespace caffe {
+
+namespace nccl {
+
+template <typename Dtype> class dataType;
+
+template<> class dataType<float> {
+ public:
+  static const ncclDataType_t type = ncclFloat;
+};
+template<> class dataType<double> {
+ public:
+  static const ncclDataType_t type = ncclDouble;
+};
+
+}  // namespace nccl
+
+}  // namespace caffe
+
+#endif  // end USE_NCCL
+
+#endif  // CAFFE_UTIL_NCCL_H_
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index 4a34e4c5..603e52f7 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -89,6 +89,12 @@ const Dtype* Blob<Dtype>::cpu_data() const {
 template <typename Dtype>
 void Blob<Dtype>::set_cpu_data(Dtype* data) {
   CHECK(data);
+  // Make sure CPU and GPU sizes remain equal
+  size_t size = count_ * sizeof(Dtype);
+  if (data_->size() != size) {
+    data_.reset(new SyncedMemory(size));
+    diff_.reset(new SyncedMemory(size));
+  }
   data_->set_cpu_data(data);
 }
 
@@ -98,6 +104,18 @@ const Dtype* Blob<Dtype>::gpu_data() const {
   return (const Dtype*)data_->gpu_data();
 }
 
+template <typename Dtype>
+void Blob<Dtype>::set_gpu_data(Dtype* data) {
+  CHECK(data);
+  // Make sure CPU and GPU sizes remain equal
+  size_t size = count_ * sizeof(Dtype);
+  if (data_->size() != size) {
+    data_.reset(new SyncedMemory(size));
+    diff_.reset(new SyncedMemory(size));
+  }
+  data_->set_gpu_data(data);
+}
+
 template <typename Dtype>
 const Dtype* Blob<Dtype>::cpu_diff() const {
   CHECK(diff_);
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index dee68165..4f6f9bcc 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -53,7 +53,7 @@ void GlobalInit(int* pargc, char*** pargv) {
 
 Caffe::Caffe()
     : random_generator_(), mode_(Caffe::CPU),
-      solver_count_(1), root_solver_(true) { }
+      solver_count_(1), solver_rank_(0), multiprocess_(false) { }
 
 Caffe::~Caffe() { }
 
@@ -106,7 +106,8 @@ void* Caffe::RNG::generator() {
 
 Caffe::Caffe()
     : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(),
-    mode_(Caffe::CPU), solver_count_(1), root_solver_(true) {
+    mode_(Caffe::CPU),
+    solver_count_(1), solver_rank_(0), multiprocess_(false) {
   // Try to create a cublas handler, and report an error if failed (but we will
   // keep the program running as one might just want to run CPU code).
   if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
diff --git a/src/caffe/data_reader.cpp b/src/caffe/data_reader.cpp
deleted file mode 100644
index 9f019bbf..00000000
--- a/src/caffe/data_reader.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-#include <boost/thread.hpp>
-#include <map>
-#include <string>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/data_reader.hpp"
-#include "caffe/layers/data_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-namespace caffe {
-
-using boost::weak_ptr;
-
-map<const string, weak_ptr<DataReader::Body> > DataReader::bodies_;
-static boost::mutex bodies_mutex_;
-
-DataReader::DataReader(const LayerParameter& param)
-    : queue_pair_(new QueuePair(  //
-        param.data_param().prefetch() * param.data_param().batch_size())) {
-  // Get or create a body
-  boost::mutex::scoped_lock lock(bodies_mutex_);
-  string key = source_key(param);
-  weak_ptr<Body>& weak = bodies_[key];
-  body_ = weak.lock();
-  if (!body_) {
-    body_.reset(new Body(param));
-    bodies_[key] = weak_ptr<Body>(body_);
-  }
-  body_->new_queue_pairs_.push(queue_pair_);
-}
-
-DataReader::~DataReader() {
-  string key = source_key(body_->param_);
-  body_.reset();
-  boost::mutex::scoped_lock lock(bodies_mutex_);
-  if (bodies_[key].expired()) {
-    bodies_.erase(key);
-  }
-}
-
-//
-
-DataReader::QueuePair::QueuePair(int size) {
-  // Initialize the free queue with requested number of datums
-  for (int i = 0; i < size; ++i) {
-    free_.push(new Datum());
-  }
-}
-
-DataReader::QueuePair::~QueuePair() {
-  Datum* datum;
-  while (free_.try_pop(&datum)) {
-    delete datum;
-  }
-  while (full_.try_pop(&datum)) {
-    delete datum;
-  }
-}
-
-//
-
-DataReader::Body::Body(const LayerParameter& param)
-    : param_(param),
-      new_queue_pairs_() {
-  StartInternalThread();
-}
-
-DataReader::Body::~Body() {
-  StopInternalThread();
-}
-
-void DataReader::Body::InternalThreadEntry() {
-  shared_ptr<db::DB> db(db::GetDB(param_.data_param().backend()));
-  db->Open(param_.data_param().source(), db::READ);
-  shared_ptr<db::Cursor> cursor(db->NewCursor());
-  vector<shared_ptr<QueuePair> > qps;
-  try {
-    int solver_count = param_.phase() == TRAIN ? Caffe::solver_count() : 1;
-
-    // To ensure deterministic runs, only start running once all solvers
-    // are ready. But solvers need to peek on one item during initialization,
-    // so read one item, then wait for the next solver.
-    for (int i = 0; i < solver_count; ++i) {
-      shared_ptr<QueuePair> qp(new_queue_pairs_.pop());
-      read_one(cursor.get(), qp.get());
-      qps.push_back(qp);
-    }
-    // Main loop
-    while (!must_stop()) {
-      for (int i = 0; i < solver_count; ++i) {
-        read_one(cursor.get(), qps[i].get());
-      }
-      // Check no additional readers have been created. This can happen if
-      // more than one net is trained at a time per process, whether single
-      // or multi solver. It might also happen if two data layers have same
-      // name and same source.
-      CHECK_EQ(new_queue_pairs_.size(), 0);
-    }
-  } catch (boost::thread_interrupted&) {
-    // Interrupted exception is expected on shutdown
-  }
-}
-
-void DataReader::Body::read_one(db::Cursor* cursor, QueuePair* qp) {
-  Datum* datum = qp->free_.pop();
-  // TODO deserialize in-place instead of copy?
-  datum->ParseFromString(cursor->value());
-  qp->full_.push(datum);
-
-  // go to the next iter
-  cursor->Next();
-  if (!cursor->valid()) {
-    DLOG(INFO) << "Restarting data prefetching from start.";
-    cursor->SeekToFirst();
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp
index 104884e0..11de4979 100644
--- a/src/caffe/internal_thread.cpp
+++ b/src/caffe/internal_thread.cpp
@@ -28,25 +28,27 @@ void InternalThread::StartInternalThread() {
   Caffe::Brew mode = Caffe::mode();
   int rand_seed = caffe_rng_rand();
   int solver_count = Caffe::solver_count();
-  bool root_solver = Caffe::root_solver();
+  int solver_rank = Caffe::solver_rank();
+  bool multiprocess = Caffe::multiprocess();
 
   try {
     thread_.reset(new boost::thread(&InternalThread::entry, this, device, mode,
-          rand_seed, solver_count, root_solver));
+          rand_seed, solver_count, solver_rank, multiprocess));
   } catch (std::exception& e) {
     LOG(FATAL) << "Thread exception: " << e.what();
   }
 }
 
 void InternalThread::entry(int device, Caffe::Brew mode, int rand_seed,
-    int solver_count, bool root_solver) {
+    int solver_count, int solver_rank, bool multiprocess) {
 #ifndef CPU_ONLY
   CUDA_CHECK(cudaSetDevice(device));
 #endif
   Caffe::set_mode(mode);
   Caffe::set_random_seed(rand_seed);
   Caffe::set_solver_count(solver_count);
-  Caffe::set_root_solver(root_solver);
+  Caffe::set_solver_rank(solver_rank);
+  Caffe::set_multiprocess(multiprocess);
 
   InternalThreadEntry();
 }
diff --git a/src/caffe/layer.cpp b/src/caffe/layer.cpp
index 3b912898..684ae88b 100644
--- a/src/caffe/layer.cpp
+++ b/src/caffe/layer.cpp
@@ -1,27 +1,7 @@
-#include <boost/thread.hpp>
 #include "caffe/layer.hpp"
 
 namespace caffe {
 
-template <typename Dtype>
-void Layer<Dtype>::InitMutex() {
-  forward_mutex_.reset(new boost::mutex());
-}
-
-template <typename Dtype>
-void Layer<Dtype>::Lock() {
-  if (IsShared()) {
-    forward_mutex_->lock();
-  }
-}
-
-template <typename Dtype>
-void Layer<Dtype>::Unlock() {
-  if (IsShared()) {
-    forward_mutex_->unlock();
-  }
-}
-
 INSTANTIATE_CLASS(Layer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 989319f1..9414f6f9 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -36,9 +36,12 @@ template <typename Dtype>
 BasePrefetchingDataLayer<Dtype>::BasePrefetchingDataLayer(
     const LayerParameter& param)
     : BaseDataLayer<Dtype>(param),
-      prefetch_free_(), prefetch_full_() {
-  for (int i = 0; i < PREFETCH_COUNT; ++i) {
-    prefetch_free_.push(&prefetch_[i]);
+      prefetch_(param.has_data_param() ?
+                param.data_param().prefetch() : PREFETCH_COUNT),
+      prefetch_free_(), prefetch_full_(), prefetch_current_() {
+  for (int i = 0; i < prefetch_.size(); ++i) {
+    prefetch_[i].reset(new Batch<Dtype>());
+    prefetch_free_.push(prefetch_[i].get());
   }
 }
 
@@ -46,22 +49,23 @@ template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::LayerSetUp(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   BaseDataLayer<Dtype>::LayerSetUp(bottom, top);
+
   // Before starting the prefetch thread, we make cpu_data and gpu_data
   // calls so that the prefetch thread does not accidentally make simultaneous
   // cudaMalloc calls when the main thread is running. In some GPUs this
   // seems to cause failures if we do not so.
-  for (int i = 0; i < PREFETCH_COUNT; ++i) {
-    prefetch_[i].data_.mutable_cpu_data();
+  for (int i = 0; i < prefetch_.size(); ++i) {
+    prefetch_[i]->data_.mutable_cpu_data();
     if (this->output_labels_) {
-      prefetch_[i].label_.mutable_cpu_data();
+      prefetch_[i]->label_.mutable_cpu_data();
     }
   }
 #ifndef CPU_ONLY
   if (Caffe::mode() == Caffe::GPU) {
-    for (int i = 0; i < PREFETCH_COUNT; ++i) {
-      prefetch_[i].data_.mutable_gpu_data();
+    for (int i = 0; i < prefetch_.size(); ++i) {
+      prefetch_[i]->data_.mutable_gpu_data();
       if (this->output_labels_) {
-        prefetch_[i].label_.mutable_gpu_data();
+        prefetch_[i]->label_.mutable_gpu_data();
       }
     }
   }
@@ -88,6 +92,9 @@ void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
 #ifndef CPU_ONLY
       if (Caffe::mode() == Caffe::GPU) {
         batch->data_.data().get()->async_gpu_push(stream);
+        if (this->output_labels_) {
+          batch->label_.data().get()->async_gpu_push(stream);
+        }
         CUDA_CHECK(cudaStreamSynchronize(stream));
       }
 #endif
@@ -106,22 +113,18 @@ void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  Batch<Dtype>* batch = prefetch_full_.pop("Data layer prefetch queue empty");
+  if (prefetch_current_) {
+    prefetch_free_.push(prefetch_current_);
+  }
+  prefetch_current_ = prefetch_full_.pop("Waiting for data");
   // Reshape to loaded data.
-  top[0]->ReshapeLike(batch->data_);
-  // Copy the data
-  caffe_copy(batch->data_.count(), batch->data_.cpu_data(),
-             top[0]->mutable_cpu_data());
-  DLOG(INFO) << "Prefetch copied";
+  top[0]->ReshapeLike(prefetch_current_->data_);
+  top[0]->set_cpu_data(prefetch_current_->data_.mutable_cpu_data());
   if (this->output_labels_) {
     // Reshape to loaded labels.
-    top[1]->ReshapeLike(batch->label_);
-    // Copy the labels.
-    caffe_copy(batch->label_.count(), batch->label_.cpu_data(),
-        top[1]->mutable_cpu_data());
+    top[1]->ReshapeLike(prefetch_current_->label_);
+    top[1]->set_cpu_data(prefetch_current_->label_.mutable_cpu_data());
   }
-
-  prefetch_free_.push(batch);
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/base_data_layer.cu b/src/caffe/layers/base_data_layer.cu
index 4056d36a..64c621a7 100644
--- a/src/caffe/layers/base_data_layer.cu
+++ b/src/caffe/layers/base_data_layer.cu
@@ -7,23 +7,18 @@ namespace caffe {
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  Batch<Dtype>* batch = prefetch_full_.pop("Data layer prefetch queue empty");
+  if (prefetch_current_) {
+    prefetch_free_.push(prefetch_current_);
+  }
+  prefetch_current_ = prefetch_full_.pop("Waiting for data");
   // Reshape to loaded data.
-  top[0]->ReshapeLike(batch->data_);
-  // Copy the data
-  caffe_copy(batch->data_.count(), batch->data_.gpu_data(),
-      top[0]->mutable_gpu_data());
+  top[0]->ReshapeLike(prefetch_current_->data_);
+  top[0]->set_gpu_data(prefetch_current_->data_.mutable_gpu_data());
   if (this->output_labels_) {
     // Reshape to loaded labels.
-    top[1]->ReshapeLike(batch->label_);
-    // Copy the labels.
-    caffe_copy(batch->label_.count(), batch->label_.gpu_data(),
-        top[1]->mutable_gpu_data());
+    top[1]->ReshapeLike(prefetch_current_->label_);
+    top[1]->set_gpu_data(prefetch_current_->label_.mutable_gpu_data());
   }
-  // Ensure the copy is synchronous wrt the host, so that the next batch isn't
-  // copied in meanwhile.
-  CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
-  prefetch_free_.push(batch);
 }
 
 INSTANTIATE_LAYER_GPU_FORWARD(BasePrefetchingDataLayer);
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 66e6301f..0f1296bb 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -14,7 +14,10 @@ namespace caffe {
 template <typename Dtype>
 DataLayer<Dtype>::DataLayer(const LayerParameter& param)
   : BasePrefetchingDataLayer<Dtype>(param),
-    reader_(param) {
+    offset_() {
+  db_.reset(db::GetDB(param.data_param().backend()));
+  db_->Open(param.data_param().source(), db::READ);
+  cursor_.reset(db_->NewCursor());
 }
 
 template <typename Dtype>
@@ -27,7 +30,8 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const int batch_size = this->layer_param_.data_param().batch_size();
   // Read a data point, and use it to initialize the top blob.
-  Datum& datum = *(reader_.full().peek());
+  Datum datum;
+  datum.ParseFromString(cursor_->value());
 
   // Use data_transformer to infer the expected blob shape from datum.
   vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
@@ -35,22 +39,44 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // Reshape top[0] and prefetch_data according to the batch_size.
   top_shape[0] = batch_size;
   top[0]->Reshape(top_shape);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].data_.Reshape(top_shape);
+  for (int i = 0; i < this->prefetch_.size(); ++i) {
+    this->prefetch_[i]->data_.Reshape(top_shape);
   }
-  LOG(INFO) << "output data size: " << top[0]->num() << ","
+  LOG_IF(INFO, Caffe::root_solver())
+      << "output data size: " << top[0]->num() << ","
       << top[0]->channels() << "," << top[0]->height() << ","
       << top[0]->width();
   // label
   if (this->output_labels_) {
     vector<int> label_shape(1, batch_size);
     top[1]->Reshape(label_shape);
-    for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-      this->prefetch_[i].label_.Reshape(label_shape);
+    for (int i = 0; i < this->prefetch_.size(); ++i) {
+      this->prefetch_[i]->label_.Reshape(label_shape);
     }
   }
 }
 
+template <typename Dtype>
+bool DataLayer<Dtype>::Skip() {
+  int size = Caffe::solver_count();
+  int rank = Caffe::solver_rank();
+  bool keep = (offset_ % size) == rank ||
+              // In test mode, only rank 0 runs, so avoid skipping
+              this->layer_param_.phase() == TEST;
+  return !keep;
+}
+
+template<typename Dtype>
+void DataLayer<Dtype>::Next() {
+  cursor_->Next();
+  if (!cursor_->valid()) {
+    LOG_IF(INFO, Caffe::root_solver())
+        << "Restarting data prefetching from start.";
+    cursor_->SeekToFirst();
+  }
+  offset_++;
+}
+
 // This function is called on prefetch thread
 template<typename Dtype>
 void DataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
@@ -61,41 +87,41 @@ void DataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
   CPUTimer timer;
   CHECK(batch->data_.count());
   CHECK(this->transformed_data_.count());
-
-  // Reshape according to the first datum of each batch
-  // on single input batches allows for inputs of varying dimension.
   const int batch_size = this->layer_param_.data_param().batch_size();
-  Datum& datum = *(reader_.full().peek());
-  // Use data_transformer to infer the expected blob shape from datum.
-  vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
-  this->transformed_data_.Reshape(top_shape);
-  // Reshape batch according to the batch_size.
-  top_shape[0] = batch_size;
-  batch->data_.Reshape(top_shape);
-
-  Dtype* top_data = batch->data_.mutable_cpu_data();
-  Dtype* top_label = NULL;  // suppress warnings about uninitialized variables
 
-  if (this->output_labels_) {
-    top_label = batch->label_.mutable_cpu_data();
-  }
+  Datum datum;
   for (int item_id = 0; item_id < batch_size; ++item_id) {
     timer.Start();
-    // get a datum
-    Datum& datum = *(reader_.full().pop("Waiting for data"));
+    while (Skip()) {
+      Next();
+    }
+    datum.ParseFromString(cursor_->value());
     read_time += timer.MicroSeconds();
-    timer.Start();
+
+    if (item_id == 0) {
+      // Reshape according to the first datum of each batch
+      // on single input batches allows for inputs of varying dimension.
+      // Use data_transformer to infer the expected blob shape from datum.
+      vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
+      this->transformed_data_.Reshape(top_shape);
+      // Reshape batch according to the batch_size.
+      top_shape[0] = batch_size;
+      batch->data_.Reshape(top_shape);
+    }
+
     // Apply data transformations (mirror, scale, crop...)
+    timer.Start();
     int offset = batch->data_.offset(item_id);
+    Dtype* top_data = batch->data_.mutable_cpu_data();
     this->transformed_data_.set_cpu_data(top_data + offset);
     this->data_transformer_->Transform(datum, &(this->transformed_data_));
     // Copy label.
     if (this->output_labels_) {
+      Dtype* top_label = batch->label_.mutable_cpu_data();
       top_label[item_id] = datum.label();
     }
     trans_time += timer.MicroSeconds();
-
-    reader_.free().push(const_cast<Datum*>(&datum));
+    Next();
   }
   timer.Stop();
   batch_timer.Stop();
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index c957451a..b9a071ce 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -124,28 +124,46 @@ void HDF5DataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   }
 }
 
+template <typename Dtype>
+bool HDF5DataLayer<Dtype>::Skip() {
+  int size = Caffe::solver_count();
+  int rank = Caffe::solver_rank();
+  bool keep = (offset_ % size) == rank ||
+              // In test mode, only rank 0 runs, so avoid skipping
+              this->layer_param_.phase() == TEST;
+  return !keep;
+}
+
+template<typename Dtype>
+void HDF5DataLayer<Dtype>::Next() {
+  if (++current_row_ == hdf_blobs_[0]->shape(0)) {
+    if (num_files_ > 1) {
+      ++current_file_;
+      if (current_file_ == num_files_) {
+        current_file_ = 0;
+        if (this->layer_param_.hdf5_data_param().shuffle()) {
+          std::random_shuffle(file_permutation_.begin(),
+                              file_permutation_.end());
+        }
+        DLOG(INFO) << "Looping around to first file.";
+      }
+      LoadHDF5FileData(
+        hdf_filenames_[file_permutation_[current_file_]].c_str());
+    }
+    current_row_ = 0;
+    if (this->layer_param_.hdf5_data_param().shuffle())
+      std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+  }
+  offset_++;
+}
+
 template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-  for (int i = 0; i < batch_size; ++i, ++current_row_) {
-    if (current_row_ == hdf_blobs_[0]->shape(0)) {
-      if (num_files_ > 1) {
-        ++current_file_;
-        if (current_file_ == num_files_) {
-          current_file_ = 0;
-          if (this->layer_param_.hdf5_data_param().shuffle()) {
-            std::random_shuffle(file_permutation_.begin(),
-                                file_permutation_.end());
-          }
-          DLOG(INFO) << "Looping around to first file.";
-        }
-        LoadHDF5FileData(
-            hdf_filenames_[file_permutation_[current_file_]].c_str());
-      }
-      current_row_ = 0;
-      if (this->layer_param_.hdf5_data_param().shuffle())
-        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+  for (int i = 0; i < batch_size; ++i) {
+    while (Skip()) {
+      Next();
     }
     for (int j = 0; j < this->layer_param_.top_size(); ++j) {
       int data_dim = top[j]->count() / top[j]->shape(0);
@@ -153,6 +171,7 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
           &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
             * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
     }
+    Next();
   }
 }
 
diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu
index 595d2230..33eebd41 100644
--- a/src/caffe/layers/hdf5_data_layer.cu
+++ b/src/caffe/layers/hdf5_data_layer.cu
@@ -17,24 +17,9 @@ template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-  for (int i = 0; i < batch_size; ++i, ++current_row_) {
-    if (current_row_ == hdf_blobs_[0]->shape(0)) {
-      if (num_files_ > 1) {
-        current_file_ += 1;
-        if (current_file_ == num_files_) {
-          current_file_ = 0;
-          if (this->layer_param_.hdf5_data_param().shuffle()) {
-            std::random_shuffle(file_permutation_.begin(),
-                                file_permutation_.end());
-          }
-          DLOG(INFO) << "Looping around to first file.";
-        }
-        LoadHDF5FileData(
-            hdf_filenames_[file_permutation_[current_file_]].c_str());
-      }
-      current_row_ = 0;
-      if (this->layer_param_.hdf5_data_param().shuffle())
-        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+  for (int i = 0; i < batch_size; ++i) {
+    while (Skip()) {
+      Next();
     }
     for (int j = 0; j < this->layer_param_.top_size(); ++j) {
       int data_dim = top[j]->count() / top[j]->shape(0);
@@ -42,6 +27,7 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
           &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
             * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]);
     }
+    Next();
   }
 }
 
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index 7ee7dc40..ec0fc5b0 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -54,6 +54,11 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
     const unsigned int prefetch_rng_seed = caffe_rng_rand();
     prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
     ShuffleImages();
+  } else {
+    if (this->phase_ == TRAIN && Caffe::solver_rank() > 0 &&
+        this->layer_param_.image_data_param().rand_skip() == 0) {
+      LOG(WARNING) << "Shuffling or skipping recommended for multi-GPU";
+    }
   }
   LOG(INFO) << "A total of " << lines_.size() << " images.";
 
@@ -77,8 +82,8 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   const int batch_size = this->layer_param_.image_data_param().batch_size();
   CHECK_GT(batch_size, 0) << "Positive batch size required";
   top_shape[0] = batch_size;
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].data_.Reshape(top_shape);
+  for (int i = 0; i < this->prefetch_.size(); ++i) {
+    this->prefetch_[i]->data_.Reshape(top_shape);
   }
   top[0]->Reshape(top_shape);
 
@@ -88,8 +93,8 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // label
   vector<int> label_shape(1, batch_size);
   top[1]->Reshape(label_shape);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].label_.Reshape(label_shape);
+  for (int i = 0; i < this->prefetch_.size(); ++i) {
+    this->prefetch_[i]->label_.Reshape(label_shape);
   }
 }
 
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index 103dd4b6..1bf3760e 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -173,8 +173,8 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   CHECK_GT(crop_size, 0);
   const int batch_size = this->layer_param_.window_data_param().batch_size();
   top[0]->Reshape(batch_size, channels, crop_size, crop_size);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i)
-    this->prefetch_[i].data_.Reshape(
+  for (int i = 0; i < this->prefetch_.size(); ++i)
+    this->prefetch_[i]->data_.Reshape(
         batch_size, channels, crop_size, crop_size);
 
   LOG(INFO) << "output data size: " << top[0]->num() << ","
@@ -183,8 +183,8 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // label
   vector<int> label_shape(1, batch_size);
   top[1]->Reshape(label_shape);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].label_.Reshape(label_shape);
+  for (int i = 0; i < this->prefetch_.size(); ++i) {
+    this->prefetch_[i]->label_.Reshape(label_shape);
   }
 
   // data mean
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 644cb7e9..aa9e8f2f 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -22,16 +22,13 @@
 namespace caffe {
 
 template <typename Dtype>
-Net<Dtype>::Net(const NetParameter& param, const Net* root_net)
-    : root_net_(root_net) {
+Net<Dtype>::Net(const NetParameter& param) {
   Init(param);
 }
 
 template <typename Dtype>
 Net<Dtype>::Net(const string& param_file, Phase phase,
-    const int level, const vector<string>* stages,
-    const Net* root_net)
-    : root_net_(root_net) {
+    const int level, const vector<string>* stages) {
   NetParameter param;
   ReadNetParamsFromTextFileOrDie(param_file, &param);
   // Set phase, stages and level
@@ -47,8 +44,6 @@ Net<Dtype>::Net(const string& param_file, Phase phase,
 
 template <typename Dtype>
 void Net<Dtype>::Init(const NetParameter& in_param) {
-  CHECK(Caffe::root_solver() || root_net_)
-      << "root_net_ needs to be set for all non-root solvers";
   // Set phase from the state.
   phase_ = in_param.state().phase();
   // Filter layers based on their include/exclude rules and
@@ -74,9 +69,6 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   top_id_vecs_.resize(param.layer_size());
   bottom_need_backward_.resize(param.layer_size());
   for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) {
-    // For non-root solvers, whether this layer is shared from root_net_.
-    bool share_from_root = !Caffe::root_solver()
-        && root_net_->layers_[layer_id]->ShareInParallel();
     // Inherit phase from net if unset.
     if (!param.layer(layer_id).has_phase()) {
       param.mutable_layer(layer_id)->set_phase(phase_);
@@ -89,13 +81,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
           << "propagate_down param must be specified "
           << "either 0 or bottom_size times ";
     }
-    if (share_from_root) {
-      LOG(INFO) << "Sharing layer " << layer_param.name() << " from root net";
-      layers_.push_back(root_net_->layers_[layer_id]);
-      layers_[layer_id]->SetShared(true);
-    } else {
-      layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
-    }
+    layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
     layer_names_.push_back(layer_param.name());
     LOG_IF(INFO, Caffe::root_solver())
         << "Creating Layer " << layer_param.name();
@@ -134,19 +120,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
       }
     }
     // After this layer is connected, set it up.
-    if (share_from_root) {
-      // Set up size of top blobs using root_net_
-      const vector<Blob<Dtype>*>& base_top = root_net_->top_vecs_[layer_id];
-      const vector<Blob<Dtype>*>& this_top = this->top_vecs_[layer_id];
-      for (int top_id = 0; top_id < base_top.size(); ++top_id) {
-        this_top[top_id]->ReshapeLike(*base_top[top_id]);
-        LOG(INFO) << "Created top blob " << top_id << " (shape: "
-            << this_top[top_id]->shape_string() <<  ") for shared layer "
-            << layer_param.name();
-      }
-    } else {
-      layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
-    }
+    layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
     LOG_IF(INFO, Caffe::root_solver())
         << "Setting up " << layer_names_[layer_id];
     for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
@@ -546,10 +520,15 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
   CHECK_LT(end, layers_.size());
   Dtype loss = 0;
   for (int i = start; i <= end; ++i) {
-    // LOG(ERROR) << "Forwarding " << layer_names_[i];
+    for (int c = 0; c < before_forward_.size(); ++c) {
+      before_forward_[c]->run(i);
+    }
     Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
     loss += layer_loss;
     if (debug_info_) { ForwardDebugInfo(i); }
+    for (int c = 0; c < after_forward_.size(); ++c) {
+      after_forward_[c]->run(i);
+    }
   }
   return loss;
 }
@@ -591,11 +570,17 @@ void Net<Dtype>::BackwardFromTo(int start, int end) {
   CHECK_GE(end, 0);
   CHECK_LT(start, layers_.size());
   for (int i = start; i >= end; --i) {
+    for (int c = 0; c < before_backward_.size(); ++c) {
+      before_backward_[c]->run(i);
+    }
     if (layer_need_backward_[i]) {
       layers_[i]->Backward(
           top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
       if (debug_info_) { BackwardDebugInfo(i); }
     }
+    for (int c = 0; c < after_backward_.size(); ++c) {
+      after_backward_[c]->run(i);
+    }
   }
 }
 
diff --git a/src/caffe/parallel.cpp b/src/caffe/parallel.cpp
index 5bc41c6a..d9433917 100644
--- a/src/caffe/parallel.cpp
+++ b/src/caffe/parallel.cpp
@@ -1,16 +1,15 @@
-#ifndef CPU_ONLY
+#ifdef USE_NCCL
+
 #include <cuda_runtime.h>
-#endif
 #include <glog/logging.h>
 #include <stdio.h>
-
 #include <sstream>
 #include <string>
 #include <vector>
 
-#include "boost/thread.hpp"
 #include "caffe/caffe.hpp"
 #include "caffe/parallel.hpp"
+#include "caffe/sgd_solvers.hpp"
 
 namespace caffe {
 
@@ -68,15 +67,14 @@ static size_t total_size(const vector<Blob<Dtype>*>& params) {
 
 template<typename Dtype>
 Params<Dtype>::Params(shared_ptr<Solver<Dtype> > root_solver)
-    : size_(total_size<Dtype>(root_solver->net()->learnable_params())),
-      data_(),
-      diff_() {
+  : size_(total_size<Dtype>(root_solver->net()->learnable_params())),
+    data_(),
+    diff_() {
 }
 
 template<typename Dtype>
 GPUParams<Dtype>::GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device)
-    : Params<Dtype>(root_solver) {
-#ifndef CPU_ONLY
+  : Params<Dtype>(root_solver) {
   int initial_device;
   CUDA_CHECK(cudaGetDevice(&initial_device));
 
@@ -86,358 +84,288 @@ GPUParams<Dtype>::GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device)
 
   // Copy blob values
   const vector<Blob<Dtype>*>& net =
-      root_solver->net()->learnable_params();
+    root_solver->net()->learnable_params();
   apply_buffers(net, data_, size_, copy);
 
   CUDA_CHECK(cudaMalloc(&diff_, size_ * sizeof(Dtype)));
   caffe_gpu_set(size_, Dtype(0), diff_);
 
   CUDA_CHECK(cudaSetDevice(initial_device));
-#else
-  NO_GPU;
-#endif
 }
 
 template<typename Dtype>
 GPUParams<Dtype>::~GPUParams() {
-#ifndef CPU_ONLY
   CUDA_CHECK(cudaFree(data_));
   CUDA_CHECK(cudaFree(diff_));
-#endif
 }
 
 template<typename Dtype>
-void GPUParams<Dtype>::configure(Solver<Dtype>* solver) const {
+void GPUParams<Dtype>::Configure(Solver<Dtype>* solver) const {
   const vector<Blob<Dtype>*>& net =
-      solver->net()->learnable_params();
+    solver->net()->learnable_params();
   apply_buffers(net, data_, size_, replace_gpu);
   apply_buffers(net, diff_, size_, replace_gpu_diff);
 }
 
-void DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) {
-#ifndef CPU_ONLY
-  vector<int> remaining(devices);
-
-  // Depth for reduction tree
-  int remaining_depth = static_cast<int>(ceil(log2(remaining.size())));
-
-  // Group GPUs by board
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-      for (int j = i + 1; j < remaining.size(); ++j) {
-        cudaDeviceProp a, b;
-        CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i]));
-        CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j]));
-        if (a.isMultiGpuBoard && b.isMultiGpuBoard) {
-          if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) {
-            pairs->push_back(DevicePair(remaining[i], remaining[j]));
-            DLOG(INFO) << "GPU board: " << remaining[i] << ":" << remaining[j];
-            remaining.erase(remaining.begin() + j);
-            break;
-          }
-        }
-      }
-    }
-  }
-  ostringstream s;
-  for (int i = 0; i < remaining.size(); ++i) {
-    s << (i ? ", " : "") << remaining[i];
-  }
-  DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str();
-
-  // Group by P2P accessibility
-  remaining_depth = ceil(log2(remaining.size()));
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-      for (int j = i + 1; j < remaining.size(); ++j) {
-        int access;
-        CUDA_CHECK(
-            cudaDeviceCanAccessPeer(&access, remaining[i], remaining[j]));
-        if (access) {
-          pairs->push_back(DevicePair(remaining[i], remaining[j]));
-          DLOG(INFO) << "P2P pair: " << remaining[i] << ":" << remaining[j];
-          remaining.erase(remaining.begin() + j);
-          break;
-        }
-      }
-    }
-  }
-  s.str("");
-  for (int i = 0; i < remaining.size(); ++i) {
-    s << (i ? ", " : "") << remaining[i];
-  }
-  DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str();
-
-  // Group remaining
-  remaining_depth = ceil(log2(remaining.size()));
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-      pairs->push_back(DevicePair(remaining[i], remaining[i + 1]));
-      DLOG(INFO) << "Remaining pair: " << remaining[i] << ":"
-                 << remaining[i + 1];
-      remaining.erase(remaining.begin() + i + 1);
-    }
-  }
+static int getDevice() {
+  int device = 0;
+  CUDA_CHECK(cudaGetDevice(&device));
+  return device;
+}
 
-  // Should only be the parent node remaining
-  CHECK_EQ(remaining.size(), 1);
+template<typename Dtype>
+NCCL<Dtype>::NCCL(shared_ptr<Solver<Dtype> > solver)
+  : GPUParams<Dtype>(solver, getDevice()),
+    comm_(), solver_(solver), barrier_() {
+  this->Configure(solver.get());
+  Init();
+}
 
-  pairs->insert(pairs->begin(), DevicePair(-1, remaining[0]));
+template<typename Dtype>
+NCCL<Dtype>::NCCL(shared_ptr<Solver<Dtype> > solver, const string& uid)
+  : GPUParams<Dtype>(solver, getDevice()),
+    solver_(solver), barrier_() {
+  this->Configure(solver.get());
+  Caffe::set_multiprocess(true);
+  ncclUniqueId nccl_uid;
+  memcpy(&nccl_uid, &uid[0], NCCL_UNIQUE_ID_BYTES);  // NOLINT(caffe/alt_fn)
+  NCCL_CHECK(ncclCommInitRank(&comm_,
+                              Caffe::solver_count(),
+                              nccl_uid,
+                              Caffe::solver_rank()));
+  Init();
+}
 
-  CHECK(pairs->size() == devices.size());
-  for (int i = 0; i < pairs->size(); ++i) {
-    CHECK((*pairs)[i].parent() != (*pairs)[i].device());
-    for (int j = i + 1; j < pairs->size(); ++j) {
-      CHECK((*pairs)[i].device() != (*pairs)[j].device());
-    }
+template<typename Dtype>
+void NCCL<Dtype>::Init() {
+  if (solver_->param().layer_wise_reduce()) {
+    CUDA_CHECK(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
   }
-#else
-  NO_GPU;
-#endif
 }
 
-//
-
 template<typename Dtype>
-P2PSync<Dtype>::P2PSync(shared_ptr<Solver<Dtype> > root_solver,
-                        P2PSync<Dtype>* parent, const SolverParameter& param)
-    : GPUParams<Dtype>(root_solver, param.device_id()),
-      parent_(parent),
-      children_(),
-      queue_(),
-      initial_iter_(root_solver->iter()),
-      solver_() {
-#ifndef CPU_ONLY
-  int initial_device;
-  CUDA_CHECK(cudaGetDevice(&initial_device));
-  const int self = param.device_id();
-  CUDA_CHECK(cudaSetDevice(self));
-
-  if (parent == NULL) {
-    solver_ = root_solver;
-  } else {
-    Caffe::set_root_solver(false);
-    solver_.reset(new WorkerSolver<Dtype>(param, root_solver.get()));
-    Caffe::set_root_solver(true);
+NCCL<Dtype>::~NCCL() {
+  if (solver_->param().layer_wise_reduce()) {
+    CUDA_CHECK(cudaStreamDestroy(stream_));
   }
-  this->configure(solver_.get());
-  solver_->add_callback(this);
-
-  if (parent) {
-    // Enable p2p access between devices
-    const int peer = parent->solver_->param().device_id();
-    int access;
-    CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
-    if (access) {
-      CUDA_CHECK(cudaDeviceEnablePeerAccess(peer, 0));
-    } else {
-      LOG(INFO)<< "GPU " << self << " does not have p2p access to GPU " << peer;
-    }
-    // Allocate receiving buffer on parent
-    CUDA_CHECK(cudaSetDevice(peer));
-    CUDA_CHECK(cudaMalloc(&parent_grads_, size_ * sizeof(Dtype)));
-    CUDA_CHECK(cudaSetDevice(self));
+  if (comm_) {
+    ncclCommDestroy(comm_);
   }
-
-  CUDA_CHECK(cudaSetDevice(initial_device));
-#else
-  NO_GPU;
-#endif
 }
 
 template<typename Dtype>
-P2PSync<Dtype>::~P2PSync() {
-#ifndef CPU_ONLY
-  int initial_device;
-  CUDA_CHECK(cudaGetDevice(&initial_device));
-  const int self = solver_->param().device_id();
-  CUDA_CHECK(cudaSetDevice(self));
-
-  if (parent_) {
-    CUDA_CHECK(cudaFree(parent_grads_));
-    const int peer = parent_->solver_->param().device_id();
-    int access;
-    CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
-    if (access) {
-      CUDA_CHECK(cudaDeviceDisablePeerAccess(peer));
-    }
-  }
-
-  CUDA_CHECK(cudaSetDevice(initial_device));
-#endif
+boost::barrier* NCCL<Dtype>::barrier() {
+  return barrier_;
+}
+template<typename Dtype>
+void NCCL<Dtype>::set_barrier(boost::barrier* value) {
+  barrier_ = value;
 }
 
 template<typename Dtype>
-void P2PSync<Dtype>::InternalThreadEntry() {
-  Caffe::SetDevice(solver_->param().device_id());
-  CHECK(Caffe::root_solver());
-  Caffe::set_root_solver(false);
-  // See if there is a defined seed and reset random state if so
-  if (solver_->param().random_seed() >= 0) {
-    // Fetch random seed and modulate by device ID to make sure
-    // everyone doesn't have the same seed.  We seem to have some
-    // solver instability if we have everyone with the same seed
-    Caffe::set_random_seed(
-        solver_->param().random_seed() + solver_->param().device_id());
+void NCCL<Dtype>::InitSingleProcess(vector<NCCL<Dtype>*>* nccls) {
+  ncclComm_t* comms = new ncclComm_t[nccls->size()];
+  int* gpu_list = new int[nccls->size()];
+  for (int i = 0; i < nccls->size(); ++i) {
+    gpu_list[i] = (*nccls)[i]->solver_->param().device_id();
+  }
+  NCCL_CHECK(ncclCommInitAll(comms, static_cast<int>(nccls->size()), gpu_list));
+  for (int i = 0; i < nccls->size(); ++i) {
+    (*nccls)[i]->comm_ = comms[i];
   }
-  solver_->Step(solver_->param().max_iter() - initial_iter_);
 }
 
 template<typename Dtype>
-void P2PSync<Dtype>::on_start() {
-#ifndef CPU_ONLY
-#ifdef DEBUG
-  int device;
-  CUDA_CHECK(cudaGetDevice(&device));
-  CHECK(device == solver_->param().device_id());
-#else
-//  CHECK(false);
-#endif
+string NCCL<Dtype>::new_uid() {
+  string uid;
+  uid.resize(NCCL_UNIQUE_ID_BYTES);
+  ncclUniqueId nccl_uid;
+  NCCL_CHECK(ncclGetUniqueId(&nccl_uid));
+  memcpy(&uid[0], &nccl_uid, NCCL_UNIQUE_ID_BYTES);  // NOLINT(caffe/alt_fn)
+  return uid;
+}
 
-  // Wait for update from parent
-  if (parent_) {
-    P2PSync<Dtype> *parent = queue_.pop();
-    CHECK(parent == parent_);
+template<typename Dtype>
+void NCCL<Dtype>::Broadcast() {
+  if (barrier_) {  // NULL in multi process case
+    barrier_->wait();
   }
-
-  // Update children
-  for (int i = children_.size() - 1; i >= 0; i--) {
-    Dtype* src = data_;
-    Dtype* dst = children_[i]->data_;
-
-#ifdef DEBUG
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == children_[i]->solver_->param().device_id());
-#endif
-
-    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),
-        cudaMemcpyDeviceToDevice, cudaStreamDefault));
-    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
-    children_[i]->queue_.push(this);
+  NCCL_CHECK(ncclBcast(data_, static_cast<int>(size_),
+                       nccl::dataType<Dtype>::type, 0,
+                       comm_, cudaStreamDefault));
+  if (barrier_) {
+    barrier_->wait();
   }
-#endif
 }
 
 template<typename Dtype>
-void P2PSync<Dtype>::on_gradients_ready() {
-#ifndef CPU_ONLY
+void NCCL<Dtype>::run(int layer) {
+  CHECK(solver_->param().layer_wise_reduce());
+  vector<shared_ptr<Blob<Dtype> > >& blobs =
+    solver_->net()->layers()[layer]->blobs();
 #ifdef DEBUG
-  int device;
-  CUDA_CHECK(cudaGetDevice(&device));
-  CHECK(device == solver_->param().device_id());
+  // Assert blobs are contiguous to reduce in one step (e.g. bias often small)
+  for (int i = 1; i < blobs.size(); ++i) {
+    CHECK_EQ(blobs[i - 1]->gpu_diff() + blobs[i - 1]->count(),
+             blobs[i + 0]->gpu_diff());
+  }
 #endif
+  if (blobs.size() > 0) {
+    // Make sure default stream is done computing gradients. Could be
+    // replaced by cudaEventRecord+cudaStreamWaitEvent to avoid
+    // blocking the default stream, but it's actually slower.
+    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
 
-  // Sum children gradients as they appear in the queue
-  for (int i = 0; i < children_.size(); ++i) {
-    P2PSync<Dtype> *child = queue_.pop();
-    Dtype* src = child->parent_grads_;
-    Dtype* dst = diff_;
-
-#ifdef DEBUG
-    bool ok = false;
-    for (int j = 0; j < children_.size(); ++j) {
-      if (child == children_[j]) {
-        ok = true;
-      }
+    // Reduce asynchronously
+    int size = 0;
+    for (int i = 0; i < blobs.size(); ++i) {
+      size += blobs[i]->count();
     }
-    CHECK(ok);
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == device);
-#endif
-
-    caffe_gpu_add(size_, src, dst, dst);
+    if (barrier_) {  // NULL in multi process case
+      barrier_->wait();
+    }
+    NCCL_CHECK(ncclAllReduce(blobs[0]->mutable_gpu_diff(),
+                             blobs[0]->mutable_gpu_diff(),
+                             size,
+                             nccl::dataType<Dtype>::type,
+                             ncclSum, comm_, stream_));
+    caffe_gpu_scal(size, (Dtype) 1.0 / Caffe::solver_count(),
+                   blobs[0]->mutable_gpu_diff(), stream_);
   }
+}
 
-  // Send gradients to parent
-  if (parent_) {
-    Dtype* src = diff_;
-    Dtype* dst = parent_grads_;
-
-#ifdef DEBUG
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == parent_->solver_->param().device_id());
-#endif
-
-    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
-        cudaMemcpyDeviceToDevice, cudaStreamDefault));
-    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
-    parent_->queue_.push(this);
+template<typename Dtype>
+void NCCL<Dtype>::on_gradients_ready() {
+  if (solver_->param().layer_wise_reduce()) {
+    CHECK_EQ(solver_->net()->params().size(),
+             solver_->net()->learnable_params().size())
+      << "Layer-wise reduce is not supported for nets with shared weights.";
+
+    // Make sure reduction is done before applying gradients
+    CUDA_CHECK(cudaStreamSynchronize(stream_));
   } else {
-    // Loss functions divide gradients by the batch size, so to compensate
-    // for split batch, the root solver divides by number of solvers.
-    caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_);
+    if (barrier_) {  // NULL in multi process case
+      barrier_->wait();
+    }
+    NCCL_CHECK(ncclAllReduce(diff_, diff_, static_cast<int>(size_),
+                             nccl::dataType<Dtype>::type, ncclSum, comm_,
+                             cudaStreamDefault));
+    caffe_gpu_scal(static_cast<int>(size_),
+                   (Dtype) 1.0 / Caffe::solver_count(), diff_);
   }
-#endif
 }
 
 template<typename Dtype>
-void P2PSync<Dtype>::Prepare(const vector<int>& gpus,
-            vector<shared_ptr<P2PSync<Dtype> > >* syncs) {
-  // Pair devices for map-reduce synchronization
-  vector<DevicePair> pairs;
-  DevicePair::compute(gpus, &pairs);
-  ostringstream s;
-  for (int i = 1; i < pairs.size(); ++i) {
-    s << (i == 1 ? "" : ", ") << pairs[i].parent() << ":" << pairs[i].device();
+class Worker : public InternalThread {
+ public:
+  explicit Worker(shared_ptr<Solver<Dtype> > rank0, int device,
+                  boost::barrier* barrier, vector<NCCL<Dtype>*>* nccls,
+                  const char* restore)
+    : rank0_(rank0), device_(device), barrier_(barrier),
+      nccls_(nccls), restore_(restore) {
   }
-  LOG(INFO)<< "GPUs pairs " << s.str();
-
-  SolverParameter param(solver_->param());
-
-  // Build the GPU tree by finding the parent for each solver
-  for (int attempts = 0; attempts < pairs.size(); ++attempts) {
-    for (int i = 1; i < pairs.size(); ++i) {
-      if (!syncs->at(i).get()) {
-        P2PSync<Dtype>* parent = NULL;
-        for (int j = 0; j < syncs->size(); ++j) {
-          P2PSync<Dtype>* sync = j == 0 ? this : syncs->at(j).get();
-          if (sync) {
-            const SolverParameter& p = sync->solver()->param();
-            if (p.device_id() == pairs[i].parent()) {
-              parent = sync;
-            }
-          }
-        }
-        if (parent) {
-          param.set_device_id(pairs[i].device());
-          syncs->at(i).reset(new P2PSync<Dtype>(solver_, parent, param));
-          parent->children_.push_back((P2PSync<Dtype>*) syncs->at(i).get());
-        }
+  virtual ~Worker() {}
+
+ protected:
+  void InternalThreadEntry() {
+    // Create solver and install callbacks
+    SolverParameter param(rank0_->param());
+    param.set_device_id(device_);
+#ifdef DEBUG
+    int device;
+    CUDA_CHECK(cudaGetDevice(&device));
+    CHECK_EQ(device, device_);
+#endif
+    param.set_type(rank0_->type());
+    shared_ptr<Solver<Dtype> > s(SolverRegistry<Dtype>::CreateSolver(param));
+    CHECK_EQ(s->type(), rank0_->type());
+    if (restore_) {
+      // Could not make NCCL broadcast solver state, it seems to crash
+      // if called in a tight loop, regardless of barriers etc. so
+      // restore all solvers from file.
+      s->Restore(restore_);
+    }
+    NCCL<Dtype> nccl(s);
+    nccl.set_barrier(barrier_);
+    s->add_callback(&nccl);
+    if (s->param().layer_wise_reduce()) {
+      s->net()->add_after_backward(&nccl);
+    }
+    (*nccls_)[Caffe::solver_rank()] = &nccl;
+    // Wait for other threads
+    barrier_->wait();
+    // Wait for NCCL init
+    barrier_->wait();
+    // Broadcast rank 0 state
+    nccl.Broadcast();
+    // Solve
+    s->Step(param.max_iter() - s->iter());
+    barrier_->wait();
+#ifdef DEBUG
+    // Check all solvers have same state
+    SGDSolver<Dtype>* sa = static_cast<SGDSolver<Dtype>*>(rank0_.get());
+    SGDSolver<Dtype>* sb = static_cast<SGDSolver<Dtype>*>(s.get());
+    for (int h = 0; h < sa->history().size(); ++h) {
+      CUDA_CHECK(cudaSetDevice(sa->param().device_id()));
+      const Dtype* a = sa->history()[h]->cpu_data();
+      CUDA_CHECK(cudaSetDevice(sb->param().device_id()));
+      const Dtype* b = sb->history()[h]->cpu_data();
+      for (int v = 0; v < sa->history()[h]->count(); ++v) {
+        CHECK_DOUBLE_EQ(a[v], b[v]);
       }
     }
+#endif
   }
-}
-
-template<typename Dtype>
-void P2PSync<Dtype>::Run(const vector<int>& gpus) {
-  vector<shared_ptr<P2PSync<Dtype> > > syncs(gpus.size());
-  Prepare(gpus, &syncs);
 
-  LOG(INFO)<< "Starting Optimization";
+  shared_ptr<Solver<Dtype> > rank0_;
+  int device_;
+  boost::barrier* barrier_;
+  vector<NCCL<Dtype>*>* nccls_;
+  const char* restore_;
+};
 
-  for (int i = 1; i < syncs.size(); ++i) {
-    syncs[i]->StartInternalThread();
+template<typename Dtype>
+void NCCL<Dtype>::Run(const vector<int>& gpus, const char* restore) {
+  boost::barrier barrier(static_cast<int>(gpus.size()));
+  vector<NCCL<Dtype>*> nccls(gpus.size());
+  // Create workers
+  vector<shared_ptr<Worker<Dtype> > > workers(gpus.size());
+  for (int i = 1; i < gpus.size(); ++i) {
+    CUDA_CHECK(cudaSetDevice(gpus[i]));
+    Caffe::set_solver_rank(i);
+    Worker<Dtype>* w = new Worker<Dtype>(solver_, gpus[i], &barrier,
+                                         &nccls, restore);
+    w->StartInternalThread();
+    workers[i].reset(w);
   }
-
-  // Run root solver on current thread
+  CUDA_CHECK(cudaSetDevice(gpus[0]));
+  Caffe::set_solver_rank(0);
+  barrier_ = &barrier;
+  solver_->add_callback(this);
+  if (solver_->param().layer_wise_reduce()) {
+    solver_->net()->add_after_backward(this);
+  }
+  nccls[0] = this;
+  // Wait for workers
+  barrier.wait();
+  // Init NCCL
+  InitSingleProcess(&nccls);
+  barrier.wait();
+  // Run first solver on current thread
+  Broadcast();
   solver_->Solve();
-
-  for (int i = 1; i < syncs.size(); ++i) {
-    syncs[i]->StopInternalThread();
+  barrier.wait();  // Hangs without it when running tests
+  // Wait for shutdown
+  for (int i = 1; i < gpus.size(); ++i) {
+    workers[i]->StopInternalThread();
   }
 }
 
 INSTANTIATE_CLASS(Params);
 INSTANTIATE_CLASS(GPUParams);
-INSTANTIATE_CLASS(P2PSync);
+INSTANTIATE_CLASS(Worker);
+INSTANTIATE_CLASS(NCCL);
 
 }  // namespace caffe
+
+#endif  // USE_NCCL
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 430a0dea..1c85f696 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -98,7 +98,7 @@ message NetParameter {
 // NOTE
 // Update the next available ID when you add a new SolverParameter field.
 //
-// SolverParameter next available ID: 41 (last added: type)
+// SolverParameter next available ID: 42 (last added: layer_wise_reduce)
 message SolverParameter {
   //////////////////////////////////////////////////////////////////////////////
   // Specifying the train and test networks
@@ -239,6 +239,9 @@ message SolverParameter {
   }
   // DEPRECATED: use type instead of solver_type
   optional SolverType solver_type = 30 [default = SGD];
+
+  // Overlap compute and communication for data parallel training
+  optional bool layer_wise_reduce = 41 [default = true];
 }
 
 // A message that stores the solver snapshots
@@ -655,8 +658,8 @@ message DataParameter {
   optional bool mirror = 6 [default = false];
   // Force the encoded image to have 3 color channels
   optional bool force_encoded_color = 9 [default = false];
-  // Prefetch queue (Number of batches to prefetch to host memory, increase if
-  // data access bandwidth varies).
+  // Prefetch queue (Increase if data feeding bandwidth varies, within the
+  // limit of device memory for GPU training)
   optional uint32 prefetch = 10 [default = 4];
 }
 
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index ece3913e..1c1a9e59 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -26,16 +26,14 @@ SolverAction::Enum Solver<Dtype>::GetRequestedAction() {
 }
 
 template <typename Dtype>
-Solver<Dtype>::Solver(const SolverParameter& param, const Solver* root_solver)
-    : net_(), callbacks_(), root_solver_(root_solver),
-      requested_early_exit_(false) {
+Solver<Dtype>::Solver(const SolverParameter& param)
+    : net_(), callbacks_(), requested_early_exit_(false) {
   Init(param);
 }
 
 template <typename Dtype>
-Solver<Dtype>::Solver(const string& param_file, const Solver* root_solver)
-    : net_(), callbacks_(), root_solver_(root_solver),
-      requested_early_exit_(false) {
+Solver<Dtype>::Solver(const string& param_file)
+    : net_(), callbacks_(), requested_early_exit_(false) {
   SolverParameter param;
   ReadSolverParamsFromTextFileOrDie(param_file, &param);
   Init(param);
@@ -43,15 +41,13 @@ Solver<Dtype>::Solver(const string& param_file, const Solver* root_solver)
 
 template <typename Dtype>
 void Solver<Dtype>::Init(const SolverParameter& param) {
-  CHECK(Caffe::root_solver() || root_solver_)
-      << "root_solver_ needs to be set for all non-root solvers";
   LOG_IF(INFO, Caffe::root_solver()) << "Initializing solver from parameters: "
     << std::endl << param.DebugString();
   param_ = param;
   CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
   CheckSnapshotWritePermissions();
-  if (Caffe::root_solver() && param_.random_seed() >= 0) {
-    Caffe::set_random_seed(param_.random_seed());
+  if (param_.random_seed() >= 0) {
+    Caffe::set_random_seed(param_.random_seed() + Caffe::solver_rank());
   }
   // Scaffolding code
   InitTrainNet();
@@ -101,11 +97,7 @@ void Solver<Dtype>::InitTrainNet() {
   net_state.MergeFrom(net_param.state());
   net_state.MergeFrom(param_.train_state());
   net_param.mutable_state()->CopyFrom(net_state);
-  if (Caffe::root_solver()) {
-    net_.reset(new Net<Dtype>(net_param));
-  } else {
-    net_.reset(new Net<Dtype>(net_param, root_solver_->net_.get()));
-  }
+  net_.reset(new Net<Dtype>(net_param));
 }
 
 template <typename Dtype>
@@ -180,12 +172,7 @@ void Solver<Dtype>::InitTestNets() {
     net_params[i].mutable_state()->CopyFrom(net_state);
     LOG(INFO)
         << "Creating test net (#" << i << ") specified by " << sources[i];
-    if (Caffe::root_solver()) {
-      test_nets_[i].reset(new Net<Dtype>(net_params[i]));
-    } else {
-      test_nets_[i].reset(new Net<Dtype>(net_params[i],
-          root_solver_->test_nets_[i].get()));
-    }
+    test_nets_[i].reset(new Net<Dtype>(net_params[i]));
     test_nets_[i]->set_debug_info(param_.debug_info());
   }
 }
@@ -197,14 +184,16 @@ void Solver<Dtype>::Step(int iters) {
   int average_loss = this->param_.average_loss();
   losses_.clear();
   smoothed_loss_ = 0;
+  iteration_timer_.Start();
 
   while (iter_ < stop_iter) {
     // zero-init the params
     net_->ClearParamDiffs();
     if (param_.test_interval() && iter_ % param_.test_interval() == 0
-        && (iter_ > 0 || param_.test_initialization())
-        && Caffe::root_solver()) {
-      TestAll();
+        && (iter_ > 0 || param_.test_initialization())) {
+      if (Caffe::root_solver()) {
+        TestAll();
+      }
       if (requested_early_exit_) {
         // Break out of the while loop because stop was requested while testing.
         break;
@@ -225,8 +214,13 @@ void Solver<Dtype>::Step(int iters) {
     // average the loss across iterations for smoothed reporting
     UpdateSmoothedLoss(loss, start_iter, average_loss);
     if (display) {
+      float lapse = iteration_timer_.Seconds();
+      float per_s = (iter_ - iterations_last_) / (lapse ? lapse : 1);
       LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_
-          << ", loss = " << smoothed_loss_;
+          << " (" << per_s << " iter/s, " << lapse << "s/"
+          << param_.display() << " iters), loss = " << smoothed_loss_;
+      iteration_timer_.Start();
+      iterations_last_ = iter_;
       const vector<Blob<Dtype>*>& result = net_->output_blobs();
       int score_index = 0;
       for (int j = 0; j < result.size(); ++j) {
diff --git a/src/caffe/solvers/adagrad_solver.cpp b/src/caffe/solvers/adagrad_solver.cpp
index e78eadca..d8107e1e 100644
--- a/src/caffe/solvers/adagrad_solver.cpp
+++ b/src/caffe/solvers/adagrad_solver.cpp
@@ -12,7 +12,6 @@ void adagrad_update_gpu(int N, Dtype* g, Dtype* h, Dtype delta,
 
 template <typename Dtype>
 void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  CHECK(Caffe::root_solver());
   const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
   const vector<float>& net_params_lr = this->net_->params_lr();
   Dtype delta = this->param_.delta();
diff --git a/src/caffe/solvers/nesterov_solver.cpp b/src/caffe/solvers/nesterov_solver.cpp
index 23ab2d43..7c1fac1f 100644
--- a/src/caffe/solvers/nesterov_solver.cpp
+++ b/src/caffe/solvers/nesterov_solver.cpp
@@ -12,7 +12,6 @@ void nesterov_update_gpu(int N, Dtype* g, Dtype* h, Dtype momentum,
 
 template <typename Dtype>
 void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  CHECK(Caffe::root_solver());
   const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
   const vector<float>& net_params_lr = this->net_->params_lr();
   Dtype momentum = this->param_.momentum();
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index f30f316d..ad6abe54 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -100,10 +100,10 @@ void SGDSolver<Dtype>::ClipGradients() {
 
 template <typename Dtype>
 void SGDSolver<Dtype>::ApplyUpdate() {
-  CHECK(Caffe::root_solver());
   Dtype rate = GetLearningRate();
   if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
-    LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
+    LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << this->iter_
+        << ", lr = " << rate;
   }
   ClipGradients();
   for (int param_id = 0; param_id < this->net_->learnable_params().size();
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 4d356417..88d9b785 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -3,26 +3,41 @@
 #include "caffe/util/math_functions.hpp"
 
 namespace caffe {
+SyncedMemory::SyncedMemory()
+  : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
+    own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false) {
+#ifndef CPU_ONLY
+#ifdef DEBUG
+  CUDA_CHECK(cudaGetDevice(&device_));
+#endif
+#endif
+}
+
+SyncedMemory::SyncedMemory(size_t size)
+  : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
+    own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false) {
+#ifndef CPU_ONLY
+#ifdef DEBUG
+  CUDA_CHECK(cudaGetDevice(&device_));
+#endif
+#endif
+}
 
 SyncedMemory::~SyncedMemory() {
+  check_device();
   if (cpu_ptr_ && own_cpu_data_) {
     CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
   }
 
 #ifndef CPU_ONLY
   if (gpu_ptr_ && own_gpu_data_) {
-    int initial_device;
-    cudaGetDevice(&initial_device);
-    if (gpu_device_ != -1) {
-      CUDA_CHECK(cudaSetDevice(gpu_device_));
-    }
     CUDA_CHECK(cudaFree(gpu_ptr_));
-    cudaSetDevice(initial_device);
   }
 #endif  // CPU_ONLY
 }
 
 inline void SyncedMemory::to_cpu() {
+  check_device();
   switch (head_) {
   case UNINITIALIZED:
     CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
@@ -49,10 +64,10 @@ inline void SyncedMemory::to_cpu() {
 }
 
 inline void SyncedMemory::to_gpu() {
+  check_device();
 #ifndef CPU_ONLY
   switch (head_) {
   case UNINITIALIZED:
-    CUDA_CHECK(cudaGetDevice(&gpu_device_));
     CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
     caffe_gpu_memset(size_, 0, gpu_ptr_);
     head_ = HEAD_AT_GPU;
@@ -60,7 +75,6 @@ inline void SyncedMemory::to_gpu() {
     break;
   case HEAD_AT_CPU:
     if (gpu_ptr_ == NULL) {
-      CUDA_CHECK(cudaGetDevice(&gpu_device_));
       CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
       own_gpu_data_ = true;
     }
@@ -77,11 +91,13 @@ inline void SyncedMemory::to_gpu() {
 }
 
 const void* SyncedMemory::cpu_data() {
+  check_device();
   to_cpu();
   return (const void*)cpu_ptr_;
 }
 
 void SyncedMemory::set_cpu_data(void* data) {
+  check_device();
   CHECK(data);
   if (own_cpu_data_) {
     CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
@@ -92,6 +108,7 @@ void SyncedMemory::set_cpu_data(void* data) {
 }
 
 const void* SyncedMemory::gpu_data() {
+  check_device();
 #ifndef CPU_ONLY
   to_gpu();
   return (const void*)gpu_ptr_;
@@ -102,16 +119,11 @@ const void* SyncedMemory::gpu_data() {
 }
 
 void SyncedMemory::set_gpu_data(void* data) {
+  check_device();
 #ifndef CPU_ONLY
   CHECK(data);
   if (own_gpu_data_) {
-    int initial_device;
-    cudaGetDevice(&initial_device);
-    if (gpu_device_ != -1) {
-      CUDA_CHECK(cudaSetDevice(gpu_device_));
-    }
     CUDA_CHECK(cudaFree(gpu_ptr_));
-    cudaSetDevice(initial_device);
   }
   gpu_ptr_ = data;
   head_ = HEAD_AT_GPU;
@@ -122,12 +134,14 @@ void SyncedMemory::set_gpu_data(void* data) {
 }
 
 void* SyncedMemory::mutable_cpu_data() {
+  check_device();
   to_cpu();
   head_ = HEAD_AT_CPU;
   return cpu_ptr_;
 }
 
 void* SyncedMemory::mutable_gpu_data() {
+  check_device();
 #ifndef CPU_ONLY
   to_gpu();
   head_ = HEAD_AT_GPU;
@@ -140,9 +154,9 @@ void* SyncedMemory::mutable_gpu_data() {
 
 #ifndef CPU_ONLY
 void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
+  check_device();
   CHECK(head_ == HEAD_AT_CPU);
   if (gpu_ptr_ == NULL) {
-    CUDA_CHECK(cudaGetDevice(&gpu_device_));
     CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
     own_gpu_data_ = true;
   }
@@ -153,5 +167,20 @@ void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
 }
 #endif
 
+void SyncedMemory::check_device() {
+#ifndef CPU_ONLY
+#ifdef DEBUG
+  int device;
+  cudaGetDevice(&device);
+  CHECK(device == device_);
+  if (gpu_ptr_ && own_gpu_data_) {
+    cudaPointerAttributes attributes;
+    CUDA_CHECK(cudaPointerGetAttributes(&attributes, gpu_ptr_));
+    CHECK(attributes.device == device_);
+  }
+#endif
+#endif
+}
+
 }  // namespace caffe
 
diff --git a/src/caffe/test/test_data_layer.cpp b/src/caffe/test/test_data_layer.cpp
index 3e8d113d..3835af1f 100644
--- a/src/caffe/test/test_data_layer.cpp
+++ b/src/caffe/test/test_data_layer.cpp
@@ -105,6 +105,32 @@ class DataLayerTest : public MultiDeviceTest<TypeParam> {
     }
   }
 
+  void TestSkip() {
+    LayerParameter param;
+    param.set_phase(TRAIN);
+    DataParameter* data_param = param.mutable_data_param();
+    int batch_size = 5;
+    data_param->set_batch_size(batch_size);
+    data_param->set_source(filename_->c_str());
+    data_param->set_backend(backend_);
+    Caffe::set_solver_count(8);
+    for (int dev = 0; dev < Caffe::solver_count(); ++dev) {
+      Caffe::set_solver_rank(dev);
+      DataLayer<Dtype> layer(param);
+      layer.SetUp(blob_bottom_vec_, blob_top_vec_);
+      int label = dev;
+      for (int iter = 0; iter < 10; ++iter) {
+        layer.Forward(blob_bottom_vec_, blob_top_vec_);
+        for (int i = 0; i < batch_size; ++i) {
+          EXPECT_EQ(label % batch_size, blob_top_label_->cpu_data()[i]);
+          label += Caffe::solver_count();
+        }
+      }
+    }
+    Caffe::set_solver_count(1);
+    Caffe::set_solver_rank(0);
+  }
+
   void TestReshape(DataParameter_DB backend) {
     const int num_inputs = 5;
     // Save data of varying shapes.
@@ -356,6 +382,11 @@ TYPED_TEST(DataLayerTest, TestReadLevelDB) {
   this->TestRead();
 }
 
+TYPED_TEST(DataLayerTest, TestSkipLevelDB) {
+  this->Fill(false, DataParameter_DB_LEVELDB);
+  this->TestSkip();
+}
+
 TYPED_TEST(DataLayerTest, TestReshapeLevelDB) {
   this->TestReshape(DataParameter_DB_LEVELDB);
 }
@@ -396,6 +427,11 @@ TYPED_TEST(DataLayerTest, TestReadLMDB) {
   this->TestRead();
 }
 
+TYPED_TEST(DataLayerTest, TestSkipLMDB) {
+  this->Fill(false, DataParameter_DB_LMDB);
+  this->TestSkip();
+}
+
 TYPED_TEST(DataLayerTest, TestReshapeLMDB) {
   this->TestReshape(DataParameter_DB_LMDB);
 }
diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index 975a8f0f..6ad0d8f6 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -36,7 +36,9 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
 
   string snapshot_prefix_;
   shared_ptr<SGDSolver<Dtype> > solver_;
-  shared_ptr<P2PSync<Dtype> > sync_;
+#ifdef USE_NCCL
+  shared_ptr<NCCL<Dtype> > nccl_;
+#endif
   int seed_;
   // Dimensions are determined by generate_sample_data.py
   // TODO this is brittle and the hdf5 file should be checked instead.
@@ -85,6 +87,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
        "lr_policy: 'fixed' "
        "iter_size: " << iter_size << " "
        "device_id: " << device_id << " "
+       "layer_wise_reduce: " << (!share_) << " "
        "net_param { "
        "  name: 'TestNetwork' "
        "  layer { "
@@ -183,7 +186,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     }
     Caffe::set_random_seed(this->seed_);
     this->InitSolverFromProtoString(proto.str());
-    if (from_snapshot != NULL) {
+    if (from_snapshot) {
       this->solver_->Restore(from_snapshot);
       for (int i = 0; i < this->solver_->iter(); ++i) {
         this->solver_->net()->Forward();
@@ -202,9 +205,10 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
           gpus.push_back(i);
       }
       Caffe::set_solver_count(gpus.size());
-      this->sync_.reset(new P2PSync<Dtype>(
-          this->solver_, NULL, this->solver_->param()));
-      this->sync_->Run(gpus);
+#ifdef USE_NCCL
+      this->nccl_.reset(new NCCL<Dtype>(this->solver_));
+      this->nccl_->Run(gpus, from_snapshot);
+#endif
       Caffe::set_solver_count(1);
     }
     if (snapshot) {
@@ -457,12 +461,28 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const int kIterSize = 1;
     // Test over all numbers of devices.
     int available_devices = 1;
-#ifndef CPU_ONLY
+#ifdef USE_NCCL
     if (Caffe::mode() == Caffe::GPU) {
       CUDA_CHECK(cudaGetDeviceCount(&available_devices));
     }
 #endif
-    for (int devices = 1; devices <= available_devices; ++devices) {
+    // Takes a while to test all sizes for each test so sparse
+    vector<int> sizes;
+    sizes.push_back(1);
+    if (available_devices >= 2) {
+      sizes.push_back(2);
+    }
+    if (available_devices >= 3) {
+      sizes.push_back(3);
+    }
+    if (available_devices >= 8) {
+      sizes.push_back(8);
+    }
+    if (available_devices >= 16) {
+      sizes.push_back(16);
+    }
+    for (int i = 0; i < sizes.size(); ++i) {
+      int devices = sizes[i];
       // Configure batch size for single / multi device equivalence.
       // Constant data is needed for multi device as for accumulation.
       num_ = kNum * devices;
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp
index 8884ce95..68e10286 100644
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ b/src/caffe/test/test_hdf5data_layer.cpp
@@ -133,4 +133,34 @@ TYPED_TEST(HDF5DataLayerTest, TestRead) {
   }
 }
 
+TYPED_TEST(HDF5DataLayerTest, TestSkip) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter param;
+  param.add_top("data");
+  param.add_top("label");
+
+  HDF5DataParameter* hdf5_data_param = param.mutable_hdf5_data_param();
+  int batch_size = 5;
+  hdf5_data_param->set_batch_size(batch_size);
+  hdf5_data_param->set_source(*(this->filename));
+
+  Caffe::set_solver_count(8);
+  for (int dev = 0; dev < Caffe::solver_count(); ++dev) {
+    Caffe::set_solver_rank(dev);
+
+    HDF5DataLayer<Dtype> layer(param);
+    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    int label = dev;
+    for (int iter = 0; iter < 1; ++iter) {
+      layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+      for (int i = 0; i < batch_size; ++i) {
+        EXPECT_EQ(1 + label, this->blob_top_label_->cpu_data()[i]);
+        label = (label + Caffe::solver_count()) % (batch_size * 2);
+      }
+    }
+  }
+  Caffe::set_solver_count(1);
+  Caffe::set_solver_rank(0);
+}
+
 }  // namespace caffe
diff --git a/src/caffe/util/blocking_queue.cpp b/src/caffe/util/blocking_queue.cpp
index 058668fe..f69d2104 100644
--- a/src/caffe/util/blocking_queue.cpp
+++ b/src/caffe/util/blocking_queue.cpp
@@ -1,7 +1,6 @@
 #include <boost/thread.hpp>
 #include <string>
 
-#include "caffe/data_reader.hpp"
 #include "caffe/layers/base_data_layer.hpp"
 #include "caffe/parallel.hpp"
 #include "caffe/util/blocking_queue.hpp"
@@ -88,9 +87,5 @@ size_t BlockingQueue<T>::size() const {
 
 template class BlockingQueue<Batch<float>*>;
 template class BlockingQueue<Batch<double>*>;
-template class BlockingQueue<Datum*>;
-template class BlockingQueue<shared_ptr<DataReader::QueuePair> >;
-template class BlockingQueue<P2PSync<float>*>;
-template class BlockingQueue<P2PSync<double>*>;
 
 }  // namespace caffe
diff --git a/src/caffe/util/db_lmdb.cpp b/src/caffe/util/db_lmdb.cpp
index fb1d4956..491a9bd0 100644
--- a/src/caffe/util/db_lmdb.cpp
+++ b/src/caffe/util/db_lmdb.cpp
@@ -32,7 +32,7 @@ void LMDB::Open(const string& source, Mode mode) {
     MDB_CHECK(rc);
   }
 #endif
-  LOG(INFO) << "Opened lmdb " << source;
+  LOG_IF(INFO, Caffe::root_solver()) << "Opened lmdb " << source;
 }
 
 LMDBCursor* LMDB::NewCursor() {
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index 4c587537..6d001026 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -90,6 +90,26 @@ void caffe_gpu_scal<double>(const int N, const double alpha, double *X) {
   CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1));
 }
 
+template <>
+void caffe_gpu_scal<float>(const int N, const float alpha, float* X,
+                           cudaStream_t str) {
+  cudaStream_t initial_stream;
+  CUBLAS_CHECK(cublasGetStream(Caffe::cublas_handle(), &initial_stream));
+  CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), str));
+  CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1));
+  CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), initial_stream));
+}
+
+template <>
+void caffe_gpu_scal<double>(const int N, const double alpha, double* X,
+                            cudaStream_t str) {
+  cudaStream_t initial_stream;
+  CUBLAS_CHECK(cublasGetStream(Caffe::cublas_handle(), &initial_stream));
+  CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), str));
+  CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1));
+  CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), initial_stream));
+}
+
 template <>
 void caffe_gpu_axpby<float>(const int N, const float alpha, const float* X,
     const float beta, float* Y) {
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 9bf4214a..3587d8aa 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -195,6 +195,7 @@ int train() {
   // If the gpus flag is not provided, allow the mode and device to be set
   // in the solver prototxt.
   if (FLAGS_gpu.size() == 0
+      && solver_param.has_solver_mode()
       && solver_param.solver_mode() == caffe::SolverParameter_SolverMode_GPU) {
       if (solver_param.has_device_id()) {
           FLAGS_gpu = "" +
@@ -244,11 +245,15 @@ int train() {
     CopyLayers(solver.get(), FLAGS_weights);
   }
 
+  LOG(INFO) << "Starting Optimization";
   if (gpus.size() > 1) {
-    caffe::P2PSync<float> sync(solver, NULL, solver->param());
-    sync.Run(gpus);
+#ifdef USE_NCCL
+    caffe::NCCL<float> nccl(solver);
+    nccl.Run(gpus, FLAGS_snapshot.size() > 0 ? FLAGS_snapshot.c_str() : NULL);
+#else
+    LOG(FATAL) << "Multi-GPU execution not available - rebuild with USE_NCCL";
+#endif
   } else {
-    LOG(INFO) << "Starting Optimization";
     solver->Solve();
   }
   LOG(INFO) << "Optimization Done.";

From e21b42004001879b232daed8f142fbc5a7e0b75d Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Tue, 22 Nov 2016 16:46:55 -0800
Subject: [PATCH 103/183] Python Multi-GPU

---
 python/caffe/__init__.py |  4 +-
 python/caffe/_caffe.cpp  | 96 ++++++++++++++++++++++++++++++++++++--
 python/caffe/pycaffe.py  |  2 +-
 python/train.py          | 99 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 193 insertions(+), 8 deletions(-)
 create mode 100644 python/train.py

diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index 5fc6ec9b..dde2e986 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,5 @@
-from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver
-from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed
+from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver, NCCL, Timer
+from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, Layer, get_solver
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 0a86045b..04dac234 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -267,12 +267,12 @@ bp::object BlobVec_add_blob(bp::tuple args, bp::dict kwargs) {
 }
 
 template<typename Dtype>
-class PythonCallback: public Solver<Dtype>::Callback {
+class SolverCallback: public Solver<Dtype>::Callback {
  protected:
   bp::object on_start_, on_gradients_ready_;
 
  public:
-  PythonCallback(bp::object on_start, bp::object on_gradients_ready)
+  SolverCallback(bp::object on_start, bp::object on_gradients_ready)
     : on_start_(on_start), on_gradients_ready_(on_gradients_ready) { }
   virtual void on_gradients_ready() {
     on_gradients_ready_();
@@ -284,9 +284,61 @@ class PythonCallback: public Solver<Dtype>::Callback {
 template<typename Dtype>
 void Solver_add_callback(Solver<Dtype> * solver, bp::object on_start,
   bp::object on_gradients_ready) {
-  solver->add_callback(new PythonCallback<Dtype>(on_start, on_gradients_ready));
+  solver->add_callback(new SolverCallback<Dtype>(on_start, on_gradients_ready));
 }
 
+// Seems boost cannot call the base method directly
+void Solver_add_nccl(SGDSolver<Dtype>* solver
+#ifdef USE_NCCL
+  , NCCL<Dtype>* nccl
+#endif
+) {
+#ifdef USE_NCCL
+  solver->add_callback(nccl);
+#endif
+}
+
+template<typename Dtype>
+class NetCallback: public Net<Dtype>::Callback {
+ public:
+  explicit NetCallback(bp::object run) : run_(run) {}
+
+ protected:
+  virtual void run(int layer) {
+    run_(layer);
+  }
+  bp::object run_;
+};
+void Net_before_forward(Net<Dtype>* net, bp::object run) {
+  net->add_before_forward(new NetCallback<Dtype>(run));
+}
+void Net_after_forward(Net<Dtype>* net, bp::object run) {
+  net->add_after_forward(new NetCallback<Dtype>(run));
+}
+void Net_before_backward(Net<Dtype>* net, bp::object run) {
+  net->add_before_backward(new NetCallback<Dtype>(run));
+}
+void Net_after_backward(Net<Dtype>* net, bp::object run) {
+  net->add_after_backward(new NetCallback<Dtype>(run));
+}
+
+void Net_add_nccl(Net<Dtype>* net
+#ifdef USE_NCCL
+  , NCCL<Dtype>* nccl
+#endif
+) {
+#ifdef USE_NCCL
+  net->add_after_backward(nccl);
+#endif
+}
+#ifndef USE_NCCL
+template<typename Dtype>
+class NCCL {
+ public:
+  NCCL(shared_ptr<Solver<Dtype> > solver, const string& uid) {}
+};
+#endif
+
 BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(SolveOverloads, Solve, 0, 1);
 
 BOOST_PYTHON_MODULE(_caffe) {
@@ -303,6 +355,10 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::def("set_mode_gpu", &set_mode_gpu);
   bp::def("set_random_seed", &set_random_seed);
   bp::def("set_device", &Caffe::SetDevice);
+  bp::def("solver_count", &Caffe::solver_count);
+  bp::def("set_solver_count", &Caffe::set_solver_count);
+  bp::def("solver_rank", &Caffe::solver_rank);
+  bp::def("set_solver_rank", &Caffe::set_solver_rank);
 
   bp::def("layer_type_list", &LayerRegistry<Dtype>::LayerTypeList);
 
@@ -346,7 +402,12 @@ BOOST_PYTHON_MODULE(_caffe) {
         bp::with_custodian_and_ward<1, 2, bp::with_custodian_and_ward<1, 3> >())
     .def("save", &Net_Save)
     .def("save_hdf5", &Net_SaveHDF5)
-    .def("load_hdf5", &Net_LoadHDF5);
+    .def("load_hdf5", &Net_LoadHDF5)
+    .def("before_forward", &Net_before_forward)
+    .def("after_forward", &Net_after_forward)
+    .def("before_backward", &Net_before_backward)
+    .def("after_backward", &Net_after_backward)
+    .def("after_backward", &Net_add_nccl);
   BP_REGISTER_SHARED_PTR_TO_PYTHON(Net<Dtype>);
 
   bp::class_<Blob<Dtype>, shared_ptr<Blob<Dtype> >, boost::noncopyable>(
@@ -378,6 +439,10 @@ BOOST_PYTHON_MODULE(_caffe) {
     .add_property("type", bp::make_function(&Layer<Dtype>::type));
   BP_REGISTER_SHARED_PTR_TO_PYTHON(Layer<Dtype>);
 
+  bp::class_<SolverParameter>("SolverParameter", bp::no_init)
+    .add_property("max_iter", &SolverParameter::max_iter)
+    .add_property("display", &SolverParameter::display)
+    .add_property("layer_wise_reduce", &SolverParameter::layer_wise_reduce);
   bp::class_<LayerParameter>("LayerParameter", bp::no_init);
 
   bp::class_<Solver<Dtype>, shared_ptr<Solver<Dtype> >, boost::noncopyable>(
@@ -387,11 +452,14 @@ BOOST_PYTHON_MODULE(_caffe) {
           bp::return_internal_reference<>()))
     .add_property("iter", &Solver<Dtype>::iter)
     .def("add_callback", &Solver_add_callback<Dtype>)
+    .def("add_callback", &Solver_add_nccl)
     .def("solve", static_cast<void (Solver<Dtype>::*)(const char*)>(
           &Solver<Dtype>::Solve), SolveOverloads())
     .def("step", &Solver<Dtype>::Step)
     .def("restore", &Solver<Dtype>::Restore)
-    .def("snapshot", &Solver<Dtype>::Snapshot);
+    .def("snapshot", &Solver<Dtype>::Snapshot)
+    .add_property("param", bp::make_function(&Solver<Dtype>::param,
+              bp::return_value_policy<bp::copy_const_reference>()));
   BP_REGISTER_SHARED_PTR_TO_PYTHON(Solver<Dtype>);
 
   bp::class_<SGDSolver<Dtype>, bp::bases<Solver<Dtype> >,
@@ -435,6 +503,24 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::class_<vector<bool> >("BoolVec")
     .def(bp::vector_indexing_suite<vector<bool> >());
 
+  bp::class_<NCCL<Dtype>, shared_ptr<NCCL<Dtype> >,
+    boost::noncopyable>("NCCL",
+                        bp::init<shared_ptr<Solver<Dtype> >, const string&>())
+#ifdef USE_NCCL
+    .def("new_uid", &NCCL<Dtype>::new_uid).staticmethod("new_uid")
+    .def("bcast", &NCCL<Dtype>::Broadcast)
+#endif
+    /* NOLINT_NEXT_LINE(whitespace/semicolon) */
+  ;
+  BP_REGISTER_SHARED_PTR_TO_PYTHON(NCCL<Dtype>);
+
+  bp::class_<Timer, shared_ptr<Timer>, boost::noncopyable>(
+    "Timer", bp::init<>())
+    .def("start", &Timer::Start)
+    .def("stop", &Timer::Stop)
+    .add_property("ms", &Timer::MilliSeconds);
+  BP_REGISTER_SHARED_PTR_TO_PYTHON(Timer);
+
   // boost python expects a void (missing) return value, while import_array
   // returns NULL for python3. import_array1() forces a void return value.
   import_array1();
diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index 5bae18d9..18803818 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -11,7 +11,7 @@
 import numpy as np
 
 from ._caffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, \
-        RMSPropSolver, AdaDeltaSolver, AdamSolver
+        RMSPropSolver, AdaDeltaSolver, AdamSolver, NCCL, Timer
 import caffe.io
 
 import six
diff --git a/python/train.py b/python/train.py
new file mode 100644
index 00000000..730dbe70
--- /dev/null
+++ b/python/train.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+"""
+Trains a model using one or more GPUs.
+"""
+from multiprocessing import Process
+
+import caffe
+
+
+def train(
+        solver,  # solver proto definition
+        snapshot,  # solver snapshot to restore
+        gpus,  # list of device ids
+        timing=False,  # show timing info for compute and communications
+):
+    # NCCL uses a uid to identify a session
+    uid = caffe.NCCL.new_uid()
+
+    caffe.init_log()
+    caffe.log('Using devices %s' % str(gpus))
+
+    procs = []
+    for rank in range(len(gpus)):
+        p = Process(target=solve,
+                    args=(solver, snapshot, gpus, timing, uid, rank))
+        p.daemon = True
+        p.start()
+        procs.append(p)
+    for p in procs:
+        p.join()
+
+
+def time(solver, nccl):
+    fprop = []
+    bprop = []
+    total = caffe.Timer()
+    allrd = caffe.Timer()
+    for _ in range(len(solver.net.layers)):
+        fprop.append(caffe.Timer())
+        bprop.append(caffe.Timer())
+    display = solver.param.display
+
+    def show_time():
+        if solver.iter % display == 0:
+            s = '\n'
+            for i in range(len(solver.net.layers)):
+                s += 'forw %3d %8s ' % (i, solver.net.layers[i].layer_param.name)
+                s += ': %.2f\n' % fprop[i].ms
+            for i in range(len(solver.net.layers) - 1, -1, -1):
+                s += 'back %3d %8s ' % (i, solver.net.layers[i].layer_param.name)
+                s += ': %.2f\n' % bprop[i].ms
+            s += 'solver total: %.2f\n' % total.ms
+            s += 'allreduce: %.2f\n' % allrd.ms
+            caffe.log(s)
+
+    solver.net.before_forward(lambda layer: fprop[layer].start())
+    solver.net.after_forward(lambda layer: fprop[layer].stop())
+    solver.net.before_backward(lambda layer: bprop[layer].start())
+    solver.net.after_backward(lambda layer: bprop[layer].stop())
+    solver.add_callback(lambda: total.start(), lambda: (total.stop(), allrd.start()))
+    solver.add_callback(nccl)
+    solver.add_callback(lambda: '', lambda: (allrd.stop(), show_time()))
+
+
+def solve(proto, snapshot, gpus, timing, uid, rank):
+    caffe.set_mode_gpu()
+    caffe.set_device(gpus[rank])
+    caffe.set_solver_count(len(gpus))
+    caffe.set_solver_rank(rank)
+
+    solver = caffe.SGDSolver(proto)
+    if snapshot and len(snapshot) != 0:
+        solver.restore(snapshot)
+
+    nccl = caffe.NCCL(solver, uid)
+    nccl.bcast()
+
+    if timing and rank == 0:
+        time(solver, nccl)
+    else:
+        solver.add_callback(nccl)
+
+    if solver.param.layer_wise_reduce:
+        solver.net.after_backward(nccl)
+    solver.step(solver.param.max_iter)
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--solver", required=True, help="Solver proto definition.")
+    parser.add_argument("--snapshot", help="Solver snapshot to restore.")
+    parser.add_argument("--gpus", type=int, nargs='+', default=[0],
+                        help="List of device ids.")
+    parser.add_argument("--timing", action='store_true', help="Show timing info.")
+    args = parser.parse_args()
+
+    train(args.solver, args.snapshot, args.gpus, args.timing)

From 0d27efc7e3d3d2edbf45cccb73bad03ad655c164 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marian=20Gla=CC=88ser?= <mglaeser@me.com>
Date: Thu, 22 Dec 2016 12:25:46 -0800
Subject: [PATCH 104/183] Python layers should build on multiprocess &
 solver_cnt; enable with bindings

---
 include/caffe/layers/python_layer.hpp | 2 +-
 python/caffe/__init__.py              | 2 +-
 python/caffe/_caffe.cpp               | 1 +
 python/train.py                       | 5 +++--
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/caffe/layers/python_layer.hpp b/include/caffe/layers/python_layer.hpp
index 529b09cb..10c4bfd0 100644
--- a/include/caffe/layers/python_layer.hpp
+++ b/include/caffe/layers/python_layer.hpp
@@ -21,7 +21,7 @@ class PythonLayer : public Layer<Dtype> {
     // Disallow PythonLayer in MultiGPU training stage, due to GIL issues
     // Details: https://github.com/BVLC/caffe/issues/2936
     if (this->phase_ == TRAIN && Caffe::solver_count() > 1
-        && !Caffe::root_solver() && !Caffe::multiprocess()) {
+        && !Caffe::multiprocess()) {
       LOG(FATAL) << "PythonLayer does not support CLI Multi-GPU, use train.py";
     }
     self_.attr("param_str") = bp::str(
diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index dde2e986..43a0c49b 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,5 @@
 from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver, NCCL, Timer
-from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, Layer, get_solver
+from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess, Layer, get_solver
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 04dac234..3589e476 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -359,6 +359,7 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::def("set_solver_count", &Caffe::set_solver_count);
   bp::def("solver_rank", &Caffe::solver_rank);
   bp::def("set_solver_rank", &Caffe::set_solver_rank);
+  bp::def("set_multiprocess", &Caffe::set_multiprocess);
 
   bp::def("layer_type_list", &LayerRegistry<Dtype>::LayerTypeList);
 
diff --git a/python/train.py b/python/train.py
index 730dbe70..5897f5dc 100644
--- a/python/train.py
+++ b/python/train.py
@@ -44,10 +44,10 @@ def show_time():
         if solver.iter % display == 0:
             s = '\n'
             for i in range(len(solver.net.layers)):
-                s += 'forw %3d %8s ' % (i, solver.net.layers[i].layer_param.name)
+                s += 'forw %3d %8s ' % (i, solver.net._layer_names[i])
                 s += ': %.2f\n' % fprop[i].ms
             for i in range(len(solver.net.layers) - 1, -1, -1):
-                s += 'back %3d %8s ' % (i, solver.net.layers[i].layer_param.name)
+                s += 'back %3d %8s ' % (i, solver.net._layer_names[i])
                 s += ': %.2f\n' % bprop[i].ms
             s += 'solver total: %.2f\n' % total.ms
             s += 'allreduce: %.2f\n' % allrd.ms
@@ -67,6 +67,7 @@ def solve(proto, snapshot, gpus, timing, uid, rank):
     caffe.set_device(gpus[rank])
     caffe.set_solver_count(len(gpus))
     caffe.set_solver_rank(rank)
+    caffe.set_multiprocess(True)
 
     solver = caffe.SGDSolver(proto)
     if snapshot and len(snapshot) != 0:

From 5f28eb1147c1abb6e5e5c7cd282218679b0d531d Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Wed, 4 Jan 2017 00:25:00 -0800
Subject: [PATCH 105/183] Using default from proto for prefetch

---
 include/caffe/layers/base_data_layer.hpp | 3 ---
 src/caffe/layers/base_data_layer.cpp     | 3 +--
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/include/caffe/layers/base_data_layer.hpp b/include/caffe/layers/base_data_layer.hpp
index 925b019d..21d3ada5 100644
--- a/include/caffe/layers/base_data_layer.hpp
+++ b/include/caffe/layers/base_data_layer.hpp
@@ -67,9 +67,6 @@ class BasePrefetchingDataLayer :
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
 
-  // Prefetches batches (asynchronously if to GPU memory)
-  static const int PREFETCH_COUNT = 4;  // same as proto
-
  protected:
   virtual void InternalThreadEntry();
   virtual void load_batch(Batch<Dtype>* batch) = 0;
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 9414f6f9..93a798f3 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -36,8 +36,7 @@ template <typename Dtype>
 BasePrefetchingDataLayer<Dtype>::BasePrefetchingDataLayer(
     const LayerParameter& param)
     : BaseDataLayer<Dtype>(param),
-      prefetch_(param.has_data_param() ?
-                param.data_param().prefetch() : PREFETCH_COUNT),
+      prefetch_(param.data_param().prefetch()),
       prefetch_free_(), prefetch_full_(), prefetch_current_() {
   for (int i = 0; i < prefetch_.size(); ++i) {
     prefetch_[i].reset(new Batch<Dtype>());

From 8e63bb6ef1537db2d94ddf2dc084020af5c8727d Mon Sep 17 00:00:00 2001
From: Fan Yang <stoneyang0915@gmail.com>
Date: Thu, 12 Jan 2017 15:26:07 +0800
Subject: [PATCH 106/183] minor typo

---
 models/bvlc_googlenet/train_val.prototxt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 mode change 100644 => 100755 models/bvlc_googlenet/train_val.prototxt

diff --git a/models/bvlc_googlenet/train_val.prototxt b/models/bvlc_googlenet/train_val.prototxt
old mode 100644
new mode 100755
index 5dee3abe..5fe367f2
--- a/models/bvlc_googlenet/train_val.prototxt
+++ b/models/bvlc_googlenet/train_val.prototxt
@@ -1692,7 +1692,7 @@ layer {
   type: "SoftmaxWithLoss"
   bottom: "loss2/classifier"
   bottom: "label"
-  top: "loss2/loss1"
+  top: "loss2/loss2"
   loss_weight: 0.3
 }
 layer {

From 91c15e85124ce2b143d2c18ccab5c5740ef4ce31 Mon Sep 17 00:00:00 2001
From: Guillaume Dumont <dumont.guillaume@gmail.com>
Date: Fri, 13 Jan 2017 14:33:35 -0500
Subject: [PATCH 107/183] Python 2/3 compatible download_model_binary.py

---
 scripts/download_model_binary.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/download_model_binary.py b/scripts/download_model_binary.py
index fcdbb5a9..a72fd5d7 100755
--- a/scripts/download_model_binary.py
+++ b/scripts/download_model_binary.py
@@ -3,10 +3,11 @@
 import sys
 import time
 import yaml
-import urllib
 import hashlib
 import argparse
 
+from six.moves import urllib
+
 required_keys = ['caffemodel', 'caffemodel_url', 'sha1']
 
 
@@ -69,7 +70,7 @@ def model_checks_out(filename=model_filename, sha1=frontmatter['sha1']):
         sys.exit(0)
 
     # Download and verify model.
-    urllib.urlretrieve(
+    urllib.request.urlretrieve(
         frontmatter['caffemodel_url'], model_filename, reporthook)
     if not model_checks_out():
         print('ERROR: model did not download correctly! Run this again.')

From a19357a190664b1ea99d18e14eedc27e43ebed42 Mon Sep 17 00:00:00 2001
From: shai <shai@magisto.com>
Date: Sun, 15 Jan 2017 08:54:45 +0000
Subject: [PATCH 108/183] fixing upgrade_proto for BatchNorm layer: be more
 conservative leave "name" in param, only set lr_mult and decay_mult to zero

---
 src/caffe/util/upgrade_proto.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp
index a0aacbe9..94771c8c 100644
--- a/src/caffe/util/upgrade_proto.cpp
+++ b/src/caffe/util/upgrade_proto.cpp
@@ -1018,7 +1018,13 @@ void UpgradeNetBatchNorm(NetParameter* net_param) {
     // the previous BatchNorm layer definition.
     if (net_param->layer(i).type() == "BatchNorm"
         && net_param->layer(i).param_size() == 3) {
-      net_param->mutable_layer(i)->clear_param();
+      // set lr_mult and decay_mult to zero. leave all other param intact.
+      for (int ip = 0; ip < net_param->layer(i).param_size(); ip++) {
+        ParamSpec* fixed_param_spec =
+          net_param->mutable_layer(i)->mutable_param(ip);
+        fixed_param_spec->set_lr_mult(0.f);
+        fixed_param_spec->set_decay_mult(0.f);
+      }
     }
   }
 }

From ceb25c8abe1e70558d8cc72545e4381cd1b4f273 Mon Sep 17 00:00:00 2001
From: Adam Browne <adamo.browne@gmail.com>
Date: Wed, 18 Jan 2017 15:25:02 -0500
Subject: [PATCH 109/183] Fix various documentation typos (#4172)

* fix typo (standaraized->standardized)
* fix typo (convet->convert, etc..)
* fix typo (incompartible->incompatible)
* fix typo (does't->doesn't)
* fix typo (decoded->decode)
---
 cmake/ConfigGen.cmake          | 2 +-
 cmake/Cuda.cmake               | 2 +-
 cmake/Targets.cmake            | 6 +++---
 examples/CMakeLists.txt        | 2 +-
 src/caffe/data_transformer.cpp | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
index 05637111..fd9dd2d2 100644
--- a/cmake/ConfigGen.cmake
+++ b/cmake/ConfigGen.cmake
@@ -109,7 +109,7 @@ function(caffe_generate_export_configs)
 
   # ---[ Configure and install version file ]---
 
-  # TODO: Lines below are commented because Caffe does't declare its version in headers.
+  # TODO: Lines below are commented because Caffe doesn't declare its version in headers.
   # When the declarations are added, modify `caffe_extract_caffe_version()` macro and uncomment
 
   # configure_file(cmake/Templates/CaffeConfigVersion.cmake.in "${PROJECT_BINARY_DIR}/CaffeConfigVersion.cmake" @ONLY)
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 7146a244..0fbf3018 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -284,7 +284,7 @@ mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
 if(APPLE)
   caffe_detect_darwin_version(OSX_VERSION)
 
-  # OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits
+  # OSX 10.9 and higher uses clang/libc++ by default which is incompatible with old CUDA toolkits
   if(OSX_VERSION VERSION_GREATER 10.8)
     # enabled by default if and only if CUDA version is less than 7.0
     caffe_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0))
diff --git a/cmake/Targets.cmake b/cmake/Targets.cmake
index 2cb11584..090f86c5 100644
--- a/cmake/Targets.cmake
+++ b/cmake/Targets.cmake
@@ -88,7 +88,7 @@ function(caffe_pickup_caffe_sources root)
   file(GLOB_RECURSE proto_files ${root}/src/caffe/*.proto)
   list(APPEND srcs ${proto_files})
 
-  # convet to absolute paths
+  # convert to absolute paths
   caffe_convert_absolute_paths(srcs)
   caffe_convert_absolute_paths(cuda)
   caffe_convert_absolute_paths(test_srcs)
@@ -102,7 +102,7 @@ function(caffe_pickup_caffe_sources root)
 endfunction()
 
 ################################################################################################
-# Short command for setting defeault target properties
+# Short command for setting default target properties
 # Usage:
 #   caffe_default_properties(<target>)
 function(caffe_default_properties target)
@@ -111,7 +111,7 @@ function(caffe_default_properties target)
     ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib"
     LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib"
     RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin")
-  # make sure we build all external depepdencies first
+  # make sure we build all external dependencies first
   if (DEFINED external_project_dependencies)
     add_dependencies(${target} ${external_project_dependencies})
   endif()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 663d7360..a59e0df3 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -23,7 +23,7 @@ foreach(source_file ${examples_srcs})
 
   if(UNIX OR APPLE)
     # Funny command to make tutorials work
-    # TODO: remove in future as soon as naming is standartaized everywhere
+    # TODO: remove in future as soon as naming is standardized everywhere
     set(__outname ${PROJECT_BINARY_DIR}/examples/${folder}/${name}${Caffe_POSTFIX})
     add_custom_command(TARGET ${name} POST_BUILD
                        COMMAND ln -sf "${__outname}" "${__outname}.bin")
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index 7189d67e..3012251e 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -130,7 +130,7 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(const Datum& datum,
                                        Blob<Dtype>* transformed_blob) {
-  // If datum is encoded, decoded and transform the cv::image.
+  // If datum is encoded, decode and transform the cv::image.
   if (datum.encoded()) {
 #ifdef USE_OPENCV
     CHECK(!(param_.force_color() && param_.force_gray()))

From e744056d8f7ebcf7f0410a52d801d9ca552f69ad Mon Sep 17 00:00:00 2001
From: xmyqsh <xmyqsh@gmail.com>
Date: Thu, 19 Jan 2017 05:19:48 +0800
Subject: [PATCH 110/183] remove redundant operations in Crop layer (#5138)

---
 src/caffe/layers/crop_layer.cpp | 40 ++++++++++++++++-----------------
 src/caffe/layers/crop_layer.cu  | 22 +++++++-----------
 2 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/src/caffe/layers/crop_layer.cpp b/src/caffe/layers/crop_layer.cpp
index d36b61ca..ef8c177c 100644
--- a/src/caffe/layers/crop_layer.cpp
+++ b/src/caffe/layers/crop_layer.cpp
@@ -86,27 +86,25 @@ void CropLayer<Dtype>::crop_copy(const vector<Blob<Dtype>*>& bottom,
     }
   } else {
     // We are at the last dimensions, which is stored continuously in memory
-    for (int i = 0; i < top[0]->shape(cur_dim); ++i) {
-      // prepare index vector reduced(red) and with offsets(off)
-      std::vector<int> ind_red(cur_dim, 0);
-      std::vector<int> ind_off(cur_dim+1, 0);
-      for (int j = 0; j < cur_dim; ++j) {
-          ind_red[j] = indices[j];
-          ind_off[j] = indices[j] + offsets[j];
-      }
-      ind_off[cur_dim] = offsets[cur_dim];
-      // do the copy
-      if (is_forward) {
-        caffe_copy(top[0]->shape(cur_dim),
-            src_data + bottom[0]->offset(ind_off),
-            dest_data + top[0]->offset(ind_red));
-      } else {
-        // in the backwards pass the src_data is top_diff
-        // and the dest_data is bottom_diff
-        caffe_copy(top[0]->shape(cur_dim),
-            src_data + top[0]->offset(ind_red),
-            dest_data + bottom[0]->offset(ind_off));
-      }
+    // prepare index vector reduced(red) and with offsets(off)
+    std::vector<int> ind_red(cur_dim, 0);
+    std::vector<int> ind_off(cur_dim+1, 0);
+    for (int j = 0; j < cur_dim; ++j) {
+      ind_red[j] = indices[j];
+      ind_off[j] = indices[j] + offsets[j];
+    }
+    ind_off[cur_dim] = offsets[cur_dim];
+    // do the copy
+    if (is_forward) {
+      caffe_copy(top[0]->shape(cur_dim),
+          src_data + bottom[0]->offset(ind_off),
+          dest_data + top[0]->offset(ind_red));
+    } else {
+      // in the backwards pass the src_data is top_diff
+      // and the dest_data is bottom_diff
+      caffe_copy(top[0]->shape(cur_dim),
+          src_data + top[0]->offset(ind_red),
+          dest_data + bottom[0]->offset(ind_off));
     }
   }
 }
diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu
index 1ea13253..677077cd 100644
--- a/src/caffe/layers/crop_layer.cu
+++ b/src/caffe/layers/crop_layer.cu
@@ -8,14 +8,12 @@ namespace caffe {
 // strides in the last two dimensions.
 template <typename Dtype>
 __global__ void copy_kernel(const int n, const int height, const int width,
-    const int src_outer_stride, const int src_inner_stride,
-    const int dest_outer_stride, const int dest_inner_stride,
+    const int src_inner_stride,
+    const int dest_inner_stride,
     const Dtype* src, Dtype* dest) {
   CUDA_KERNEL_LOOP(index, n) {
-    int src_start = index / height * src_outer_stride
-                  + index % height * src_inner_stride;
-    int dest_start = index / height * dest_outer_stride
-                   + index % height * dest_inner_stride;
+    int src_start = index * src_inner_stride;
+    int dest_start = index * dest_inner_stride;
     for (int i = 0; i < width; ++i) {
       dest[dest_start + i] = src[src_start + i];
     }
@@ -53,11 +51,7 @@ void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
     ind_off[cur_dim] = offsets[cur_dim];
     ind_off[cur_dim+1] = offsets[cur_dim+1];
     // Compute copy strides
-    const int src_outer_stride =
-        bottom[0]->shape(cur_dim)*bottom[0]->shape(cur_dim+1);
     const int src_inner_stride = bottom[0]->shape(cur_dim+1);
-    const int dest_outer_stride =
-        top[0]->shape(cur_dim)*top[0]->shape(cur_dim+1);
     const int dest_inner_stride = top[0]->shape(cur_dim+1);
 
     if (is_forward) {
@@ -68,8 +62,8 @@ void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
       // NOLINT_NEXT_LINE(whitespace/operators)
       copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
           lines, height, width,
-          src_outer_stride, src_inner_stride,
-          dest_outer_stride, dest_inner_stride,
+          src_inner_stride,
+          dest_inner_stride,
           bottom_data, top_data);
 
     } else {
@@ -80,8 +74,8 @@ void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
       // NOLINT_NEXT_LINE(whitespace/operators)
       copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
           lines, height, width,
-          dest_outer_stride, dest_inner_stride,
-          src_outer_stride, src_inner_stride,
+          dest_inner_stride,
+          src_inner_stride,
           top_diff, bottom_diff);
     }
   }

From 9b9f6d02ccb664b7f17ce2d3d17072ba578cac09 Mon Sep 17 00:00:00 2001
From: Jonathan L Long <jonlong@cs.berkeley.edu>
Date: Wed, 18 Jan 2017 16:03:55 -0800
Subject: [PATCH 111/183] [build] remove trailing backslash on comment

---
 Makefile.config.example | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.config.example b/Makefile.config.example
index 541cf807..b590bd16 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -68,7 +68,7 @@ PYTHON_INCLUDE := /usr/include/python2.7 \
 # ANACONDA_HOME := $(HOME)/anaconda
 # PYTHON_INCLUDE := $(ANACONDA_HOME)/include \
 		# $(ANACONDA_HOME)/include/python2.7 \
-		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \
+		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include
 
 # Uncomment to use Python 3 (default is Python 2)
 # PYTHON_LIBRARIES := boost_python3 python3.5m

From ff3158a3d0f974a15981dfdbaa95c11ec2cee097 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Wed, 18 Jan 2017 17:39:35 -0800
Subject: [PATCH 112/183] ignore generated includes for docs

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 281ef326..eff292b7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -84,6 +84,7 @@ cmake_build
 
 # Generated documentation
 docs/_site
+docs/_includes
 docs/gathered
 _site
 doxygen

From 9ab67099e08c03bf57e6a67538ca4746365beda8 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Wed, 18 Jan 2017 17:40:36 -0800
Subject: [PATCH 113/183] copyright spans 2014-2017

---
 LICENSE | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/LICENSE b/LICENSE
index d69d16f5..0c99adc1 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,11 +1,11 @@
 COPYRIGHT
 
 All contributions by the University of California:
-Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+Copyright (c) 2014-2017 The Regents of the University of California (Regents)
 All rights reserved.
 
 All other contributions:
-Copyright (c) 2014, 2015, the respective contributors
+Copyright (c) 2014-2017, the respective contributors
 All rights reserved.
 
 Caffe uses a shared copyright model: each contributor holds copyright over

From 4056f79f9d8ebf261db45883470a0e2939f725e9 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Tue, 17 Jan 2017 20:10:15 -0800
Subject: [PATCH 114/183] Docker refresh: simplified & update to 16.04, cuda8,
 cudnn5, nccl

---
 docker/Makefile                        | 50 ------------------
 docker/README.md                       | 70 ++++++++++++--------------
 docker/{standalone => }/cpu/Dockerfile | 12 +++--
 docker/{standalone => }/gpu/Dockerfile | 15 +++---
 docker/templates/Dockerfile.template   | 42 ----------------
 5 files changed, 49 insertions(+), 140 deletions(-)
 delete mode 100644 docker/Makefile
 rename docker/{standalone => }/cpu/Dockerfile (76%)
 rename docker/{standalone => }/gpu/Dockerfile (66%)
 delete mode 100644 docker/templates/Dockerfile.template

diff --git a/docker/Makefile b/docker/Makefile
deleted file mode 100644
index 3a6575b0..00000000
--- a/docker/Makefile
+++ /dev/null
@@ -1,50 +0,0 @@
-# A makefile to build the docker images for caffe.
-# Two caffe images will be built:
-#   caffe:cpu --> A CPU-only build of caffe.
-#   caffe:gpu --> A GPU-enabled build using the latest CUDA and CUDNN versions.
-
-DOCKER ?= docker
-
-all: docker_files standalone
-
-.PHONY: standalone devel
-
-standalone: cpu_standalone gpu_standalone
-
-
-cpu_standalone: standalone/cpu/Dockerfile
-	$(DOCKER) build -t caffe:cpu standalone/cpu
-
-gpu_standalone: standalone/gpu/Dockerfile
-	$(DOCKER) build -t caffe:gpu standalone/gpu
-
-docker_files: standalone_files
-
-standalone_files: standalone/cpu/Dockerfile standalone/gpu/Dockerfile
-
-FROM_GPU = "nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04"
-FROM_CPU = "ubuntu:14.04"
-GPU_CMAKE_ARGS = -DUSE_CUDNN=1
-CPU_CMAKE_ARGS = -DCPU_ONLY=1
-
-# A make macro to select the CPU or GPU base image.
-define from_image
-$(if $(strip $(findstring gpu,$@)),$(FROM_GPU),$(FROM_CPU))
-endef
-
-# A make macro to select the CPU or GPU build args.
-define build_args
-$(if $(strip $(findstring gpu,$@)),$(GPU_CMAKE_ARGS),$(CPU_CMAKE_ARGS))
-endef
-
-# A make macro to construct the CPU or GPU Dockerfile from the template
-define create_docker_file
-	@echo creating $@
-	@echo "FROM "$(from_image) > $@
-	@cat $^ | sed 's/$${CMAKE_ARGS}/$(build_args)/' >> $@
-endef
-
-
-standalone/%/Dockerfile: templates/Dockerfile.template
-	$(create_docker_file)
-
diff --git a/docker/README.md b/docker/README.md
index fdab641b..11c18157 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -1,52 +1,48 @@
-# Caffe standalone Dockerfiles.
+### Running an official image
 
-The `standalone` subfolder contains docker files for generating both CPU and GPU executable images for Caffe. The images can be built using make, or by running:
+You can run one of the automatic [builds](https://hub.docker.com/r/bvlc/caffe)
+like this:
 
-```
-docker build -t caffe:cpu standalone/cpu
-```
-for example. (Here `gpu` can be substituted for `cpu`, but to keep the readme simple, only the `cpu` case will be discussed in detail).
+`docker run -ti bvlc/caffe caffe --version`
 
-Note that the GPU standalone requires a CUDA 7.5 capable driver to be installed on the system and [nvidia-docker] for running the Docker containers. Here it is generally sufficient to use `nvidia-docker` instead of `docker` in any of the commands mentioned.
+or for GPU support (You need a CUDA 8.0 capable driver and
+[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)):
 
-# Running Caffe using the docker image
+`nvidia-docker run -ti bvlc/caffe:gpu caffe --version`
 
-In order to test the Caffe image, run:
-```
-docker run -ti caffe:cpu caffe --version
-```
-which should show a message like:
-```
-libdc1394 error: Failed to initialize libdc1394
-caffe version 1.0.0-rc3
-```
+You might see an error about libdc1394, ignore it.
 
-One can also build and run the Caffe tests in the image using:
-```
-docker run -ti caffe:cpu bash -c "cd /opt/caffe/build; make runtest"
-```
+### Docker run options
 
-In order to get the most out of the caffe image, some more advanced `docker run` options could be used. For example, running:
-```
-docker run -ti --volume=$(pwd):/workspace caffe:cpu caffe train --solver=example_solver.prototxt
-```
-will train a network defined in the `example_solver.prototxt` file in the current directory (`$(pwd)` is maped to the container volume `/workspace` using the `--volume=` Docker flag).
+By default caffe runs as root, thus any output files, e.g. snapshots, will be owned
+by root. It also runs by default in a container-private folder.
 
-Note that docker runs all commands as root by default, and thus any output files (e.g. snapshots) generated will be owned by the root user. In order to ensure that the current user is used instead, the following command can be used:
-```
-docker run -ti --volume=$(pwd):/workspace -u $(id -u):$(id -g) caffe:cpu caffe train --solver=example_solver.prototxt
-```
-where the `-u` Docker command line option runs the commands in the container as the specified user, and the shell command `id` is used to determine the user and group ID of the current user. Note that the Caffe docker images have `/workspace` defined as the default working directory. This can be overridden using the `--workdir=` Docker command line option.
+You can change this using flags, like user (-u), current directory, and volumes (-w and -v).
+E.g. this behaves like the usual caffe executable:
 
-# Other use-cases
+`docker run --rm -u $(id -u):$(id -g) -v $(pwd):$(pwd) -w $(pwd) bvlc/caffe caffe train --solver=example_solver.prototxt`
 
-Although running the `caffe` command in the docker containers as described above serves many purposes, the container can also be used for more interactive use cases. For example, specifying `bash` as the command instead of `caffe` yields a shell that can be used for interactive tasks. (Since the caffe build requirements are included in the container, this can also be used to build and run local versions of caffe).
+Containers can also be used interactively, specifying e.g. `bash` or `ipython`
+instead of `caffe`.
 
-Another use case is to run python scripts that depend on `caffe`'s Python modules. Using the `python` command instead of `bash` or `caffe` will allow this, and an interactive interpreter can be started by running:
 ```
-docker run -ti caffe:cpu python
+docker run -ti bvlc/caffe ipython
+import caffe
+...
 ```
-(`ipython` is also available in the container).
 
-Since the `caffe/python` folder is also added to the path, the utility executable scripts defined there can also be used as executables. This includes `draw_net.py`, `classify.py`, and `detect.py`
+The caffe build requirements are included in the container, so this can be used to
+build and run custom versions of caffe. Also, `caffe/python` is in PATH, so python
+utilities can be used directly, e.g. `draw_net.py`, `classify.py`, or `detect.py`.
+
+### Building images yourself
+
+Examples:
+
+`docker build -t caffe cpu`
+
+`docker build -t caffe:gpu gpu`
+
+You can also build Caffe and run the tests in the image:
 
+`docker run -ti caffe bash -c "cd /opt/caffe/build; make runtest"`
diff --git a/docker/standalone/cpu/Dockerfile b/docker/cpu/Dockerfile
similarity index 76%
rename from docker/standalone/cpu/Dockerfile
rename to docker/cpu/Dockerfile
index 4fef25aa..af6c03c6 100644
--- a/docker/standalone/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@@ -1,5 +1,5 @@
-FROM ubuntu:14.04
-MAINTAINER caffe-maint@googlegroups.com
+FROM ubuntu:16.04
+LABEL maintainer caffe-maint@googlegroups.com
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -20,17 +20,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         python-dev \
         python-numpy \
         python-pip \
+        python-setuptools \
         python-scipy && \
     rm -rf /var/lib/apt/lists/*
 
 ENV CAFFE_ROOT=/opt/caffe
 WORKDIR $CAFFE_ROOT
 
-# FIXME: clone a specific git tag and use ARG instead of ENV once DockerHub supports this.
-ENV CLONE_TAG=master
+# FIXME: use ARG instead of ENV once DockerHub supports this
+ENV CLONE_TAG=rc4
 
 RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
-    for req in $(cat python/requirements.txt) pydot; do pip install $req; done && \
+    pip install --upgrade pip && \
+    cd python && for req in $(cat requirements.txt) pydot; do pip install $req; done && cd .. && \
     mkdir build && cd build && \
     cmake -DCPU_ONLY=1 .. && \
     make -j"$(nproc)"
diff --git a/docker/standalone/gpu/Dockerfile b/docker/gpu/Dockerfile
similarity index 66%
rename from docker/standalone/gpu/Dockerfile
rename to docker/gpu/Dockerfile
index daf6a722..0785b10f 100644
--- a/docker/standalone/gpu/Dockerfile
+++ b/docker/gpu/Dockerfile
@@ -1,5 +1,5 @@
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER caffe-maint@googlegroups.com
+FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+LABEL maintainer caffe-maint@googlegroups.com
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -20,19 +20,22 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         python-dev \
         python-numpy \
         python-pip \
+        python-setuptools \
         python-scipy && \
     rm -rf /var/lib/apt/lists/*
 
 ENV CAFFE_ROOT=/opt/caffe
 WORKDIR $CAFFE_ROOT
 
-# FIXME: clone a specific git tag and use ARG instead of ENV once DockerHub supports this.
-ENV CLONE_TAG=master
+# FIXME: use ARG instead of ENV once DockerHub supports this
+ENV CLONE_TAG=rc4
 
 RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
-    for req in $(cat python/requirements.txt) pydot; do pip install $req; done && \
+    pip install --upgrade pip && \
+    cd python && for req in $(cat requirements.txt) pydot; do pip install $req; done && cd .. && \
+    git clone https://github.com/NVIDIA/nccl.git && cd nccl && make -j install && cd .. && rm -rf nccl && \
     mkdir build && cd build && \
-    cmake -DUSE_CUDNN=1 .. && \
+    cmake -DUSE_CUDNN=1 -DUSE_NCCL=1 .. && \
     make -j"$(nproc)"
 
 ENV PYCAFFE_ROOT $CAFFE_ROOT/python
diff --git a/docker/templates/Dockerfile.template b/docker/templates/Dockerfile.template
deleted file mode 100644
index 8834f057..00000000
--- a/docker/templates/Dockerfile.template
+++ /dev/null
@@ -1,42 +0,0 @@
-MAINTAINER caffe-maint@googlegroups.com
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cmake \
-        git \
-        wget \
-        libatlas-base-dev \
-        libboost-all-dev \
-        libgflags-dev \
-        libgoogle-glog-dev \
-        libhdf5-serial-dev \
-        libleveldb-dev \
-        liblmdb-dev \
-        libopencv-dev \
-        libprotobuf-dev \
-        libsnappy-dev \
-        protobuf-compiler \
-        python-dev \
-        python-numpy \
-        python-pip \
-        python-scipy && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV CAFFE_ROOT=/opt/caffe
-WORKDIR $CAFFE_ROOT
-
-# FIXME: clone a specific git tag and use ARG instead of ENV once DockerHub supports this.
-ENV CLONE_TAG=master
-
-RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
-    for req in $(cat python/requirements.txt) pydot; do pip install $req; done && \
-    mkdir build && cd build && \
-    cmake ${CMAKE_ARGS} .. && \
-    make -j"$(nproc)"
-
-ENV PYCAFFE_ROOT $CAFFE_ROOT/python
-ENV PYTHONPATH $PYCAFFE_ROOT:$PYTHONPATH
-ENV PATH $CAFFE_ROOT/build/tools:$PYCAFFE_ROOT:$PATH
-RUN echo "$CAFFE_ROOT/build/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
-
-WORKDIR /workspace

From 135440371c7cb2932d5c1e8e671e0d2e231fd2cc Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Sat, 21 Jan 2017 03:06:38 +0000
Subject: [PATCH 115/183] cmake: bump soversion to rc4

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3af394f7..15a7fe46 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,8 +10,8 @@ endif()
 project(Caffe C CXX)
 
 # ---[ Caffe version
-set(CAFFE_TARGET_VERSION "1.0.0-rc3" CACHE STRING "Caffe logical version")
-set(CAFFE_TARGET_SOVERSION "1.0.0-rc3" CACHE STRING "Caffe soname version")
+set(CAFFE_TARGET_VERSION "1.0.0-rc4" CACHE STRING "Caffe logical version")
+set(CAFFE_TARGET_SOVERSION "1.0.0-rc4" CACHE STRING "Caffe soname version")
 add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 
 # ---[ Using cmake scripts and modules

From 3a0b6c6e75ca17bae4c728c6987dc5db1e380ce6 Mon Sep 17 00:00:00 2001
From: Fyodor Tokarev <ftokarev@gmail.com>
Date: Sat, 21 Jan 2017 15:12:38 +0300
Subject: [PATCH 116/183] Update a comment in caffe.proto

---
 src/caffe/proto/caffe.proto | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 430a0dea..815ead35 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -128,8 +128,7 @@ message SolverParameter {
   // The states for the train/test nets. Must be unspecified or
   // specified once per net.
   //
-  // By default, all states will have solver = true;
-  // train_state will have phase = TRAIN,
+  // By default, train_state will have phase = TRAIN,
   // and all test_state's will have phase = TEST.
   // Other defaults are set according to the NetState defaults.
   optional NetState train_state = 26;

From e0cd85237c9ea756cf6bd35b8b0e3432ea3e5273 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Mon, 23 Jan 2017 10:31:26 -0800
Subject: [PATCH 117/183] Restore can be invoked on rank > 0

---
 src/caffe/solver.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 1c1a9e59..fd4c0372 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -462,7 +462,6 @@ string Solver<Dtype>::SnapshotToHDF5() {
 
 template <typename Dtype>
 void Solver<Dtype>::Restore(const char* state_file) {
-  CHECK(Caffe::root_solver());
   string state_filename(state_file);
   if (state_filename.size() >= 3 &&
       state_filename.compare(state_filename.size() - 3, 3, ".h5") == 0) {

From 29f0cdb9d785459126516dc58f755af5b486cf71 Mon Sep 17 00:00:00 2001
From: Ken Schutte <kenschutte@gmail.com>
Date: Tue, 24 Jan 2017 10:45:52 -0600
Subject: [PATCH 118/183] parse_log.py was not using --verbose argument

---
 tools/extra/parse_log.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/extra/parse_log.py b/tools/extra/parse_log.py
index b47ffd0d..4248e2b8 100755
--- a/tools/extra/parse_log.py
+++ b/tools/extra/parse_log.py
@@ -203,7 +203,7 @@ def main():
     args = parse_args()
     train_dict_list, test_dict_list = parse_log(args.logfile_path)
     save_csv_files(args.logfile_path, args.output_dir, train_dict_list,
-                   test_dict_list, delimiter=args.delimiter)
+                   test_dict_list, delimiter=args.delimiter, verbose=args.verbose)
 
 
 if __name__ == '__main__':

From 6bf10afd20f91366909318fe4e85a098bb742f58 Mon Sep 17 00:00:00 2001
From: "Jonathan R. Williford" <jonathan@neural.vision>
Date: Fri, 20 Jan 2017 11:53:12 +0000
Subject: [PATCH 119/183] Fix broken links in layer documentation, minor fixes.

---
 docs/tutorial/layers/accuracy.md                | 3 +--
 docs/tutorial/layers/argmax.md                  | 3 +--
 docs/tutorial/layers/infogainloss.md            | 5 ++---
 docs/tutorial/layers/lrn.md                     | 4 ++--
 docs/tutorial/layers/memorydata.md              | 2 +-
 docs/tutorial/layers/multinomiallogisticloss.md | 2 +-
 docs/tutorial/layers/silence.md                 | 8 +-------
 7 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/docs/tutorial/layers/accuracy.md b/docs/tutorial/layers/accuracy.md
index ecf84090..80293b1c 100644
--- a/docs/tutorial/layers/accuracy.md
+++ b/docs/tutorial/layers/accuracy.md
@@ -10,7 +10,6 @@ title: Accuracy and Top-k
 * [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1AccuracyLayer.html)
 * Header: [`./include/caffe/layers/accuracy_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/accuracy_layer.hpp)
 * CPU implementation: [`./src/caffe/layers/accuracy_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/accuracy_layer.cpp)
-* CUDA GPU implementation: [`./src/caffe/layers/accuracy_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/accuracy_layer.cu)
 
 ## Parameters
 * Parameters (`AccuracyParameter accuracy_param`)
@@ -18,4 +17,4 @@ title: Accuracy and Top-k
 
 {% highlight Protobuf %}
 {% include proto/AccuracyParameter.txt %}
-{% endhighlight %}
\ No newline at end of file
+{% endhighlight %}
diff --git a/docs/tutorial/layers/argmax.md b/docs/tutorial/layers/argmax.md
index f5f173ac..9eb8b773 100644
--- a/docs/tutorial/layers/argmax.md
+++ b/docs/tutorial/layers/argmax.md
@@ -8,7 +8,6 @@ title: ArgMax Layer
 * [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ArgMaxLayer.html)
 * Header: [`./include/caffe/layers/argmax_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/argmax_layer.hpp)
 * CPU implementation: [`./src/caffe/layers/argmax_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/argmax_layer.cpp)
-* CUDA GPU implementation: [`./src/caffe/layers/argmax_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/argmax_layer.cu)
 
 ## Parameters
 * Parameters (`ArgMaxParameter argmax_param`)
@@ -16,4 +15,4 @@ title: ArgMax Layer
 
 {% highlight Protobuf %}
 {% include proto/ArgMaxParameter.txt %}
-{% endhighlight %}
\ No newline at end of file
+{% endhighlight %}
diff --git a/docs/tutorial/layers/infogainloss.md b/docs/tutorial/layers/infogainloss.md
index 86140b6c..b3b690d2 100644
--- a/docs/tutorial/layers/infogainloss.md
+++ b/docs/tutorial/layers/infogainloss.md
@@ -8,11 +8,10 @@ title: Infogain Loss Layer
 * [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1InfogainLossLayer.html)
 * Header: [`./include/caffe/layers/infogain_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/infogain_loss_layer.hpp)
 * CPU implementation: [`./src/caffe/layers/infogain_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/infogain_loss_layer.cpp)
-* CUDA GPU implementation: [`./src/caffe/layers/infogain_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/infogain_loss_layer.cu)
 
-A generalization of [MultinomialLogisticLossLayer](layers/multinomiallogisticloss.md) that takes an "information gain" (infogain) matrix specifying the "value" of all label pairs.
+A generalization of [MultinomialLogisticLossLayer](multinomiallogisticloss.html) that takes an "information gain" (infogain) matrix specifying the "value" of all label pairs.
 
-Equivalent to the [MultinomialLogisticLossLayer](layers/multinomiallogisticloss.md) if the infogain matrix is the identity.
+Equivalent to the [MultinomialLogisticLossLayer](multinomiallogisticloss.html) if the infogain matrix is the identity.
 
 ## Parameters
 
diff --git a/docs/tutorial/layers/lrn.md b/docs/tutorial/layers/lrn.md
index 387311c2..2fbef734 100644
--- a/docs/tutorial/layers/lrn.md
+++ b/docs/tutorial/layers/lrn.md
@@ -20,9 +20,9 @@ The local response normalization layer performs a kind of "lateral inhibition" b
 
 ## Parameters
 
-* Parameters (`Parameter lrn_param`)
+* Parameters (`LRNParameter lrn_param`)
 * From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
 
 {% highlight Protobuf %}
-{% include proto/BatchNormParameter.txt %}
+{% include proto/LRNParameter.txt %}
 {% endhighlight %}
diff --git a/docs/tutorial/layers/memorydata.md b/docs/tutorial/layers/memorydata.md
index 754e62ae..afce4a24 100644
--- a/docs/tutorial/layers/memorydata.md
+++ b/docs/tutorial/layers/memorydata.md
@@ -7,7 +7,7 @@ title: Memory Data Layer
 * Layer type: `MemoryData`
 * [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1MemoryDataLayer.html)
 * Header: [`./include/caffe/layers/memory_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/memory_data_layer.hpp)
-* CPU implementation: [`./src/caffe/layers/memory_data_layer.cpu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/memory_data_layer.cpu)
+* CPU implementation: [`./src/caffe/layers/memory_data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/memory_data_layer.cpp)
 
 The memory data layer reads data directly from memory, without copying it. In order to use it, one must call `MemoryDataLayer::Reset` (from C++) or `Net.set_input_arrays` (from Python) in order to specify a source of contiguous data (as 4D row major array), which is read one batch-sized chunk at a time.
 
diff --git a/docs/tutorial/layers/multinomiallogisticloss.md b/docs/tutorial/layers/multinomiallogisticloss.md
index a28ab914..5eab74a8 100644
--- a/docs/tutorial/layers/multinomiallogisticloss.md
+++ b/docs/tutorial/layers/multinomiallogisticloss.md
@@ -7,7 +7,7 @@ title: Multinomial Logistic Loss Layer
 * Layer type: `MultinomialLogisticLoss`
 * [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1MultinomialLogisticLossLayer.html)
 * Header: [`./include/caffe/layers/multinomial_logistic_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/multinomial_logistic_loss_layer.hpp)
-* CPU implementation: [`./src/caffe/layers/multinomial_logistic_loss_layer.cpu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/multinomial_logistic_loss_layer.cpu)
+* CPU implementation: [`./src/caffe/layers/multinomial_logistic_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/multinomial_logistic_loss_layer.cpp)
 
 ## Parameters
 
diff --git a/docs/tutorial/layers/silence.md b/docs/tutorial/layers/silence.md
index 2c37a9cd..8b4579a9 100644
--- a/docs/tutorial/layers/silence.md
+++ b/docs/tutorial/layers/silence.md
@@ -14,10 +14,4 @@ Silences a blob, so that it is not printed.
 
 ## Parameters
 
-* Parameters (`SilenceParameter silence_param`)
-* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
-
-{% highlight Protobuf %}
-{% include proto/BatchNormParameter.txt %}
-{% endhighlight %}
-
+No parameters.

From 7b5731c6a68b6a9372c00eb8e13c697f832d8d1b Mon Sep 17 00:00:00 2001
From: Wenbo Yang <solrex@users.noreply.github.com>
Date: Mon, 30 Jan 2017 16:33:20 +0800
Subject: [PATCH 120/183] Remove sdk version from veclib searching path.

---
 cmake/Modules/FindvecLib.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Modules/FindvecLib.cmake b/cmake/Modules/FindvecLib.cmake
index 46043367..8eaab594 100644
--- a/cmake/Modules/FindvecLib.cmake
+++ b/cmake/Modules/FindvecLib.cmake
@@ -16,7 +16,7 @@ find_path(vecLib_INCLUDE_DIR vecLib.h
           DOC "vecLib include directory"
           PATHS /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
                 /System/Library/${__veclib_include_suffix}
-                /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
+                /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
           NO_DEFAULT_PATH)
 
 include(FindPackageHandleStandardArgs)

From cd89d4b567529de086e409b66390c961624a84b3 Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Wed, 1 Feb 2017 11:21:00 +0000
Subject: [PATCH 121/183] docs: update install_apt_debian guide

---
 docs/install_apt_debian.md | 76 ++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 35 deletions(-)

diff --git a/docs/install_apt_debian.md b/docs/install_apt_debian.md
index 0d39e3ae..65fe7092 100644
--- a/docs/install_apt_debian.md
+++ b/docs/install_apt_debian.md
@@ -5,13 +5,13 @@ title: "Installation: Debian"
 # Debian Installation
 
 Caffe packages are available for several Debian versions, as shown in the
-following chart
+following chart:
 
 ```
 Your Distro     |  CPU_ONLY  |  CUDA  |     Alias
 ----------------+------------+--------+-------------------
 Debian/stable   |     ✘      |   ✘    | Debian Jessie
-Debian/testing  |     ✔      |   ☐    | Debian Stretch/Sid
+Debian/testing  |     ✔      |   ✔    | Debian Stretch/Sid
 Debian/unstable |     ✔      |   ✔    | Debian Sid
 ```
 
@@ -19,30 +19,32 @@ Debian/unstable |     ✔      |   ✔    | Debian Sid
 
 * `✔ ` You can install caffe with a single command line following this guide.
 
-* `☐ ` The same with `✔ `. However it will not work any more when Debian/Stretch becomes the stable branch.
-
-Last update: 2017-01-05
+Last update: 2017-02-01
 
 ## Binary installation with APT
 
 Apart from the installation methods based on source, Debian/unstable
-and Debian/testing users can install pre-compiled Caffe packages via the official archive.
+and Debian/testing users can install pre-compiled Caffe packages from
+the official archive.
+
+Make sure that your `/etc/apt/sources.list` contains `contrib` and `non-free`
+sections if you want to install the CUDA version, for instance:
 
-Make sure that there is something like the follows in your `/etc/apt/sources.list`:
 ```
-deb http://MIRROR/debian CODENAME main contrib non-free
+deb http://ftp2.cn.debian.org/debian sid main contrib non-free
 ```
-where `MIRROR` is your favorate Debian mirror, and `CODENAME ∈ {testing,stretch,sid}`.
 
 Then we update APT cache and directly install Caffe. Note, the cpu version and
-the cuda version cannot be installed at the same time.
+the cuda version cannot coexist.
+
 ```
-# apt update
-# apt install [ caffe-cpu | caffe-cuda ]
-# caffe                                              # command line interface working
-# python3 -c 'import caffe; print(caffe.__path__)'   # python3 interface working
+$ sudo apt update
+$ sudo apt install [ caffe-cpu | caffe-cuda ]
+$ caffe                                              # command line interface working
+$ python3 -c 'import caffe; print(caffe.__path__)'   # python3 interface working
 ```
-It should work out of box.
+
+These Caffe packages should work for you out of box.
 
 #### Customizing caffe packages
 
@@ -50,46 +52,49 @@ Some users may need to customize the Caffe package. The way to customize
 the package is beyond this guide. Here is only a brief guide of producing
 the customized `.deb` packages. 
 
-Make sure that there is something like this in your `/etc/apt/sources.list`:
+Make sure that there is a `dec-src` source in your `/etc/apt/sources.list`,
+for instance:
+
 ```
 deb http://ftp2.cn.debian.org/debian sid main contrib non-free
 deb-src http://ftp2.cn.debian.org/debian sid main contrib non-free
 ```
 
 Then we build caffe deb files with the following commands:
+
 ```
 $ sudo apt update
-$ sudo apt install build-essential debhelper devscripts    # standard package building tools
-$ sudo apt build-dep [ caffe-cpu | caffe-cuda ]            # the most elegant way to pull caffe build dependencies
-$ apt source [ caffe-cpu | caffe-cuda ]               # download the source tarball and extract
+$ sudo apt install build-essential debhelper devscripts  # standard package building tools
+$ sudo apt build-dep [ caffe-cpu | caffe-cuda ]          # the most elegant way to pull caffe build dependencies
+$ apt source [ caffe-cpu | caffe-cuda ]                  # download the source tarball and extract
 $ cd caffe-XXXX
-[ ... optional, customize caffe code/build ... ]
-$ dch -llocal "Modified XXX in order to XXX"          # write your one-line changelog
-$ debuild -B -j4                                      # build caffe with 4 parallel jobs (similar to make -j4)
+[ ... optional, customizing caffe code/build ... ]
+$ dch --local "Modified XXX"                             # bump package version and write changelog
+$ debuild -B -j4                                         # build caffe with 4 parallel jobs (similar to make -j4)
 [ ... building ...]
-$ debc                                                # optional, if you want to check the package contents
-$ sudo debi                                           # optional, install the generated packages
+$ debc                                                   # optional, if you want to check the package contents
+$ sudo debi                                              # optional, install the generated packages
+$ ls ../                                                 # optional, you will see the resulting packages
 ```
-The resulting deb packages can be found under the parent directory of the source tree.
 
-Note, the `dch ...` command line above is for bumping the package version number
-and adding an entry to the package changelog. If you would like to write
-more than one changelog entry, use subsequent `dch` command (see `man 1 dch`)
-instead of manually modifing `debian/changelog` unless you know how to keep its format correct.
+It is a BUG if the package failed to build without any change.
 The changelog will be installed at e.g. `/usr/share/doc/caffe-cpu/changelog.Debian.gz`.
 
 ## Source installation
 
-Source installation under Debian/unstable is similar to that of Ubuntu, but
+Source installation under Debian/unstable and Debian/testing is similar to that of Ubuntu, but
 here is a more elegant way to pull caffe build dependencies:
+
 ```
 $ sudo apt build-dep [ caffe-cpu | caffe-cuda ]
 ```
+
 Note, this requires a `deb-src` entry in your `/etc/apt/sources.list`.
 
 #### Compiler Combinations
 
-Some users may find their favorate compiler doesn't work well with CUDA.
+Some users may find their favorate compiler doesn't work with CUDA.
+
 ```
 CXX compiler |  CUDA 7.5  |  CUDA 8.0  |
 -------------+------------+------------+-
@@ -144,12 +149,13 @@ and hack the packaging scripts, then build your customized package.
 * Where are the examples, the models and other documentation stuff?
 
 ```
-sudo apt install caffe-doc
-dpkg -L caffe-doc
+$ sudo apt install caffe-doc
+$ dpkg -L caffe-doc
 ```
 
 * Where can I find the Debian package status?
 
-https://tracker.debian.org/pkg/caffe  (for the CPU_ONLY version)
-
+```
+https://tracker.debian.org/pkg/caffe          (for the CPU_ONLY version)
 https://tracker.debian.org/pkg/caffe-contrib  (for the CUDA version)
+```

From 734702b3703de0368e901644125ddca91bab4cb7 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Wed, 8 Feb 2017 11:42:05 -0800
Subject: [PATCH 122/183] Document switch to explicit flags for docker: cpu /
 gpu.

---
 docker/README.md | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/docker/README.md b/docker/README.md
index 11c18157..f9c7c756 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -1,9 +1,8 @@
 ### Running an official image
 
-You can run one of the automatic [builds](https://hub.docker.com/r/bvlc/caffe)
-like this:
+You can run one of the automatic [builds](https://hub.docker.com/r/bvlc/caffe). E.g. for the CPU version:
 
-`docker run -ti bvlc/caffe caffe --version`
+`docker run -ti bvlc/caffe:cpu caffe --version`
 
 or for GPU support (You need a CUDA 8.0 capable driver and
 [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)):
@@ -20,13 +19,13 @@ by root. It also runs by default in a container-private folder.
 You can change this using flags, like user (-u), current directory, and volumes (-w and -v).
 E.g. this behaves like the usual caffe executable:
 
-`docker run --rm -u $(id -u):$(id -g) -v $(pwd):$(pwd) -w $(pwd) bvlc/caffe caffe train --solver=example_solver.prototxt`
+`docker run --rm -u $(id -u):$(id -g) -v $(pwd):$(pwd) -w $(pwd) bvlc/caffe:cpu caffe train --solver=example_solver.prototxt`
 
 Containers can also be used interactively, specifying e.g. `bash` or `ipython`
 instead of `caffe`.
 
 ```
-docker run -ti bvlc/caffe ipython
+docker run -ti bvlc/caffe:cpu ipython
 import caffe
 ...
 ```
@@ -39,10 +38,10 @@ utilities can be used directly, e.g. `draw_net.py`, `classify.py`, or `detect.py
 
 Examples:
 
-`docker build -t caffe cpu`
+`docker build -t caffe:cpu cpu`
 
 `docker build -t caffe:gpu gpu`
 
 You can also build Caffe and run the tests in the image:
 
-`docker run -ti caffe bash -c "cd /opt/caffe/build; make runtest"`
+`docker run -ti caffe:cpu bash -c "cd /opt/caffe/build; make runtest"`

From 9c201e177994e31df430cf01baa3105aa5c00699 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Wed, 8 Feb 2017 17:13:53 -0800
Subject: [PATCH 123/183] make: bump version to rc4

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 65d08f7d..1b73ae0f 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ LIB_BUILD_DIR := $(BUILD_DIR)/lib
 STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
 DYNAMIC_VERSION_MAJOR 		:= 1
 DYNAMIC_VERSION_MINOR 		:= 0
-DYNAMIC_VERSION_REVISION 	:= 0-rc3
+DYNAMIC_VERSION_REVISION 	:= 0-rc4
 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
 #DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR)
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)

From 15dfcc1433441f01b0602474eb068e20e7451dd4 Mon Sep 17 00:00:00 2001
From: Katherine Crowson <crowsonkb@gmail.com>
Date: Thu, 9 Feb 2017 11:40:52 -0800
Subject: [PATCH 124/183] Add Pascal CUDA architectures to
 Makefile.config.example

---
 Makefile.config.example | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Makefile.config.example b/Makefile.config.example
index b590bd16..d552b38a 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -31,13 +31,17 @@ CUDA_DIR := /usr/local/cuda
 # CUDA_DIR := /usr
 
 # CUDA architecture setting: going with all of them.
-# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# For CUDA < 6.0, comment the *_50 through *_61 lines for compatibility.
+# For CUDA < 8.0, comment the *_60 and *_61 lines for compatibility.
 CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
 		-gencode arch=compute_20,code=sm_21 \
 		-gencode arch=compute_30,code=sm_30 \
 		-gencode arch=compute_35,code=sm_35 \
 		-gencode arch=compute_50,code=sm_50 \
-		-gencode arch=compute_50,code=compute_50
+		-gencode arch=compute_52,code=sm_52 \
+		-gencode arch=compute_60,code=sm_60 \
+		-gencode arch=compute_61,code=sm_61 \
+		-gencode arch=compute_61,code=compute_61
 
 # BLAS choice:
 # atlas for ATLAS (default)

From 23fca12e579731cf21c783b4a82de3d0a8b6e2cf Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Thu, 16 Feb 2017 16:40:18 -0800
Subject: [PATCH 125/183] version bump: rc5

---
 CMakeLists.txt | 4 ++--
 Makefile       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 15a7fe46..32b5bcb4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,8 +10,8 @@ endif()
 project(Caffe C CXX)
 
 # ---[ Caffe version
-set(CAFFE_TARGET_VERSION "1.0.0-rc4" CACHE STRING "Caffe logical version")
-set(CAFFE_TARGET_SOVERSION "1.0.0-rc4" CACHE STRING "Caffe soname version")
+set(CAFFE_TARGET_VERSION "1.0.0-rc5" CACHE STRING "Caffe logical version")
+set(CAFFE_TARGET_SOVERSION "1.0.0-rc5" CACHE STRING "Caffe soname version")
 add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 
 # ---[ Using cmake scripts and modules
diff --git a/Makefile b/Makefile
index 1b73ae0f..77900b69 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ LIB_BUILD_DIR := $(BUILD_DIR)/lib
 STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
 DYNAMIC_VERSION_MAJOR 		:= 1
 DYNAMIC_VERSION_MINOR 		:= 0
-DYNAMIC_VERSION_REVISION 	:= 0-rc4
+DYNAMIC_VERSION_REVISION 	:= 0-rc5
 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
 #DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR)
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)

From 85ab6100a122042c7dfd4adaf06f4c0b2e71148d Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Mon, 27 Feb 2017 11:54:37 -0800
Subject: [PATCH 126/183] fix broken link to hinge loss

---
 docs/tutorial/layers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tutorial/layers.md b/docs/tutorial/layers.md
index a903d5ac..2faacc58 100644
--- a/docs/tutorial/layers.md
+++ b/docs/tutorial/layers.md
@@ -128,7 +128,7 @@ Layers:
 * [Infogain Loss](layers/infogainloss.html) - a generalization of MultinomialLogisticLossLayer.
 * [Softmax with Loss](layers/softmaxwithloss.html) - computes the multinomial logistic loss of the softmax of its inputs. It's conceptually identical to a softmax layer followed by a multinomial logistic loss layer, but provides a more numerically stable gradient.
 * [Sum-of-Squares / Euclidean](layers/euclideanloss.html) - computes the sum of squares of differences of its two inputs, $$\frac 1 {2N} \sum_{i=1}^N \| x^1_i - x^2_i \|_2^2$$.
-* [Hinge / Margin](layers/hiddenloss.html) - The hinge loss layer computes a one-vs-all hinge (L1) or squared hinge loss (L2).
+* [Hinge / Margin](layers/hingeloss.html) - The hinge loss layer computes a one-vs-all hinge (L1) or squared hinge loss (L2).
 * [Sigmoid Cross-Entropy Loss](layers/sigmoidcrossentropyloss.html) - computes the cross-entropy (logistic) loss, often used for predicting targets interpreted as probabilities.
 * [Accuracy / Top-k layer](layers/accuracy.html) - scores the output as an accuracy with respect to target -- it is not actually a loss and has no backward step.
 * [Contrastive Loss](layers/contrastiveloss.html)

From fe9e58d6360d99cde0a883a06590631bb11911e0 Mon Sep 17 00:00:00 2001
From: zhuyuanhao <nju.zhuyuanhao@gmail.com>
Date: Wed, 1 Mar 2017 20:42:30 +0800
Subject: [PATCH 127/183] Remove not used variable in base_conv_layer.cpp

---
 src/caffe/layers/base_conv_layer.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 4a4c68e0..35c90145 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -19,7 +19,6 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   const int num_axes = bottom[0]->num_axes();
   num_spatial_axes_ = num_axes - first_spatial_axis;
   CHECK_GE(num_spatial_axes_, 0);
-  vector<int> bottom_dim_blob_shape(1, num_spatial_axes_ + 1);
   vector<int> spatial_dim_blob_shape(1, std::max(num_spatial_axes_, 1));
   // Setup filter kernel dimensions (kernel_shape_).
   kernel_shape_.Reshape(spatial_dim_blob_shape);

From 4529f12bdcd27d74655473b6665f5a23cd1214b1 Mon Sep 17 00:00:00 2001
From: gineshidalgo99 <gineshidalgo99@gmail.com>
Date: Thu, 9 Mar 2017 19:24:06 -0500
Subject: [PATCH 128/183] =?UTF-8?q?Removed=20some=20'warning:=20extra=20?=
 =?UTF-8?q?=E2=80=98;=E2=80=99=20[-Wpedantic]'?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/caffe/util/math_functions.hpp |  6 +++---
 include/caffe/util/mkl_alternate.hpp  | 18 +++++++++---------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 51068fe2..37abce5e 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -128,16 +128,16 @@ inline int8_t caffe_sign(Dtype val) {
   }
 
 // output is 1 for the positives, 0 for zero, and -1 for the negatives
-DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
+DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]))
 
 // This returns a nonzero value if the input has its sign bit set.
 // The name sngbit is meant to avoid conflicts with std::signbit in the macro.
 // The extra parens are needed because CUDA < 6.5 defines signbit as a macro,
 // and we don't want that to expand here when CUDA headers are also included.
 DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \
-    y[i] = static_cast<bool>((std::signbit)(x[i])));
+    y[i] = static_cast<bool>((std::signbit)(x[i])))
 
-DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
+DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]))
 
 template <typename Dtype>
 void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
index 95df0f93..79b2c32d 100644
--- a/include/caffe/util/mkl_alternate.hpp
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -36,10 +36,10 @@ extern "C" {
     v##name<double>(n, a, y); \
   }
 
-DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i]);
-DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]));
-DEFINE_VSL_UNARY_FUNC(Ln, y[i] = log(a[i]));
-DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]));
+DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i])
+DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]))
+DEFINE_VSL_UNARY_FUNC(Ln, y[i] = log(a[i]))
+DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]))
 
 // A simple way to define the vsl unary functions with singular parameter b.
 // The operation should be in the form e.g. y[i] = pow(a[i], b)
@@ -58,7 +58,7 @@ DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]));
     v##name<double>(n, a, b, y); \
   }
 
-DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
+DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b))
 
 // A simple way to define the vsl binary functions. The operation should
 // be in the form e.g. y[i] = a[i] + b[i]
@@ -77,10 +77,10 @@ DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
     v##name<double>(n, a, b, y); \
   }
 
-DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i]);
-DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i]);
-DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i]);
-DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]);
+DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i])
+DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i])
+DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i])
+DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i])
 
 // In addition, MKL comes with an additional function axpby that is not present
 // in standard blas. We will simply use a two-step (inefficient, of course) way

From 1d3e6e4522a95faf954e775b23a2f907e66caf31 Mon Sep 17 00:00:00 2001
From: folz <joachim.folz@dfki.de>
Date: Mon, 13 Mar 2017 11:04:30 +0100
Subject: [PATCH 129/183] Solver_add_nccl accepts any kind of Solver

---
 python/caffe/_caffe.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 3589e476..be011699 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -288,7 +288,7 @@ void Solver_add_callback(Solver<Dtype> * solver, bp::object on_start,
 }
 
 // Seems boost cannot call the base method directly
-void Solver_add_nccl(SGDSolver<Dtype>* solver
+void Solver_add_nccl(Solver<Dtype>* solver
 #ifdef USE_NCCL
   , NCCL<Dtype>* nccl
 #endif

From 93993a3c2b25ad683dbf13ef3085b0ea5912911f Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Tue, 14 Mar 2017 15:41:40 -0700
Subject: [PATCH 130/183] Init test net on all GPUs, allows parallel inference

---
 src/caffe/solver.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index fd4c0372..04426937 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -51,8 +51,8 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
   }
   // Scaffolding code
   InitTrainNet();
+  InitTestNets();
   if (Caffe::root_solver()) {
-    InitTestNets();
     LOG(INFO) << "Solver scaffolding done.";
   }
   iter_ = 0;
@@ -102,7 +102,6 @@ void Solver<Dtype>::InitTrainNet() {
 
 template <typename Dtype>
 void Solver<Dtype>::InitTestNets() {
-  CHECK(Caffe::root_solver());
   const bool has_net_param = param_.has_net_param();
   const bool has_net_file = param_.has_net();
   const int num_generic_nets = has_net_param + has_net_file;

From 802d90fe81f04e5e9c28c088da0f1b22e1b9fed2 Mon Sep 17 00:00:00 2001
From: Guillaume Dumont <dumont.guillaume@gmail.com>
Date: Thu, 16 Mar 2017 23:08:20 -0400
Subject: [PATCH 131/183] Added python 3 compatibility to cpp_lint.py

---
 scripts/cpp_lint.py | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/scripts/cpp_lint.py b/scripts/cpp_lint.py
index 6ec4fb76..b2016d4b 100755
--- a/scripts/cpp_lint.py
+++ b/scripts/cpp_lint.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python2
+#!/usr/bin/env python
 #
 # Copyright (c) 2009 Google Inc. All rights reserved.
 #
@@ -52,6 +52,10 @@
 import sys
 import unicodedata
 
+import six
+
+from six import iteritems, itervalues
+from six.moves import xrange
 
 _USAGE = """
 Syntax: cpp_lint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
@@ -756,7 +760,7 @@ def IncrementErrorCount(self, category):
 
   def PrintErrorCounts(self):
     """Print a summary of errors by category, and the total."""
-    for category, count in self.errors_by_category.iteritems():
+    for category, count in iteritems(self.errors_by_category):
       sys.stderr.write('Category \'%s\' errors found: %d\n' %
                        (category, count))
     sys.stderr.write('Total errors found: %d\n' % self.error_count)
@@ -3444,16 +3448,16 @@ def GetLineWidth(line):
     The width of the line in column positions, accounting for Unicode
     combining characters and wide characters.
   """
-  if isinstance(line, unicode):
-    width = 0
-    for uc in unicodedata.normalize('NFC', line):
-      if unicodedata.east_asian_width(uc) in ('W', 'F'):
-        width += 2
-      elif not unicodedata.combining(uc):
-        width += 1
-    return width
-  else:
-    return len(line)
+  if six.PY2:
+    if isinstance(line, unicode):
+      width = 0
+      for uc in unicodedata.normalize('NFC', line):
+        if unicodedata.east_asian_width(uc) in ('W', 'F'):
+          width += 2
+        elif not unicodedata.combining(uc):
+          width += 1
+      return width
+  return len(line)
 
 
 def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
@@ -3774,7 +3778,7 @@ def _GetTextInside(text, start_pattern):
 
   # Give opening punctuations to get the matching close-punctuations.
   matching_punctuation = {'(': ')', '{': '}', '[': ']'}
-  closing_punctuation = set(matching_punctuation.itervalues())
+  closing_punctuation = set(itervalues(matching_punctuation))
 
   # Find the position to start extracting text.
   match = re.search(start_pattern, text, re.M)
@@ -4851,10 +4855,11 @@ def main():
 
   # Change stderr to write with replacement characters so we don't die
   # if we try to print something containing non-ASCII characters.
-  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
-                                         codecs.getreader('utf8'),
-                                         codecs.getwriter('utf8'),
-                                         'replace')
+  if six.PY2:
+    sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                          codecs.getreader('utf8'),
+                                          codecs.getwriter('utf8'),
+                                          'replace')
 
   _cpplint_state.ResetErrorCounts()
   for filename in filenames:

From accd188d3241c27a6d24b95cd95a4dca4f4078bc Mon Sep 17 00:00:00 2001
From: max argus <argus.max@gmail.com>
Date: Wed, 8 Mar 2017 15:04:29 +0000
Subject: [PATCH 132/183] sane h5df file type check for weights

---
 src/caffe/net.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 70d51806..353c2f95 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -769,8 +769,7 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
 
 template <typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
-  if (trained_filename.size() >= 3 &&
-      trained_filename.compare(trained_filename.size() - 3, 3, ".h5") == 0) {
+  if (H5Fis_hdf5(trained_filename.c_str())) {
     CopyTrainedLayersFromHDF5(trained_filename);
   } else {
     CopyTrainedLayersFromBinaryProto(trained_filename);

From 11930f1416efb66795e1fabc5e362a568446d37d Mon Sep 17 00:00:00 2001
From: "Jonathan R. Williford" <jonathan@neural.vision>
Date: Wed, 22 Mar 2017 22:36:14 +0100
Subject: [PATCH 133/183] Clarify batch norm parameter documentation.

---
 src/caffe/proto/caffe.proto | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index a145c541..02e0ddf5 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -502,11 +502,21 @@ message ConcatParameter {
 }
 
 message BatchNormParameter {
-  // If false, accumulate global mean/variance values via a moving average. If
-  // true, use those accumulated values instead of computing mean/variance
-  // across the batch.
+  // If false, normalization is performed over the current mini-batch
+  // and global statistics are accumulated (but not yet used) by a moving
+  // average.
+  // If true, those accumulated mean and variance values are used for the
+  // normalization.
+  // By default, it is set to false when the network is in the training
+  // phase and true when the network is in the testing phase.
   optional bool use_global_stats = 1;
-  // How much does the moving average decay each iteration?
+  // What fraction of the moving average remains each iteration?
+  // Smaller values make the moving average decay faster, giving more
+  // weight to the recent values.
+  // Each iteration updates the moving average @f$S_{t-1}@f$ with the
+  // current mean @f$ Y_t @f$ by
+  // @f$ S_t = (1-\beta)Y_t + \beta \cdot S_{t-1} @f$, where @f$ \beta @f$
+  // is the moving_average_fraction parameter.
   optional float moving_average_fraction = 2 [default = .999];
   // Small value to add to the variance estimate so that we don't divide by
   // zero.

From 5c8e3545c650e9d3924f707334bde7cd67cf4e07 Mon Sep 17 00:00:00 2001
From: max argus <argus.max@gmail.com>
Date: Wed, 22 Mar 2017 23:15:34 +0000
Subject: [PATCH 134/183] [caffe][build] added Atlas lapack Library name
 atllapack

---
 cmake/Modules/FindAtlas.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/Modules/FindAtlas.cmake b/cmake/Modules/FindAtlas.cmake
index 9c665a47..7ffa6393 100644
--- a/cmake/Modules/FindAtlas.cmake
+++ b/cmake/Modules/FindAtlas.cmake
@@ -28,7 +28,7 @@ find_path(Atlas_CLAPACK_INCLUDE_DIR NAMES clapack.h PATHS ${Atlas_INCLUDE_SEARCH
 
 find_library(Atlas_CBLAS_LIBRARY NAMES  ptcblas_r ptcblas cblas_r cblas       PATHS ${Atlas_LIB_SEARCH_PATHS})
 find_library(Atlas_BLAS_LIBRARY NAMES   atlas_r   atlas                       PATHS ${Atlas_LIB_SEARCH_PATHS})
-find_library(Atlas_LAPACK_LIBRARY NAMES lapack alapack_r alapack lapack_atlas PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_LAPACK_LIBRARY NAMES lapack alapack_r alapack lapack_atlas atllapack PATHS ${Atlas_LIB_SEARCH_PATHS})
 
 set(LOOKED_FOR
   Atlas_CBLAS_INCLUDE_DIR
@@ -47,6 +47,6 @@ if(ATLAS_FOUND)
   set(Atlas_LIBRARIES ${Atlas_LAPACK_LIBRARY} ${Atlas_CBLAS_LIBRARY} ${Atlas_BLAS_LIBRARY})
   mark_as_advanced(${LOOKED_FOR})
 
-  message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR}, library: ${Atlas_BLAS_LIBRARY})")
+  message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR} library: ${Atlas_BLAS_LIBRARY} lapack: ${Atlas_LAPACK_LIBRARY}")
 endif(ATLAS_FOUND)
 

From 1e02d622da5aa01fbcf1185bced8e4b0daa0a50b Mon Sep 17 00:00:00 2001
From: max argus <argus.max@gmail.com>
Date: Wed, 22 Mar 2017 23:24:13 +0000
Subject: [PATCH 135/183] [caffe][build] added ABS_TEST_DATA_DIR var.

---
 cmake/Templates/caffe_config.h.in             | 15 ++++-----------
 include/caffe/test/test_caffe_main.hpp        |  3 +--
 src/caffe/test/test_gradient_based_solver.cpp |  2 +-
 src/caffe/test/test_hdf5_output_layer.cpp     |  3 +--
 src/caffe/test/test_hdf5data_layer.cpp        |  3 +--
 5 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in
index 45465b98..2080c63d 100644
--- a/cmake/Templates/caffe_config.h.in
+++ b/cmake/Templates/caffe_config.h.in
@@ -4,16 +4,9 @@
 /* Binaries directory */
 #define BINARY_FOLDER "${PROJECT_BINARY_DIR}"
 
+/* This is an absolute path so that we can run test from any build
+ * directory */
+#define ABS_TEST_DATA_DIR "${PROJECT_SOURCE_DIR}/src/caffe/test/test_data/"
+
 /* Test device */
 #define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE}
-
-/* Temporary (TODO: remove) */
-#if 1
-  #define CMAKE_SOURCE_DIR SOURCE_FOLDER "/src/"
-  #define EXAMPLES_SOURCE_DIR BINARY_FOLDER "/examples/"
-  #define CMAKE_EXT ".gen.cmake"
-#else
-  #define CMAKE_SOURCE_DIR "src/"
-  #define EXAMPLES_SOURCE_DIR "examples/"
-  #define CMAKE_EXT ""
-#endif
diff --git a/include/caffe/test/test_caffe_main.hpp b/include/caffe/test/test_caffe_main.hpp
index fc156091..294f7e50 100644
--- a/include/caffe/test/test_caffe_main.hpp
+++ b/include/caffe/test/test_caffe_main.hpp
@@ -18,9 +18,8 @@ using std::endl;
   #include "caffe_config.h"
 #else
   #define CUDA_TEST_DEVICE -1
-  #define CMAKE_SOURCE_DIR "src/"
   #define EXAMPLES_SOURCE_DIR "examples/"
-  #define CMAKE_EXT ""
+  #define ABS_TEST_DATA_DIR "src/caffe/test/test_data"
 #endif
 
 int main(int argc, char** argv);
diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index 6ad0d8f6..465140f2 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -28,7 +28,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
       seed_(1701), num_(4), channels_(3), height_(10), width_(10),
       share_(false) {
         input_file_ = new string(
-        CMAKE_SOURCE_DIR "caffe/test/test_data/solver_data_list.txt" CMAKE_EXT);
+        ABS_TEST_DATA_DIR "/solver_data_list.txt");
       }
   ~GradientBasedSolverTest() {
     delete input_file_;
diff --git a/src/caffe/test/test_hdf5_output_layer.cpp b/src/caffe/test/test_hdf5_output_layer.cpp
index 2bc2de1e..f94dd57e 100644
--- a/src/caffe/test/test_hdf5_output_layer.cpp
+++ b/src/caffe/test/test_hdf5_output_layer.cpp
@@ -20,8 +20,7 @@ class HDF5OutputLayerTest : public MultiDeviceTest<TypeParam> {
 
  protected:
   HDF5OutputLayerTest()
-      : input_file_name_(
-        CMAKE_SOURCE_DIR "caffe/test/test_data/sample_data.h5"),
+      : input_file_name_(ABS_TEST_DATA_DIR "/sample_data.h5"),
         blob_data_(new Blob<Dtype>()),
         blob_label_(new Blob<Dtype>()),
         num_(5),
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp
index 487f5176..3977c486 100644
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ b/src/caffe/test/test_hdf5data_layer.cpp
@@ -30,8 +30,7 @@ class HDF5DataLayerTest : public MultiDeviceTest<TypeParam> {
     blob_top_vec_.push_back(blob_top_label2_);
 
     // Check out generate_sample_data.py in the same directory.
-    filename = new string(
-    CMAKE_SOURCE_DIR "caffe/test/test_data/sample_data_list.txt" CMAKE_EXT);
+    filename = new string(ABS_TEST_DATA_DIR "/sample_data_list.txt");
     LOG(INFO)<< "Using sample HDF5 data file " << filename;
   }
 

From 8602a238a712d50ac5a2d7dffadee2f34d755e3f Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Mon, 27 Mar 2017 11:33:06 -0700
Subject: [PATCH 136/183] Expose share_weights to python to allow running test
 nets

---
 python/caffe/_caffe.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index be011699..276f21f8 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -298,6 +298,10 @@ void Solver_add_nccl(Solver<Dtype>* solver
 #endif
 }
 
+void share_weights(Solver<Dtype>* solver, Net<Dtype>* net) {
+  net->ShareTrainedLayersWith(solver->net().get());
+}
+
 template<typename Dtype>
 class NetCallback: public Net<Dtype>::Callback {
  public:
@@ -459,6 +463,7 @@ BOOST_PYTHON_MODULE(_caffe) {
     .def("step", &Solver<Dtype>::Step)
     .def("restore", &Solver<Dtype>::Restore)
     .def("snapshot", &Solver<Dtype>::Snapshot)
+    .def("share_weights", &share_weights)
     .add_property("param", bp::make_function(&Solver<Dtype>::param,
               bp::return_value_policy<bp::copy_const_reference>()));
   BP_REGISTER_SHARED_PTR_TO_PYTHON(Solver<Dtype>);

From 850ffd8d1cf18cabe36eb269b63d693db2b167ef Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Mon, 27 Mar 2017 13:15:18 -0700
Subject: [PATCH 137/183] Remove missed legacy parallel code

---
 include/caffe/layers/base_data_layer.hpp   | 2 --
 include/caffe/layers/data_layer.hpp        | 2 --
 include/caffe/layers/dummy_data_layer.hpp  | 2 --
 include/caffe/layers/hdf5_data_layer.hpp   | 2 --
 include/caffe/layers/hdf5_output_layer.hpp | 2 --
 include/caffe/layers/input_layer.hpp       | 2 --
 include/caffe/layers/python_layer.hpp      | 4 ----
 src/caffe/proto/caffe.proto                | 4 +---
 8 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/include/caffe/layers/base_data_layer.hpp b/include/caffe/layers/base_data_layer.hpp
index 21d3ada5..c8b6998c 100644
--- a/include/caffe/layers/base_data_layer.hpp
+++ b/include/caffe/layers/base_data_layer.hpp
@@ -26,8 +26,6 @@ class BaseDataLayer : public Layer<Dtype> {
   // This method may not be overridden except by the BasePrefetchingDataLayer.
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
   // Data layers have no bottoms, so reshaping is trivial.
diff --git a/include/caffe/layers/data_layer.hpp b/include/caffe/layers/data_layer.hpp
index dec58180..667a4ae4 100644
--- a/include/caffe/layers/data_layer.hpp
+++ b/include/caffe/layers/data_layer.hpp
@@ -20,8 +20,6 @@ class DataLayer : public BasePrefetchingDataLayer<Dtype> {
   virtual ~DataLayer();
   virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // DataLayer uses DataReader instead for sharing for parallelism
-  virtual inline bool ShareInParallel() const { return false; }
   virtual inline const char* type() const { return "Data"; }
   virtual inline int ExactNumBottomBlobs() const { return 0; }
   virtual inline int MinTopBlobs() const { return 1; }
diff --git a/include/caffe/layers/dummy_data_layer.hpp b/include/caffe/layers/dummy_data_layer.hpp
index 4180f1d0..13a63d47 100644
--- a/include/caffe/layers/dummy_data_layer.hpp
+++ b/include/caffe/layers/dummy_data_layer.hpp
@@ -22,8 +22,6 @@ class DummyDataLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/hdf5_data_layer.hpp b/include/caffe/layers/hdf5_data_layer.hpp
index 650a3fb0..601b36c6 100644
--- a/include/caffe/layers/hdf5_data_layer.hpp
+++ b/include/caffe/layers/hdf5_data_layer.hpp
@@ -27,8 +27,6 @@ class HDF5DataLayer : public Layer<Dtype> {
   virtual ~HDF5DataLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/hdf5_output_layer.hpp b/include/caffe/layers/hdf5_output_layer.hpp
index 487d08fc..061e279d 100644
--- a/include/caffe/layers/hdf5_output_layer.hpp
+++ b/include/caffe/layers/hdf5_output_layer.hpp
@@ -28,8 +28,6 @@ class HDF5OutputLayer : public Layer<Dtype> {
   virtual ~HDF5OutputLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/input_layer.hpp b/include/caffe/layers/input_layer.hpp
index f4472678..0ffdc724 100644
--- a/include/caffe/layers/input_layer.hpp
+++ b/include/caffe/layers/input_layer.hpp
@@ -22,8 +22,6 @@ class InputLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/python_layer.hpp b/include/caffe/layers/python_layer.hpp
index 10c4bfd0..1407d921 100644
--- a/include/caffe/layers/python_layer.hpp
+++ b/include/caffe/layers/python_layer.hpp
@@ -34,10 +34,6 @@ class PythonLayer : public Layer<Dtype> {
     self_.attr("reshape")(bottom, top);
   }
 
-  virtual inline bool ShareInParallel() const {
-    return this->layer_param_.python_param().share_in_parallel();
-  }
-
   virtual inline const char* type() const { return "Python"; }
 
  protected:
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 02e0ddf5..8e528e8e 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -937,9 +937,7 @@ message PythonParameter {
   // string, dictionary in Python dict format, JSON, etc. You may parse this
   // string in `setup` method and use it in `forward` and `backward`.
   optional string param_str = 3 [default = ''];
-  // Whether this PythonLayer is shared among worker solvers during data parallelism.
-  // If true, each worker solver sequentially run forward from this layer.
-  // This value should be set true if you are using it as a data layer.
+  // DEPRECATED
   optional bool share_in_parallel = 4 [default = false];
 }
 

From 9bd80b2f12649c6336b64c8ebcc2d1210755d1c7 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yuduowu@users.noreply.github.com>
Date: Wed, 29 Mar 2017 14:42:36 -0700
Subject: [PATCH 138/183] Fix typo in test_caffe_main.cpp: defice -> device

---
 src/caffe/test/test_caffe_main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp
index 6473b74d..8f333bd7 100644
--- a/src/caffe/test/test_caffe_main.cpp
+++ b/src/caffe/test/test_caffe_main.cpp
@@ -15,7 +15,7 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   caffe::GlobalInit(&argc, &argv);
 #ifndef CPU_ONLY
-  // Before starting testing, let's first print out a few cuda defice info.
+  // Before starting testing, let's first print out a few cuda device info.
   int device;
   cudaGetDeviceCount(&device);
   cout << "Cuda number of devices: " << device << endl;

From a32114e6b2e098e2fdef47e397542b105eb58b66 Mon Sep 17 00:00:00 2001
From: Will Crichton <wcrichto@stanford.edu>
Date: Fri, 31 Mar 2017 11:22:22 -0400
Subject: [PATCH 139/183] Fixed memory leaks in cudnn conv and relu

---
 src/caffe/layers/cudnn_conv_layer.cpp | 1 +
 src/caffe/layers/cudnn_relu_layer.cpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index 1987fb09..efc9e04e 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -252,6 +252,7 @@ CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
   }
 
   cudaFree(workspaceData);
+  delete [] workspace;
   delete [] stream_;
   delete [] handle_;
   delete [] fwd_algo_;
diff --git a/src/caffe/layers/cudnn_relu_layer.cpp b/src/caffe/layers/cudnn_relu_layer.cpp
index 795e0a9e..687c9057 100644
--- a/src/caffe/layers/cudnn_relu_layer.cpp
+++ b/src/caffe/layers/cudnn_relu_layer.cpp
@@ -36,6 +36,7 @@ CuDNNReLULayer<Dtype>::~CuDNNReLULayer() {
 
   cudnnDestroyTensorDescriptor(this->bottom_desc_);
   cudnnDestroyTensorDescriptor(this->top_desc_);
+  cudnnDestroyActivationDescriptor(this->activ_desc_);
   cudnnDestroy(this->handle_);
 }
 

From a2601eddf65bab54429244e350899b6d994f4f37 Mon Sep 17 00:00:00 2001
From: Luke Yeager <lukeyeager@users.noreply.github.com>
Date: Fri, 31 Mar 2017 11:01:13 -0700
Subject: [PATCH 140/183] Revert "Fix Python net drawing script"

This reverts commit db6cf0a728cad63c93b345f2203f3ad1f5d5c2f4.
---
 python/caffe/draw.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index e4fd7aac..9eecf6d7 100644
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -104,11 +104,11 @@ def get_layer_label(layer, rankdir):
                       pooling_types_dict[layer.pooling_param.pool],
                       layer.type,
                       separator,
-                      layer.pooling_param.kernel_size[0] if len(layer.pooling_param.kernel_size._values) else 1,
+                      layer.pooling_param.kernel_size,
                       separator,
-                      layer.pooling_param.stride[0] if len(layer.pooling_param.stride._values) else 1,
+                      layer.pooling_param.stride,
                       separator,
-                      layer.pooling_param.pad[0] if len(layer.pooling_param.pad._values) else 0)
+                      layer.pooling_param.pad)
     else:
         node_label = '"%s%s(%s)"' % (layer.name, separator, layer.type)
     return node_label

From 0096fe3d270a4833479076e18492de8b28564c80 Mon Sep 17 00:00:00 2001
From: Felix Abecassis <fabecassis@nvidia.com>
Date: Fri, 31 Mar 2017 11:18:39 -0700
Subject: [PATCH 141/183] Add support for cuDNN v6

Support for cuDNN v4 and v5 is preserved.
---
 docs/installation.md           |  4 ++--
 include/caffe/util/cudnn.hpp   | 10 ++++++++++
 scripts/travis/install-deps.sh |  2 +-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/docs/installation.md b/docs/installation.md
index 2e558027..42f1d0ce 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -42,14 +42,14 @@ Optional dependencies:
 
 * [OpenCV](http://opencv.org/) >= 2.4 including 3.0
 * IO libraries: `lmdb`, `leveldb` (note: leveldb requires `snappy`)
-* cuDNN for GPU acceleration (v5)
+* cuDNN for GPU acceleration (v6)
 
 Pycaffe and Matcaffe interfaces have their own natural needs.
 
 * For Python Caffe:  `Python 2.7` or `Python 3.3+`, `numpy (>= 1.7)`, boost-provided `boost.python`
 * For MATLAB Caffe: MATLAB with the `mex` compiler.
 
-**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v5; older versions are supported in older Caffe.
+**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v6; older versions are supported in older Caffe.
 
 **CPU-only Caffe**: for cold-brewed CPU-only Caffe uncomment the `CPU_ONLY := 1` flag in `Makefile.config` to configure and build Caffe without CUDA. This is helpful for cloud or cluster deployment.
 
diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp
index a7d8dbba..498cfe38 100644
--- a/include/caffe/util/cudnn.hpp
+++ b/include/caffe/util/cudnn.hpp
@@ -41,6 +41,10 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
       return "CUDNN_STATUS_NOT_SUPPORTED";
     case CUDNN_STATUS_LICENSE_ERROR:
       return "CUDNN_STATUS_LICENSE_ERROR";
+#if CUDNN_VERSION_MIN(6, 0, 0)
+    case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING:
+      return "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING";
+#endif
   }
   return "Unknown cudnn status";
 }
@@ -109,8 +113,14 @@ template <typename Dtype>
 inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv,
     cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter,
     int pad_h, int pad_w, int stride_h, int stride_w) {
+#if CUDNN_VERSION_MIN(6, 0, 0)
   CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
+      pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION,
+      dataType<Dtype>::type));
+#else
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
       pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
+#endif
 }
 
 template <typename Dtype>
diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index 1900b16d..1593ed8b 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -104,7 +104,7 @@ if $WITH_CUDA ; then
   ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda
 
   if $WITH_CUDNN ; then
-    apt-get install -y --no-install-recommends libcudnn5-dev
+    apt-get install -y --no-install-recommends libcudnn6-dev
   fi
 fi
 

From 179dafdb1a930cf86ff0956618bf8411b8dcd90e Mon Sep 17 00:00:00 2001
From: Luke Yeager <lukeyeager@users.noreply.github.com>
Date: Fri, 31 Mar 2017 11:24:56 -0700
Subject: [PATCH 142/183] Add test for caffe.draw.draw_net()

---
 python/caffe/test/test_draw.py        | 33 +++++++++++++++++++++++++++
 scripts/travis/install-deps.sh        |  2 ++
 scripts/travis/install-python-deps.sh |  1 +
 3 files changed, 36 insertions(+)
 create mode 100644 python/caffe/test/test_draw.py

diff --git a/python/caffe/test/test_draw.py b/python/caffe/test/test_draw.py
new file mode 100644
index 00000000..1634145e
--- /dev/null
+++ b/python/caffe/test/test_draw.py
@@ -0,0 +1,33 @@
+import os
+import unittest
+
+from google import protobuf
+
+import caffe.draw
+from caffe.proto import caffe_pb2
+
+def getFilenames():
+    """Yields files in the source tree which are Net prototxts."""
+    result = []
+
+    root_dir = os.path.abspath(os.path.join(
+        os.path.dirname(__file__), '..', '..', '..'))
+    assert os.path.exists(root_dir)
+
+    for dirname in ('models', 'examples'):
+        dirname = os.path.join(root_dir, dirname)
+        assert os.path.exists(dirname)
+        for cwd, _, filenames in os.walk(dirname):
+            for filename in filenames:
+                filename = os.path.join(cwd, filename)
+                if filename.endswith('.prototxt') and 'solver' not in filename:
+                    yield os.path.join(dirname, filename)
+
+
+class TestDraw(unittest.TestCase):
+    def test_draw_net(self):
+        for filename in getFilenames():
+            net = caffe_pb2.NetParameter()
+            with open(filename) as infile:
+                protobuf.text_format.Merge(infile.read(), net)
+            caffe.draw.draw_net(net, 'LR')
diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index 1900b16d..59a9163d 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -8,6 +8,7 @@ source $BASEDIR/defaults.sh
 apt-get -y update
 apt-get install -y --no-install-recommends \
   build-essential \
+  graphviz \
   libboost-filesystem-dev \
   libboost-python-dev \
   libboost-system-dev \
@@ -31,6 +32,7 @@ if ! $WITH_PYTHON3 ; then
     python-dev \
     python-numpy \
     python-protobuf \
+    python-pydot \
     python-skimage
 else
   # Python3
diff --git a/scripts/travis/install-python-deps.sh b/scripts/travis/install-python-deps.sh
index eeec3027..910d35a9 100755
--- a/scripts/travis/install-python-deps.sh
+++ b/scripts/travis/install-python-deps.sh
@@ -11,4 +11,5 @@ if ! $WITH_PYTHON3 ; then
 else
   # Python3
   pip install --pre protobuf==3.0.0b3
+  pip install pydot
 fi

From 41e34c9061e9577c2b1dd56be65fd23ef26457fd Mon Sep 17 00:00:00 2001
From: Nitheesh <nitheeshas91@gmail.com>
Date: Tue, 4 Apr 2017 13:36:20 +0530
Subject: [PATCH 143/183] Minor fix for net drawing script

---
 python/caffe/draw.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index 9eecf6d7..8411a41d 100644
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -91,11 +91,11 @@ def get_layer_label(layer, rankdir):
                       separator,
                       layer.type,
                       separator,
-                      layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size._values) else 1,
+                      layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size) else 1,
                       separator,
-                      layer.convolution_param.stride[0] if len(layer.convolution_param.stride._values) else 1,
+                      layer.convolution_param.stride[0] if len(layer.convolution_param.stride) else 1,
                       separator,
-                      layer.convolution_param.pad[0] if len(layer.convolution_param.pad._values) else 0)
+                      layer.convolution_param.pad[0] if len(layer.convolution_param.pad) else 0)
     elif layer.type == 'Pooling':
         pooling_types_dict = get_pooling_types_dict()
         node_label = '"%s%s(%s %s)%skernel size: %d%sstride: %d%spad: %d"' %\

From 31bfe8fb498ea2e528da6463c9045b397992e028 Mon Sep 17 00:00:00 2001
From: Nitheesh <nitheeshas91@gmail.com>
Date: Tue, 4 Apr 2017 13:40:31 +0530
Subject: [PATCH 144/183] Add main() for draw_net unittest, fix import errors

---
 python/caffe/test/test_draw.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/caffe/test/test_draw.py b/python/caffe/test/test_draw.py
index 1634145e..835bb5df 100644
--- a/python/caffe/test/test_draw.py
+++ b/python/caffe/test/test_draw.py
@@ -1,7 +1,7 @@
 import os
 import unittest
 
-from google import protobuf
+from google.protobuf import text_format
 
 import caffe.draw
 from caffe.proto import caffe_pb2
@@ -29,5 +29,9 @@ def test_draw_net(self):
         for filename in getFilenames():
             net = caffe_pb2.NetParameter()
             with open(filename) as infile:
-                protobuf.text_format.Merge(infile.read(), net)
+                text_format.Merge(infile.read(), net)
             caffe.draw.draw_net(net, 'LR')
+
+
+if __name__ == "__main__":
+    unittest.main()

From 5f1ca848f8c9daa73f61f64413e15ab2cd6602e7 Mon Sep 17 00:00:00 2001
From: "Jonathan R. Williford" <jonathan@neural.vision>
Date: Wed, 5 Apr 2017 10:03:31 +0000
Subject: [PATCH 145/183] Add example and small blurb about sigmoid layer.

---
 docs/tutorial/layers/sigmoid.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/tutorial/layers/sigmoid.md b/docs/tutorial/layers/sigmoid.md
index 50531835..f18ac4b8 100644
--- a/docs/tutorial/layers/sigmoid.md
+++ b/docs/tutorial/layers/sigmoid.md
@@ -9,6 +9,16 @@ title: Sigmoid Layer
 * Header: [`./include/caffe/layers/sigmoid_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/sigmoid_layer.hpp)
 * CPU implementation: [`./src/caffe/layers/sigmoid_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_layer.cpp)
 * CUDA GPU implementation: [`./src/caffe/layers/sigmoid_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_layer.cu)
+* Example (from [`./examples/mnist/mnist_autoencoder.prototxt`](https://github.com/BVLC/caffe/blob/master/examples/mnist/mnist_autoencoder.prototxt)):
+
+      layer {
+        name: "encode1neuron"
+        bottom: "encode1"
+        top: "encode1neuron"
+        type: "Sigmoid"
+      }
+
+The `Sigmoid` layer computes `sigmoid(x)` for each element `x` in the bottom blob.
 
 ## Parameters
 

From ce7193c7385298825c8cabebd20f664f3f93f06a Mon Sep 17 00:00:00 2001
From: Guillaume Dumont <dumont.guillaume@gmail.com>
Date: Sat, 8 Apr 2017 12:59:24 -0400
Subject: [PATCH 146/183] Removed repeated import Layer, get_solver

---
 python/caffe/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index 43a0c49b..80f51716 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,5 @@
 from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver, NCCL, Timer
-from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess, Layer, get_solver
+from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier

From b2a95fa7fcba2089b981eb30b47d9aeba2b89ce9 Mon Sep 17 00:00:00 2001
From: Bruno Bowden <github@brunobowden.com>
Date: Sat, 8 Apr 2017 15:54:04 -0700
Subject: [PATCH 147/183] Log shape dimensions for eltwise layer shape mismatch

When layer shapes mismatch for the eltwise layer, caffe will fail a
check but doesn't give any information on how the shapes mismatch.
This logging information will make it easier to debug. Additionally
this reorders the variables to CHECK(expected == actual), matching
the JUnit convention.

BEFORE: Check failed: bottom[i]->shape() == bottom[0]->shape()

AFTER:  Check failed: bottom[0]->shape() == bottom[i]->shape()
        bottom[0]: 1 4 (4), bottom[3]: 1 6 (6)

NOTE: This removes use of CHECK_EQ in an earlier version of this PR,
which caused a build warning due to include of glog/stl_logging.h.
---
 src/caffe/layers/eltwise_layer.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index 21256166..3d82b0e1 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -31,7 +31,9 @@ template <typename Dtype>
 void EltwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   for (int i = 1; i < bottom.size(); ++i) {
-    CHECK(bottom[i]->shape() == bottom[0]->shape());
+    CHECK(bottom[0]->shape() == bottom[i]->shape())
+        << "bottom[0]: " << bottom[0]->shape_string()
+        << ", bottom[" << i << "]: " << bottom[i]->shape_string();
   }
   top[0]->ReshapeLike(*bottom[0]);
   // If max operation, we will initialize the vector index part.

From 51728d1532dbee2853acb89a8a9653e82219953b Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Wed, 12 Apr 2017 01:42:59 -0700
Subject: [PATCH 148/183] Fix log parsing #5422

---
 tools/extra/parse_log.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/extra/parse_log.sh b/tools/extra/parse_log.sh
index 9892c897..122eb9e6 100755
--- a/tools/extra/parse_log.sh
+++ b/tools/extra/parse_log.sh
@@ -39,7 +39,7 @@ rm aux.txt aux0.txt aux1.txt aux2.txt aux3.txt aux4.txt
 grep '] Solving ' $1 > aux.txt
 grep ', loss = ' $1 >> aux.txt
 grep 'Iteration ' aux.txt | sed  's/.*Iteration \([[:digit:]]*\).*/\1/g' > aux0.txt
-grep ', loss = ' $1 | awk '{print $9}' > aux1.txt
+grep ', loss = ' $1 | awk -F = '{print $2}' > aux1.txt
 grep ', lr = ' $1 | awk '{print $9}' > aux2.txt
 
 # Extracting elapsed seconds

From bac59bed485dfa195600b5b12031401613fade05 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Wed, 12 Apr 2017 02:05:34 -0700
Subject: [PATCH 149/183] Allow using env vars for glog init from python

---
 python/caffe/_caffe.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 276f21f8..01b34b84 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -51,14 +51,18 @@ const int NPY_DTYPE = NPY_FLOAT32;
 void set_mode_cpu() { Caffe::set_mode(Caffe::CPU); }
 void set_mode_gpu() { Caffe::set_mode(Caffe::GPU); }
 
-void InitLog(int level) {
-  FLAGS_logtostderr = 1;
-  FLAGS_minloglevel = level;
+void InitLog() {
   ::google::InitGoogleLogging("");
   ::google::InstallFailureSignalHandler();
 }
-void InitLogInfo() {
-  InitLog(google::INFO);
+void InitLogLevel(int level) {
+  FLAGS_minloglevel = level;
+  InitLog();
+}
+void InitLogLevelPipe(int level, bool stderr) {
+  FLAGS_minloglevel = level;
+  FLAGS_logtostderr = stderr;
+  InitLog();
 }
 void Log(const string& s) {
   LOG(INFO) << s;
@@ -353,7 +357,8 @@ BOOST_PYTHON_MODULE(_caffe) {
 
   // Caffe utility functions
   bp::def("init_log", &InitLog);
-  bp::def("init_log", &InitLogInfo);
+  bp::def("init_log", &InitLogLevel);
+  bp::def("init_log", &InitLogLevelPipe);
   bp::def("log", &Log);
   bp::def("set_mode_cpu", &set_mode_cpu);
   bp::def("set_mode_gpu", &set_mode_gpu);

From 35a7b87ad87457291dfc79bf8a7e7cf7ef278cbb Mon Sep 17 00:00:00 2001
From: Noiredd <snowball91b@gmail.com>
Date: Wed, 12 Apr 2017 11:59:06 +0200
Subject: [PATCH 150/183] fixes pycaffe forward() and backward() behavior for
 nets whose layer names do not match respective tops

---
 python/caffe/pycaffe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index 63606591..4a7b5a24 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -113,7 +113,7 @@ def _Net_forward(self, blobs=None, start=None, end=None, **kwargs):
 
     if end is not None:
         end_ind = list(self._layer_names).index(end)
-        outputs = set([end] + blobs)
+        outputs = set(self.top_names[end] + blobs)
     else:
         end_ind = len(self.layers) - 1
         outputs = set(self.outputs + blobs)
@@ -161,7 +161,7 @@ def _Net_backward(self, diffs=None, start=None, end=None, **kwargs):
 
     if end is not None:
         end_ind = list(self._layer_names).index(end)
-        outputs = set([end] + diffs)
+        outputs = set(self.bottom_names[end] + diffs)
     else:
         end_ind = 0
         outputs = set(self.inputs + diffs)

From 3a987960d6a08b179eb6c0c526b27ab761ea2d6e Mon Sep 17 00:00:00 2001
From: Kang Kim <kangk@qti.qualcomm.com>
Date: Thu, 13 Apr 2017 15:23:26 +0900
Subject: [PATCH 151/183] remove redundant check in LSTMUnitLayer

---
 src/caffe/layers/lstm_unit_layer.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/caffe/layers/lstm_unit_layer.cpp b/src/caffe/layers/lstm_unit_layer.cpp
index 277c031a..d1ab59c4 100644
--- a/src/caffe/layers/lstm_unit_layer.cpp
+++ b/src/caffe/layers/lstm_unit_layer.cpp
@@ -31,7 +31,6 @@ void LSTMUnitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     CHECK_EQ(num_instances, bottom[i]->shape(1));
   }
   hidden_dim_ = bottom[0]->shape(2);
-  CHECK_EQ(num_instances, bottom[1]->shape(1));
   CHECK_EQ(4 * hidden_dim_, bottom[1]->shape(2));
   top[0]->ReshapeLike(*bottom[0]);
   top[1]->ReshapeLike(*bottom[0]);

From 96870628698090813d92a9b1f8af9a8311469354 Mon Sep 17 00:00:00 2001
From: Guillaume Dumont <dumont.guillaume@gmail.com>
Date: Thu, 13 Apr 2017 13:15:24 -0400
Subject: [PATCH 152/183] Bump boost version to 1.55 in CMake build

---
 cmake/Dependencies.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 02c81525..4a5bac47 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -5,7 +5,7 @@ set(Caffe_DEFINITIONS "")
 set(Caffe_COMPILE_OPTIONS "")
 
 # ---[ Boost
-find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
+find_package(Boost 1.55 REQUIRED COMPONENTS system thread filesystem)
 list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${Boost_INCLUDE_DIRS})
 list(APPEND Caffe_LINKER_LIBS PUBLIC ${Boost_LIBRARIES})
 

From 0c9cc62379e4061b58b0dfa257d79c2ecaeb2be8 Mon Sep 17 00:00:00 2001
From: Guillaume Dumont <dumont.guillaume@gmail.com>
Date: Sat, 11 Mar 2017 20:12:40 -0500
Subject: [PATCH 153/183] Added support for python 3 and NCCL

---
 python/caffe/__init__.py       |  2 +-
 python/caffe/_caffe.cpp        | 32 +++++++++++++++++++++++++++++++-
 python/caffe/test/test_nccl.py | 19 +++++++++++++++++++
 3 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 python/caffe/test/test_nccl.py

diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index 80f51716..776945ee 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,5 @@
 from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver, NCCL, Timer
-from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess
+from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess, has_nccl
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 01b34b84..7fc06c08 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -347,6 +347,35 @@ class NCCL {
 };
 #endif
 
+bool HasNCCL() {
+#ifdef USE_NCCL
+  return true;
+#else
+  return false;
+#endif
+}
+
+#ifdef USE_NCCL
+bp::object NCCL_New_Uid() {
+  std::string uid = NCCL<Dtype>::new_uid();
+#if PY_MAJOR_VERSION >= 3
+  // Convert std::string to bytes so that Python does not
+  // try to decode the string using the current locale.
+
+  // Since boost 1.53 boost.python will convert str and bytes
+  // to std::string but will convert std::string to str. Here we
+  // force a bytes object to be returned. When this object
+  // is passed back to the NCCL constructor boost.python will
+  // correctly convert the bytes to std::string automatically
+  PyObject* py_uid = PyBytes_FromString(uid.c_str());
+  return bp::object(bp::handle<>(py_uid));
+#else
+  // automatic conversion is correct for python 2.
+  return uid;
+#endif
+}
+#endif
+
 BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(SolveOverloads, Solve, 0, 1);
 
 BOOST_PYTHON_MODULE(_caffe) {
@@ -360,6 +389,7 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::def("init_log", &InitLogLevel);
   bp::def("init_log", &InitLogLevelPipe);
   bp::def("log", &Log);
+  bp::def("has_nccl", &HasNCCL);
   bp::def("set_mode_cpu", &set_mode_cpu);
   bp::def("set_mode_gpu", &set_mode_gpu);
   bp::def("set_random_seed", &set_random_seed);
@@ -518,7 +548,7 @@ BOOST_PYTHON_MODULE(_caffe) {
     boost::noncopyable>("NCCL",
                         bp::init<shared_ptr<Solver<Dtype> >, const string&>())
 #ifdef USE_NCCL
-    .def("new_uid", &NCCL<Dtype>::new_uid).staticmethod("new_uid")
+    .def("new_uid", NCCL_New_Uid).staticmethod("new_uid")
     .def("bcast", &NCCL<Dtype>::Broadcast)
 #endif
     /* NOLINT_NEXT_LINE(whitespace/semicolon) */
diff --git a/python/caffe/test/test_nccl.py b/python/caffe/test/test_nccl.py
new file mode 100644
index 00000000..127a9337
--- /dev/null
+++ b/python/caffe/test/test_nccl.py
@@ -0,0 +1,19 @@
+import sys
+import unittest
+
+import caffe
+
+
+class TestNCCL(unittest.TestCase):
+
+    def test_newuid(self):
+        """
+        Test that NCCL uids are of the proper type
+        according to python version
+        """
+        if caffe.has_nccl():
+            uid = caffe.NCCL.new_uid()
+            if sys.version_info.major >= 3:
+                self.assertTrue(isinstance(uid, bytes))
+            else:
+                self.assertTrue(isinstance(uid, str))

From e98023af4a570e3105486b661e4c4d1855c0dd79 Mon Sep 17 00:00:00 2001
From: Patrick Follmann <follmann@mvtec.com>
Date: Thu, 29 Dec 2016 14:37:21 +0100
Subject: [PATCH 154/183] Add GPU sqrt functions

---
 include/caffe/util/math_functions.hpp |  3 +++
 src/caffe/util/math_functions.cu      | 21 +++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 37abce5e..60a8404a 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -214,6 +214,9 @@ void caffe_gpu_log(const int n, const Dtype* a, Dtype* y);
 template <typename Dtype>
 void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
+template <typename Dtype>
+void caffe_gpu_sqrt(const int n, const Dtype* a, Dtype* y);
+
 // caffe_gpu_rng_uniform with two arguments generates integers in the range
 // [0, UINT_MAX].
 void caffe_gpu_rng_uniform(const int n, unsigned int* r);
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index 6d001026..314e6ba0 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -387,6 +387,27 @@ void caffe_gpu_powx<double>(const int N, const double* a,
       N, a, alpha, y);
 }
 
+template <typename Dtype>
+__global__ void sqrt_kernel(const int n, const Dtype* a, Dtype* y) {
+  CUDA_KERNEL_LOOP(index, n) {
+    y[index] = sqrt(a[index]);
+  }
+}
+
+template <>
+void caffe_gpu_sqrt<float>(const int N, const float* a, float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  sqrt_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, a, y);
+}
+
+template <>
+void caffe_gpu_sqrt<double>(const int N, const double* a, double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  sqrt_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, a, y);
+}
+
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
                                       - (x[index] < Dtype(0)));
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));

From e93fcd267582888f960ca48d6e0c2e719d4ea09b Mon Sep 17 00:00:00 2001
From: Patrick Follmann <follmann@mvtec.com>
Date: Thu, 29 Dec 2016 14:46:16 +0100
Subject: [PATCH 155/183] GPU BatchNormLayer: replace powx with mul and sqrt

---
 src/caffe/layers/batch_norm_layer.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/batch_norm_layer.cu b/src/caffe/layers/batch_norm_layer.cu
index c21713c8..a35e778e 100644
--- a/src/caffe/layers/batch_norm_layer.cu
+++ b/src/caffe/layers/batch_norm_layer.cu
@@ -48,14 +48,14 @@ void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   if (!use_global_stats_) {
     // compute variance using var(X) = E((X-EX)^2)
-    caffe_gpu_powx(top[0]->count(), top_data, Dtype(2),
+    caffe_gpu_mul(top[0]->count(), top[0]->gpu_data(), top[0]->gpu_data(),
         temp_.mutable_gpu_data());  // (X-EX)^2
     caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
         1. / (num * spatial_dim), temp_.gpu_data(),
         spatial_sum_multiplier_.gpu_data(), 0.,
         num_by_chans_.mutable_gpu_data());
-    caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
-        num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
+    caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, Dtype(1.),
+        num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), Dtype(0.),
         variance_.mutable_gpu_data());  // E((X_EX)^2)
 
     // compute and save moving average
@@ -72,7 +72,7 @@ void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   // normalize variance
   caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
-  caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
+  caffe_gpu_sqrt(variance_.count(), variance_.gpu_data(),
       variance_.mutable_gpu_data());
 
   // replicate variance to input size

From ab3398832964c1ff1bf6b78501e4e43a11f282a1 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Thu, 13 Apr 2017 13:25:16 -0700
Subject: [PATCH 156/183] Add CPU sqrt functions

---
 include/caffe/util/math_functions.hpp |  3 +++
 src/caffe/util/math_functions.cpp     | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 60a8404a..e549120a 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -52,6 +52,9 @@ void caffe_scal(const int N, const Dtype alpha, Dtype *X);
 template <typename Dtype>
 void caffe_sqr(const int N, const Dtype* a, Dtype* y);
 
+template <typename Dtype>
+void caffe_sqrt(const int N, const Dtype* a, Dtype* y);
+
 template <typename Dtype>
 void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 71c02274..59625bc0 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -196,6 +196,16 @@ void caffe_sqr<double>(const int n, const double* a, double* y) {
   vdSqr(n, a, y);
 }
 
+template <>
+void caffe_sqrt<float>(const int n, const float* a, float* y) {
+  vsSqrt(n, a, y);
+}
+
+template <>
+void caffe_sqrt<double>(const int n, const double* a, double* y) {
+  vdSqrt(n, a, y);
+}
+
 template <>
 void caffe_exp<float>(const int n, const float* a, float* y) {
   vsExp(n, a, y);

From 1c15d94f7da736945450e6ed321077f3045445b1 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Thu, 13 Apr 2017 13:26:16 -0700
Subject: [PATCH 157/183] CPU BatchNormLayer: replace powx with sqr and sqrt

---
 src/caffe/layers/batch_norm_layer.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp
index 0a08ed4c..c6a1d5b1 100644
--- a/src/caffe/layers/batch_norm_layer.cpp
+++ b/src/caffe/layers/batch_norm_layer.cpp
@@ -124,8 +124,8 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   if (!use_global_stats_) {
     // compute variance using var(X) = E((X-EX)^2)
-    caffe_powx(top[0]->count(), top_data, Dtype(2),
-        temp_.mutable_cpu_data());  // (X-EX)^2
+    caffe_sqr<Dtype>(top[0]->count(), top_data,
+                     temp_.mutable_cpu_data());  // (X-EX)^2
     caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
         1. / (num * spatial_dim), temp_.cpu_data(),
         spatial_sum_multiplier_.cpu_data(), 0.,
@@ -148,7 +148,7 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   // normalize variance
   caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
-  caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
+  caffe_sqrt(variance_.count(), variance_.cpu_data(),
              variance_.mutable_cpu_data());
 
   // replicate variance to input size

From 3d5bed06a9b6b8a5dfd3db8da33f2fa3bc9a1213 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Thu, 13 Apr 2017 14:15:16 -0700
Subject: [PATCH 158/183] fix: add non-MKL sqrt (should have been included in
 ab33988)

---
 include/caffe/util/mkl_alternate.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
index 79b2c32d..8c2294c7 100644
--- a/include/caffe/util/mkl_alternate.hpp
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -37,6 +37,7 @@ extern "C" {
   }
 
 DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i])
+DEFINE_VSL_UNARY_FUNC(Sqrt, y[i] = sqrt(a[i]))
 DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]))
 DEFINE_VSL_UNARY_FUNC(Ln, y[i] = log(a[i]))
 DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]))

From 2ec19b6177111526d2df362d29d0e08aa5645a22 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Thu, 13 Apr 2017 14:22:30 -0700
Subject: [PATCH 159/183] deprecate WindowData layer type

---
 include/caffe/layers/window_data_layer.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/caffe/layers/window_data_layer.hpp b/include/caffe/layers/window_data_layer.hpp
index 35f41b80..b9b66b7c 100644
--- a/include/caffe/layers/window_data_layer.hpp
+++ b/include/caffe/layers/window_data_layer.hpp
@@ -16,7 +16,8 @@ namespace caffe {
 
 /**
  * @brief Provides data to the Net from windows of images files, specified
- *        by a window data file.
+ *        by a window data file. This layer is *DEPRECATED* and only kept for
+ *        archival purposes for use by the original R-CNN.
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */

From e7163f650885b9f7b9cae1c3253aa97d9fe30d86 Mon Sep 17 00:00:00 2001
From: Guillaume Dumont <dumont.guillaume@gmail.com>
Date: Thu, 13 Apr 2017 20:32:40 -0400
Subject: [PATCH 160/183] Updated Travis boost dependencies

---
 scripts/travis/install-deps.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index 2fa2a74a..dac5d2f9 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -9,10 +9,10 @@ apt-get -y update
 apt-get install -y --no-install-recommends \
   build-essential \
   graphviz \
-  libboost-filesystem-dev \
-  libboost-python-dev \
-  libboost-system-dev \
-  libboost-thread-dev \
+  libboost-filesystem1.55-dev \
+  libboost-python1.55-dev \
+  libboost-system1.55-dev \
+  libboost-thread1.55-dev \
   libgflags-dev \
   libgoogle-glog-dev \
   libhdf5-serial-dev \

From 8bc82c635914676d51ecd2849cc69f6fb6042496 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Thu, 13 Apr 2017 19:14:57 -0700
Subject: [PATCH 161/183] [examples] switch cifar-10 back to proto instead of
 h5 serialization

(it's more common)
---
 examples/cifar10/cifar10_quick_solver.prototxt | 1 -
 examples/cifar10/train_full.sh                 | 4 ++--
 examples/cifar10/train_quick.sh                | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/cifar10/cifar10_quick_solver.prototxt b/examples/cifar10/cifar10_quick_solver.prototxt
index 5de276f7..14b4401b 100644
--- a/examples/cifar10/cifar10_quick_solver.prototxt
+++ b/examples/cifar10/cifar10_quick_solver.prototxt
@@ -20,7 +20,6 @@ display: 100
 max_iter: 4000
 # snapshot intermediate results
 snapshot: 4000
-snapshot_format: HDF5
 snapshot_prefix: "examples/cifar10/cifar10_quick"
 # solver mode: CPU or GPU
 solver_mode: GPU
diff --git a/examples/cifar10/train_full.sh b/examples/cifar10/train_full.sh
index 06ecc2dc..fe46e60d 100755
--- a/examples/cifar10/train_full.sh
+++ b/examples/cifar10/train_full.sh
@@ -9,9 +9,9 @@ $TOOLS/caffe train \
 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
     --solver=examples/cifar10/cifar10_full_solver_lr1.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate.h5 $@
+    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate $@
 
 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
     --solver=examples/cifar10/cifar10_full_solver_lr2.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate.h5 $@
+    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate $@
diff --git a/examples/cifar10/train_quick.sh b/examples/cifar10/train_quick.sh
index d2b87534..257479e0 100755
--- a/examples/cifar10/train_quick.sh
+++ b/examples/cifar10/train_quick.sh
@@ -9,4 +9,4 @@ $TOOLS/caffe train \
 # reduce learning rate by factor of 10 after 8 epochs
 $TOOLS/caffe train \
   --solver=examples/cifar10/cifar10_quick_solver_lr1.prototxt \
-  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate.h5 $@
+  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate $@

From aa29eba26b781349174cb856b6ea96360ebbb3f2 Mon Sep 17 00:00:00 2001
From: Guillaume Dumont <dumont.guillaume@gmail.com>
Date: Thu, 13 Apr 2017 22:37:13 -0400
Subject: [PATCH 162/183] Explicit std::string to bp::object conversion

---
 python/caffe/_caffe.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 7fc06c08..d7f43fff 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -371,7 +371,7 @@ bp::object NCCL_New_Uid() {
   return bp::object(bp::handle<>(py_uid));
 #else
   // automatic conversion is correct for python 2.
-  return uid;
+  return bp::object(uid);
 #endif
 }
 #endif

From c19c9602d031274ce77eb6a94ce2a9e8d843d98f Mon Sep 17 00:00:00 2001
From: Carl Doersch <cdoersch@cs.cmu.edu>
Date: Tue, 25 Aug 2015 11:26:14 -0700
Subject: [PATCH 163/183] Test for python forward and backward with start and
 end layer.

---
 python/caffe/test/test_net.py | 45 +++++++++++++++++++++++++++++++----
 1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/python/caffe/test/test_net.py b/python/caffe/test/test_net.py
index 24391cc5..afd27690 100644
--- a/python/caffe/test/test_net.py
+++ b/python/caffe/test/test_net.py
@@ -25,11 +25,11 @@ def simple_net_file(num_output):
         bias_filler { type: 'constant' value: 2 } }
         param { decay_mult: 1 } param { decay_mult: 0 }
         }
-    layer { type: 'InnerProduct' name: 'ip' bottom: 'conv' top: 'ip'
+    layer { type: 'InnerProduct' name: 'ip' bottom: 'conv' top: 'ip_blob'
       inner_product_param { num_output: """ + str(num_output) + """
         weight_filler { type: 'gaussian' std: 2.5 }
         bias_filler { type: 'constant' value: -3 } } }
-    layer { type: 'SoftmaxWithLoss' name: 'loss' bottom: 'ip' bottom: 'label'
+    layer { type: 'SoftmaxWithLoss' name: 'loss' bottom: 'ip_blob' bottom: 'label'
       top: 'loss' }""")
     f.close()
     return f.name
@@ -71,6 +71,43 @@ def test_forward_backward(self):
         self.net.forward()
         self.net.backward()
 
+    def test_forward_start_end(self):
+        conv_blob=self.net.blobs['conv'];
+        ip_blob=self.net.blobs['ip_blob'];
+        sample_data=np.random.uniform(size=conv_blob.data.shape);
+        sample_data=sample_data.astype(np.float32);
+        conv_blob.data[:]=sample_data;
+        forward_blob=self.net.forward(start='ip',end='ip');
+        self.assertIn('ip_blob',forward_blob);
+
+        manual_forward=[];
+        for i in range(0,conv_blob.data.shape[0]):
+          dot=np.dot(self.net.params['ip'][0].data,
+                     conv_blob.data[i].reshape(-1));
+          manual_forward.append(dot+self.net.params['ip'][1].data);
+        manual_forward=np.array(manual_forward);
+
+        np.testing.assert_allclose(ip_blob.data,manual_forward,rtol=1e-3);
+
+    def test_backward_start_end(self):
+        conv_blob=self.net.blobs['conv'];
+        ip_blob=self.net.blobs['ip_blob'];
+        sample_data=np.random.uniform(size=ip_blob.data.shape)
+        sample_data=sample_data.astype(np.float32);
+        ip_blob.diff[:]=sample_data;
+        backward_blob=self.net.backward(start='ip',end='ip');
+        self.assertIn('conv',backward_blob);
+
+        manual_backward=[];
+        for i in range(0,conv_blob.data.shape[0]):
+          dot=np.dot(self.net.params['ip'][0].data.transpose(),
+                     sample_data[i].reshape(-1));
+          manual_backward.append(dot);
+        manual_backward=np.array(manual_backward);
+        manual_backward=manual_backward.reshape(conv_blob.data.shape);
+
+        np.testing.assert_allclose(conv_blob.diff,manual_backward,rtol=1e-3);
+
     def test_clear_param_diffs(self):
         # Run a forward/backward step to have non-zero diffs
         self.net.forward()
@@ -90,13 +127,13 @@ def test_top_bottom_names(self):
         self.assertEqual(self.net.top_names,
                          OrderedDict([('data', ['data', 'label']),
                                       ('conv', ['conv']),
-                                      ('ip', ['ip']),
+                                      ('ip', ['ip_blob']),
                                       ('loss', ['loss'])]))
         self.assertEqual(self.net.bottom_names,
                          OrderedDict([('data', []),
                                       ('conv', ['data']),
                                       ('ip', ['conv']),
-                                      ('loss', ['ip', 'label'])]))
+                                      ('loss', ['ip_blob', 'label'])]))
 
     def test_save_and_read(self):
         f = tempfile.NamedTemporaryFile(mode='w+', delete=False)

From 451944333510e1ea9b0bdac11e4ec201e5284714 Mon Sep 17 00:00:00 2001
From: jgyllinsky <jgyllinsky@users.noreply.github.com>
Date: Fri, 14 Apr 2017 03:11:59 -0400
Subject: [PATCH 164/183] [docs] added apt command to install OpenBLAS (#4718)

---
 docs/install_apt.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/install_apt.md b/docs/install_apt.md
index bc1566b0..ee2cd287 100644
--- a/docs/install_apt.md
+++ b/docs/install_apt.md
@@ -14,7 +14,7 @@ The NVIDIA package tends to follow more recent library and driver versions, but
 If installing from packages, install the library and latest driver separately; the driver bundled with the library is usually out-of-date.
 This can be skipped for CPU-only installation.
 
-**BLAS**: install ATLAS by `sudo apt-get install libatlas-base-dev` or install OpenBLAS or MKL for better CPU performance.
+**BLAS**: install ATLAS by `sudo apt-get install libatlas-base-dev` or install OpenBLAS by `sudo apt-get install libopenblas-dev` or MKL for better CPU performance.
 
 **Python** (optional): if you use the default Python you will need to `sudo apt-get install` the `python-dev` package to have the Python headers for building the pycaffe interface.
 

From 80073497045d3101492a28a8a2c87dff65d64ff4 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 10:17:52 -0700
Subject: [PATCH 165/183] fix lint errors that snuck in by #4566

---
 src/caffe/test/test_gradient_based_solver.cpp | 12 ++++++++----
 src/caffe/test/test_neuron_layer.cpp          |  9 ++++++---
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index 05cab909..f4395f53 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -558,9 +558,11 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const vector<Blob<Dtype>*>& params = solver_->net()->learnable_params();
     for (int i = 0; i < params.size(); ++i) {
       for (int j = 0; j < params[i]->count(); ++j) {
-        EXPECT_FLOAT_EQ(param_copies[i]->cpu_data()[j], params[i]->cpu_data()[j])
+        EXPECT_FLOAT_EQ(param_copies[i]->cpu_data()[j],
+            params[i]->cpu_data()[j])
             << "param " << i << " data differed at dim " << j;
-        EXPECT_FLOAT_EQ(param_copies[i]->cpu_diff()[j], params[i]->cpu_diff()[j])
+        EXPECT_FLOAT_EQ(param_copies[i]->cpu_diff()[j],
+            params[i]->cpu_diff()[j])
             << "param " << i << " diff differed at dim " << j;
       }
     }
@@ -569,9 +571,11 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const vector<shared_ptr<Blob<Dtype> > >& history = solver_->history();
     for (int i = 0; i < history.size(); ++i) {
       for (int j = 0; j < history[i]->count(); ++j) {
-        EXPECT_FLOAT_EQ(history_copies[i]->cpu_data()[j], history[i]->cpu_data()[j])
+        EXPECT_FLOAT_EQ(history_copies[i]->cpu_data()[j],
+            history[i]->cpu_data()[j])
             << "history blob " << i << " data differed at dim " << j;
-        EXPECT_FLOAT_EQ(history_copies[i]->cpu_diff()[j], history[i]->cpu_diff()[j])
+        EXPECT_FLOAT_EQ(history_copies[i]->cpu_diff()[j],
+            history[i]->cpu_diff()[j])
             << "history blob " << i << " diff differed at dim " << j;
       }
     }
diff --git a/src/caffe/test/test_neuron_layer.cpp b/src/caffe/test/test_neuron_layer.cpp
index 57bd47b3..180871a2 100644
--- a/src/caffe/test/test_neuron_layer.cpp
+++ b/src/caffe/test/test_neuron_layer.cpp
@@ -791,13 +791,16 @@ TYPED_TEST(NeuronLayerTest, TestPReLUInPlace) {
   ip2.Backward(blob_middle_vec_2, propagate_down, blob_bottom_vec_2);
   // Check numbers
   for (int s = 0; s < blob_bottom_2->count(); ++s) {
-    EXPECT_FLOAT_EQ(this->blob_bottom_->cpu_diff()[s], blob_bottom_2->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(this->blob_bottom_->cpu_diff()[s],
+        blob_bottom_2->cpu_diff()[s]);
   }
   for (int s = 0; s < ip.blobs()[0]->count(); ++s) {
-    EXPECT_FLOAT_EQ(ip.blobs()[0]->cpu_diff()[s], ip2.blobs()[0]->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(ip.blobs()[0]->cpu_diff()[s],
+        ip2.blobs()[0]->cpu_diff()[s]);
   }
   for (int s = 0; s < ip.blobs()[1]->count(); ++s) {
-    EXPECT_FLOAT_EQ(ip.blobs()[1]->cpu_diff()[s], ip2.blobs()[1]->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(ip.blobs()[1]->cpu_diff()[s],
+        ip2.blobs()[1]->cpu_diff()[s]);
   }
   for (int s = 0; s < prelu.blobs()[0]->count(); ++s) {
     EXPECT_FLOAT_EQ(prelu.blobs()[0]->cpu_diff()[s],

From 4db619aec9cd384b11a1c55fac257d14b704bb15 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Fri, 14 Apr 2017 12:30:50 -0700
Subject: [PATCH 166/183] Docker update to cuDNN 6

---
 docker/cpu/Dockerfile | 3 ++-
 docker/gpu/Dockerfile | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile
index af6c03c6..67e2e61b 100644
--- a/docker/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@@ -28,7 +28,8 @@ ENV CAFFE_ROOT=/opt/caffe
 WORKDIR $CAFFE_ROOT
 
 # FIXME: use ARG instead of ENV once DockerHub supports this
-ENV CLONE_TAG=rc4
+# https://github.com/docker/hub-feedback/issues/460
+ENV CLONE_TAG=1.0
 
 RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
     pip install --upgrade pip && \
diff --git a/docker/gpu/Dockerfile b/docker/gpu/Dockerfile
index 0785b10f..dcdbdf32 100644
--- a/docker/gpu/Dockerfile
+++ b/docker/gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
 LABEL maintainer caffe-maint@googlegroups.com
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -28,7 +28,8 @@ ENV CAFFE_ROOT=/opt/caffe
 WORKDIR $CAFFE_ROOT
 
 # FIXME: use ARG instead of ENV once DockerHub supports this
-ENV CLONE_TAG=rc4
+# https://github.com/docker/hub-feedback/issues/460
+ENV CLONE_TAG=1.0
 
 RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
     pip install --upgrade pip && \

From 44da39f662a24de746fa83b92bd670fe41b3a7da Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 12:36:41 -0700
Subject: [PATCH 167/183] BVLC -> BAIR

Berkeley AI Research (BAIR) is the the successor to
the Berkeley Vision and Learning Center (BVLC).
---
 CONTRIBUTORS.md                               |  2 +-
 README.md                                     |  6 +++---
 docs/_layouts/default.html                    |  2 +-
 docs/development.md                           |  4 ++--
 docs/index.md                                 | 10 +++++-----
 docs/model_zoo.md                             | 18 +++++++++---------
 docs/multigpu.md                              |  4 ++--
 docs/performance_hardware.md                  |  2 +-
 docs/tutorial/interfaces.md                   |  4 ++--
 examples/finetune_flickr_style/readme.md      |  2 +-
 models/bvlc_alexnet/readme.md                 |  2 +-
 models/bvlc_googlenet/readme.md               |  2 +-
 models/bvlc_reference_caffenet/readme.md      |  2 +-
 models/bvlc_reference_rcnn_ilsvrc13/readme.md |  2 +-
 14 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8db66ea8..3fd76781 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -1,6 +1,6 @@
 # Contributors
 
-Caffe is developed by a core set of BVLC members and the open-source community.
+Caffe is developed by a core set of BAIR members and the open-source community.
 
 We thank all of our [contributors](https://github.com/BVLC/caffe/graphs/contributors)!
 
diff --git a/README.md b/README.md
index 44b9e62c..0ae3616b 100644
--- a/README.md
+++ b/README.md
@@ -4,13 +4,13 @@
 [![License](https://img.shields.io/badge/license-BSD-blue.svg)](LICENSE)
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.
-It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and community contributors.
+It is developed by Berkeley AI Research ([BAIR](http://bair.berkeley.edu))/The Berkeley Vision and Learning Center (BVLC) and community contributors.
 
 Check out the [project site](http://caffe.berkeleyvision.org) for all the details like
 
 - [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p)
 - [Tutorial Documentation](http://caffe.berkeleyvision.org/tutorial/)
-- [BVLC reference models](http://caffe.berkeleyvision.org/model_zoo.html) and the [community model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)
+- [BAIR reference models](http://caffe.berkeleyvision.org/model_zoo.html) and the [community model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)
 - [Installation instructions](http://caffe.berkeleyvision.org/installation.html)
 
 and step-by-step examples.
@@ -25,7 +25,7 @@ Happy brewing!
 ## License and Citation
 
 Caffe is released under the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE).
-The BVLC reference models are released for unrestricted use.
+The BAIR/BVLC reference models are released for unrestricted use.
 
 Please cite Caffe in your publications if it helps your research:
 
diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html
index b8efe60b..3799e95a 100644
--- a/docs/_layouts/default.html
+++ b/docs/_layouts/default.html
@@ -36,7 +36,7 @@
       <header>
         <h1 class="header"><a href="/">Caffe</a></h1>
         <p class="header">
-          Deep learning framework by the <a class="header name" href="http://bvlc.eecs.berkeley.edu/">BVLC</a>
+          Deep learning framework by <a class="header name" href="http://bair.berkeley.edu/">BAIR</a>
         </p>
         <p class="header">
           Created by
diff --git a/docs/development.md b/docs/development.md
index 107c2c3b..ec05bbee 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -4,7 +4,7 @@ title: Developing and Contributing
 # Development and Contributing
 
 Caffe is developed with active participation of the community.<br>
-The [BVLC](http://bvlc.eecs.berkeley.edu/) brewers welcome all contributions!
+The [BAIR](http://bair.berkeley.edu/)/BVLC brewers welcome all contributions!
 
 The exact details of contributions are recorded by versioning and cited in our [acknowledgements](http://caffe.berkeleyvision.org/#acknowledgements).
 This method is impartial and always up-to-date.
@@ -37,7 +37,7 @@ We absolutely appreciate any contribution to this effort!
 
 The `master` branch receives all new development including community contributions.
 We try to keep it in a reliable state, but it is the bleeding edge, and things do get broken every now and then.
-BVLC maintainers will periodically make releases by marking stable checkpoints as tags and maintenance branches. [Past releases](https://github.com/BVLC/caffe/releases) are catalogued online.
+BAIR maintainers will periodically make releases by marking stable checkpoints as tags and maintenance branches. [Past releases](https://github.com/BVLC/caffe/releases) are catalogued online.
 
 #### Issues & Pull Request Protocol
 
diff --git a/docs/index.md b/docs/index.md
index 932b3b58..302a7d56 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -5,7 +5,7 @@ title: Deep Learning Framework
 # Caffe
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.
-It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and by community contributors.
+It is developed by Berkeley AI Research ([BAIR](http://bair.berkeley.edu)) and by community contributors.
 [Yangqing Jia](http://daggerfs.com) created the project during his PhD at UC Berkeley.
 Caffe is released under the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE).
 
@@ -45,7 +45,7 @@ A 4-page report for the ACM Multimedia Open Source competition (arXiv:1408.5093v
 - [Installation instructions](/installation.html)<br>
 Tested on Ubuntu, Red Hat, OS X.
 * [Model Zoo](/model_zoo.html)<br>
-BVLC suggests a standard distribution format for Caffe models, and provides trained models.
+BAIR suggests a standard distribution format for Caffe models, and provides trained models.
 * [Developing & Contributing](/development.html)<br>
 Guidelines for development and contributing to Caffe.
 * [API Documentation](/doxygen/annotated.html)<br>
@@ -92,9 +92,9 @@ The core Caffe developers offer [consulting services](mailto:caffe-coldpress@goo
 
 ## Acknowledgements
 
-The BVLC Caffe developers would like to thank NVIDIA for GPU donation, A9 and Amazon Web Services for a research grant in support of Caffe development and reproducible research in deep learning, and BVLC PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for guidance.
+The BAIR Caffe developers would like to thank NVIDIA for GPU donation, A9 and Amazon Web Services for a research grant in support of Caffe development and reproducible research in deep learning, and BAIR PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for guidance.
 
-The BVLC members who have contributed to Caffe are (alphabetical by first name):
+The BAIR members who have contributed to Caffe are (alphabetical by first name):
 [Eric Tzeng](https://github.com/erictzeng), [Evan Shelhamer](http://imaginarynumber.net/), [Jeff Donahue](http://jeffdonahue.com/), [Jon Long](https://github.com/longjon), [Ross Girshick](http://www.cs.berkeley.edu/~rbg/), [Sergey Karayev](http://sergeykarayev.com/), [Sergio Guadarrama](http://www.eecs.berkeley.edu/~sguada/), and [Yangqing Jia](http://daggerfs.com/).
 
 The open-source community plays an important and growing role in Caffe's development.
@@ -103,4 +103,4 @@ Check out the Github [project pulse](https://github.com/BVLC/caffe/pulse) for re
 We sincerely appreciate your interest and contributions!
 If you'd like to contribute, please read the [developing & contributing](development.html) guide.
 
-Yangqing would like to give a personal thanks to the NVIDIA Academic program for providing GPUs, [Oriol Vinyals](http://www1.icsi.berkeley.edu/~vinyals/) for discussions along the journey, and BVLC PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for advice.
+Yangqing would like to give a personal thanks to the NVIDIA Academic program for providing GPUs, [Oriol Vinyals](http://www1.icsi.berkeley.edu/~vinyals/) for discussions along the journey, and BAIR PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for advice.
diff --git a/docs/model_zoo.md b/docs/model_zoo.md
index 06dc0a49..f9078718 100644
--- a/docs/model_zoo.md
+++ b/docs/model_zoo.md
@@ -14,15 +14,15 @@ To help share these models, we introduce the model zoo framework:
 
 ## Where to get trained models
 
-First of all, we bundle BVLC-trained models for unrestricted, out of the box use.
+First of all, we bundle BAIR-trained models for unrestricted, out of the box use.
 <br>
-See the [BVLC model license](#bvlc-model-license) for details.
+See the [BAIR model license](#bair-model-license) for details.
 Each one of these can be downloaded by running `scripts/download_model_binary.py <dirname>` where `<dirname>` is specified below:
 
-- **BVLC Reference CaffeNet** in `models/bvlc_reference_caffenet`: AlexNet trained on ILSVRC 2012, with a minor variation from the version as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Jeff Donahue @jeffdonahue)
-- **BVLC AlexNet** in `models/bvlc_alexnet`: AlexNet trained on ILSVRC 2012, almost exactly as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Evan Shelhamer @shelhamer)
-- **BVLC Reference R-CNN ILSVRC-2013** in `models/bvlc_reference_rcnn_ilsvrc13`: pure Caffe implementation of [R-CNN](https://github.com/rbgirshick/rcnn) as described by Girshick et al. in CVPR 2014. (Trained by Ross Girshick @rbgirshick)
-- **BVLC GoogLeNet** in `models/bvlc_googlenet`: GoogLeNet trained on ILSVRC 2012, almost exactly as described in [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842) by Szegedy et al. in ILSVRC 2014. (Trained by Sergio Guadarrama @sguada)
+- **BAIR Reference CaffeNet** in `models/bvlc_reference_caffenet`: AlexNet trained on ILSVRC 2012, with a minor variation from the version as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Jeff Donahue @jeffdonahue)
+- **BAIR AlexNet** in `models/bvlc_alexnet`: AlexNet trained on ILSVRC 2012, almost exactly as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Evan Shelhamer @shelhamer)
+- **BAIR Reference R-CNN ILSVRC-2013** in `models/bvlc_reference_rcnn_ilsvrc13`: pure Caffe implementation of [R-CNN](https://github.com/rbgirshick/rcnn) as described by Girshick et al. in CVPR 2014. (Trained by Ross Girshick @rbgirshick)
+- **BAIR GoogLeNet** in `models/bvlc_googlenet`: GoogLeNet trained on ILSVRC 2012, almost exactly as described in [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842) by Szegedy et al. in ILSVRC 2014. (Trained by Sergio Guadarrama @sguada)
 
 **Community models** made by Caffe users are posted to a publicly editable [wiki page](https://github.com/BVLC/caffe/wiki/Model-Zoo).
 These models are subject to conditions of their respective authors such as citation and license.
@@ -55,14 +55,14 @@ Downloading model info is done just as easily with `scripts/download_model_from_
 ### Hosting trained models
 
 It is up to the user where to host the `.caffemodel` file.
-We host our BVLC-provided models on our own server.
+We host our BAIR-provided models on our own server.
 Dropbox also works fine (tip: make sure that `?dl=1` is appended to the end of the URL).
 
 `scripts/download_model_binary.py <dirname>` downloads the `.caffemodel` from the URL specified in the `<dirname>/readme.md` frontmatter and confirms SHA1.
 
-## BVLC model license
+## BAIR model license
 
-The Caffe models bundled by the BVLC are released for unrestricted use.
+The Caffe models bundled by the BAIR are released for unrestricted use.
 
 These models are trained on data from the [ImageNet project](http://www.image-net.org/) and training data includes internet photos that may be subject to copyright.
 
diff --git a/docs/multigpu.md b/docs/multigpu.md
index d91acef9..e04ebb0b 100644
--- a/docs/multigpu.md
+++ b/docs/multigpu.md
@@ -13,7 +13,7 @@ The GPUs to be used for training can be set with the "-gpu" flag on the command
 # Hardware Configuration Assumptions
 
 The current implementation uses a tree reduction strategy.  e.g. if there are 4 GPUs in the system, 0:1, 2:3 will exchange gradients, then 0:2 (top of the tree) will exchange gradients, 0 will calculate
-updated model, 0\-\>2, and then 0\-\>1, 2\-\>3. 
+updated model, 0\-\>2, and then 0\-\>1, 2\-\>3.
 
 For best performance, P2P DMA access between devices is needed. Without P2P access, for example crossing PCIe root complex, data is copied through host and effective exchange bandwidth is greatly reduced.
 
@@ -23,4 +23,4 @@ Current implementation has a "soft" assumption that the devices being used are h
 
 # Scaling Performance
 
-Performance is **heavily** dependent on the PCIe topology of the system, the configuration of the neural network you are training, and the speed of each of the layers.  Systems like the DIGITS DevBox have an optimized PCIe topology (X99-E WS chipset).  In general, scaling on 2 GPUs tends to be ~1.8X on average for networks like AlexNet, CaffeNet, VGG, GoogleNet.  4 GPUs begins to have falloff in scaling.  Generally with "weak scaling" where the batchsize increases with the number of GPUs you will see 3.5x scaling or so.  With "strong scaling", the system can become communication bound, especially with layer performance optimizations like those in [cuDNNv3](http://nvidia.com/cudnn), and you will likely see closer to mid 2.x scaling in performance.  Networks that have heavy computation compared to the number of parameters tend to have the best scaling performance.
\ No newline at end of file
+Performance is **heavily** dependent on the PCIe topology of the system, the configuration of the neural network you are training, and the speed of each of the layers.  Systems like the DIGITS DevBox have an optimized PCIe topology (X99-E WS chipset).  In general, scaling on 2 GPUs tends to be ~1.8X on average for networks like AlexNet, CaffeNet, VGG, GoogleNet.  4 GPUs begins to have falloff in scaling.  Generally with "weak scaling" where the batchsize increases with the number of GPUs you will see 3.5x scaling or so.  With "strong scaling", the system can become communication bound, especially with layer performance optimizations like those in [cuDNNv3](http://nvidia.com/cudnn), and you will likely see closer to mid 2.x scaling in performance.  Networks that have heavy computation compared to the number of parameters tend to have the best scaling performance.
diff --git a/docs/performance_hardware.md b/docs/performance_hardware.md
index cdd4b361..fbf25684 100644
--- a/docs/performance_hardware.md
+++ b/docs/performance_hardware.md
@@ -8,7 +8,7 @@ To measure performance on different NVIDIA GPUs we use CaffeNet, the Caffe refer
 
 For training, each time point is 20 iterations/minibatches of 256 images for 5,120 images total. For testing, a 50,000 image validation set is classified.
 
-**Acknowledgements**: BVLC members are very grateful to NVIDIA for providing several GPUs to conduct this research.
+**Acknowledgements**: BAIR members are very grateful to NVIDIA for providing several GPUs to conduct this research.
 
 ## NVIDIA K40
 
diff --git a/docs/tutorial/interfaces.md b/docs/tutorial/interfaces.md
index d7ff3782..b5a4f1ad 100644
--- a/docs/tutorial/interfaces.md
+++ b/docs/tutorial/interfaces.md
@@ -91,7 +91,7 @@ In MatCaffe, you can
 * Run for a certain number of iterations and give back control to Matlab
 * Intermingle arbitrary Matlab code with gradient steps
 
-An ILSVRC image classification demo is in caffe/matlab/demo/classification_demo.m (you need to download BVLC CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) to run it).
+An ILSVRC image classification demo is in caffe/matlab/demo/classification_demo.m (you need to download BAIR CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) to run it).
 
 ### Build MatCaffe
 
@@ -114,7 +114,7 @@ You can save your Matlab search PATH by running `savepath` so that you don't hav
 
 MatCaffe is very similar to PyCaffe in usage.
 
-Examples below shows detailed usages and assumes you have downloaded BVLC CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) and started `matlab` from caffe root folder.
+Examples below shows detailed usages and assumes you have downloaded BAIR CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) and started `matlab` from caffe root folder.
 
     model = './models/bvlc_reference_caffenet/deploy.prototxt';
     weights = './models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel';
diff --git a/examples/finetune_flickr_style/readme.md b/examples/finetune_flickr_style/readme.md
index 188dedf1..dacfd01c 100644
--- a/examples/finetune_flickr_style/readme.md
+++ b/examples/finetune_flickr_style/readme.md
@@ -9,7 +9,7 @@ priority: 5
 # Fine-tuning CaffeNet for Style Recognition on "Flickr Style" Data
 
 Fine-tuning takes an already learned model, adapts the architecture, and resumes training from the already learned model weights.
-Let's fine-tune the BVLC-distributed CaffeNet model on a different dataset, [Flickr Style](http://sergeykarayev.com/files/1311.3715v3.pdf), to predict image style instead of object category.
+Let's fine-tune the BAIR-distributed CaffeNet model on a different dataset, [Flickr Style](http://sergeykarayev.com/files/1311.3715v3.pdf), to predict image style instead of object category.
 
 ## Explanation
 
diff --git a/models/bvlc_alexnet/readme.md b/models/bvlc_alexnet/readme.md
index 008d690f..a83e3d4e 100644
--- a/models/bvlc_alexnet/readme.md
+++ b/models/bvlc_alexnet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC AlexNet Model
+name: BAIR/BVLC AlexNet Model
 caffemodel: bvlc_alexnet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_googlenet/readme.md b/models/bvlc_googlenet/readme.md
index 061b6d74..ef04db62 100644
--- a/models/bvlc_googlenet/readme.md
+++ b/models/bvlc_googlenet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC GoogleNet Model
+name: BAIR/BVLC GoogleNet Model
 caffemodel: bvlc_googlenet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_reference_caffenet/readme.md b/models/bvlc_reference_caffenet/readme.md
index 671e47a5..5352e536 100644
--- a/models/bvlc_reference_caffenet/readme.md
+++ b/models/bvlc_reference_caffenet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC CaffeNet Model
+name: BAIR/BVLC CaffeNet Model
 caffemodel: bvlc_reference_caffenet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_reference_caffenet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_reference_rcnn_ilsvrc13/readme.md b/models/bvlc_reference_rcnn_ilsvrc13/readme.md
index 9a11a24d..12543b2b 100644
--- a/models/bvlc_reference_rcnn_ilsvrc13/readme.md
+++ b/models/bvlc_reference_rcnn_ilsvrc13/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC Reference RCNN ILSVRC13 Model
+name: BAIR/BVLC Reference RCNN ILSVRC13 Model
 caffemodel: bvlc_reference_rcnn_ilsvrc13.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_reference_rcnn_ilsvrc13.caffemodel
 license: unrestricted

From 3562698afb4b1f12f51eca752740e279f85714c4 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 12:45:21 -0700
Subject: [PATCH 168/183] drop performance + hardware page and switch to sheet

simpler to read and update
---
 docs/index.md                |  9 +++--
 docs/performance_hardware.md | 73 ------------------------------------
 2 files changed, 5 insertions(+), 77 deletions(-)
 delete mode 100644 docs/performance_hardware.md

diff --git a/docs/index.md b/docs/index.md
index 302a7d56..bbfd91fc 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -23,15 +23,14 @@ Thanks to these contributors the framework tracks the state-of-the-art in both c
 
 **Speed** makes Caffe perfect for research experiments and industry deployment.
 Caffe can process **over 60M images per day** with a single NVIDIA K40 GPU\*.
-That's 1 ms/image for inference and 4 ms/image for learning.
-We believe that Caffe is the fastest convnet implementation available.
+That's 1 ms/image for inference and 4 ms/image for learning and more recent library versions and hardware are faster still.
+We believe that Caffe is among the fastest convnet implementations available.
 
 **Community**: Caffe already powers academic research projects, startup prototypes, and even large-scale industrial applications in vision, speech, and multimedia.
 Join our community of brewers on the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users) and [Github](https://github.com/BVLC/caffe/).
 
 <p class="footnote" markdown="1">
-\* With the ILSVRC2012-winning [SuperVision](http://www.image-net.org/challenges/LSVRC/2012/supervision.pdf) model and caching IO.
-Consult performance [details](/performance_hardware.html).
+\* With the ILSVRC2012-winning [SuperVision](http://www.image-net.org/challenges/LSVRC/2012/supervision.pdf) model and prefetching IO.
 </p>
 
 ## Documentation
@@ -50,6 +49,8 @@ BAIR suggests a standard distribution format for Caffe models, and provides trai
 Guidelines for development and contributing to Caffe.
 * [API Documentation](/doxygen/annotated.html)<br>
 Developer documentation automagically generated from code comments.
+* [Benchmarking](https://docs.google.com/spreadsheets/d/1Yp4rqHpT7mKxOPbpzYeUfEFLnELDAgxSSBQKp5uKDGQ/edit#gid=0)<br>
+Comparison of inference and learning for different networks and GPUs.
 
 ### Examples
 
diff --git a/docs/performance_hardware.md b/docs/performance_hardware.md
deleted file mode 100644
index fbf25684..00000000
--- a/docs/performance_hardware.md
+++ /dev/null
@@ -1,73 +0,0 @@
----
-title: Performance and Hardware Configuration
----
-
-# Performance and Hardware Configuration
-
-To measure performance on different NVIDIA GPUs we use CaffeNet, the Caffe reference ImageNet model.
-
-For training, each time point is 20 iterations/minibatches of 256 images for 5,120 images total. For testing, a 50,000 image validation set is classified.
-
-**Acknowledgements**: BAIR members are very grateful to NVIDIA for providing several GPUs to conduct this research.
-
-## NVIDIA K40
-
-Performance is best with ECC off and boost clock enabled. While ECC makes a negligible difference in speed, disabling it frees ~1 GB of GPU memory.
-
-Best settings with ECC off and maximum clock speed in standard Caffe:
-
-* Training is 26.5 secs / 20 iterations (5,120 images)
-* Testing is 100 secs / validation set (50,000 images)
-
-Best settings with Caffe + [cuDNN acceleration](http://nvidia.com/cudnn):
-
-* Training is 19.2 secs / 20 iterations (5,120 images)
-* Testing is 60.7 secs / validation set (50,000 images)
-
-Other settings:
-
-* ECC on, max speed: training 26.7 secs / 20 iterations, test 101 secs / validation set
-* ECC on, default speed: training 31 secs / 20 iterations, test 117 secs / validation set
-* ECC off, default speed: training 31 secs / 20 iterations, test 118 secs / validation set
-
-### K40 configuration tips
-
-For maximum K40 performance, turn off ECC and boost the clock speed (at your own risk).
-
-To turn off ECC, do
-
-    sudo nvidia-smi -i 0 --ecc-config=0    # repeat with -i x for each GPU ID
-
-then reboot.
-
-Set the "persistence" mode of the GPU settings by
-
-    sudo nvidia-smi -pm 1
-
-and then set the clock speed with
-
-    sudo nvidia-smi -i 0 -ac 3004,875    # repeat with -i x for each GPU ID
-
-but note that this configuration resets across driver reloading / rebooting. Include these commands in a boot script to initialize these settings. For a simple fix, add these commands to `/etc/rc.local` (on Ubuntu).
-
-## NVIDIA Titan
-
-Training: 26.26 secs / 20 iterations (5,120 images).
-Testing: 100 secs / validation set (50,000 images).
-
-cuDNN Training: 20.25 secs / 20 iterations (5,120 images).
-cuDNN Testing: 66.3 secs / validation set (50,000 images).
-
-
-## NVIDIA K20
-
-Training: 36.0 secs / 20 iterations (5,120 images).
-Testing: 133 secs / validation set (50,000 images).
-
-## NVIDIA GTX 770
-
-Training: 33.0 secs / 20 iterations (5,120 images).
-Testing: 129 secs / validation set (50,000 images).
-
-cuDNN Training: 24.3 secs / 20 iterations (5,120 images).
-cuDNN Testing: 104 secs / validation set (50,000 images).

From 0f5bfc34e0b37b9ab3437d6755eb04a8dc9e8656 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 12:46:56 -0700
Subject: [PATCH 169/183] favor notebook examples as more clear and popular

---
 docs/index.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index bbfd91fc..82eb059e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -52,13 +52,6 @@ Developer documentation automagically generated from code comments.
 * [Benchmarking](https://docs.google.com/spreadsheets/d/1Yp4rqHpT7mKxOPbpzYeUfEFLnELDAgxSSBQKp5uKDGQ/edit#gid=0)<br>
 Comparison of inference and learning for different networks and GPUs.
 
-### Examples
-
-{% assign examples = site.pages | where:'category','example' | sort: 'priority' %}
-{% for page in examples %}
-- <div><a href="{{page.url}}">{{page.title}}</a><br>{{page.description}}</div>
-{% endfor %}
-
 ### Notebook Examples
 
 {% assign notebooks = site.pages | where:'category','notebook' | sort: 'priority' %}
@@ -66,6 +59,13 @@ Comparison of inference and learning for different networks and GPUs.
 - <div><a href="http://nbviewer.ipython.org/github/BVLC/caffe/blob/master/{{page.original_path}}">{{page.title}}</a><br>{{page.description}}</div>
 {% endfor %}
 
+### Command Line Examples
+
+{% assign examples = site.pages | where:'category','example' | sort: 'priority' %}
+{% for page in examples %}
+- <div><a href="{{page.url}}">{{page.title}}</a><br>{{page.description}}</div>
+{% endfor %}
+
 ## Citing Caffe
 
 Please cite Caffe in your publications if it helps your research:

From 2158bbb2151049dec2486b720c0a351164a0eb6b Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 12:50:19 -0700
Subject: [PATCH 170/183] model zoo: point out wiki link immediately, explain
 manual editing

---
 docs/model_zoo.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/model_zoo.md b/docs/model_zoo.md
index f9078718..3f77e825 100644
--- a/docs/model_zoo.md
+++ b/docs/model_zoo.md
@@ -3,7 +3,7 @@ title: Model Zoo
 ---
 # Caffe Model Zoo
 
-Lots of researchers and engineers have made Caffe models for different tasks with all kinds of architectures and data.
+Lots of researchers and engineers have made Caffe models for different tasks with all kinds of architectures and data: check out the [model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)!
 These models are learned and applied for problems ranging from simple regression, to large-scale visual classification, to Siamese networks for image similarity, to speech and robotics applications.
 
 To help share these models, we introduce the model zoo framework:
@@ -24,7 +24,7 @@ Each one of these can be downloaded by running `scripts/download_model_binary.py
 - **BAIR Reference R-CNN ILSVRC-2013** in `models/bvlc_reference_rcnn_ilsvrc13`: pure Caffe implementation of [R-CNN](https://github.com/rbgirshick/rcnn) as described by Girshick et al. in CVPR 2014. (Trained by Ross Girshick @rbgirshick)
 - **BAIR GoogLeNet** in `models/bvlc_googlenet`: GoogLeNet trained on ILSVRC 2012, almost exactly as described in [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842) by Szegedy et al. in ILSVRC 2014. (Trained by Sergio Guadarrama @sguada)
 
-**Community models** made by Caffe users are posted to a publicly editable [wiki page](https://github.com/BVLC/caffe/wiki/Model-Zoo).
+**Community models** made by Caffe users are posted to a publicly editable [model zoo wiki page](https://github.com/BVLC/caffe/wiki/Model-Zoo).
 These models are subject to conditions of their respective authors such as citation and license.
 Thank you for sharing your models!
 
@@ -42,6 +42,8 @@ A caffe model is distributed as a directory containing:
     - License information.
 - [optional] Other helpful scripts.
 
+This simple format can be handled through bundled scripts or manually if need be.
+
 ### Hosting model info
 
 Github Gist is a good format for model info distribution because it can contain multiple files, is versionable, and has in-browser syntax highlighting and markdown rendering.

From 414b74c06038c17924745b68954ef10827fe1edd Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 13:19:53 -0700
Subject: [PATCH 171/183] add missing names to BAIR roster

---
 docs/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/index.md b/docs/index.md
index 82eb059e..db8eaffb 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -96,7 +96,7 @@ The core Caffe developers offer [consulting services](mailto:caffe-coldpress@goo
 The BAIR Caffe developers would like to thank NVIDIA for GPU donation, A9 and Amazon Web Services for a research grant in support of Caffe development and reproducible research in deep learning, and BAIR PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for guidance.
 
 The BAIR members who have contributed to Caffe are (alphabetical by first name):
-[Eric Tzeng](https://github.com/erictzeng), [Evan Shelhamer](http://imaginarynumber.net/), [Jeff Donahue](http://jeffdonahue.com/), [Jon Long](https://github.com/longjon), [Ross Girshick](http://www.cs.berkeley.edu/~rbg/), [Sergey Karayev](http://sergeykarayev.com/), [Sergio Guadarrama](http://www.eecs.berkeley.edu/~sguada/), and [Yangqing Jia](http://daggerfs.com/).
+[Carl Doersch](http://www.carldoersch.com/), [Eric Tzeng](https://github.com/erictzeng), [Evan Shelhamer](http://imaginarynumber.net/), [Jeff Donahue](http://jeffdonahue.com/), [Jon Long](https://github.com/longjon), [Philipp Krähenbühl](http://www.philkr.net/), [Ronghang Hu](http://ronghanghu.com/), [Ross Girshick](http://www.cs.berkeley.edu/~rbg/), [Sergey Karayev](http://sergeykarayev.com/), [Sergio Guadarrama](http://www.eecs.berkeley.edu/~sguada/), [Takuya Narihira](https://github.com/tnarihi), and [Yangqing Jia](http://daggerfs.com/).
 
 The open-source community plays an important and growing role in Caffe's development.
 Check out the Github [project pulse](https://github.com/BVLC/caffe/pulse) for recent activity and the [contributors](https://github.com/BVLC/caffe/graphs/contributors) for the full list.

From e90a6a6ca29423afb15f39adb1157bff9e6f8655 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 13:24:30 -0700
Subject: [PATCH 172/183] retire caffe-dev and caffe-coldpress

dev has diffused into the community from the original Caffe core
---
 docs/index.md | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index db8eaffb..0e21ae82 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -86,11 +86,6 @@ Join the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users
 
 Framework development discussions and thorough bug reports are collected on [Issues](https://github.com/BVLC/caffe/issues).
 
-Contact [caffe-dev](mailto:caffe-dev@googlegroups.com) if you have a confidential proposal for the framework *and the ability to act on it*.
-Requests for features, explanations, or personal help will be ignored; post to [caffe-users](https://groups.google.com/forum/#!forum/caffe-users) instead.
-
-The core Caffe developers offer [consulting services](mailto:caffe-coldpress@googlegroups.com) for appropriate projects.
-
 ## Acknowledgements
 
 The BAIR Caffe developers would like to thank NVIDIA for GPU donation, A9 and Amazon Web Services for a research grant in support of Caffe development and reproducible research in deep learning, and BAIR PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for guidance.

From 8985818e4fbb5fc207e4f383c63c28d80fd286f2 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 13:28:24 -0700
Subject: [PATCH 173/183] track publications by google scholar and not the wiki

---
 docs/index.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 0e21ae82..3385747c 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -77,8 +77,7 @@ Please cite Caffe in your publications if it helps your research:
       Year = {2014}
     }
 
-If you do publish a paper where Caffe helped your research, we encourage you to update the [publications wiki](https://github.com/BVLC/caffe/wiki/Publications).
-Citations are also tracked automatically by [Google Scholar](http://scholar.google.com/scholar?oi=bibs&hl=en&cites=17333247995453974016).
+If you do publish a paper where Caffe helped your research, we encourage you to cite the framework for tracking by [Google Scholar](https://scholar.google.com/citations?view_op=view_citation&hl=en&citation_for_view=-ltRSM0AAAAJ:u5HHmVD_uO8C).
 
 ## Contacting Us
 

From 8b8f2dd40ba87543f066cb157c6d65dd8187253f Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 15:26:30 -0700
Subject: [PATCH 174/183] link to new full-day crash course

---
 docs/index.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 3385747c..b633f7cf 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -35,8 +35,8 @@ Join our community of brewers on the [caffe-users group](https://groups.google.c
 
 ## Documentation
 
-- [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p)<br>
-Tutorial presentation.
+- [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p) and [Caffe in a Day](https://docs.google.com/presentation/d/1HxGdeq8MPktHaPb-rlmYYQ723iWzq9ur6Gjo71YiG0Y/edit#slide=id.gc2fcdcce7_216_0)<br>
+Tutorial presentation of the framework and a full-day crash course.
 - [Tutorial Documentation](/tutorial)<br>
 Practical guide and framework reference.
 - [arXiv / ACM MM '14 paper](http://arxiv.org/abs/1408.5093)<br>

From 49761d34d18b7063af995b13ecca0fee1bdaf02c Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 15:32:50 -0700
Subject: [PATCH 175/183] Caffe 1.0

---
 CMakeLists.txt | 4 ++--
 Makefile       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c52ff466..08f56a33 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,8 +10,8 @@ endif()
 project(Caffe C CXX)
 
 # ---[ Caffe version
-set(CAFFE_TARGET_VERSION "1.0.0-rc5" CACHE STRING "Caffe logical version")
-set(CAFFE_TARGET_SOVERSION "1.0.0-rc5" CACHE STRING "Caffe soname version")
+set(CAFFE_TARGET_VERSION "1.0.0" CACHE STRING "Caffe logical version")
+set(CAFFE_TARGET_SOVERSION "1.0.0" CACHE STRING "Caffe soname version")
 add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 
 # ---[ Using cmake scripts and modules
diff --git a/Makefile b/Makefile
index 77900b69..4d324160 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ LIB_BUILD_DIR := $(BUILD_DIR)/lib
 STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
 DYNAMIC_VERSION_MAJOR 		:= 1
 DYNAMIC_VERSION_MINOR 		:= 0
-DYNAMIC_VERSION_REVISION 	:= 0-rc5
+DYNAMIC_VERSION_REVISION 	:= 0
 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
 #DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR)
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)

From 33f86122970392fcda19ef80ed5cd349279b896d Mon Sep 17 00:00:00 2001
From: Eric Tzeng <etzeng@pinterest.com>
Date: Tue, 18 Apr 2017 18:22:38 -0700
Subject: [PATCH 176/183] Rewrite crop cuda kernel

---
 include/caffe/layers/crop_layer.hpp |   6 +-
 src/caffe/layers/crop_layer.cpp     |  21 +++--
 src/caffe/layers/crop_layer.cu      | 122 +++++++++++-----------------
 3 files changed, 69 insertions(+), 80 deletions(-)

diff --git a/include/caffe/layers/crop_layer.hpp b/include/caffe/layers/crop_layer.hpp
index c4fda122..5219fa5c 100644
--- a/include/caffe/layers/crop_layer.hpp
+++ b/include/caffe/layers/crop_layer.hpp
@@ -41,13 +41,15 @@ class CropLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
-  vector<int> offsets;
+  Blob<int> offsets;
+  Blob<int> src_strides_;
+  Blob<int> dest_strides_;
 
  private:
   // Recursive copy function.
   void crop_copy(const vector<Blob<Dtype>*>& bottom,
                const vector<Blob<Dtype>*>& top,
-               const vector<int>& offsets,
+               const int* offsets,
                vector<int> indices,
                int cur_dim,
                const Dtype* src_data,
diff --git a/src/caffe/layers/crop_layer.cpp b/src/caffe/layers/crop_layer.cpp
index ef8c177c..65ea8f8b 100644
--- a/src/caffe/layers/crop_layer.cpp
+++ b/src/caffe/layers/crop_layer.cpp
@@ -40,8 +40,10 @@ void CropLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   const int start_axis = bottom[0]->CanonicalAxisIndex(param.axis());
 
   // Initialize offsets to 0 and the new shape to the current shape of the data.
-  offsets = vector<int>(input_dim, 0);
   vector<int> new_shape(bottom[0]->shape());
+  vector<int> offsets_shape(1, input_dim);
+  offsets.Reshape(offsets_shape);
+  int* offset_data = offsets.mutable_cpu_data();
 
   // Determine crop offsets and the new shape post-crop.
   for (int i = 0; i < input_dim; ++i) {
@@ -63,15 +65,22 @@ void CropLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
           << "size " << bottom[1]->shape(i) << " and offset " << crop_offset;
     }
     new_shape[i] = new_size;
-    offsets[i] = crop_offset;
+    offset_data[i] = crop_offset;
   }
   top[0]->Reshape(new_shape);
+  // Compute strides
+  src_strides_.Reshape(offsets_shape);
+  dest_strides_.Reshape(offsets_shape);
+  for (int i = 0; i < input_dim; ++i) {
+    src_strides_.mutable_cpu_data()[i] = bottom[0]->count(i + 1, input_dim);
+    dest_strides_.mutable_cpu_data()[i] = top[0]->count(i + 1, input_dim);
+  }
 }
 
 template <typename Dtype>
 void CropLayer<Dtype>::crop_copy(const vector<Blob<Dtype>*>& bottom,
              const vector<Blob<Dtype>*>& top,
-             const vector<int>& offsets,
+             const int* offsets,
              vector<int> indices,
              int cur_dim,
              const Dtype* src_data,
@@ -115,7 +124,8 @@ void CropLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   std::vector<int> indices(top[0]->num_axes(), 0);
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
-  crop_copy(bottom, top, offsets, indices, 0, bottom_data, top_data, true);
+  crop_copy(bottom, top, offsets.cpu_data(), indices, 0, bottom_data, top_data,
+      true);
 }
 
 template <typename Dtype>
@@ -127,7 +137,8 @@ void CropLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   if (propagate_down[0]) {
     caffe_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
     std::vector<int> indices(top[0]->num_axes(), 0);
-    crop_copy(bottom, top, offsets, indices, 0, top_diff, bottom_diff, false);
+    crop_copy(bottom, top, offsets.cpu_data(), indices, 0, top_diff,
+        bottom_diff, false);
   }
 }
 
diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu
index 677077cd..a400f333 100644
--- a/src/caffe/layers/crop_layer.cu
+++ b/src/caffe/layers/crop_layer.cu
@@ -4,90 +4,62 @@
 
 namespace caffe {
 
-// Copy (one line per thread) from one array to another, with arbitrary
-// strides in the last two dimensions.
+__device__ int compute_uncropped_index(
+    int index,
+    const int ndims,
+    const int* src_strides,
+    const int* dest_strides,
+    const int* offsets) {
+  int dest_index = index;
+  int src_index = 0;
+  for (int i = 0; i < ndims; ++i) {
+      int coord = dest_index / dest_strides[i];
+      dest_index -= coord * dest_strides[i];
+      src_index += src_strides[i] * (coord + offsets[i]);
+  }
+  return src_index;
+}
+
 template <typename Dtype>
-__global__ void copy_kernel(const int n, const int height, const int width,
-    const int src_inner_stride,
-    const int dest_inner_stride,
+__global__ void crop_kernel_forward(const int nthreads,
+    const int ndims,
+    const int* src_strides,
+    const int* dest_strides,
+    const int* offsets,
     const Dtype* src, Dtype* dest) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int src_start = index * src_inner_stride;
-    int dest_start = index * dest_inner_stride;
-    for (int i = 0; i < width; ++i) {
-      dest[dest_start + i] = src[src_start + i];
-    }
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int src_index = compute_uncropped_index(
+        index, ndims, src_strides, dest_strides, offsets);
+    dest[index] = src[src_index];
   }
 }
 
 template <typename Dtype>
-void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
-             const vector<Blob<Dtype>*>& top,
-             const vector<int>& offsets,
-             vector<int> indices,
-             int cur_dim,
-             const Dtype* src_data,
-             Dtype* dest_data,
-             bool is_forward) {
-  if (cur_dim + 2 < top[0]->num_axes()) {
-    // We are not yet at the final dimension, call copy recursivley
-    for (int i = 0; i < top[0]->shape(cur_dim); ++i) {
-      indices[cur_dim] = i;
-      crop_copy_gpu(bottom, top, offsets, indices, cur_dim+1,
-                src_data, dest_data, is_forward);
-    }
-  } else {
-    // We are at the last two dimensions, which are stored continuously in
-    // memory. With (N,C,H,W)
-    //              (0,1,2,3) cur_dim   -> H
-    //                        cur_dim+1 -> W
-    const int lines = top[0]->shape(cur_dim);
-    const int height = top[0]->shape(cur_dim);
-    const int width = top[0]->shape(cur_dim+1);
-    std::vector<int> ind_off(cur_dim+2, 0);
-    for (int j = 0; j < cur_dim; ++j) {
-        ind_off[j] = indices[j] + offsets[j];
-    }
-    ind_off[cur_dim] = offsets[cur_dim];
-    ind_off[cur_dim+1] = offsets[cur_dim+1];
-    // Compute copy strides
-    const int src_inner_stride = bottom[0]->shape(cur_dim+1);
-    const int dest_inner_stride = top[0]->shape(cur_dim+1);
-
-    if (is_forward) {
-      const Dtype* bottom_data = bottom[0]->gpu_data() +
-          bottom[0]->offset(ind_off);
-      Dtype* top_data = top[0]->mutable_gpu_data() +
-          top[0]->offset(indices);
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
-          lines, height, width,
-          src_inner_stride,
-          dest_inner_stride,
-          bottom_data, top_data);
-
-    } else {
-      const Dtype* top_diff = top[0]->gpu_diff() +
-          top[0]->offset(indices);
-      Dtype* bottom_diff = bottom[0]->mutable_gpu_diff() +
-          bottom[0]->offset(ind_off);
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
-          lines, height, width,
-          dest_inner_stride,
-          src_inner_stride,
-          top_diff, bottom_diff);
-    }
+__global__ void crop_kernel_backward(const int nthreads,
+    const int ndims,
+    const int* src_strides,
+    const int* dest_strides,
+    const int* offsets,
+    Dtype* src, const Dtype* dest) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int src_index = compute_uncropped_index(
+        index, ndims, src_strides, dest_strides, offsets);
+    src[src_index] = dest[index];
   }
 }
 
 template <typename Dtype>
 void CropLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  std::vector<int> indices(top[0]->num_axes(), 0);
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  crop_copy_gpu(bottom, top, offsets, indices, 0, bottom_data, top_data, true);
+  int n = top[0]->count();
+  crop_kernel_forward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
+      bottom[0]->num_axes(),
+      src_strides_.gpu_data(),
+      dest_strides_.gpu_data(),
+      offsets.gpu_data(),
+      bottom_data, top_data);
 }
 
 template <typename Dtype>
@@ -95,12 +67,16 @@ void CropLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->gpu_diff();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  int n = top[0]->count();
 
   if (propagate_down[0]) {
     caffe_gpu_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
-    std::vector<int> indices(top[0]->num_axes(), 0);
-    crop_copy_gpu(bottom, top, offsets, indices, 0, top_diff, bottom_diff,
-                  false);
+    crop_kernel_backward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
+        bottom[0]->num_axes(),
+        src_strides_.gpu_data(),
+        dest_strides_.gpu_data(),
+        offsets.gpu_data(),
+        bottom_diff, top_diff);
   }
 }
 

From cd1696d00b995a1d8567cb6f3ad7f65ec4df4176 Mon Sep 17 00:00:00 2001
From: Eric Tzeng <etzeng@pinterest.com>
Date: Tue, 18 Apr 2017 18:48:26 -0700
Subject: [PATCH 177/183] Fix crop layer lint errors

---
 src/caffe/layers/crop_layer.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu
index a400f333..4ece9cd1 100644
--- a/src/caffe/layers/crop_layer.cu
+++ b/src/caffe/layers/crop_layer.cu
@@ -54,6 +54,7 @@ void CropLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
   int n = top[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
   crop_kernel_forward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
       bottom[0]->num_axes(),
       src_strides_.gpu_data(),
@@ -71,6 +72,7 @@ void CropLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 
   if (propagate_down[0]) {
     caffe_gpu_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
+    // NOLINT_NEXT_LINE(whitespace/operators)
     crop_kernel_backward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
         bottom[0]->num_axes(),
         src_strides_.gpu_data(),

From ec35395e131a0d5e7c55cbd74dadbd46a49a645c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Malte=20St=C3=A6r=20Nissen?= <nissen@di.ku.dk>
Date: Thu, 4 May 2017 14:33:40 +0200
Subject: [PATCH 178/183] Handling destruction of empty Net objects

---
 matlab/+caffe/Net.m | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/matlab/+caffe/Net.m b/matlab/+caffe/Net.m
index 349e060e..bb99ec89 100644
--- a/matlab/+caffe/Net.m
+++ b/matlab/+caffe/Net.m
@@ -69,7 +69,9 @@
       self.blob_names = self.attributes.blob_names;
     end
     function delete (self)
-      caffe_('delete_net', self.hNet_self);
+      if ~isempty(self.hNet_self)
+        caffe_('delete_net', self.hNet_self);
+      end
     end
     function layer = layers(self, layer_name)
       CHECK(ischar(layer_name), 'layer_name must be a string');

From b7e2b99c7f0aeeb8e24046f8cbf5212065b9ccdf Mon Sep 17 00:00:00 2001
From: Luke Yeager <lukeyeager@users.noreply.github.com>
Date: Fri, 12 May 2017 10:06:51 -0700
Subject: [PATCH 179/183] Downgrade boost requirement from 1.55 to 1.54

---
 cmake/Dependencies.cmake       | 2 +-
 scripts/travis/install-deps.sh | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 4a5bac47..c48255c8 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -5,7 +5,7 @@ set(Caffe_DEFINITIONS "")
 set(Caffe_COMPILE_OPTIONS "")
 
 # ---[ Boost
-find_package(Boost 1.55 REQUIRED COMPONENTS system thread filesystem)
+find_package(Boost 1.54 REQUIRED COMPONENTS system thread filesystem)
 list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${Boost_INCLUDE_DIRS})
 list(APPEND Caffe_LINKER_LIBS PUBLIC ${Boost_LIBRARIES})
 
diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index dac5d2f9..2fa2a74a 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -9,10 +9,10 @@ apt-get -y update
 apt-get install -y --no-install-recommends \
   build-essential \
   graphviz \
-  libboost-filesystem1.55-dev \
-  libboost-python1.55-dev \
-  libboost-system1.55-dev \
-  libboost-thread1.55-dev \
+  libboost-filesystem-dev \
+  libboost-python-dev \
+  libboost-system-dev \
+  libboost-thread-dev \
   libgflags-dev \
   libgoogle-glog-dev \
   libhdf5-serial-dev \

From 30a2ab7e50430911f37ddf981e67e4f36f662f14 Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Mon, 15 May 2017 02:16:19 +0000
Subject: [PATCH 180/183] cmake: rename libproto.a -> libcaffeproto.a

---
 cmake/ConfigGen.cmake    |  2 +-
 src/caffe/CMakeLists.txt | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
index ad91f542..09bb09b4 100644
--- a/cmake/ConfigGen.cmake
+++ b/cmake/ConfigGen.cmake
@@ -33,7 +33,7 @@ function(caffe_generate_export_configs)
   configure_file("cmake/Templates/CaffeConfig.cmake.in" "${PROJECT_BINARY_DIR}/CaffeConfig.cmake" @ONLY)
 
   # Add targets to the build-tree export set
-  export(TARGETS caffe proto FILE "${PROJECT_BINARY_DIR}/CaffeTargets.cmake")
+  export(TARGETS caffe caffeproto FILE "${PROJECT_BINARY_DIR}/CaffeTargets.cmake")
   export(PACKAGE Caffe)
 
   # ---[ Configure install-tree CaffeConfig.cmake file ]---
diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
index b9152e92..4a805568 100644
--- a/src/caffe/CMakeLists.txt
+++ b/src/caffe/CMakeLists.txt
@@ -3,12 +3,12 @@ file(GLOB proto_files proto/*.proto)
 caffe_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_python ${proto_files})
 
 # include python files either to force generation
-add_library(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
-caffe_default_properties(proto)
-target_link_libraries(proto PUBLIC ${PROTOBUF_LIBRARIES})
-target_include_directories(proto PUBLIC ${PROTOBUF_INCLUDE_DIR})
+add_library(caffeproto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
+caffe_default_properties(caffeproto)
+target_link_libraries(caffeproto PUBLIC ${PROTOBUF_LIBRARIES})
+target_include_directories(caffeproto PUBLIC ${PROTOBUF_INCLUDE_DIR})
 
-list(INSERT Caffe_LINKER_LIBS 0 PUBLIC proto) # note, crucial to prepend!
+list(INSERT Caffe_LINKER_LIBS 0 PUBLIC caffeproto) # note, crucial to prepend!
 
 # --[ Caffe library
 
@@ -42,7 +42,7 @@ set_target_properties(caffe PROPERTIES
 # ---[ Install
 install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 install(FILES ${proto_hdrs} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/caffe/proto)
-install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION ${CMAKE_INSTALL_LIBDIR})
+install(TARGETS caffe caffeproto EXPORT CaffeTargets DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
 list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)

From 83814da36d5a44039ddc35f58f9b341e9d1bd935 Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Mon, 15 May 2017 03:04:47 +0000
Subject: [PATCH 181/183] docs/debian guide: update compiler combination table

---
 docs/install_apt_debian.md | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/docs/install_apt_debian.md b/docs/install_apt_debian.md
index 65fe7092..bd91124a 100644
--- a/docs/install_apt_debian.md
+++ b/docs/install_apt_debian.md
@@ -96,18 +96,22 @@ Note, this requires a `deb-src` entry in your `/etc/apt/sources.list`.
 Some users may find their favorate compiler doesn't work with CUDA.
 
 ```
-CXX compiler |  CUDA 7.5  |  CUDA 8.0  |
--------------+------------+------------+-
-GCC-7        |     ?      |     ?      |
-GCC-6        |     ✘      |     ✘      |
-GCC-5        |     ✔ [1]  |     ✔      |
-CLANG-4.0    |     ?      |     ?      |
-CLANG-3.9    |     ✘      |     ✘      |
-CLANG-3.8    |     ?      |     ✔      |
+CXX compiler |  CUDA 7.5  |  CUDA 8.0  |  CUDA 9.0  |
+-------------+------------+------------+------------+
+GCC-8        |     ?      |     ?      |     ?      |
+GCC-7        |     ?      |     ?      |     ?      |
+GCC-6        |     ✘      |     ✘      |     ✔      |
+GCC-5        |     ✔ [1]  |     ✔      |     ✔      |
+-------------+------------+------------+------------+
+CLANG-4.0    |     ?      |     ?      |     ?      |
+CLANG-3.9    |     ✘      |     ✘      |     ✔      |
+CLANG-3.8    |     ?      |     ✔      |     ✔      |
 ```
 
 `[1]` CUDA 7.5 's `host_config.h` must be patched before working with GCC-5.
 
+`[2]` CUDA 9.0: https://devblogs.nvidia.com/parallelforall/cuda-9-features-revealed/
+
 BTW, please forget the GCC-4.X series, since its `libstdc++` ABI is not compatible with GCC-5's.
 You may encounter failure linking GCC-4.X object files against GCC-5 libraries.
 (See https://wiki.debian.org/GCC5 )

From 264cf199e4e8bc44bb97762b1018137704157c2c Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Tue, 13 Jun 2017 11:59:26 -0700
Subject: [PATCH 182/183] List branches in readme

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index 0ae3616b..c40aee65 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,14 @@ Check out the [project site](http://caffe.berkeleyvision.org) for all the detail
 
 and step-by-step examples.
 
+## Custom distributions
+
+- [Intel optimized branch](https://github.com/BVLC/caffe/tree/intel) for CPU, in particular Xeon processors (HSW, BDW, Xeon Phi).
+- [OpenCL Caffe](https://github.com/BVLC/caffe/tree/opencl) e.g. for AMD or Intel devices.
+- [Windows Caffe](https://github.com/BVLC/caffe/tree/windows)
+
+## Community
+
 [![Join the chat at https://gitter.im/BVLC/caffe](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/BVLC/caffe?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
 Please join the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users) or [gitter chat](https://gitter.im/BVLC/caffe) to ask questions and talk about methods and models.

From 4efdf7ee49cffefdd7ea099c00dc5ea327640f04 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Tue, 20 Jun 2017 14:20:42 -0700
Subject: [PATCH 183/183] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c40aee65..5148c69d 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ and step-by-step examples.
 
 ## Custom distributions
 
-- [Intel optimized branch](https://github.com/BVLC/caffe/tree/intel) for CPU, in particular Xeon processors (HSW, BDW, Xeon Phi).
+ - [Intel Caffe](https://github.com/BVLC/caffe/tree/intel) (Optimized for CPU and support for multi-node), in particular Xeon processors (HSW, BDW, Xeon Phi).
 - [OpenCL Caffe](https://github.com/BVLC/caffe/tree/opencl) e.g. for AMD or Intel devices.
 - [Windows Caffe](https://github.com/BVLC/caffe/tree/windows)