From 6965746a6323dc0cfe71590c6a38f2c36843a610 Mon Sep 17 00:00:00 2001 From: Jonathan L Long Date: Mon, 11 Aug 2014 21:38:59 -0700 Subject: [PATCH 01/26] zero-init param diffs and accumulate gradients (With layers whose backwards accumlate gradients), this effectively decouples the computational batch from the SGD minibatch. Each iteration accumulates gradients over iter_size batches, then parameters are updated. --- src/caffe/proto/caffe.proto | 3 ++- src/caffe/solver.cpp | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index 5b21cf20028..fc118fa66d8 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -88,7 +88,7 @@ message NetParameter { // NOTE // Update the next available ID when you add a new SolverParameter field. // -// SolverParameter next available ID: 36 (last added: clip_gradients) +// SolverParameter next available ID: 37 (last added: iter_size) message SolverParameter { ////////////////////////////////////////////////////////////////////////////// // Specifying the train and test networks @@ -141,6 +141,7 @@ message SolverParameter { // Display the loss averaged over the last average_loss iterations optional int32 average_loss = 33 [default = 1]; optional int32 max_iter = 7; // the maximum number of iterations + optional int32 iter_size = 36 [default = 1]; optional string lr_policy = 8; // The learning rate decay policy. optional float gamma = 9; // The parameter to compute the learning rate. optional float power = 10; // The parameter to compute the learning rate. diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 096980dd7af..538fad9c3ab 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -168,6 +168,25 @@ void Solver::Step(int iters) { Dtype smoothed_loss = 0; for (; iter_ < stop_iter; ++iter_) { + // zero-init the params + for (int i = 0; i < net_->params().size(); ++i) { + shared_ptr > blob = net_->params()[i]; + switch(Caffe::mode()) { + case Caffe::CPU: + caffe_set(blob->count(), static_cast(0), + blob->mutable_cpu_diff()); + break; + case Caffe::GPU: +#ifndef CPU_ONLY + caffe_gpu_set(blob->count(), static_cast(0), + blob->mutable_gpu_diff()); +#else + NO_GPU; +#endif + break; + } + } + if (param_.test_interval() && iter_ % param_.test_interval() == 0 && (iter_ > 0 || param_.test_initialization())) { TestAll(); @@ -175,7 +194,13 @@ void Solver::Step(int iters) { const bool display = param_.display() && iter_ % param_.display() == 0; net_->set_debug_info(display && param_.debug_info()); - Dtype loss = net_->ForwardBackward(bottom_vec); + // accumulate the loss and gradient + Dtype loss = 0; + for (int i = 0; i < param_.iter_size(); ++i) { + loss += net_->ForwardBackward(bottom_vec); + } + loss /= param_.iter_size(); + // average the loss across iterations for smoothed reporting if (losses.size() < average_loss) { losses.push_back(loss); int size = losses.size(); @@ -471,7 +496,8 @@ void SGDSolver::ComputeUpdateValue() { case Caffe::CPU: for (int param_id = 0; param_id < net_params.size(); ++param_id) { // Compute the value to history, and then copy them to the blob's diff. - Dtype local_rate = rate * net_params_lr[param_id]; + Dtype local_rate = rate * net_params_lr[param_id] + / this->param_.iter_size(); Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; if (local_decay) { @@ -507,7 +533,8 @@ void SGDSolver::ComputeUpdateValue() { #ifndef CPU_ONLY for (int param_id = 0; param_id < net_params.size(); ++param_id) { // Compute the value to history, and then copy them to the blob's diff. - Dtype local_rate = rate * net_params_lr[param_id]; + Dtype local_rate = rate * net_params_lr[param_id] + / this->param_.iter_size(); Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; if (local_decay) { From 37bddae5017e8778d26d60f4c7deb0eea54dae8d Mon Sep 17 00:00:00 2001 From: Jonathan L Long Date: Tue, 30 Dec 2014 22:52:07 -0800 Subject: [PATCH 02/26] zero-init param diffs in gradient checker --- include/caffe/test/test_gradient_check_util.hpp | 7 +++++-- src/caffe/solver.cpp | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp index 22937711b58..cc5dcbad0ee 100644 --- a/include/caffe/test/test_gradient_check_util.hpp +++ b/include/caffe/test/test_gradient_check_util.hpp @@ -80,11 +80,14 @@ void GradientChecker::CheckGradientSingle(Layer* layer, CHECK_EQ(top_count, bottom[blob_id]->count()); } } - // First, figure out what blobs we need to check against. + // First, figure out what blobs we need to check against, and zero init + // parameter blobs. vector*> blobs_to_check; vector propagate_down(bottom.size(), check_bottom < 0); for (int i = 0; i < layer->blobs().size(); ++i) { - blobs_to_check.push_back(layer->blobs()[i].get()); + Blob* blob = layer->blobs()[i].get(); + caffe_set(blob->count(), static_cast(0), blob->mutable_cpu_diff()); + blobs_to_check.push_back(blob); } if (check_bottom < 0) { for (int i = 0; i < bottom.size(); ++i) { diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 538fad9c3ab..8e2d2a83a1f 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -171,7 +171,7 @@ void Solver::Step(int iters) { // zero-init the params for (int i = 0; i < net_->params().size(); ++i) { shared_ptr > blob = net_->params()[i]; - switch(Caffe::mode()) { + switch (Caffe::mode()) { case Caffe::CPU: caffe_set(blob->count(), static_cast(0), blob->mutable_cpu_diff()); From 05185e26013db9cb9eae7b85beec5701e8a71995 Mon Sep 17 00:00:00 2001 From: Sergio Date: Fri, 26 Sep 2014 23:03:26 -0700 Subject: [PATCH 03/26] accumulate gradients in inner product layer --- src/caffe/layers/inner_product_layer.cpp | 4 ++-- src/caffe/layers/inner_product_layer.cu | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index 89e0c8fbad7..83c3235eb71 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -101,13 +101,13 @@ void InnerProductLayer::Backward_cpu(const vector*>& top, const Dtype* bottom_data = bottom[0]->cpu_data(); // Gradient with respect to weight caffe_cpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_cpu_diff()); + top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff()); } if (bias_term_ && this->param_propagate_down_[1]) { const Dtype* top_diff = top[0]->cpu_diff(); // Gradient with respect to bias caffe_cpu_gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, - bias_multiplier_.cpu_data(), (Dtype)0., + bias_multiplier_.cpu_data(), (Dtype)1., this->blobs_[1]->mutable_cpu_diff()); } if (propagate_down[0]) { diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu index a9e1784a205..dd90cac12a8 100644 --- a/src/caffe/layers/inner_product_layer.cu +++ b/src/caffe/layers/inner_product_layer.cu @@ -33,13 +33,13 @@ void InnerProductLayer::Backward_gpu(const vector*>& top, const Dtype* bottom_data = bottom[0]->gpu_data(); // Gradient with respect to weight caffe_gpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_gpu_diff()); + top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_gpu_diff()); } if (bias_term_ && this->param_propagate_down_[1]) { const Dtype* top_diff = top[0]->gpu_diff(); // Gradient with respect to bias caffe_gpu_gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, - bias_multiplier_.gpu_data(), (Dtype)0., + bias_multiplier_.gpu_data(), (Dtype)1., this->blobs_[1]->mutable_gpu_diff()); } if (propagate_down[0]) { From c8f0bbe3812fa7a8d6f6f2424d23591fbcb4acdf Mon Sep 17 00:00:00 2001 From: Jonathan L Long Date: Tue, 30 Dec 2014 22:29:35 -0800 Subject: [PATCH 04/26] accumulate gradients in (de)conv layers --- src/caffe/layers/conv_layer.cpp | 7 ------- src/caffe/layers/conv_layer.cu | 7 ------- src/caffe/layers/deconv_layer.cpp | 7 ------- src/caffe/layers/deconv_layer.cu | 7 ------- 4 files changed, 28 deletions(-) diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index c0c9f6f3371..928ef5ee468 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -39,13 +39,6 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); - if (this->param_propagate_down_[0]) { - caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff); - } - if (this->bias_term_ && this->param_propagate_down_[1]) { - caffe_set(this->blobs_[1]->count(), Dtype(0), - this->blobs_[1]->mutable_cpu_diff()); - } for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->cpu_diff(); const Dtype* bottom_data = bottom[i]->cpu_data(); diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu index 3902fdf3930..b8a98ff7cc9 100644 --- a/src/caffe/layers/conv_layer.cu +++ b/src/caffe/layers/conv_layer.cu @@ -31,13 +31,6 @@ void ConvolutionLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->gpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - if (this->param_propagate_down_[0]) { - caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff); - } - if (this->bias_term_ && this->param_propagate_down_[1]) { - caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), - this->blobs_[1]->mutable_gpu_diff()); - } for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->gpu_diff(); // Bias gradient, if necessary. diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp index e6d65ab526b..a4612963b6b 100644 --- a/src/caffe/layers/deconv_layer.cpp +++ b/src/caffe/layers/deconv_layer.cpp @@ -39,13 +39,6 @@ void DeconvolutionLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); - if (this->param_propagate_down_[0]) { - caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff); - } - if (this->bias_term_ && this->param_propagate_down_[1]) { - caffe_set(this->blobs_[1]->count(), Dtype(0), - this->blobs_[1]->mutable_cpu_diff()); - } for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->cpu_diff(); const Dtype* bottom_data = bottom[i]->cpu_data(); diff --git a/src/caffe/layers/deconv_layer.cu b/src/caffe/layers/deconv_layer.cu index 9198dd64c72..39bc4de8c66 100644 --- a/src/caffe/layers/deconv_layer.cu +++ b/src/caffe/layers/deconv_layer.cu @@ -31,13 +31,6 @@ void DeconvolutionLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->gpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - if (this->param_propagate_down_[0]) { - caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff); - } - if (this->bias_term_ && this->param_propagate_down_[1]) { - caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), - this->blobs_[1]->mutable_gpu_diff()); - } for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->gpu_diff(); const Dtype* bottom_data = bottom[i]->gpu_data(); From 2983572eba08517fb65c58d40cd82521ddf605ad Mon Sep 17 00:00:00 2001 From: Jonathan L Long Date: Sat, 13 Sep 2014 17:41:59 -0700 Subject: [PATCH 05/26] accumulate gradients in cudnn conv layer --- src/caffe/layers/cudnn_conv_layer.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu index 071014e1b48..b5bfdb098e0 100644 --- a/src/caffe/layers/cudnn_conv_layer.cu +++ b/src/caffe/layers/cudnn_conv_layer.cu @@ -54,12 +54,10 @@ void CuDNNConvolutionLayer::Backward_gpu(const vector*>& top, if (this->param_propagate_down_[0]) { weight = this->blobs_[0]->gpu_data(); weight_diff = this->blobs_[0]->mutable_gpu_diff(); - caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff); } Dtype* bias_diff = NULL; if (this->bias_term_ && this->param_propagate_down_[1]) { bias_diff = this->blobs_[1]->mutable_gpu_diff(); - caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff); } for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->gpu_diff(); From ea39cb54df13f220cfa53de983a685ad7ad4eab7 Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Wed, 21 Jan 2015 22:21:13 -0800 Subject: [PATCH 06/26] Add gpu_util.cuh, with caffe_gpu_atomic_add (double impl from NVIDIA dev docs; float impl included in CUDA as "atomicAdd") --- include/caffe/util/gpu_util.cuh | 35 +++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 include/caffe/util/gpu_util.cuh diff --git a/include/caffe/util/gpu_util.cuh b/include/caffe/util/gpu_util.cuh new file mode 100644 index 00000000000..994202f2a1a --- /dev/null +++ b/include/caffe/util/gpu_util.cuh @@ -0,0 +1,35 @@ +#ifndef CAFFE_UTIL_GPU_UTIL_H_ +#define CAFFE_UTIL_GPU_UTIL_H_ + +namespace caffe { + +template +inline __device__ Dtype caffe_gpu_atomic_add(const Dtype val, Dtype* address); + +template <> +inline __device__ +float caffe_gpu_atomic_add(const float val, float* address) { + return atomicAdd(address, val); +} + +// double atomicAdd implementation taken from: +// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#axzz3PVCpVsEG +template <> +inline __device__ +double caffe_gpu_atomic_add(const double val, double* address) { + unsigned long long int* address_as_ull = // NOLINT(runtime/int) + // NOLINT_NEXT_LINE(runtime/int) + reinterpret_cast(address); + unsigned long long int old = *address_as_ull; // NOLINT(runtime/int) + unsigned long long int assumed; // NOLINT(runtime/int) + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} + +} // namespace caffe + +#endif // CAFFE_UTIL_GPU_UTIL_H_ From 0c19b6b94e63d6fe43831a236f28e81d277bd220 Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Wed, 21 Jan 2015 14:23:34 -0800 Subject: [PATCH 07/26] test_gradient_check_util: check_bottom < -1 only checks params --- include/caffe/test/test_gradient_check_util.hpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp index cc5dcbad0ee..25f35d1589e 100644 --- a/include/caffe/test/test_gradient_check_util.hpp +++ b/include/caffe/test/test_gradient_check_util.hpp @@ -45,6 +45,10 @@ class GradientChecker { void CheckGradientEltwise(Layer* layer, const vector*>& bottom, const vector*>& top); + // Checks the gradient of a single output with respect to particular input + // blob(s). If check_bottom = i >= 0, check only the ith bottom Blob. + // If check_bottom == -1, check everything -- all bottom Blobs and all + // param Blobs. Otherwise (if check_bottom < -1), check only param Blobs. void CheckGradientSingle(Layer* layer, const vector*>& bottom, const vector*>& top, int check_bottom, int top_id, int top_data_id, bool element_wise = false); @@ -83,21 +87,22 @@ void GradientChecker::CheckGradientSingle(Layer* layer, // First, figure out what blobs we need to check against, and zero init // parameter blobs. vector*> blobs_to_check; - vector propagate_down(bottom.size(), check_bottom < 0); + vector propagate_down(bottom.size(), check_bottom == -1); for (int i = 0; i < layer->blobs().size(); ++i) { Blob* blob = layer->blobs()[i].get(); caffe_set(blob->count(), static_cast(0), blob->mutable_cpu_diff()); blobs_to_check.push_back(blob); } - if (check_bottom < 0) { + if (check_bottom == -1) { for (int i = 0; i < bottom.size(); ++i) { blobs_to_check.push_back(bottom[i]); } - } else { + } else if (check_bottom >= 0) { CHECK_LT(check_bottom, bottom.size()); blobs_to_check.push_back(bottom[check_bottom]); propagate_down[check_bottom] = true; } + CHECK_GT(blobs_to_check.size(), 0) << "No blobs to check."; // Compute the gradient analytically using Backward Caffe::set_random_seed(seed_); // Ignore the loss from the layer (it's just the weighted sum of the losses From bb8a91d3a917e2746d49b1fae431c82be4ba4d27 Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Sun, 15 Feb 2015 16:00:04 -0800 Subject: [PATCH 08/26] Add EmbedLayer for inner products with sparse input (one-hot vectors), with unit tests --- include/caffe/common_layers.hpp | 38 ++++++ src/caffe/layers/embed_layer.cpp | 122 +++++++++++++++++++ src/caffe/layers/embed_layer.cu | 80 ++++++++++++ src/caffe/proto/caffe.proto | 17 ++- src/caffe/test/test_embed_layer.cpp | 183 ++++++++++++++++++++++++++++ 5 files changed, 439 insertions(+), 1 deletion(-) create mode 100644 src/caffe/layers/embed_layer.cpp create mode 100644 src/caffe/layers/embed_layer.cu create mode 100644 src/caffe/test/test_embed_layer.cpp diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index b1ac3a93eff..7e0046107c3 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -180,6 +180,44 @@ class EltwiseLayer : public Layer { bool stable_prod_grad_; }; +/** + * @brief A layer for learning "embeddings" of one-hot vector input. + * Equivalent to an InnerProductLayer with one-hot vectors as input, but + * for efficiency the input is the "hot" index of each column itself. + * + * TODO(dox): thorough documentation for Forward, Backward, and proto params. + */ +template +class EmbedLayer : public Layer { + public: + explicit EmbedLayer(const LayerParameter& param) + : Layer(param) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { return "Embed"; } + virtual inline int ExactNumBottomBlobs() const { return 1; } + virtual inline int ExactNumTopBlobs() const { return 1; } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + int M_; + int K_; + int N_; + bool bias_term_; + Blob bias_multiplier_; +}; + /** * @brief Reshapes the input Blob into flat vectors. * diff --git a/src/caffe/layers/embed_layer.cpp b/src/caffe/layers/embed_layer.cpp new file mode 100644 index 00000000000..be6b2cd2727 --- /dev/null +++ b/src/caffe/layers/embed_layer.cpp @@ -0,0 +1,122 @@ +#include + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/common_layers.hpp" +#include "caffe/filler.hpp" +#include "caffe/layer.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +void EmbedLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { + N_ = this->layer_param_.embed_param().num_output(); + CHECK_GT(N_, 0) << "EmbedLayer num_output must be positive."; + K_ = this->layer_param_.embed_param().input_dim(); + CHECK_GT(K_, 0) << "EmbedLayer input_dim must be positive."; + bias_term_ = this->layer_param_.embed_param().bias_term(); + // Check if we need to set up the weights + if (this->blobs_.size() > 0) { + LOG(INFO) << "Skipping parameter initialization"; + } else { + if (bias_term_) { + this->blobs_.resize(2); + } else { + this->blobs_.resize(1); + } + // Initialize the weights -- + // transposed from InnerProductLayer for spatial locality. + vector weight_shape(2); + weight_shape[0] = K_; + weight_shape[1] = N_; + this->blobs_[0].reset(new Blob(weight_shape)); + // fill the weights + shared_ptr > weight_filler(GetFiller( + this->layer_param_.embed_param().weight_filler())); + weight_filler->Fill(this->blobs_[0].get()); + // If necessary, initialize and fill the bias term + if (bias_term_) { + vector bias_shape(1, N_); + this->blobs_[1].reset(new Blob(bias_shape)); + shared_ptr > bias_filler(GetFiller( + this->layer_param_.embed_param().bias_filler())); + bias_filler->Fill(this->blobs_[1].get()); + } + } // parameter initialization + this->param_propagate_down_.resize(this->blobs_.size(), true); +} + +template +void EmbedLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + // Figure out the dimensions + M_ = bottom[0]->count(); + vector top_shape = bottom[0]->shape(); + top_shape.push_back(N_); + top[0]->Reshape(top_shape); + // Set up the bias multiplier + if (bias_term_) { + vector bias_shape(1, M_); + bias_multiplier_.Reshape(bias_shape); + caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data()); + } +} + +template +void EmbedLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + const Dtype* weight = this->blobs_[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + int index; + for (int n = 0; n < M_; ++n) { + index = static_cast(bottom_data[n]); + DCHECK_GE(index, 0); + DCHECK_LT(index, K_); + DCHECK_EQ(static_cast(index), bottom_data[n]) << "non-integer input"; + caffe_copy(N_, weight + index * N_, top_data + n * N_); + } + if (bias_term_) { + const Dtype* bias = this->blobs_[1]->cpu_data(); + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, Dtype(1), + bias_multiplier_.cpu_data(), bias, Dtype(1), top_data); + } +} + +template +void EmbedLayer::Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + CHECK(!propagate_down[0]) << "Can't backpropagate to EmbedLayer input."; + if (this->param_propagate_down_[0]) { + const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + // Gradient with respect to weight + Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); + int index; + for (int n = 0; n < M_; ++n) { + index = static_cast(bottom_data[n]); + DCHECK_GE(index, 0); + DCHECK_LT(index, K_); + DCHECK_EQ(static_cast(index), bottom_data[n]) + << "non-integer input"; + caffe_axpy(N_, Dtype(1), top_diff + n * N_, weight_diff + index * N_); + } + } + if (bias_term_ && this->param_propagate_down_[1]) { + const Dtype* top_diff = top[0]->cpu_diff(); + Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff(); + caffe_cpu_gemv(CblasTrans, M_, N_, Dtype(1), top_diff, + bias_multiplier_.cpu_data(), Dtype(1), bias_diff); + } +} + +#ifdef CPU_ONLY +STUB_GPU(EmbedLayer); +#endif + +INSTANTIATE_CLASS(EmbedLayer); +REGISTER_LAYER_CLASS(Embed); + +} // namespace caffe diff --git a/src/caffe/layers/embed_layer.cu b/src/caffe/layers/embed_layer.cu new file mode 100644 index 00000000000..37a4f7e35cd --- /dev/null +++ b/src/caffe/layers/embed_layer.cu @@ -0,0 +1,80 @@ +#include + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/common_layers.hpp" +#include "caffe/filler.hpp" +#include "caffe/layer.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +__global__ void EmbedForward(const int nthreads, const Dtype* bottom_data, + const Dtype* weight, const int M, const int N, const int K, + Dtype* top_data) { + CUDA_KERNEL_LOOP(top_index, nthreads) { + const int n = top_index / N; + const int d = top_index % N; + const int index = static_cast(bottom_data[n]); + const int weight_index = index * N + d; + top_data[top_index] = weight[weight_index]; + } +} + +template +__global__ void EmbedBackward(const int nthreads, const Dtype* bottom_data, + const Dtype* top_diff, const int M, const int N, const int K, + Dtype* weight_diff) { + CUDA_KERNEL_LOOP(weight_index, nthreads) { + const int index = weight_index / N; + const int output_index = weight_index % N; + for (int n = 0; n < M; ++n) { + if (static_cast(bottom_data[n]) == index) { + weight_diff[weight_index] += top_diff[n * N + output_index]; + } + } + } +} + +template +void EmbedLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + const Dtype* weight = this->blobs_[0]->gpu_data(); + const int count = top[0]->count(); + EmbedForward // NOLINT_NEXT_LINE(whitespace/operators) + <<>>( + count, bottom_data, weight, M_, N_, K_, top_data); + if (bias_term_) { + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, Dtype(1), + bias_multiplier_.gpu_data(), + this->blobs_[1]->gpu_data(), Dtype(1), top_data); + } +} + +template +void EmbedLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + CHECK(!propagate_down[0]) << "Can't backpropagate to EmbedLayer input."; + if (this->param_propagate_down_[0]) { + const int count = this->blobs_[0]->count(); + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); + EmbedBackward // NOLINT_NEXT_LINE(whitespace/operators) + <<>>( + count, bottom_data, top_diff, M_, N_, K_, weight_diff); + } + if (bias_term_ && this->param_propagate_down_[1]) { + const Dtype* top_diff = top[0]->gpu_diff(); + Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff(); + caffe_gpu_gemv(CblasTrans, M_, N_, Dtype(1), top_diff, + bias_multiplier_.gpu_data(), Dtype(1), bias_diff); + } +} + +INSTANTIATE_LAYER_GPU_FUNCS(EmbedLayer); + +} // namespace caffe diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index fc118fa66d8..46dda408d1d 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -260,7 +260,7 @@ message ParamSpec { // NOTE // Update the next available ID when you add a new LayerParameter field. // -// LayerParameter next available layer-specific ID: 132 (last added: prelu_param) +// LayerParameter next available layer-specific ID: 133 (last added: embed_param) message LayerParameter { optional string name = 1; // the layer name optional string type = 2; // the layer type @@ -312,6 +312,7 @@ message LayerParameter { optional DropoutParameter dropout_param = 108; optional DummyDataParameter dummy_data_param = 109; optional EltwiseParameter eltwise_param = 110; + optional EmbedParameter embed_param = 132; optional ExpParameter exp_param = 111; optional HDF5DataParameter hdf5_data_param = 112; optional HDF5OutputParameter hdf5_output_param = 113; @@ -503,6 +504,20 @@ message EltwiseParameter { optional bool stable_prod_grad = 3 [default = true]; } +// Message that stores parameters used by EmbedLayer +message EmbedParameter { + optional uint32 num_output = 1; // The number of outputs for the layer + // The input is given as integers to be interpreted as one-hot + // vector indices with dimension num_input. Hence num_input should be + // 1 greater than the maximum possible input value. + optional uint32 input_dim = 2; + + optional bool bias_term = 3 [default = true]; // Whether to use a bias term + optional FillerParameter weight_filler = 4; // The filler for the weight + optional FillerParameter bias_filler = 5; // The filler for the bias + +} + // Message that stores parameters used by ExpLayer message ExpParameter { // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0. diff --git a/src/caffe/test/test_embed_layer.cpp b/src/caffe/test/test_embed_layer.cpp new file mode 100644 index 00000000000..7a4fb9800f2 --- /dev/null +++ b/src/caffe/test/test_embed_layer.cpp @@ -0,0 +1,183 @@ +#include +#include + +#include "gtest/gtest.h" + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/vision_layers.hpp" + +#include "caffe/test/test_caffe_main.hpp" +#include "caffe/test/test_gradient_check_util.hpp" + +namespace caffe { + +#ifndef CPU_ONLY +extern cudaDeviceProp CAFFE_TEST_CUDA_PROP; +#endif + +template +class EmbedLayerTest : public MultiDeviceTest { + typedef typename TypeParam::Dtype Dtype; + protected: + EmbedLayerTest() + : blob_bottom_(new Blob(4, 1, 1, 1)), + blob_top_(new Blob()) { + // fill the values + FillerParameter filler_param; + UniformFiller filler(filler_param); + filler.Fill(this->blob_bottom_); + blob_bottom_vec_.push_back(blob_bottom_); + blob_top_vec_.push_back(blob_top_); + } + virtual ~EmbedLayerTest() { delete blob_bottom_; delete blob_top_; } + Blob* const blob_bottom_; + Blob* const blob_top_; + vector*> blob_bottom_vec_; + vector*> blob_top_vec_; +}; + +TYPED_TEST_CASE(EmbedLayerTest, TestDtypesAndDevices); + +TYPED_TEST(EmbedLayerTest, TestSetUp) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + EmbedParameter* embed_param = layer_param.mutable_embed_param(); + embed_param->set_num_output(10); + embed_param->set_input_dim(5); + shared_ptr > layer(new EmbedLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + ASSERT_EQ(this->blob_top_->num_axes(), 5); + EXPECT_EQ(this->blob_top_->shape(0), 4); + EXPECT_EQ(this->blob_top_->shape(1), 1); + EXPECT_EQ(this->blob_top_->shape(2), 1); + EXPECT_EQ(this->blob_top_->shape(3), 1); + EXPECT_EQ(this->blob_top_->shape(4), 10); +} + +TYPED_TEST(EmbedLayerTest, TestForward) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + EmbedParameter* embed_param = layer_param.mutable_embed_param(); + const int kNumOutput = 10; + const int kInputDim = 5; + embed_param->set_num_output(kNumOutput); + embed_param->set_input_dim(kInputDim); + embed_param->mutable_weight_filler()->set_type("uniform"); + embed_param->mutable_weight_filler()->set_min(-10); + embed_param->mutable_weight_filler()->set_max(10); + embed_param->set_bias_term(false); + shared_ptr > layer(new EmbedLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + ASSERT_EQ(1, layer->blobs().size()); + vector weight_shape(2); + weight_shape[0] = kInputDim; + weight_shape[1] = kNumOutput; + ASSERT_TRUE(weight_shape == layer->blobs()[0]->shape()); + for (int i = 0; i < this->blob_bottom_->count(); ++i) { + this->blob_bottom_->mutable_cpu_data()[i] = caffe_rng_rand() % kInputDim; + } + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + vector weight_offset(2, 0); + vector top_offset(5, 0); + for (int i = 0; i < this->blob_bottom_->count(); ++i) { + weight_offset[0] = static_cast(this->blob_bottom_->cpu_data()[i]); + weight_offset[1] = 0; + top_offset[0] = i; + top_offset[4] = 0; + for (int j = 0; j < kNumOutput; ++j) { + EXPECT_EQ(layer->blobs()[0]->data_at(weight_offset), + this->blob_top_->data_at(top_offset)); + ++top_offset[4]; + ++weight_offset[1]; + } + } +} + +TYPED_TEST(EmbedLayerTest, TestForwardWithBias) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + EmbedParameter* embed_param = layer_param.mutable_embed_param(); + const int kNumOutput = 10; + const int kInputDim = 5; + embed_param->set_num_output(kNumOutput); + embed_param->set_input_dim(kInputDim); + embed_param->mutable_weight_filler()->set_type("uniform"); + embed_param->mutable_weight_filler()->set_min(-10); + embed_param->mutable_weight_filler()->set_max(10); + embed_param->mutable_bias_filler()->CopyFrom(embed_param->weight_filler()); + embed_param->set_bias_term(true); + shared_ptr > layer(new EmbedLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + ASSERT_EQ(2, layer->blobs().size()); + vector weight_shape(2); + weight_shape[0] = kInputDim; + weight_shape[1] = kNumOutput; + ASSERT_TRUE(weight_shape == layer->blobs()[0]->shape()); + for (int i = 0; i < this->blob_bottom_->count(); ++i) { + this->blob_bottom_->mutable_cpu_data()[i] = caffe_rng_rand() % kInputDim; + } + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + vector bias_offset(1, 0); + vector weight_offset(2, 0); + vector top_offset(5, 0); + for (int i = 0; i < this->blob_bottom_->count(); ++i) { + weight_offset[0] = static_cast(this->blob_bottom_->cpu_data()[i]); + weight_offset[1] = 0; + top_offset[0] = i; + top_offset[4] = 0; + bias_offset[0] = 0; + for (int j = 0; j < kNumOutput; ++j) { + EXPECT_EQ(layer->blobs()[0]->data_at(weight_offset) + + layer->blobs()[1]->data_at(bias_offset), + this->blob_top_->data_at(top_offset)); + ++top_offset[4]; + ++weight_offset[1]; + ++bias_offset[0]; + } + } +} + +TYPED_TEST(EmbedLayerTest, TestGradient) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + EmbedParameter* embed_param = layer_param.mutable_embed_param(); + embed_param->set_num_output(10); + embed_param->set_input_dim(5); + embed_param->set_bias_term(false); + embed_param->mutable_weight_filler()->set_type("uniform"); + embed_param->mutable_weight_filler()->set_min(-10); + embed_param->mutable_weight_filler()->set_max(10); + EmbedLayer layer(layer_param); + GradientChecker checker(1e-2, 1e-3); + this->blob_bottom_->mutable_cpu_data()[0] = 4; + this->blob_bottom_->mutable_cpu_data()[1] = 2; + this->blob_bottom_->mutable_cpu_data()[2] = 2; + this->blob_bottom_->mutable_cpu_data()[3] = 3; + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, -2); +} + +TYPED_TEST(EmbedLayerTest, TestGradientWithBias) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + EmbedParameter* embed_param = layer_param.mutable_embed_param(); + embed_param->set_num_output(10); + embed_param->set_input_dim(5); + embed_param->set_bias_term(true); + embed_param->mutable_weight_filler()->set_type("uniform"); + embed_param->mutable_weight_filler()->set_min(-10); + embed_param->mutable_weight_filler()->set_max(10); + embed_param->mutable_bias_filler()->CopyFrom(embed_param->weight_filler()); + EmbedLayer layer(layer_param); + GradientChecker checker(1e-2, 1e-3); + this->blob_bottom_->mutable_cpu_data()[0] = 4; + this->blob_bottom_->mutable_cpu_data()[1] = 2; + this->blob_bottom_->mutable_cpu_data()[2] = 2; + this->blob_bottom_->mutable_cpu_data()[3] = 3; + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, -2); +} + +} // namespace caffe From 9271abf6768b69362cdfc0043965db60881da08b Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Wed, 21 Jan 2015 16:12:12 -0800 Subject: [PATCH 09/26] EmbedBackward with no loops -- use caffe_gpu_atomic_add instead --- src/caffe/layers/embed_layer.cu | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/caffe/layers/embed_layer.cu b/src/caffe/layers/embed_layer.cu index 37a4f7e35cd..672fb9c608c 100644 --- a/src/caffe/layers/embed_layer.cu +++ b/src/caffe/layers/embed_layer.cu @@ -5,6 +5,7 @@ #include "caffe/common_layers.hpp" #include "caffe/filler.hpp" #include "caffe/layer.hpp" +#include "caffe/util/gpu_util.cuh" #include "caffe/util/math_functions.hpp" namespace caffe { @@ -22,18 +23,21 @@ __global__ void EmbedForward(const int nthreads, const Dtype* bottom_data, } } +template +__global__ void EmbedBackward(const int nthreads, const Dtype* bottom_data, + const Dtype* top_diff, const int M, const int N, const int K, + Dtype* weight_diff); + template __global__ void EmbedBackward(const int nthreads, const Dtype* bottom_data, const Dtype* top_diff, const int M, const int N, const int K, Dtype* weight_diff) { - CUDA_KERNEL_LOOP(weight_index, nthreads) { - const int index = weight_index / N; - const int output_index = weight_index % N; - for (int n = 0; n < M; ++n) { - if (static_cast(bottom_data[n]) == index) { - weight_diff[weight_index] += top_diff[n * N + output_index]; - } - } + CUDA_KERNEL_LOOP(top_index, nthreads) { + const int n = top_index / N; + const int d = top_index % N; + const int index = static_cast(bottom_data[n]); + const int weight_index = index * N + d; + caffe_gpu_atomic_add(top_diff[top_index], weight_diff + weight_index); } } @@ -59,13 +63,14 @@ void EmbedLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { CHECK(!propagate_down[0]) << "Can't backpropagate to EmbedLayer input."; if (this->param_propagate_down_[0]) { + const int top_count = top[0]->count(); const int count = this->blobs_[0]->count(); const Dtype* top_diff = top[0]->gpu_diff(); const Dtype* bottom_data = bottom[0]->gpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); EmbedBackward // NOLINT_NEXT_LINE(whitespace/operators) - <<>>( - count, bottom_data, top_diff, M_, N_, K_, weight_diff); + <<>>( + top_count, bottom_data, top_diff, M_, N_, K_, weight_diff); } if (bias_term_ && this->param_propagate_down_[1]) { const Dtype* top_diff = top[0]->gpu_diff(); From 23d5c03163e28f008a7b1613eaefa89806579cba Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Fri, 30 Jan 2015 23:19:34 -0800 Subject: [PATCH 10/26] Add (very simple version of) ReshapeLayer --- include/caffe/common_layers.hpp | 47 +++++++++ src/caffe/layers/reshape_layer.cpp | 21 ++++ src/caffe/proto/caffe.proto | 10 +- src/caffe/test/test_reshape_layer.cpp | 138 ++++++++++++++++++++++++++ 4 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 src/caffe/layers/reshape_layer.cpp create mode 100644 src/caffe/test/test_reshape_layer.cpp diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index 7e0046107c3..cadd6b20e5e 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -335,6 +335,53 @@ class MVNLayer : public Layer { Blob sum_multiplier_; }; +/** + * @brief Reshapes an input Blob. + */ +template +class ReshapeLayer : public Layer { + public: + explicit ReshapeLayer(const LayerParameter& param) + : Layer(param) {} + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { return "Reshape"; } + virtual inline int ExactNumBottomBlobs() const { return 1; } + virtual inline int ExactNumTopBlobs() const { return 1; } + + protected: + /** + * @param bottom input Blob vector (length 1) + * -# @f$ (D_1 \times D_2 \times ... \times D_m) @f$ + * the inputs + * @param top output Blob vector (length 1) + * -# @f$ (d_1 \times d_2 \times ... \times d_n) @f$, + * the outputs -- i.e., the (virtually) copied inputs. + * The shape is specified by reshape_param.shape(), and the + * product of the dimensions in the new shape must match that of the + * input shape; i.e., @f$ d_1 d_2 ... d_n = D_1 D_2 ... D_m @f$. + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top) {} + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top) {} + + /** + * @brief Computes the error gradient w.r.t. the concatenate inputs. + * + * @param top output Blob vector (length 1), providing the error gradient with + * respect to the outputs + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length K), into which the top error + * gradient is (virtually) copied + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) {} + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) {} +}; + /** * @brief Ignores bottom blobs while producing no top blobs. (This is useful * to suppress outputs during testing.) diff --git a/src/caffe/layers/reshape_layer.cpp b/src/caffe/layers/reshape_layer.cpp new file mode 100644 index 00000000000..89f5d3433f6 --- /dev/null +++ b/src/caffe/layers/reshape_layer.cpp @@ -0,0 +1,21 @@ +#include + +#include "caffe/common_layers.hpp" +#include "caffe/layer.hpp" + +namespace caffe { + +template +void ReshapeLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + top[0]->Reshape(this->layer_param_.reshape_param().shape()); + CHECK_EQ(top[0]->count(), bottom[0]->count()) + << "new shape must have the same count as input"; + top[0]->ShareData(*bottom[0]); + top[0]->ShareDiff(*bottom[0]); +} + +INSTANTIATE_CLASS(ReshapeLayer); +REGISTER_LAYER_CLASS(Reshape); + +} // namespace caffe diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index 46dda408d1d..a4485736858 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -260,7 +260,7 @@ message ParamSpec { // NOTE // Update the next available ID when you add a new LayerParameter field. // -// LayerParameter next available layer-specific ID: 133 (last added: embed_param) +// LayerParameter next available layer-specific ID: 134 (last added: reshape_param) message LayerParameter { optional string name = 1; // the layer name optional string type = 2; // the layer type @@ -328,6 +328,7 @@ message LayerParameter { optional PReLUParameter prelu_param = 131; optional PythonParameter python_param = 130; optional ReLUParameter relu_param = 123; + optional ReshapeParameter reshape_param = 133; optional SigmoidParameter sigmoid_param = 124; optional SoftmaxParameter softmax_param = 125; optional SliceParameter slice_param = 126; @@ -682,6 +683,13 @@ message PythonParameter { optional string layer = 2; } +// Message that stores parameters used by ReshapeLayer +message ReshapeParameter { + // The new shape of the Blob. Must have the same "count" (product of + // dimensions) as the input Blob. + optional BlobShape shape = 1; +} + // Message that stores parameters used by ReLULayer message ReLUParameter { // Allow non-zero slope for negative inputs to speed up optimization diff --git a/src/caffe/test/test_reshape_layer.cpp b/src/caffe/test/test_reshape_layer.cpp new file mode 100644 index 00000000000..78f157b81a7 --- /dev/null +++ b/src/caffe/test/test_reshape_layer.cpp @@ -0,0 +1,138 @@ +#include +#include + +#include "gtest/gtest.h" + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/vision_layers.hpp" + +#include "caffe/test/test_caffe_main.hpp" +#include "caffe/test/test_gradient_check_util.hpp" + +namespace caffe { + +template +class ReshapeLayerTest : public MultiDeviceTest { + typedef typename TypeParam::Dtype Dtype; + protected: + ReshapeLayerTest() + : blob_bottom_(new Blob(2, 3, 6, 5)), + blob_top_(new Blob()) { + Caffe::set_random_seed(1701); + // fill the values + FillerParameter filler_param; + GaussianFiller filler(filler_param); + filler.Fill(this->blob_bottom_); + blob_bottom_vec_.push_back(blob_bottom_); + blob_top_vec_.push_back(blob_top_); + } + virtual ~ReshapeLayerTest() { delete blob_bottom_; delete blob_top_; } + Blob* const blob_bottom_; + Blob* const blob_top_; + vector*> blob_bottom_vec_; + vector*> blob_top_vec_; +}; + +TYPED_TEST_CASE(ReshapeLayerTest, TestDtypesAndDevices); + +TYPED_TEST(ReshapeLayerTest, TestSetup) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + BlobShape* shape = layer_param.mutable_reshape_param()->mutable_shape(); + shared_ptr > layer; + + shape->Clear(); + shape->add_dim(2 * 3 * 6 * 5); + layer.reset(new ReshapeLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + ASSERT_EQ(this->blob_top_->num_axes(), 1); + EXPECT_EQ(this->blob_top_->shape(0), 2 * 3 * 6 * 5); + + shape->Clear(); + shape->add_dim(2 * 3 * 6); + shape->add_dim(5); + layer.reset(new ReshapeLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + ASSERT_EQ(this->blob_top_->num_axes(), 2); + EXPECT_EQ(this->blob_top_->shape(0), 2 * 3 * 6); + EXPECT_EQ(this->blob_top_->shape(1), 5); + + shape->Clear(); + shape->add_dim(6); + shape->add_dim(1); + shape->add_dim(2); + shape->add_dim(3); + shape->add_dim(1); + shape->add_dim(5); + layer.reset(new ReshapeLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + ASSERT_EQ(this->blob_top_->num_axes(), 6); + EXPECT_EQ(this->blob_top_->shape(0), 6); + EXPECT_EQ(this->blob_top_->shape(1), 1); + EXPECT_EQ(this->blob_top_->shape(2), 2); + EXPECT_EQ(this->blob_top_->shape(3), 3); + EXPECT_EQ(this->blob_top_->shape(4), 1); + EXPECT_EQ(this->blob_top_->shape(5), 5); +} + +TYPED_TEST(ReshapeLayerTest, TestForward) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + BlobShape* shape = layer_param.mutable_reshape_param()->mutable_shape(); + shape->add_dim(6); + shape->add_dim(2); + shape->add_dim(3); + shape->add_dim(5); + ReshapeLayer layer(layer_param); + layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_); + for (int i = 0; i < this->blob_bottom_->count(); ++i) { + EXPECT_EQ(this->blob_top_->cpu_data()[i], + this->blob_bottom_->cpu_data()[i]); + } +} + +TYPED_TEST(ReshapeLayerTest, TestForwardAfterReshape) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + BlobShape* shape = layer_param.mutable_reshape_param()->mutable_shape(); + shape->add_dim(6); + shape->add_dim(2); + shape->add_dim(3); + shape->add_dim(5); + ReshapeLayer layer(layer_param); + layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_); + // We know the above produced the correct result from TestForward. + // Reshape the bottom and call layer.Reshape, then try again. + vector new_bottom_shape(1, 2 * 3 * 6 * 5); + this->blob_bottom_->Reshape(new_bottom_shape); + layer.Reshape(this->blob_bottom_vec_, this->blob_top_vec_); + FillerParameter filler_param; + GaussianFiller filler(filler_param); + filler.Fill(this->blob_bottom_); + layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_); + for (int i = 0; i < this->blob_bottom_->count(); ++i) { + EXPECT_EQ(this->blob_top_->cpu_data()[i], + this->blob_bottom_->cpu_data()[i]); + } +} + +TYPED_TEST(ReshapeLayerTest, TestGradient) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + BlobShape* shape = layer_param.mutable_reshape_param()->mutable_shape(); + shape->add_dim(6); + shape->add_dim(2); + shape->add_dim(3); + shape->add_dim(5); + ReshapeLayer layer(layer_param); + GradientChecker checker(1e-2, 1e-2); + checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_, + this->blob_top_vec_); +} + + +} // namespace caffe From d2c304c53dc99ce0af559b9094e89b2b28c06e19 Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Thu, 30 Oct 2014 17:54:35 -0700 Subject: [PATCH 11/26] FlattenLayer fix -- top should always Share* from bottom (and do everything in Reshape) --- include/caffe/common_layers.hpp | 8 ++++++-- src/caffe/layers/flatten_layer.cpp | 12 +----------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index cadd6b20e5e..1991d58206d 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -250,7 +250,9 @@ class FlattenLayer : public Layer { * the outputs -- i.e., the (virtually) copied, flattened inputs */ virtual void Forward_cpu(const vector*>& bottom, - const vector*>& top); + const vector*>& top) {} + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top) {} /** * @brief Computes the error gradient w.r.t. the concatenate inputs. @@ -262,7 +264,9 @@ class FlattenLayer : public Layer { * gradient is (virtually) copied */ virtual void Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom); + const vector& propagate_down, const vector*>& bottom) {} + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) {} }; /** diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp index 745f271ea45..3a078fabb81 100644 --- a/src/caffe/layers/flatten_layer.cpp +++ b/src/caffe/layers/flatten_layer.cpp @@ -14,18 +14,8 @@ void FlattenLayer::Reshape(const vector*>& bottom, top_shape[1] = bottom[0]->count() / bottom[0]->num(); top[0]->Reshape(top_shape); CHECK_EQ(top[0]->count(), bottom[0]->count()); -} - -template -void FlattenLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { top[0]->ShareData(*bottom[0]); -} - -template -void FlattenLayer::Backward_cpu(const vector*>& top, - const vector& propagate_down, const vector*>& bottom) { - bottom[0]->ShareDiff(*top[0]); + top[0]->ShareDiff(*bottom[0]); } INSTANTIATE_CLASS(FlattenLayer); From 8bafc354bd2cf55e7903f50af8e3bb2103bb42c4 Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Wed, 5 Nov 2014 13:12:53 -0800 Subject: [PATCH 12/26] AccuracyLayer: add 'denominator' param --- include/caffe/loss_layers.hpp | 2 ++ src/caffe/layers/accuracy_layer.cpp | 7 ++++++- src/caffe/proto/caffe.proto | 6 ++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp index d3eecd2e510..bb78001b305 100644 --- a/include/caffe/loss_layers.hpp +++ b/include/caffe/loss_layers.hpp @@ -86,6 +86,8 @@ class AccuracyLayer : public Layer { bool has_ignore_label_; /// The label indicating that an instance should be ignored. int ignore_label_; + + Dtype denominator_; }; /** diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp index 90aad675ed3..73ba83fc9e9 100644 --- a/src/caffe/layers/accuracy_layer.cpp +++ b/src/caffe/layers/accuracy_layer.cpp @@ -20,6 +20,10 @@ void AccuracyLayer::LayerSetUp( if (has_ignore_label_) { ignore_label_ = this->layer_param_.accuracy_param().ignore_label(); } + + denominator_ = this->layer_param_.accuracy_param().denominator(); + CHECK_GE(denominator_, 0) + << "Denominator must be positive; or 0, for the batch size."; } template @@ -81,7 +85,8 @@ void AccuracyLayer::Forward_cpu(const vector*>& bottom, } // LOG(INFO) << "Accuracy: " << accuracy; - top[0]->mutable_cpu_data()[0] = accuracy / count; + const Dtype denominator = (denominator_ == 0) ? count : denominator_; + top[0]->mutable_cpu_data()[0] = accuracy / denominator; // Accuracy layer should not be used as a loss function. } diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index a4485736858..21332007ee8 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -381,6 +381,12 @@ message AccuracyParameter { // If specified, ignore instances with the given label. optional int32 ignore_label = 3; + + // Controls the denominator in the computed accuracy = #correct / denominator. + // Must be a positive number, or the default of 0, for the total input weight. + // If no input weights are used, the denominator is the batch size, as the + // weights each default to 1. + optional float denominator = 4 [default = 0]; } // Message that stores parameters used by ArgMaxLayer From 654f5df9e4945bcb89e9aa18630d0957c1e5897e Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Tue, 30 Sep 2014 17:08:18 -0700 Subject: [PATCH 13/26] EltwiseLayer can take a blob of per-num coefficients --- include/caffe/common_layers.hpp | 1 + src/caffe/layers/eltwise_layer.cpp | 65 +++++++++++++++++++++------ src/caffe/layers/eltwise_layer.cu | 30 ++++++++++--- src/caffe/proto/caffe.proto | 4 ++ src/caffe/test/test_eltwise_layer.cpp | 59 +++++++++++++++++++++++- 5 files changed, 140 insertions(+), 19 deletions(-) diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index 1991d58206d..f8a68f0228e 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -176,6 +176,7 @@ class EltwiseLayer : public Layer { EltwiseParameter_EltwiseOp op_; vector coeffs_; Blob max_idx_; + bool coeff_blob_; bool stable_prod_grad_; }; diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index a80700736bd..5c159ac3d64 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -10,18 +10,23 @@ namespace caffe { template void EltwiseLayer::LayerSetUp(const vector*>& bottom, const vector*>& top) { - CHECK(this->layer_param().eltwise_param().coeff_size() == 0 - || this->layer_param().eltwise_param().coeff_size() == bottom.size()) << + op_ = this->layer_param_.eltwise_param().operation(); + coeff_blob_ = this->layer_param().eltwise_param().coeff_blob(); + if (coeff_blob_) { + CHECK_EQ(op_, EltwiseParameter_EltwiseOp_SUM) + << "coeff_blob option only implemented for the SUM operation"; + } + const int coeff_size = this->layer_param().eltwise_param().coeff_size(); + CHECK(coeff_size == 0 || (!coeff_blob_ && coeff_size == bottom.size()) + || (coeff_blob_ && coeff_size == bottom.size() - 1)) << "Eltwise Layer takes one coefficient per bottom blob."; - CHECK(!(this->layer_param().eltwise_param().operation() - == EltwiseParameter_EltwiseOp_PROD - && this->layer_param().eltwise_param().coeff_size())) << + CHECK(op_ == EltwiseParameter_EltwiseOp_SUM + || this->layer_param().eltwise_param().coeff_size() == 0) << "Eltwise layer only takes coefficients for summation."; - op_ = this->layer_param_.eltwise_param().operation(); // Blob-wise coefficients for the elementwise operation. coeffs_ = vector(bottom.size(), 1); - if (this->layer_param().eltwise_param().coeff_size()) { - for (int i = 0; i < bottom.size(); ++i) { + if (coeff_size) { + for (int i = 0; i < bottom.size() - coeff_blob_; ++i) { coeffs_[i] = this->layer_param().eltwise_param().coeff(i); } } @@ -32,7 +37,19 @@ template void EltwiseLayer::Reshape(const vector*>& bottom, const vector*>& top) { for (int i = 1; i < bottom.size(); ++i) { - CHECK(bottom[i]->shape() == bottom[0]->shape()); + if (coeff_blob_ && i == bottom.size() - 1) { + CHECK_EQ(i, bottom[i]->shape(0)) + << "Dimension of coeff blob axis 0 must equal the number of bottom " + << "blobs (not including the coeff blob itself)."; + for (int input_axis = 0, coeff_axis = 1; + coeff_axis < bottom[i]->num_axes(); ++input_axis, ++coeff_axis) { + CHECK_EQ(bottom[0]->shape(input_axis), bottom[i]->shape(coeff_axis)) + << "Each axis i >= 1 of the coeff blob must match the (i-1)th " + << "axis of the input."; + } + } else { + CHECK(bottom[i]->shape() == bottom[0]->shape()); + } } top[0]->ReshapeLike(*bottom[0]); // If max operation, we will initialize the vector index part. @@ -60,8 +77,21 @@ void EltwiseLayer::Forward_cpu( case EltwiseParameter_EltwiseOp_SUM: caffe_set(count, Dtype(0), top_data); // TODO(shelhamer) does BLAS optimize to sum for coeff = 1? - for (int i = 0; i < bottom.size(); ++i) { - caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data); + for (int i = 0; i < bottom.size() - coeff_blob_; ++i) { + if (coeff_blob_) { + const int num = bottom[bottom.size() - 1]->count() / + (bottom.size() - 1); + const int dim = bottom[i]->count() / num; + const Dtype* bottom_data = bottom[i]->cpu_data(); + const Dtype* coeff_data = bottom[bottom.size() - 1]->cpu_data(); + for (int j = 0; j < num; ++j, bottom_data += dim, top_data += dim) { + const Dtype coeff = coeffs_[i] * coeff_data[i * num + j]; + caffe_axpy(dim, coeff, bottom_data, top_data); + } + top_data = top[0]->mutable_cpu_data(); + } else { + caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data); + } } break; case EltwiseParameter_EltwiseOp_MAX: @@ -104,7 +134,7 @@ void EltwiseLayer::Backward_cpu(const vector*>& top, const int count = top[0]->count(); const Dtype* top_data = top[0]->cpu_data(); const Dtype* top_diff = top[0]->cpu_diff(); - for (int i = 0; i < bottom.size(); ++i) { + for (int i = 0; i < bottom.size() - coeff_blob_; ++i) { if (propagate_down[i]) { const Dtype* bottom_data = bottom[i]->cpu_data(); Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); @@ -128,7 +158,16 @@ void EltwiseLayer::Backward_cpu(const vector*>& top, caffe_mul(count, bottom_diff, top_diff, bottom_diff); break; case EltwiseParameter_EltwiseOp_SUM: - if (coeffs_[i] == Dtype(1)) { + if (coeff_blob_) { + const int num = bottom[bottom.size() - 1]->count() / + (bottom.size() - 1); + const int dim = bottom[i]->count() / num; + const Dtype* coeff_data = bottom[bottom.size() - 1]->cpu_data(); + for (int j = 0; j < num; ++j, bottom_diff += dim, top_diff += dim) { + const Dtype coeff = coeffs_[i] * coeff_data[i * num + j]; + caffe_cpu_scale(dim, coeff, top_diff, bottom_diff); + } + } else if (coeffs_[i] == Dtype(1.)) { caffe_copy(count, top_diff, bottom_diff); } else { caffe_cpu_scale(count, coeffs_[i], top_diff, bottom_diff); diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/eltwise_layer.cu index 2247870d97f..3abc582919e 100644 --- a/src/caffe/layers/eltwise_layer.cu +++ b/src/caffe/layers/eltwise_layer.cu @@ -48,8 +48,20 @@ void EltwiseLayer::Forward_gpu(const vector*>& bottom, case EltwiseParameter_EltwiseOp_SUM: caffe_gpu_set(count, Dtype(0.), top_data); // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1? - for (int i = 0; i < bottom.size(); ++i) { - caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); + for (int i = 0; i < bottom.size() - coeff_blob_; ++i) { + if (coeff_blob_) { + const int num = bottom[i]->num(); + const int dim = bottom[i]->count() / num; + const Dtype* bottom_data = bottom[i]->gpu_data(); + const Dtype* coeff_data = bottom[bottom.size() - 1]->cpu_data(); + for (int j = 0; j < num; ++j, bottom_data += dim, top_data += dim) { + const Dtype coeff = coeffs_[i] * coeff_data[i * num + j]; + caffe_gpu_axpy(dim, coeff, bottom_data, top_data); + } + top_data = top[0]->mutable_gpu_data(); + } else { + caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); + } } break; case EltwiseParameter_EltwiseOp_MAX: @@ -86,10 +98,10 @@ void EltwiseLayer::Backward_gpu(const vector*>& top, const int* mask = NULL; const int count = top[0]->count(); const Dtype* top_data = top[0]->gpu_data(); - const Dtype* top_diff = top[0]->gpu_diff(); - for (int i = 0; i < bottom.size(); ++i) { + for (int i = 0; i < bottom.size() - coeff_blob_; ++i) { if (propagate_down[i]) { const Dtype* bottom_data = bottom[i]->gpu_data(); + const Dtype* top_diff = top[0]->gpu_diff(); Dtype* bottom_diff = bottom[i]->mutable_gpu_diff(); switch (op_) { case EltwiseParameter_EltwiseOp_PROD: @@ -111,7 +123,15 @@ void EltwiseLayer::Backward_gpu(const vector*>& top, caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); break; case EltwiseParameter_EltwiseOp_SUM: - if (coeffs_[i] == Dtype(1.)) { + if (coeff_blob_) { + const int num = bottom[i]->num(); + const int dim = bottom[i]->count() / num; + const Dtype* coeff_data = bottom[bottom.size() - 1]->cpu_data(); + for (int j = 0; j < num; ++j, bottom_diff += dim, top_diff += dim) { + const Dtype coeff = coeffs_[i] * coeff_data[i * num + j]; + caffe_gpu_scale(dim, coeff, top_diff, bottom_diff); + } + } else if (coeffs_[i] == Dtype(1.)) { caffe_copy(count, top_diff, bottom_diff); } else { caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index 21332007ee8..c40e4c75716 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -509,6 +509,10 @@ message EltwiseParameter { // Whether to use an asymptotically slower (for >2 inputs) but stabler method // of computing the gradient for the PROD operation. (No effect for SUM op.) optional bool stable_prod_grad = 3 [default = true]; + + // If true and the EltwiseOp is SUM, the last bottom blob is a singleton + // coefficient for the first N-1 bottom blobs, with shape (N-1, 1, 1, 1). + optional bool coeff_blob = 4 [default = false]; } // Message that stores parameters used by EmbedLayer diff --git a/src/caffe/test/test_eltwise_layer.cpp b/src/caffe/test/test_eltwise_layer.cpp index be0c1347709..85c11341abd 100644 --- a/src/caffe/test/test_eltwise_layer.cpp +++ b/src/caffe/test/test_eltwise_layer.cpp @@ -22,14 +22,18 @@ class EltwiseLayerTest : public MultiDeviceTest { : blob_bottom_a_(new Blob(2, 3, 4, 5)), blob_bottom_b_(new Blob(2, 3, 4, 5)), blob_bottom_c_(new Blob(2, 3, 4, 5)), + blob_bottom_coeff_(new Blob()), blob_top_(new Blob()) { - // fill the values + vector coeff_shape(2); + coeff_shape[0] = 3; coeff_shape[1] = 2; + blob_bottom_coeff_->Reshape(coeff_shape); Caffe::set_random_seed(1701); FillerParameter filler_param; UniformFiller filler(filler_param); filler.Fill(this->blob_bottom_a_); filler.Fill(this->blob_bottom_b_); filler.Fill(this->blob_bottom_c_); + filler.Fill(this->blob_bottom_coeff_); blob_bottom_vec_.push_back(blob_bottom_a_); blob_bottom_vec_.push_back(blob_bottom_b_); blob_bottom_vec_.push_back(blob_bottom_c_); @@ -39,11 +43,13 @@ class EltwiseLayerTest : public MultiDeviceTest { delete blob_bottom_a_; delete blob_bottom_b_; delete blob_bottom_c_; + delete blob_bottom_coeff_; delete blob_top_; } Blob* const blob_bottom_a_; Blob* const blob_bottom_b_; Blob* const blob_bottom_c_; + Blob* const blob_bottom_coeff_; Blob* const blob_top_; vector*> blob_bottom_vec_; vector*> blob_top_vec_; @@ -126,6 +132,37 @@ TYPED_TEST(EltwiseLayerTest, TestSumCoeff) { } } +TYPED_TEST(EltwiseLayerTest, TestSumBlobCoeff) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param(); + eltwise_param->set_operation(EltwiseParameter_EltwiseOp_SUM); + eltwise_param->set_coeff_blob(true); + eltwise_param->add_coeff(1); + eltwise_param->add_coeff(-0.5); + eltwise_param->add_coeff(2); + shared_ptr > layer( + new EltwiseLayer(layer_param)); + this->blob_bottom_vec_.push_back(this->blob_bottom_coeff_); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + const Dtype* data = this->blob_top_->cpu_data(); + const int count = this->blob_top_->count(); + const int num = this->blob_top_->num(); + const int dim = count / num; + const Dtype* coeff_data = this->blob_bottom_coeff_->cpu_data(); + for (int n = 0; n < num; ++n) { + for (int d = 0; d < dim; ++d) { + Dtype sum = 0; + for (int i = 0; i < this->blob_bottom_vec_.size() - 1; ++i) { + const Dtype coeff = coeff_data[i * num + n] * eltwise_param->coeff(i); + sum += coeff * this->blob_bottom_vec_[i]->cpu_data()[n * dim + d]; + } + EXPECT_NEAR(data[n * dim + d], sum, 1e-4); + } + } +} + TYPED_TEST(EltwiseLayerTest, TestStableProdGradient) { typedef typename TypeParam::Dtype Dtype; LayerParameter layer_param; @@ -175,6 +212,26 @@ TYPED_TEST(EltwiseLayerTest, TestSumCoeffGradient) { this->blob_top_vec_); } +TYPED_TEST(EltwiseLayerTest, TestSumBlobCoeffGradient) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param(); + eltwise_param->set_operation(EltwiseParameter_EltwiseOp_SUM); + eltwise_param->set_coeff_blob(true); + eltwise_param->add_coeff(1); + eltwise_param->add_coeff(-0.5); + eltwise_param->add_coeff(2); + EltwiseLayer layer(layer_param); + this->blob_bottom_vec_.push_back(this->blob_bottom_coeff_); + GradientChecker checker(1e-2, 1e-3); + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 0); + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 1); + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 2); +} + TYPED_TEST(EltwiseLayerTest, TestMax) { typedef typename TypeParam::Dtype Dtype; LayerParameter layer_param; From f6ef1ce8cff8173b49de79085aee7f8c0fb63f3f Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Sun, 2 Nov 2014 00:41:50 -0700 Subject: [PATCH 14/26] EltwiseLayer with coeff blob GPU kernel --- src/caffe/layers/eltwise_layer.cpp | 2 +- src/caffe/layers/eltwise_layer.cu | 69 ++++++++++++++++++------------ 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp index 5c159ac3d64..55d2d955c7c 100644 --- a/src/caffe/layers/eltwise_layer.cpp +++ b/src/caffe/layers/eltwise_layer.cpp @@ -24,7 +24,7 @@ void EltwiseLayer::LayerSetUp(const vector*>& bottom, || this->layer_param().eltwise_param().coeff_size() == 0) << "Eltwise layer only takes coefficients for summation."; // Blob-wise coefficients for the elementwise operation. - coeffs_ = vector(bottom.size(), 1); + coeffs_.resize(bottom.size(), 1); if (coeff_size) { for (int i = 0; i < bottom.size() - coeff_blob_; ++i) { coeffs_[i] = this->layer_param().eltwise_param().coeff(i); diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/eltwise_layer.cu index 3abc582919e..97f52079108 100644 --- a/src/caffe/layers/eltwise_layer.cu +++ b/src/caffe/layers/eltwise_layer.cu @@ -31,12 +31,33 @@ __global__ void MaxForward(const int nthreads, const Dtype* bottom_data_a, } } +template +__global__ void CoeffSum(const int count, const int dim, + const int num_offset, const Dtype coeff, const Dtype* coeff_data, + const bool backward, const Dtype* in, Dtype* out) { + CUDA_KERNEL_LOOP(index, count) { + const int n = num_offset + index / dim; + const Dtype other_coeff = coeff_data ? coeff_data[n] : Dtype(1); + const Dtype final_coeff = coeff * other_coeff; + const Dtype result = in[index] * final_coeff; + if (num_offset == 0 || backward) { + out[index] = result; + } else { + out[index] += result; + } + } +} + template void EltwiseLayer::Forward_gpu(const vector*>& bottom, const vector*>& top) { int* mask = NULL; const int count = top[0]->count(); + const int num = top[0]->num(); + const int dim = count / num; Dtype* top_data = top[0]->mutable_gpu_data(); + const Dtype* coeff_data = NULL; + const bool kBackward = false; switch (op_) { case EltwiseParameter_EltwiseOp_PROD: caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(), @@ -46,22 +67,17 @@ void EltwiseLayer::Forward_gpu(const vector*>& bottom, } break; case EltwiseParameter_EltwiseOp_SUM: - caffe_gpu_set(count, Dtype(0.), top_data); // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1? + if (coeff_blob_) { + coeff_data = bottom[bottom.size() - 1]->gpu_data(); + } for (int i = 0; i < bottom.size() - coeff_blob_; ++i) { - if (coeff_blob_) { - const int num = bottom[i]->num(); - const int dim = bottom[i]->count() / num; - const Dtype* bottom_data = bottom[i]->gpu_data(); - const Dtype* coeff_data = bottom[bottom.size() - 1]->cpu_data(); - for (int j = 0; j < num; ++j, bottom_data += dim, top_data += dim) { - const Dtype coeff = coeffs_[i] * coeff_data[i * num + j]; - caffe_gpu_axpy(dim, coeff, bottom_data, top_data); - } - top_data = top[0]->mutable_gpu_data(); - } else { - caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data); - } + const Dtype* bottom_data = bottom[i]->gpu_data(); + CoeffSum // NOLINT_NEXT_LINE(whitespace/operators) + <<>>( + count, dim, i * num, coeffs_[i], coeff_data, + kBackward, bottom_data, top_data); + CUDA_POST_KERNEL_CHECK; } break; case EltwiseParameter_EltwiseOp_MAX: @@ -97,7 +113,14 @@ void EltwiseLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { const int* mask = NULL; const int count = top[0]->count(); + const int num = top[0]->num(); + const int dim = count / num; const Dtype* top_data = top[0]->gpu_data(); + const Dtype* coeff_data = NULL; + if (coeff_blob_) { + coeff_data = bottom[bottom.size() - 1]->gpu_data(); + } + const bool kBackward = true; for (int i = 0; i < bottom.size() - coeff_blob_; ++i) { if (propagate_down[i]) { const Dtype* bottom_data = bottom[i]->gpu_data(); @@ -123,19 +146,11 @@ void EltwiseLayer::Backward_gpu(const vector*>& top, caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff); break; case EltwiseParameter_EltwiseOp_SUM: - if (coeff_blob_) { - const int num = bottom[i]->num(); - const int dim = bottom[i]->count() / num; - const Dtype* coeff_data = bottom[bottom.size() - 1]->cpu_data(); - for (int j = 0; j < num; ++j, bottom_diff += dim, top_diff += dim) { - const Dtype coeff = coeffs_[i] * coeff_data[i * num + j]; - caffe_gpu_scale(dim, coeff, top_diff, bottom_diff); - } - } else if (coeffs_[i] == Dtype(1.)) { - caffe_copy(count, top_diff, bottom_diff); - } else { - caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff); - } + CoeffSum // NOLINT_NEXT_LINE(whitespace/operators) + <<>>( + count, dim, i * num, coeffs_[i], coeff_data, + kBackward, top_diff, bottom_diff); + CUDA_POST_KERNEL_CHECK; break; case EltwiseParameter_EltwiseOp_MAX: mask = max_idx_.gpu_data(); From f0d036fff41029f7f3dfbc5e4024c1a40aa2938c Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Tue, 7 Oct 2014 11:55:54 -0700 Subject: [PATCH 15/26] Allow SliceLayer to have a single top Blob (for testing) --- include/caffe/common_layers.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index f8a68f0228e..d694f4381c3 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -530,7 +530,7 @@ class SliceLayer : public Layer { virtual inline const char* type() const { return "Slice"; } virtual inline int ExactNumBottomBlobs() const { return 1; } - virtual inline int MinTopBlobs() const { return 2; } + virtual inline int MinTopBlobs() const { return 1; } protected: virtual void Forward_cpu(const vector*>& bottom, From a10bf5bfee67658b90c94ce3363bf8710d89e70e Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Fri, 23 Jan 2015 12:52:44 -0800 Subject: [PATCH 16/26] Allow ConcatLayer to take a single bottom Blob (for testing) --- include/caffe/common_layers.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index d694f4381c3..a23c671b7e9 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -85,7 +85,7 @@ class ConcatLayer : public Layer { const vector*>& top); virtual inline const char* type() const { return "Concat"; } - virtual inline int MinBottomBlobs() const { return 2; } + virtual inline int MinBottomBlobs() const { return 1; } virtual inline int ExactNumTopBlobs() const { return 1; } protected: From a7eaaf5cad93d9fe632d9bdb7042591b607b157b Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Sun, 15 Feb 2015 15:18:56 -0800 Subject: [PATCH 17/26] Modifications to Net to facilitate unrolled recurrent networks --- include/caffe/net.hpp | 10 ++++++++++ src/caffe/net.cpp | 42 +++++++++++------------------------------- 2 files changed, 21 insertions(+), 31 deletions(-) diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp index 075afebc9b0..10fc1705192 100644 --- a/include/caffe/net.hpp +++ b/include/caffe/net.hpp @@ -84,6 +84,13 @@ class Net { /// @brief Updates the network weights based on the diff values computed. void Update(); + /** + * @brief Shares weight data of owner blobs with shared blobs. + * + * Note: this is called by Net::Init, and thus should normally not be + * called manually. + */ + void ShareWeightData(); /** * @brief For an already initialized net, implicitly copies (i.e., using no @@ -150,6 +157,9 @@ class Net { return param_names_index_; } inline const vector& param_owners() const { return param_owners_; } + inline const vector& param_display_names() const { + return param_display_names_; + } /// @brief Input and output blob numbers inline int num_inputs() const { return net_input_blobs_.size(); } inline int num_outputs() const { return net_output_blobs_.size(); } diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index fd00b122630..14c5bfc195e 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -213,6 +213,7 @@ void Net::Init(const NetParameter& in_param) { layer_names_index_[layer_names_[layer_id]] = layer_id; } GetLearningRateAndWeightDecay(); + ShareWeightData(); debug_info_ = param.debug_info(); LOG(INFO) << "Network initialization done."; LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); @@ -437,8 +438,6 @@ void Net::AppendParam(const NetParameter& param, const int layer_id, // Strict dimension checking -- all dims must be the same. CHECK(this_blob->shape() == owner_blob->shape()); } - layers_[layer_id]->blobs()[param_id]->ShareData( - *layers_[owner_layer_id]->blobs()[owner_param_id]); } } @@ -741,35 +740,7 @@ void Net::ToProto(NetParameter* param, bool write_diff) const { template void Net::Update() { - // First, accumulate the diffs of any shared parameters into their owner's - // diff. (Assumes that the learning rate, weight decay, etc. have already been - // accounted for in the current diff.) - for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] < 0) { continue; } - if (debug_info_) { UpdateDebugInfo(i); } - const int count = params_[i]->count(); - const Dtype* this_diff; - Dtype* owner_diff; - switch (Caffe::mode()) { - case Caffe::CPU: - this_diff = params_[i]->cpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_cpu_diff(); - caffe_add(count, this_diff, owner_diff, owner_diff); - break; -#ifndef CPU_ONLY - case Caffe::GPU: - this_diff = params_[i]->gpu_diff(); - owner_diff = params_[param_owners_[i]]->mutable_gpu_diff(); - caffe_gpu_add(count, this_diff, owner_diff, owner_diff); - break; -#else - NO_GPU; -#endif - default: - LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); - } - } - // Now, update the owned parameters. + // Update only the owned parameters. for (int i = 0; i < params_.size(); ++i) { if (param_owners_[i] >= 0) { continue; } if (debug_info_) { UpdateDebugInfo(i); } @@ -777,6 +748,15 @@ void Net::Update() { } } +template +void Net::ShareWeightData() { + for (int i = 0; i < params_.size(); ++i) { + if (param_owners_[i] < 0) { continue; } + params_[i]->ShareData(*params_[param_owners_[i]]); + params_[i]->ShareDiff(*params_[param_owners_[i]]); + } +} + template bool Net::has_blob(const string& blob_name) const { return blob_names_index_.find(blob_name) != blob_names_index_.end(); From 49fbab9ad931e22d5aa0f7a7139fae8a1721392b Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Sun, 15 Feb 2015 17:35:52 -0800 Subject: [PATCH 18/26] TestNet fixes for Net weight sharing modifications --- src/caffe/test/test_net.cpp | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/src/caffe/test/test_net.cpp b/src/caffe/test/test_net.cpp index 08106e79274..c65116d7eb1 100644 --- a/src/caffe/test/test_net.cpp +++ b/src/caffe/test/test_net.cpp @@ -1007,11 +1007,10 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) { EXPECT_EQ(this->net_->layer_names()[2], "innerproduct2"); Blob* ip1_weights = this->net_->layers()[1]->blobs()[0].get(); Blob* ip2_weights = this->net_->layers()[2]->blobs()[0].get(); - // Check that data blobs of shared weights share the same location in memory. + // Check that data and diff blobs of shared weights share the same memory + // locations. EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data()); - // Check that diff blobs of shared weights are at different locations in - // memory. (The diffs should be accumulated at update time.) - EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff()); + EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff()); this->net_->Forward(bottom); this->net_->Backward(); // Compute the expected update as the data minus the two diffs. @@ -1024,11 +1023,7 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) { // Make sure the diffs are non-trivial. for (int i = 0; i < count; ++i) { EXPECT_NE(0, ip1_weights->cpu_diff()[i]); - EXPECT_NE(0, ip2_weights->cpu_diff()[i]); - EXPECT_NE(ip1_weights->cpu_diff()[i], ip2_weights->cpu_diff()[i]); } - caffe_axpy(count, Dtype(1), ip2_weights->cpu_diff(), - shared_params.mutable_cpu_diff()); caffe_axpy(count, Dtype(-1), shared_params.cpu_diff(), shared_params.mutable_cpu_data()); const Dtype* expected_updated_params = shared_params.cpu_data(); @@ -1065,8 +1060,8 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) { EXPECT_NE(0, ip1_weights->cpu_diff()[i]); EXPECT_NE(0, ip2_weights->cpu_diff()[i]); EXPECT_NE(ip1_weights->cpu_diff()[i], ip2_weights->cpu_diff()[i]); - EXPECT_EQ(ip1_weights->cpu_diff()[i] + ip2_weights->cpu_diff()[i], - shared_params.cpu_diff()[i]); + EXPECT_FLOAT_EQ(ip1_weights->cpu_diff()[i] + ip2_weights->cpu_diff()[i], + shared_params.cpu_diff()[i]); } caffe_axpy(count, Dtype(-1), ip1_weights->cpu_diff(), unshared_params1.mutable_cpu_data()); @@ -1096,11 +1091,10 @@ TYPED_TEST(NetTest, TestSharedWeightsResume) { EXPECT_EQ(this->net_->layer_names()[2], "innerproduct2"); Blob* ip1_weights = this->net_->layers()[1]->blobs()[0].get(); Blob* ip2_weights = this->net_->layers()[2]->blobs()[0].get(); - // Check that data blobs of shared weights share the same location in memory. + // Check that data and diff blobs of shared weights share the same memory + // locations. EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data()); - // Check that diff blobs of shared weights are at different locations in - // memory. (The diffs should be accumulated at update time.) - EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff()); + EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff()); this->net_->ForwardBackward(bottom); this->net_->Update(); Blob shared_params; @@ -1123,14 +1117,13 @@ TYPED_TEST(NetTest, TestSharedWeightsResume) { ASSERT_FALSE(NULL == ip1_weights); ASSERT_FALSE(NULL == ip2_weights); EXPECT_NE(ip1_weights, ip2_weights); - // Check that data blobs of shared weights share the same location in memory. + // Check that data and diff blobs of shared weights share the same memory + // locations. EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data()); + EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff()); for (int i = 0; i < count; ++i) { EXPECT_FLOAT_EQ(shared_params.cpu_data()[i], ip1_weights->cpu_data()[i]); } - // Check that diff blobs of shared weights are at different locations in - // memory. (The diffs should be accumulated at update time.) - EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff()); } TYPED_TEST(NetTest, TestParamPropagateDown) { From f2724eabdd20d1649041078e6f8baebbeb11a72a Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Sun, 15 Feb 2015 14:28:01 -0800 Subject: [PATCH 19/26] Add RecurrentLayer: an abstract superclass for other recurrent layer types --- include/caffe/sequence_layers.hpp | 154 +++++++++++++++++++ src/caffe/layers/recurrent_layer.cpp | 222 +++++++++++++++++++++++++++ src/caffe/layers/recurrent_layer.cu | 35 +++++ src/caffe/proto/caffe.proto | 16 +- 4 files changed, 426 insertions(+), 1 deletion(-) create mode 100644 include/caffe/sequence_layers.hpp create mode 100644 src/caffe/layers/recurrent_layer.cpp create mode 100644 src/caffe/layers/recurrent_layer.cu diff --git a/include/caffe/sequence_layers.hpp b/include/caffe/sequence_layers.hpp new file mode 100644 index 00000000000..d064136bf15 --- /dev/null +++ b/include/caffe/sequence_layers.hpp @@ -0,0 +1,154 @@ +#ifndef CAFFE_SEQUENCE_LAYERS_HPP_ +#define CAFFE_SEQUENCE_LAYERS_HPP_ + +#include +#include +#include + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/layer.hpp" +#include "caffe/net.hpp" +#include "caffe/proto/caffe.pb.h" + +namespace caffe { + +template class RecurrentLayer; + +/** + * @brief An abstract class for implementing recurrent behavior inside of an + * unrolled network. This Layer type cannot be instantiated -- instaed, + * you should use one of its implementations which defines the recurrent + * architecture, such as RNNLayer or LSTMLayer. + */ +template +class RecurrentLayer : public Layer { + public: + explicit RecurrentLayer(const LayerParameter& param) + : Layer(param) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual void Reset(); + + virtual inline const char* type() const { return "Recurrent"; } + virtual inline int MinBottomBlobs() const { return 2; } + virtual inline int MaxBottomBlobs() const { return 3; } + virtual inline int ExactNumTopBlobs() const { return 1; } + + virtual inline bool AllowForceBackward(const int bottom_index) const { + // Can't propagate to sequence continuation indicators. + return bottom_index != 1; + } + + protected: + /** + * @brief Fills net_param with the recurrent network arcthiecture. Subclasses + * should define this -- see RNNLayer and LSTMLayer for examples. + */ + virtual void FillUnrolledNet(NetParameter* net_param) const = 0; + + /** + * @brief Fills names with the names of the 0th timestep recurrent input + * Blob&s. Subclasses should define this -- see RNNLayer and LSTMLayer + * for examples. + */ + virtual void RecurrentInputBlobNames(vector* names) const = 0; + + /** + * @brief Fills names with the names of the Tth timestep recurrent output + * Blob&s. Subclasses should define this -- see RNNLayer and LSTMLayer + * for examples. + */ + virtual void RecurrentOutputBlobNames(vector* names) const = 0; + + /** + * @brief Fills names with the names of the output blobs, concatenated across + * all timesteps. Should return a name for each top Blob. + * Subclasses should define this -- see RNNLayer and LSTMLayer for + * examples. + */ + virtual void OutputBlobNames(vector* names) const = 0; + + /** + * @param bottom input Blob vector (length 2-3) + * + * -# @f$ (T \times N \times ...) @f$ + * the time-varying input @f$ x @f$. After the first two axes, whose + * dimensions must correspond to the number of timesteps @f$ T @f$ and + * the number of independent streams @f$ N @f$, respectively, its + * dimensions may be arbitrary. Note that the ordering of dimensions -- + * @f$ (T \times N \times ...) @f$, rather than + * @f$ (N \times T \times ...) @f$ -- means that the @f$ N @f$ + * independent input streams must be "interleaved". + * + * -# @f$ (T \times N) @f$ + * the sequence continuation indicators @f$ \delta @f$. + * These inputs should be binary (0 or 1) indicators, where + * @f$ \delta_{t,n} = 0 @f$ means that timestep @f$ t @f$ of stream + * @f$ n @f$ is the beginning of a new sequence, and hence the previous + * hidden state @f$ h_{t-1} @f$ is multiplied by @f$ \delta_t = 0 @f$ + * and has no effect on the cell's output at timestep @f$ t @f$, and + * a value of @f$ \delta_{t,n} = 1 @f$ means that timestep @f$ t @f$ of + * stream @f$ n @f$ is a continuation from the previous timestep + * @f$ t-1 @f$, and the previous hidden state @f$ h_{t-1} @f$ affects the + * updated hidden state and output. + * + * -# @f$ (N \times ...) @f$ (optional) + * the static (non-time-varying) input @f$ x_{static} @f$. + * After the first axis, whose dimension must be the number of + * independent streams, its dimensions may be arbitrary. + * This is mathematically equivalent to using a time-varying input of + * @f$ x'_t = [x_t; x_{static}] @f$ -- i.e., tiling the static input + * across the @f$ T @f$ timesteps and concatenating with the time-varying + * input. Note that if this input is used, all timesteps in a single + * batch within a particular one of the @f$ N @f$ streams must share the + * same static input, even if the sequence continuation indicators + * suggest that difference sequences are ending and beginning within a + * single batch. This may require padding and/or truncation for uniform + * length. + * + * @param top output Blob vector (length 1) + * -# @f$ (T \times N \times D) @f$ + * the time-varying output @f$ y @f$, where @f$ D @f$ is + * recurrent_param.num_output(). + * Refer to documentation for particular RecurrentLayer implementations + * (such as RNNLayer and LSTMLayer) for the definition of @f$ y @f$. + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// @brief A helper function, useful for stringifying timestep indices. + virtual string int_to_str(const int t) const; + + /// @brief A Net to implement the Recurrent functionality. + shared_ptr > unrolled_net_; + + /// @brief The number of independent streams to process simultaneously. + int N_; + + /** + * @brief The number of timesteps in the layer's input, and the number of + * timesteps over which to backpropagate through time. + */ + int T_; + + /// @brief Whether the layer has a "static" input copied across all timesteps. + bool static_input_; + + vector* > recur_input_blobs_; + vector* > recur_output_blobs_; + vector* > output_blobs_; + Blob* x_input_blob_; + Blob* x_static_input_blob_; + Blob* cont_input_blob_; +}; + +} // namespace caffe + +#endif // CAFFE_SEQUENCE_LAYERS_HPP_ diff --git a/src/caffe/layers/recurrent_layer.cpp b/src/caffe/layers/recurrent_layer.cpp new file mode 100644 index 00000000000..7dc38fec901 --- /dev/null +++ b/src/caffe/layers/recurrent_layer.cpp @@ -0,0 +1,222 @@ +#include +#include + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layer.hpp" +#include "caffe/sequence_layers.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +string RecurrentLayer::int_to_str(const int t) const { + ostringstream num; + num << t; + return num.str(); +} + +template +void RecurrentLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { + CHECK_GE(bottom[0]->num_axes(), 2) + << "bottom[0] must have at least 2 axes -- (#timesteps, #streams, ...)"; + T_ = bottom[0]->shape(0); + N_ = bottom[0]->shape(1); + LOG(INFO) << "Initializing recurrent layer: assuming input batch contains " + << T_ << " timesteps of " << N_ << " independent streams."; + + CHECK_EQ(bottom[1]->num_axes(), 2) + << "bottom[1] must have exactly 2 axes -- (#timesteps, #streams)"; + CHECK_EQ(T_, bottom[1]->shape(0)); + CHECK_EQ(N_, bottom[1]->shape(1)); + + // If provided, bottom[2] is a static input to the recurrent net. + static_input_ = (bottom.size() > 2); + if (static_input_) { + CHECK_GE(bottom[2]->num_axes(), 1); + CHECK_EQ(N_, bottom[2]->shape(0)); + } + + // Create a NetParameter; setup the inputs that aren't unique to particular + // recurrent architectures. + NetParameter net_param; + net_param.set_force_backward(true); + + net_param.add_input("x"); + BlobShape input_shape; + for (int i = 0; i < bottom[0]->num_axes(); ++i) { + input_shape.add_dim(bottom[0]->shape(i)); + } + net_param.add_input_shape()->CopyFrom(input_shape); + + input_shape.Clear(); + input_shape.add_dim(1); + for (int i = 0; i < bottom[1]->num_axes(); ++i) { + input_shape.add_dim(bottom[1]->shape(i)); + } + net_param.add_input("cont"); + net_param.add_input_shape()->CopyFrom(input_shape); + + if (static_input_) { + input_shape.Clear(); + for (int i = 0; i < bottom[2]->num_axes(); ++i) { + input_shape.add_dim(bottom[2]->shape(i)); + } + net_param.add_input("x_static"); + net_param.add_input_shape()->CopyFrom(input_shape); + } + + // Call the child's FillUnrolledNet implementation to specify the unrolled + // recurrent architecture. + this->FillUnrolledNet(&net_param); + + // Prepend this layer's name to the names of each layer in the unrolled net. + const string& layer_name = this->layer_param_.name(); + if (layer_name.size() > 0) { + for (int i = 0; i < net_param.layer_size(); ++i) { + LayerParameter* layer = net_param.mutable_layer(i); + layer->set_name(layer_name + "_" + layer->name()); + } + } + + // Create the unrolled net. + unrolled_net_.reset(new Net(net_param)); + unrolled_net_->set_debug_info( + this->layer_param_.recurrent_param().debug_info()); + + // Setup pointers to the inputs. + x_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("x").get()); + cont_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("cont").get()); + if (static_input_) { + x_static_input_blob_ = + CHECK_NOTNULL(unrolled_net_->blob_by_name("x_static").get()); + } + + // Setup pointers to paired recurrent inputs/outputs. + vector recur_input_names; + RecurrentInputBlobNames(&recur_input_names); + vector recur_output_names; + RecurrentOutputBlobNames(&recur_output_names); + const int num_recur_blobs = recur_input_names.size(); + CHECK_EQ(num_recur_blobs, recur_output_names.size()); + recur_input_blobs_.resize(num_recur_blobs); + recur_output_blobs_.resize(num_recur_blobs); + for (int i = 0; i < recur_input_names.size(); ++i) { + recur_input_blobs_[i] = + CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_input_names[i]).get()); + recur_output_blobs_[i] = + CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_output_names[i]).get()); + } + + // Setup pointers to outputs. + vector output_names; + OutputBlobNames(&output_names); + CHECK_EQ(top.size(), output_names.size()) + << "OutputBlobNames must provide an output blob name for each top."; + output_blobs_.resize(output_names.size()); + for (int i = 0; i < output_names.size(); ++i) { + output_blobs_[i] = + CHECK_NOTNULL(unrolled_net_->blob_by_name(output_names[i]).get()); + } + + // We should have 2 inputs (x and cont), plus a number of recurrent inputs, + // plus maybe a static input. + CHECK_EQ(2 + num_recur_blobs + static_input_, + unrolled_net_->input_blobs().size()); + + // This layer's parameters are any parameters in the layers of the unrolled + // net. We only want one copy of each parameter, so check that the parameter + // is "owned" by the layer, rather than shared with another. + this->blobs_.clear(); + for (int i = 0; i < unrolled_net_->params().size(); ++i) { + if (unrolled_net_->param_owners()[i] == -1) { + LOG(INFO) << "Adding parameter " << i << ": " + << unrolled_net_->param_display_names()[i]; + this->blobs_.push_back(unrolled_net_->params()[i]); + } + } + // Check that param_propagate_down is set for all of the parameters in the + // unrolled net; set param_propagate_down to true in this layer. + for (int i = 0; i < unrolled_net_->layers().size(); ++i) { + for (int j = 0; j < unrolled_net_->layers()[i]->blobs().size(); ++j) { + CHECK(unrolled_net_->layers()[i]->param_propagate_down(j)) + << "param_propagate_down not set for layer " << i << ", param " << j; + } + } + this->param_propagate_down_.clear(); + this->param_propagate_down_.resize(this->blobs_.size(), true); + + // Set the diffs of recurrent outputs to 0 -- we can't backpropagate across + // batches. + for (int i = 0; i < recur_output_blobs_.size(); ++i) { + caffe_set(recur_output_blobs_[i]->count(), Dtype(0), + recur_output_blobs_[i]->mutable_cpu_diff()); + } +} + +template +void RecurrentLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + CHECK_EQ(top.size(), output_blobs_.size()); + for (int i = 0; i < top.size(); ++i) { + top[i]->ReshapeLike(*output_blobs_[i]); + output_blobs_[i]->ShareData(*top[i]); + output_blobs_[i]->ShareDiff(*top[i]); + } + x_input_blob_->ShareData(*bottom[0]); + x_input_blob_->ShareDiff(*bottom[0]); + cont_input_blob_->ShareData(*bottom[1]); + if (static_input_) { + x_static_input_blob_->ShareData(*bottom[2]); + x_static_input_blob_->ShareDiff(*bottom[2]); + } +} + +template +void RecurrentLayer::Reset() { + // "Reset" the hidden state of the net by zeroing out all recurrent outputs. + for (int i = 0; i < recur_output_blobs_.size(); ++i) { + caffe_set(recur_output_blobs_[i]->count(), Dtype(0), + recur_output_blobs_[i]->mutable_cpu_data()); + } +} + +template +void RecurrentLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { + // Hacky fix for test time... reshare all the shared blobs. + // TODO: somehow make this work non-hackily. + if (this->phase_ == TEST) { + unrolled_net_->ShareWeightData(); + } + + DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size()); + for (int i = 0; i < recur_input_blobs_.size(); ++i) { + const int count = recur_input_blobs_[i]->count(); + DCHECK_EQ(count, recur_output_blobs_[i]->count()); + const Dtype* timestep_T_data = recur_output_blobs_[i]->cpu_data(); + Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_cpu_data(); + caffe_copy(count, timestep_T_data, timestep_0_data); + } + + unrolled_net_->ForwardPrefilled(); +} + +template +void RecurrentLayer::Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + CHECK(!propagate_down[1]) << "Cannot backpropagate to sequence indicators."; + if (!propagate_down[0] && !propagate_down[2]) { return; } + + unrolled_net_->Backward(); +} + +#ifdef CPU_ONLY +STUB_GPU_FORWARD(RecurrentLayer, Forward); +#endif + +INSTANTIATE_CLASS(RecurrentLayer); + +} // namespace caffe diff --git a/src/caffe/layers/recurrent_layer.cu b/src/caffe/layers/recurrent_layer.cu new file mode 100644 index 00000000000..ce4b2f9b77c --- /dev/null +++ b/src/caffe/layers/recurrent_layer.cu @@ -0,0 +1,35 @@ +#include + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layer.hpp" +#include "caffe/sequence_layers.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +void RecurrentLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + // Hacky fix for test time... reshare all the shared blobs. + // TODO: somehow make this work non-hackily. + if (this->phase_ == TEST) { + unrolled_net_->ShareWeightData(); + } + + DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size()); + for (int i = 0; i < recur_input_blobs_.size(); ++i) { + const int count = recur_input_blobs_[i]->count(); + DCHECK_EQ(count, recur_output_blobs_[i]->count()); + const Dtype* timestep_T_data = recur_output_blobs_[i]->gpu_data(); + Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_gpu_data(); + caffe_copy(count, timestep_T_data, timestep_0_data); + } + + unrolled_net_->ForwardPrefilled(); +} + +INSTANTIATE_LAYER_GPU_FORWARD(RecurrentLayer); + +} // namespace caffe diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index c40e4c75716..915b0af5d25 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -260,7 +260,7 @@ message ParamSpec { // NOTE // Update the next available ID when you add a new LayerParameter field. // -// LayerParameter next available layer-specific ID: 134 (last added: reshape_param) +// LayerParameter next available layer-specific ID: 135 (last added: recurrent_param) message LayerParameter { optional string name = 1; // the layer name optional string type = 2; // the layer type @@ -327,6 +327,7 @@ message LayerParameter { optional PowerParameter power_param = 122; optional PReLUParameter prelu_param = 131; optional PythonParameter python_param = 130; + optional RecurrentParameter recurrent_param = 134; optional ReLUParameter relu_param = 123; optional ReshapeParameter reshape_param = 133; optional SigmoidParameter sigmoid_param = 124; @@ -700,6 +701,19 @@ message ReshapeParameter { optional BlobShape shape = 1; } +// Message that stores parameters used by RecurrentLayer +message RecurrentParameter { + // The dimension of the output (and usually hidden state) representation -- + // must be explicitly set to non-zero. + optional uint32 num_output = 1 [default = 0]; + + optional FillerParameter weight_filler = 2; // The filler for the weight + optional FillerParameter bias_filler = 3; // The filler for the bias + + // Whether to enable displaying debug_info in the unrolled recurrent net. + optional bool debug_info = 4 [default = false]; +} + // Message that stores parameters used by ReLULayer message ReLUParameter { // Allow non-zero slope for negative inputs to speed up optimization From 69e875cb79a59f6607d26af2c1e407d07067c00a Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Sun, 15 Feb 2015 14:56:50 -0800 Subject: [PATCH 20/26] Add RNNLayer, with tests --- include/caffe/sequence_layers.hpp | 25 ++++ src/caffe/layers/rnn_layer.cpp | 217 ++++++++++++++++++++++++++++++ src/caffe/test/test_rnn_layer.cpp | 196 +++++++++++++++++++++++++++ 3 files changed, 438 insertions(+) create mode 100644 src/caffe/layers/rnn_layer.cpp create mode 100644 src/caffe/test/test_rnn_layer.cpp diff --git a/include/caffe/sequence_layers.hpp b/include/caffe/sequence_layers.hpp index d064136bf15..708f6380d93 100644 --- a/include/caffe/sequence_layers.hpp +++ b/include/caffe/sequence_layers.hpp @@ -149,6 +149,31 @@ class RecurrentLayer : public Layer { Blob* cont_input_blob_; }; +/** + * @brief Processes time-varying inputs using a simple recurrent neural network + * (RNN). Implemented as a network unrolling the RNN computation in time. + * + * Given time-varying inputs @f$ x_t @f$, computes hidden state @f$ + * h_t := \tanh[ W_{hh} h_{t_1} + W_{xh} x_t + b_h ] + * @f$, and outputs @f$ + * o_t := \tanh[ W_{ho} h_t + b_o ] + * @f$. + */ +template +class RNNLayer : public RecurrentLayer { + public: + explicit RNNLayer(const LayerParameter& param) + : RecurrentLayer(param) {} + + virtual inline const char* type() const { return "RNN"; } + + protected: + virtual void FillUnrolledNet(NetParameter* net_param) const; + virtual void RecurrentInputBlobNames(vector* names) const; + virtual void RecurrentOutputBlobNames(vector* names) const; + virtual void OutputBlobNames(vector* names) const; +}; + } // namespace caffe #endif // CAFFE_SEQUENCE_LAYERS_HPP_ diff --git a/src/caffe/layers/rnn_layer.cpp b/src/caffe/layers/rnn_layer.cpp new file mode 100644 index 00000000000..a2a22f62819 --- /dev/null +++ b/src/caffe/layers/rnn_layer.cpp @@ -0,0 +1,217 @@ +#include +#include + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layer.hpp" +#include "caffe/sequence_layers.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +void RNNLayer::RecurrentInputBlobNames(vector* names) const { + names->resize(1); + (*names)[0] = "h_0"; +} + +template +void RNNLayer::RecurrentOutputBlobNames(vector* names) const { + names->resize(1); + (*names)[0] = "h_" + this->int_to_str(this->T_); +} + +template +void RNNLayer::OutputBlobNames(vector* names) const { + names->resize(1); + (*names)[0] = "o"; +} + +template +void RNNLayer::FillUnrolledNet(NetParameter* net_param) const { + const int num_output = this->layer_param_.recurrent_param().num_output(); + CHECK_GT(num_output, 0) << "num_output must be positive"; + const FillerParameter& weight_filler = + this->layer_param_.recurrent_param().weight_filler(); + const FillerParameter& bias_filler = + this->layer_param_.recurrent_param().bias_filler(); + + // Add generic LayerParameter's (without bottoms/tops) of layer types we'll + // use to save redundant code. + LayerParameter hidden_param; + hidden_param.set_type("InnerProduct"); + hidden_param.mutable_inner_product_param()->set_num_output(num_output); + hidden_param.mutable_inner_product_param()->set_bias_term(false); + hidden_param.mutable_inner_product_param()->set_axis(2); + hidden_param.mutable_inner_product_param()-> + mutable_weight_filler()->CopyFrom(weight_filler); + + LayerParameter biased_hidden_param(hidden_param); + biased_hidden_param.mutable_inner_product_param()->set_bias_term(true); + biased_hidden_param.mutable_inner_product_param()-> + mutable_bias_filler()->CopyFrom(bias_filler); + + LayerParameter sum_param; + sum_param.set_type("Eltwise"); + sum_param.mutable_eltwise_param()->set_operation( + EltwiseParameter_EltwiseOp_SUM); + + LayerParameter tanh_param; + tanh_param.set_type("TanH"); + + LayerParameter slice_param; + slice_param.set_type("Slice"); + slice_param.mutable_slice_param()->set_axis(0); + + BlobShape input_shape; + input_shape.add_dim(1); // h_0 is a single timestep + input_shape.add_dim(this->N_); + input_shape.add_dim(num_output); + net_param->add_input("h_0"); + net_param->add_input_shape()->CopyFrom(input_shape); + + LayerParameter* cont_slice_param = net_param->add_layer(); + cont_slice_param->CopyFrom(slice_param); + cont_slice_param->set_name("cont_slice"); + cont_slice_param->add_bottom("cont"); + cont_slice_param->mutable_slice_param()->set_axis(1); + + // Add layer to transform all timesteps of x to the hidden state dimension. + // W_xh_x = W_xh * x + b_h + { + LayerParameter* x_transform_param = net_param->add_layer(); + x_transform_param->CopyFrom(biased_hidden_param); + x_transform_param->set_name("x_transform"); + x_transform_param->add_param()->set_name("W_xh"); + x_transform_param->add_param()->set_name("b_h"); + x_transform_param->add_bottom("x"); + x_transform_param->add_top("W_xh_x"); + } + + if (this->static_input_) { + // Add layer to transform x_static to the hidden state dimension. + // W_xh_x_static = W_xh_static * x_static + LayerParameter* x_static_transform_param = net_param->add_layer(); + x_static_transform_param->CopyFrom(hidden_param); + x_static_transform_param->mutable_inner_product_param()->set_axis(1); + x_static_transform_param->set_name("W_xh_x_static"); + x_static_transform_param->add_param()->set_name("W_xh_static"); + x_static_transform_param->add_bottom("x_static"); + x_static_transform_param->add_top("W_xh_x_static"); + + LayerParameter* reshape_param = net_param->add_layer(); + reshape_param->set_type("Reshape"); + BlobShape* new_shape = + reshape_param->mutable_reshape_param()->mutable_shape(); + new_shape->add_dim(1); // One timestep. + new_shape->add_dim(this->N_); + new_shape->add_dim( + x_static_transform_param->inner_product_param().num_output()); + reshape_param->set_name("W_xh_x_static_reshape"); + reshape_param->add_bottom("W_xh_x_static"); + reshape_param->add_top("W_xh_x_static"); + } + + LayerParameter* x_slice_param = net_param->add_layer(); + x_slice_param->CopyFrom(slice_param); + x_slice_param->set_name("W_xh_x_slice"); + x_slice_param->add_bottom("W_xh_x"); + + LayerParameter output_concat_layer; + output_concat_layer.set_name("o_concat"); + output_concat_layer.set_type("Concat"); + output_concat_layer.add_top("o"); + output_concat_layer.mutable_concat_param()->set_axis(0); + + for (int t = 1; t <= this->T_; ++t) { + string tm1s = this->int_to_str(t - 1); + string ts = this->int_to_str(t); + + cont_slice_param->add_top("cont_" + ts); + x_slice_param->add_top("W_xh_x_" + ts); + + // Add layer to flush the hidden state when beginning a new sequence, + // as indicated by cont_t. + // h_conted_{t-1} := cont_t * h_{t-1} + // + // Normally, cont_t is binary (i.e., 0 or 1), so: + // h_conted_{t-1} := h_{t-1} if cont_t == 1 + // 0 otherwise + { + LayerParameter* cont_h_param = net_param->add_layer(); + cont_h_param->CopyFrom(sum_param); + cont_h_param->mutable_eltwise_param()->set_coeff_blob(true); + cont_h_param->set_name("h_conted_" + tm1s); + cont_h_param->add_bottom("h_" + tm1s); + cont_h_param->add_bottom("cont_" + ts); + cont_h_param->add_top("h_conted_" + tm1s); + } + + // Add layer to compute + // W_hh_h_{t-1} := W_hh * h_conted_{t-1} + { + LayerParameter* w_param = net_param->add_layer(); + w_param->CopyFrom(hidden_param); + w_param->set_name("W_hh_h_" + tm1s); + w_param->add_param()->set_name("W_hh"); + w_param->add_bottom("h_conted_" + tm1s); + w_param->add_top("W_hh_h_" + tm1s); + w_param->mutable_inner_product_param()->set_axis(2); + } + + // Add layers to compute + // h_t := \tanh( W_hh * h_conted_{t-1} + W_xh * x_t + b_h ) + // = \tanh( W_hh_h_{t-1} + W_xh_t ) + { + LayerParameter* h_input_sum_param = net_param->add_layer(); + h_input_sum_param->CopyFrom(sum_param); + h_input_sum_param->set_name("h_input_sum_" + ts); + h_input_sum_param->add_bottom("W_hh_h_" + tm1s); + h_input_sum_param->add_bottom("W_xh_x_" + ts); + if (this->static_input_) { + h_input_sum_param->add_bottom("W_xh_x_static"); + } + h_input_sum_param->add_top("h_neuron_input_" + ts); + } + { + LayerParameter* h_neuron_param = net_param->add_layer(); + h_neuron_param->CopyFrom(tanh_param); + h_neuron_param->set_name("h_neuron_" + ts); + h_neuron_param->add_bottom("h_neuron_input_" + ts); + h_neuron_param->add_top("h_" + ts); + } + + // Add layer to compute + // W_ho_h_t := W_ho * h_t + b_o + { + LayerParameter* w_param = net_param->add_layer(); + w_param->CopyFrom(biased_hidden_param); + w_param->set_name("W_ho_h_" + ts); + w_param->add_param()->set_name("W_ho"); + w_param->add_param()->set_name("b_o"); + w_param->add_bottom("h_" + ts); + w_param->add_top("W_ho_h_" + ts); + w_param->mutable_inner_product_param()->set_axis(2); + } + + // Add layers to compute + // o_t := \tanh( W_ho h_t + b_o) + // = \tanh( W_ho_h_t ) + { + LayerParameter* o_neuron_param = net_param->add_layer(); + o_neuron_param->CopyFrom(tanh_param); + o_neuron_param->set_name("o_neuron_" + ts); + o_neuron_param->add_bottom("W_ho_h_" + ts); + o_neuron_param->add_top("o_" + ts); + } + output_concat_layer.add_bottom("o_" + ts); + } // for (int t = 1; t <= this->T_; ++t) + + net_param->add_layer()->CopyFrom(output_concat_layer); +} + +INSTANTIATE_CLASS(RNNLayer); +REGISTER_LAYER_CLASS(RNN); + +} // namespace caffe diff --git a/src/caffe/test/test_rnn_layer.cpp b/src/caffe/test/test_rnn_layer.cpp new file mode 100644 index 00000000000..eab9269ce77 --- /dev/null +++ b/src/caffe/test/test_rnn_layer.cpp @@ -0,0 +1,196 @@ +#include +#include + +#include "gtest/gtest.h" + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/sequence_layers.hpp" + +#include "caffe/test/test_caffe_main.hpp" +#include "caffe/test/test_gradient_check_util.hpp" + +namespace caffe { + +template +class RNNLayerTest : public MultiDeviceTest { + typedef typename TypeParam::Dtype Dtype; + + protected: + RNNLayerTest() : num_output_(7) { + blob_bottom_vec_.push_back(&blob_bottom_); + blob_bottom_vec_.push_back(&blob_bottom_flush_); + blob_top_vec_.push_back(&blob_top_); + + ReshapeBlobs(1, 3); + + layer_param_.mutable_recurrent_param()->set_num_output(num_output_); + FillerParameter* weight_filler = + layer_param_.mutable_recurrent_param()->mutable_weight_filler(); + weight_filler->set_type("gaussian"); + weight_filler->set_std(0.2); + FillerParameter* bias_filler = + layer_param_.mutable_recurrent_param()->mutable_bias_filler(); + bias_filler->set_type("gaussian"); + bias_filler->set_std(0.1); + + layer_param_.set_phase(TEST); + } + + void ReshapeBlobs(int num_timesteps, int num_instances) { + blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2); + vector shape(2); + shape[0] = num_timesteps; + shape[1] = num_instances; + blob_bottom_flush_.Reshape(shape); + + FillerParameter filler_param; + filler_param.set_min(-1); + filler_param.set_max(1); + UniformFiller filler(filler_param); + filler.Fill(&blob_bottom_); + } + + int num_output_; + LayerParameter layer_param_; + Blob blob_bottom_; + Blob blob_bottom_flush_; + Blob blob_top_; + vector*> blob_bottom_vec_; + vector*> blob_top_vec_; +}; + +TYPED_TEST_CASE(RNNLayerTest, TestDtypesAndDevices); + +TYPED_TEST(RNNLayerTest, TestSetUp) { + typedef typename TypeParam::Dtype Dtype; + RNNLayer layer(this->layer_param_); + layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + vector expected_top_shape = this->blob_bottom_.shape(); + expected_top_shape.resize(3); + expected_top_shape[2] = this->num_output_; + EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape); +} + +TYPED_TEST(RNNLayerTest, TestForward) { + typedef typename TypeParam::Dtype Dtype; + const int kNumTimesteps = 3; + const int num = this->blob_bottom_.shape(1); + this->ReshapeBlobs(kNumTimesteps, num); + + // Fill the flush blob with <0, 1, 1, ..., 1>, + // indicating a sequence that begins at the first timestep + // then continues for the rest of the sequence. + for (int t = 0; t < kNumTimesteps; ++t) { + for (int n = 0; n < num; ++n) { + this->blob_bottom_flush_.mutable_cpu_data()[t * num + n] = t > 0; + } + } + + // Process the full sequence in a single batch. + FillerParameter filler_param; + filler_param.set_mean(0); + filler_param.set_std(1); + GaussianFiller sequence_filler(filler_param); + sequence_filler.Fill(&this->blob_bottom_); + shared_ptr > layer(new RNNLayer(this->layer_param_)); + Caffe::set_random_seed(1701); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + LOG(INFO) << "Calling forward for full sequence RNN"; + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + // Copy the inputs and outputs to reuse/check them later. + Blob bottom_copy(this->blob_bottom_.shape()); + bottom_copy.CopyFrom(this->blob_bottom_); + Blob top_copy(this->blob_top_.shape()); + top_copy.CopyFrom(this->blob_top_); + + // Process the batch one timestep at a time; + // check that we get the same result. + this->ReshapeBlobs(1, num); + layer.reset(new RNNLayer(this->layer_param_)); + Caffe::set_random_seed(1701); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + const int bottom_count = this->blob_bottom_.count(); + const int top_count = this->blob_top_.count(); + const Dtype kEpsilon = 1e-5; + for (int t = 0; t < kNumTimesteps; ++t) { + caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count, + this->blob_bottom_.mutable_cpu_data()); + for (int n = 0; n < num; ++n) { + this->blob_bottom_flush_.mutable_cpu_data()[n] = t > 0; + } + LOG(INFO) << "Calling forward for RNN timestep " << t; + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + for (int i = 0; i < top_count; ++i) { + ASSERT_LT(t * top_count + i, top_copy.count()); + EXPECT_NEAR(this->blob_top_.cpu_data()[i], + top_copy.cpu_data()[t * top_count + i], kEpsilon) + << "t = " << t << "; i = " << i; + } + } + + // Process the batch one timestep at a time with all flush blobs set to 0. + // Check that we get a different result, except in the first timestep. + Caffe::set_random_seed(1701); + layer.reset(new RNNLayer(this->layer_param_)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + for (int t = 0; t < kNumTimesteps; ++t) { + caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count, + this->blob_bottom_.mutable_cpu_data()); + for (int n = 0; n < num; ++n) { + this->blob_bottom_flush_.mutable_cpu_data()[n] = 0; + } + LOG(INFO) << "Calling forward for RNN timestep " << t; + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + for (int i = 0; i < top_count; ++i) { + if (t == 0) { + EXPECT_NEAR(this->blob_top_.cpu_data()[i], + top_copy.cpu_data()[t * top_count + i], kEpsilon) + << "t = " << t << "; i = " << i; + } else { + EXPECT_NE(this->blob_top_.cpu_data()[i], + top_copy.cpu_data()[t * top_count + i]) + << "t = " << t << "; i = " << i; + } + } + } +} + +TYPED_TEST(RNNLayerTest, TestGradient) { + typedef typename TypeParam::Dtype Dtype; + RNNLayer layer(this->layer_param_); + GradientChecker checker(1e-2, 1e-3); + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 0); +} + +TYPED_TEST(RNNLayerTest, TestGradientNonZeroFlush) { + typedef typename TypeParam::Dtype Dtype; + RNNLayer layer(this->layer_param_); + GradientChecker checker(1e-2, 1e-3); + for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) { + this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2; + } + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 0); +} + +TYPED_TEST(RNNLayerTest, TestGradientNonZeroFlushBufferSize2) { + typedef typename TypeParam::Dtype Dtype; + this->ReshapeBlobs(2, 2); + // fill the values + FillerParameter filler_param; + UniformFiller filler(filler_param); + filler.Fill(&this->blob_bottom_); + RNNLayer layer(this->layer_param_); + GradientChecker checker(1e-2, 1e-3); + for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) { + this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2; + } + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 0); +} + +} // namespace caffe From cea7bc28fe97903ef2a4b535e6f3df83ad9cd019 Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Sun, 15 Feb 2015 14:57:22 -0800 Subject: [PATCH 21/26] Add LSTMLayer and LSTMUnitLayer, with tests --- include/caffe/sequence_layers.hpp | 133 ++++++++++++++ src/caffe/layers/lstm_layer.cpp | 221 ++++++++++++++++++++++ src/caffe/layers/lstm_unit_layer.cpp | 128 +++++++++++++ src/caffe/layers/lstm_unit_layer.cu | 154 ++++++++++++++++ src/caffe/test/test_lstm_layer.cpp | 265 +++++++++++++++++++++++++++ 5 files changed, 901 insertions(+) create mode 100644 src/caffe/layers/lstm_layer.cpp create mode 100644 src/caffe/layers/lstm_unit_layer.cpp create mode 100644 src/caffe/layers/lstm_unit_layer.cu create mode 100644 src/caffe/test/test_lstm_layer.cpp diff --git a/include/caffe/sequence_layers.hpp b/include/caffe/sequence_layers.hpp index 708f6380d93..8ac735435a4 100644 --- a/include/caffe/sequence_layers.hpp +++ b/include/caffe/sequence_layers.hpp @@ -149,6 +149,139 @@ class RecurrentLayer : public Layer { Blob* cont_input_blob_; }; +/** + * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM) + * [1] style recurrent neural network (RNN). Implemented as a network + * unrolled the LSTM computation in time. + * + * + * The specific architecture used in this implementation is as described in + * "Learning to Execute" [2], reproduced below: + * i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ] + * f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ] + * o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ] + * g_t := \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ] + * c_t := (f_t .* c_{t-1}) + (i_t .* g_t) + * h_t := o_t .* \tanh[c_t] + * In the implementation, the i, f, o, and g computations are performed as a + * single inner product. + * + * Notably, this implementation lacks the "diagonal" gates, as used in the + * LSTM architectures described by Alex Graves [3] and others. + * + * [1] Hochreiter, Sepp, and Schmidhuber, Jürgen. "Long short-term memory." + * Neural Computation 9, no. 8 (1997): 1735-1780. + * + * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute." + * arXiv preprint arXiv:1410.4615 (2014). + * + * [3] Graves, Alex. "Generating sequences with recurrent neural networks." + * arXiv preprint arXiv:1308.0850 (2013). + */ +template +class LSTMLayer : public RecurrentLayer { + public: + explicit LSTMLayer(const LayerParameter& param) + : RecurrentLayer(param) {} + + virtual inline const char* type() const { return "LSTM"; } + + protected: + virtual void FillUnrolledNet(NetParameter* net_param) const; + virtual void RecurrentInputBlobNames(vector* names) const; + virtual void RecurrentOutputBlobNames(vector* names) const; + virtual void OutputBlobNames(vector* names) const; +}; + +/** + * @brief A helper for LSTMLayer: computes a single timestep of the + * non-linearity of the LSTM, producing the updated cell and hidden + * states. + */ +template +class LSTMUnitLayer : public Layer { + public: + explicit LSTMUnitLayer(const LayerParameter& param) + : Layer(param) {} + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { return "LSTMUnit"; } + virtual inline int ExactNumBottomBlobs() const { return 3; } + virtual inline int ExactNumTopBlobs() const { return 2; } + + virtual inline bool AllowForceBackward(const int bottom_index) const { + // Can't propagate to sequence continuation indicators. + return bottom_index != 2; + } + + protected: + /** + * @param bottom input Blob vector (length 3) + * -# @f$ (1 \times N \times D) @f$ + * the previous timestep cell state @f$ c_{t-1} @f$ + * -# @f$ (1 \times N \times 4D) @f$ + * the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$ + * -# @f$ (1 \times 1 \times N) @f$ + * the sequence continuation indicators @f$ \delta_t @f$ + * @param top output Blob vector (length 2) + * -# @f$ (1 \times N \times D) @f$ + * the updated cell state @f$ c_t @f$, computed as: + * i_t := \sigmoid[i_t'] + * f_t := \sigmoid[f_t'] + * o_t := \sigmoid[o_t'] + * g_t := \tanh[g_t'] + * c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t) + * -# @f$ (1 \times N \times D) @f$ + * the updated hidden state @f$ h_t @f$, computed as: + * h_t := o_t .* \tanh[c_t] + */ + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + + /** + * @brief Computes the error gradient w.r.t. the LSTMUnit inputs. + * + * @param top output Blob vector (length 2), providing the error gradient with + * respect to the outputs + * -# @f$ (1 \times N \times D) @f$: + * containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$ + * with respect to the updated cell state @f$ c_t @f$ + * -# @f$ (1 \times N \times D) @f$: + * containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$ + * with respect to the updated cell state @f$ h_t @f$ + * @param propagate_down see Layer::Backward. + * @param bottom input Blob vector (length 3), into which the error gradients + * with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate + * inputs are computed. Computatation of the error gradients w.r.t. + * the sequence indicators is not implemented. + * -# @f$ (1 \times N \times D) @f$ + * the error gradient w.r.t. the previous timestep cell state + * @f$ c_{t-1} @f$ + * -# @f$ (1 \times N \times 4D) @f$ + * the error gradient w.r.t. the "gate inputs" + * @f$ [ + * \frac{\partial E}{\partial i_t} + * \frac{\partial E}{\partial f_t} + * \frac{\partial E}{\partial o_t} + * \frac{\partial E}{\partial g_t} + * ] @f$ + * -# @f$ (1 \times 1 \times N) @f$ + * the gradient w.r.t. the sequence continuation indicators + * @f$ \delta_t @f$ is currently not computed. + */ + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + /// @brief The hidden and output dimension. + int hidden_dim_; + Blob X_acts_; +}; + /** * @brief Processes time-varying inputs using a simple recurrent neural network * (RNN). Implemented as a network unrolling the RNN computation in time. diff --git a/src/caffe/layers/lstm_layer.cpp b/src/caffe/layers/lstm_layer.cpp new file mode 100644 index 00000000000..91543f73f71 --- /dev/null +++ b/src/caffe/layers/lstm_layer.cpp @@ -0,0 +1,221 @@ +#include +#include + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/layer.hpp" +#include "caffe/sequence_layers.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +void LSTMLayer::RecurrentInputBlobNames(vector* names) const { + names->resize(2); + (*names)[0] = "h_0"; + (*names)[1] = "c_0"; +} + +template +void LSTMLayer::RecurrentOutputBlobNames(vector* names) const { + names->resize(2); + (*names)[0] = "h_" + this->int_to_str(this->T_); + (*names)[1] = "c_T"; +} + +template +void LSTMLayer::OutputBlobNames(vector* names) const { + names->resize(1); + (*names)[0] = "h"; +} + +template +void LSTMLayer::FillUnrolledNet(NetParameter* net_param) const { + const int num_output = this->layer_param_.recurrent_param().num_output(); + CHECK_GT(num_output, 0) << "num_output must be positive"; + const FillerParameter& weight_filler = + this->layer_param_.recurrent_param().weight_filler(); + const FillerParameter& bias_filler = + this->layer_param_.recurrent_param().bias_filler(); + + // Add generic LayerParameter's (without bottoms/tops) of layer types we'll + // use to save redundant code. + LayerParameter hidden_param; + hidden_param.set_type("InnerProduct"); + hidden_param.mutable_inner_product_param()->set_num_output(num_output * 4); + hidden_param.mutable_inner_product_param()->set_bias_term(false); + hidden_param.mutable_inner_product_param()->set_axis(2); + hidden_param.mutable_inner_product_param()-> + mutable_weight_filler()->CopyFrom(weight_filler); + + LayerParameter biased_hidden_param(hidden_param); + biased_hidden_param.mutable_inner_product_param()->set_bias_term(true); + biased_hidden_param.mutable_inner_product_param()-> + mutable_bias_filler()->CopyFrom(bias_filler); + + LayerParameter sum_param; + sum_param.set_type("Eltwise"); + sum_param.mutable_eltwise_param()->set_operation( + EltwiseParameter_EltwiseOp_SUM); + + LayerParameter slice_param; + slice_param.set_type("Slice"); + slice_param.mutable_slice_param()->set_axis(0); + + LayerParameter split_param; + split_param.set_type("Split"); + + BlobShape input_shape; + input_shape.add_dim(1); // c_0 and h_0 are a single timestep + input_shape.add_dim(this->N_); + input_shape.add_dim(num_output); + + net_param->add_input("c_0"); + net_param->add_input_shape()->CopyFrom(input_shape); + + net_param->add_input("h_0"); + net_param->add_input_shape()->CopyFrom(input_shape); + + LayerParameter* cont_slice_param = net_param->add_layer(); + cont_slice_param->CopyFrom(slice_param); + cont_slice_param->set_name("cont_slice"); + cont_slice_param->add_bottom("cont"); + cont_slice_param->mutable_slice_param()->set_axis(1); + + // Add layer to transform all timesteps of x to the hidden state dimension. + // W_xc_x = W_xc * x + b_c + { + LayerParameter* x_transform_param = net_param->add_layer(); + x_transform_param->CopyFrom(biased_hidden_param); + x_transform_param->set_name("x_transform"); + x_transform_param->add_param()->set_name("W_xc"); + x_transform_param->add_param()->set_name("b_c"); + x_transform_param->add_bottom("x"); + x_transform_param->add_top("W_xc_x"); + } + + if (this->static_input_) { + // Add layer to transform x_static to the gate dimension. + // W_xc_x_static = W_xc_static * x_static + LayerParameter* x_static_transform_param = net_param->add_layer(); + x_static_transform_param->CopyFrom(hidden_param); + x_static_transform_param->mutable_inner_product_param()->set_axis(1); + x_static_transform_param->set_name("W_xc_x_static"); + x_static_transform_param->add_param()->set_name("W_xc_static"); + x_static_transform_param->add_bottom("x_static"); + x_static_transform_param->add_top("W_xc_x_static"); + + LayerParameter* reshape_param = net_param->add_layer(); + reshape_param->set_type("Reshape"); + BlobShape* new_shape = + reshape_param->mutable_reshape_param()->mutable_shape(); + new_shape->add_dim(1); // One timestep. + new_shape->add_dim(this->N_); + new_shape->add_dim( + x_static_transform_param->inner_product_param().num_output()); + reshape_param->add_bottom("W_xc_x_static"); + reshape_param->add_top("W_xc_x_static"); + } + + LayerParameter* x_slice_param = net_param->add_layer(); + x_slice_param->CopyFrom(slice_param); + x_slice_param->add_bottom("W_xc_x"); + x_slice_param->set_name("W_xc_x_slice"); + + LayerParameter output_concat_layer; + output_concat_layer.set_name("h_concat"); + output_concat_layer.set_type("Concat"); + output_concat_layer.add_top("h"); + output_concat_layer.mutable_concat_param()->set_axis(0); + + for (int t = 1; t <= this->T_; ++t) { + string tm1s = this->int_to_str(t - 1); + string ts = this->int_to_str(t); + + cont_slice_param->add_top("cont_" + ts); + x_slice_param->add_top("W_xc_x_" + ts); + + // Add layers to flush the hidden state when beginning a new + // sequence, as indicated by cont_t. + // h_conted_{t-1} := cont_t * h_{t-1} + // + // Normally, cont_t is binary (i.e., 0 or 1), so: + // h_conted_{t-1} := h_{t-1} if cont_t == 1 + // 0 otherwise + { + LayerParameter* cont_h_param = net_param->add_layer(); + cont_h_param->CopyFrom(sum_param); + cont_h_param->mutable_eltwise_param()->set_coeff_blob(true); + cont_h_param->set_name("h_conted_" + tm1s); + cont_h_param->add_bottom("h_" + tm1s); + cont_h_param->add_bottom("cont_" + ts); + cont_h_param->add_top("h_conted_" + tm1s); + } + + // Add layer to compute + // W_hc_h_{t-1} := W_hc * h_conted_{t-1} + { + LayerParameter* w_param = net_param->add_layer(); + w_param->CopyFrom(hidden_param); + w_param->set_name("transform_" + ts); + w_param->add_param()->set_name("W_hc"); + w_param->add_bottom("h_conted_" + tm1s); + w_param->add_top("W_hc_h_" + tm1s); + w_param->mutable_inner_product_param()->set_axis(2); + } + + // Add the outputs of the linear transformations to compute the gate input. + // gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c + // = W_hc_h_{t-1} + W_xc_x_t + b_c + { + LayerParameter* input_sum_layer = net_param->add_layer(); + input_sum_layer->CopyFrom(sum_param); + input_sum_layer->set_name("gate_input_" + ts); + input_sum_layer->add_bottom("W_hc_h_" + tm1s); + input_sum_layer->add_bottom("W_xc_x_" + ts); + if (this->static_input_) { + input_sum_layer->add_bottom("W_xc_x_static"); + } + input_sum_layer->add_top("gate_input_" + ts); + } + + // Add LSTMUnit layer to compute the cell & hidden vectors c_t and h_t. + // Inputs: c_{t-1}, gate_input_t = (i_t, f_t, o_t, g_t), cont_t + // Outputs: c_t, h_t + // [ i_t' ] + // [ f_t' ] := gate_input_t + // [ o_t' ] + // [ g_t' ] + // i_t := \sigmoid[i_t'] + // f_t := \sigmoid[f_t'] + // o_t := \sigmoid[o_t'] + // g_t := \tanh[g_t'] + // c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t) + // h_t := o_t .* \tanh[c_t] + { + LayerParameter* lstm_unit_param = net_param->add_layer(); + lstm_unit_param->set_type("LSTMUnit"); + lstm_unit_param->add_bottom("c_" + tm1s); + lstm_unit_param->add_bottom("gate_input_" + ts); + lstm_unit_param->add_bottom("cont_" + ts); + lstm_unit_param->add_top("c_" + ts); + lstm_unit_param->add_top("h_" + ts); + lstm_unit_param->set_name("unit_" + ts); + } + output_concat_layer.add_bottom("h_" + ts); + } // for (int t = 1; t <= this->T_; ++t) + + { + LayerParameter* c_T_copy_param = net_param->add_layer(); + c_T_copy_param->CopyFrom(split_param); + c_T_copy_param->add_bottom("c_" + this->int_to_str(this->T_)); + c_T_copy_param->add_top("c_T"); + } + net_param->add_layer()->CopyFrom(output_concat_layer); +} + +INSTANTIATE_CLASS(LSTMLayer); +REGISTER_LAYER_CLASS(LSTM); + +} // namespace caffe diff --git a/src/caffe/layers/lstm_unit_layer.cpp b/src/caffe/layers/lstm_unit_layer.cpp new file mode 100644 index 00000000000..74078d264f5 --- /dev/null +++ b/src/caffe/layers/lstm_unit_layer.cpp @@ -0,0 +1,128 @@ +#include +#include +#include + +#include "caffe/layer.hpp" +#include "caffe/sequence_layers.hpp" + +namespace caffe { + +template +inline Dtype sigmoid(Dtype x) { + return 1. / (1. + exp(-x)); +} + +template +inline Dtype tanh(Dtype x) { + return 2. * sigmoid(2. * x) - 1.; +} + +template +void LSTMUnitLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + for (int i = 0; i < bottom.size(); ++i) { + CHECK_EQ(3, bottom[i]->num_axes()); + CHECK_EQ(1, bottom[i]->shape(0)); + } + const int num_instances = bottom[0]->shape(1); + hidden_dim_ = bottom[0]->shape(2); + CHECK_EQ(num_instances, bottom[1]->shape(1)); + CHECK_EQ(4 * hidden_dim_, bottom[1]->shape(2)); + CHECK_EQ(1, bottom[2]->shape(1)); + CHECK_EQ(num_instances, bottom[2]->shape(2)); + top[0]->ReshapeLike(*bottom[0]); + top[1]->ReshapeLike(*bottom[0]); + X_acts_.ReshapeLike(*bottom[1]); +} + +template +void LSTMUnitLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { + const int num = bottom[0]->shape(1); + const int x_dim = hidden_dim_ * 4; + const Dtype* C_prev = bottom[0]->cpu_data(); + const Dtype* X = bottom[1]->cpu_data(); + const Dtype* flush = bottom[2]->cpu_data(); + Dtype* C = top[0]->mutable_cpu_data(); + Dtype* H = top[1]->mutable_cpu_data(); + for (int n = 0; n < num; ++n) { + for (int d = 0; d < hidden_dim_; ++d) { + const Dtype i = sigmoid(X[d]); + const Dtype f = (*flush == 0) ? 0 : + (*flush * sigmoid(X[1 * hidden_dim_ + d])); + const Dtype o = sigmoid(X[2 * hidden_dim_ + d]); + const Dtype g = tanh(X[3 * hidden_dim_ + d]); + const Dtype c_prev = C_prev[d]; + const Dtype c = f * c_prev + i * g; + C[d] = c; + const Dtype tanh_c = tanh(c); + H[d] = o * tanh_c; + } + C_prev += hidden_dim_; + X += x_dim; + C += hidden_dim_; + H += hidden_dim_; + ++flush; + } +} + +template +void LSTMUnitLayer::Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom) { + CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators."; + if (!propagate_down[0] && !propagate_down[1]) { return; } + + const int num = bottom[0]->shape(1); + const int x_dim = hidden_dim_ * 4; + const Dtype* C_prev = bottom[0]->cpu_data(); + const Dtype* X = bottom[1]->cpu_data(); + const Dtype* flush = bottom[2]->cpu_data(); + const Dtype* C = top[0]->cpu_data(); + const Dtype* H = top[1]->cpu_data(); + const Dtype* C_diff = top[0]->cpu_diff(); + const Dtype* H_diff = top[1]->cpu_diff(); + Dtype* C_prev_diff = bottom[0]->mutable_cpu_diff(); + Dtype* X_diff = bottom[1]->mutable_cpu_diff(); + for (int n = 0; n < num; ++n) { + for (int d = 0; d < hidden_dim_; ++d) { + const Dtype i = sigmoid(X[d]); + const Dtype f = (*flush == 0) ? 0 : + (*flush * sigmoid(X[1 * hidden_dim_ + d])); + const Dtype o = sigmoid(X[2 * hidden_dim_ + d]); + const Dtype g = tanh(X[3 * hidden_dim_ + d]); + const Dtype c_prev = C_prev[d]; + const Dtype c = C[d]; + const Dtype tanh_c = tanh(c); + Dtype* c_prev_diff = C_prev_diff + d; + Dtype* i_diff = X_diff + d; + Dtype* f_diff = X_diff + 1 * hidden_dim_ + d; + Dtype* o_diff = X_diff + 2 * hidden_dim_ + d; + Dtype* g_diff = X_diff + 3 * hidden_dim_ + d; + const Dtype c_term_diff = + C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c); + *c_prev_diff = c_term_diff * f; + *i_diff = c_term_diff * g * i * (1 - i); + *f_diff = c_term_diff * c_prev * f * (1 - f); + *o_diff = H_diff[d] * tanh_c * o * (1 - o); + *g_diff = c_term_diff * i * (1 - g * g); + } + C_prev += hidden_dim_; + X += x_dim; + C += hidden_dim_; + H += hidden_dim_; + C_diff += hidden_dim_; + H_diff += hidden_dim_; + X_diff += x_dim; + C_prev_diff += hidden_dim_; + ++flush; + } +} + +#ifdef CPU_ONLY +STUB_GPU(LSTMUnitLayer); +#endif + +INSTANTIATE_CLASS(LSTMUnitLayer); +REGISTER_LAYER_CLASS(LSTMUnit); + +} // namespace caffe diff --git a/src/caffe/layers/lstm_unit_layer.cu b/src/caffe/layers/lstm_unit_layer.cu new file mode 100644 index 00000000000..d6bf85071f5 --- /dev/null +++ b/src/caffe/layers/lstm_unit_layer.cu @@ -0,0 +1,154 @@ +#include +#include +#include + +#include "caffe/layer.hpp" +#include "caffe/sequence_layers.hpp" + +namespace caffe { + +template +__device__ Dtype sigmoid(const Dtype x) { + return Dtype(1) / (Dtype(1) + exp(-x)); +} + +template +__device__ Dtype tanh(const Dtype x) { + return Dtype(2) * sigmoid(Dtype(2) * x) - Dtype(1); +} + +template +__global__ void LSTMActsForward(const int nthreads, const int dim, + const Dtype* X, Dtype* X_acts) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int x_dim = 4 * dim; + const int d = index % x_dim; + if (d < 3 * dim) { + X_acts[index] = sigmoid(X[index]); + } else { + X_acts[index] = tanh(X[index]); + } + } +} + +template +__global__ void LSTMUnitForward(const int nthreads, const int dim, + const Dtype* C_prev, const Dtype* X, const Dtype* flush, + Dtype* C, Dtype* H) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int n = index / dim; + const int d = index % dim; + const Dtype* X_offset = X + 4 * dim * n; + const Dtype i = X_offset[d]; + const Dtype f = X_offset[1 * dim + d]; + const Dtype o = X_offset[2 * dim + d]; + const Dtype g = X_offset[3 * dim + d]; + const Dtype c_prev = C_prev[index]; + const Dtype c = flush[n] * f * c_prev + i * g; + C[index] = c; + const Dtype tanh_c = tanh(c); + H[index] = o * tanh_c; + } +} + +template +void LSTMUnitLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const int count = top[1]->count(); + const Dtype* C_prev = bottom[0]->gpu_data(); + const Dtype* X = bottom[1]->gpu_data(); + const Dtype* flush = bottom[2]->gpu_data(); + Dtype* X_acts = X_acts_.mutable_gpu_data(); + Dtype* C = top[0]->mutable_gpu_data(); + Dtype* H = top[1]->mutable_gpu_data(); + const int X_count = bottom[1]->count(); + // NOLINT_NEXT_LINE(whitespace/operators) + LSTMActsForward<<>>( + X_count, hidden_dim_, X, X_acts); + CUDA_POST_KERNEL_CHECK; + // NOLINT_NEXT_LINE(whitespace/operators) + LSTMUnitForward<<>>( + count, hidden_dim_, C_prev, X_acts, flush, C, H); + CUDA_POST_KERNEL_CHECK; +} + +template +__global__ void LSTMUnitBackward(const int nthreads, const int dim, + const Dtype* C_prev, const Dtype* X, const Dtype* C, const Dtype* H, + const Dtype* flush, const Dtype* C_diff, const Dtype* H_diff, + Dtype* C_prev_diff, Dtype* X_diff) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int n = index / dim; + const int d = index % dim; + const Dtype* X_offset = X + 4 * dim * n; + const Dtype i = X_offset[d]; + const Dtype f = X_offset[1 * dim + d]; + const Dtype o = X_offset[2 * dim + d]; + const Dtype g = X_offset[3 * dim + d]; + const Dtype c_prev = C_prev[index]; + const Dtype c = C[index]; + const Dtype tanh_c = tanh(c); + Dtype* c_prev_diff = C_prev_diff + index; + Dtype* X_diff_offset = X_diff + 4 * dim * n; + Dtype* i_diff = X_diff_offset + d; + Dtype* f_diff = X_diff_offset + 1 * dim + d; + Dtype* o_diff = X_diff_offset + 2 * dim + d; + Dtype* g_diff = X_diff_offset + 3 * dim + d; + const Dtype c_term_diff = + C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c); + const Dtype flush_n = flush[n]; + *c_prev_diff = flush_n * c_term_diff * f; + *i_diff = c_term_diff * g; + *f_diff = flush_n * c_term_diff * c_prev; + *o_diff = H_diff[index] * tanh_c; + *g_diff = c_term_diff * i; + } +} + +template +__global__ void LSTMActsBackward(const int nthreads, const int dim, + const Dtype* X_acts, const Dtype* X_acts_diff, Dtype* X_diff) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int x_dim = 4 * dim; + const int d = index % x_dim; + const Dtype X_act = X_acts[index]; + if (d < 3 * dim) { + X_diff[index] = X_acts_diff[index] * X_act * (Dtype(1) - X_act); + } else { + X_diff[index] = X_acts_diff[index] * (Dtype(1) - X_act * X_act); + } + } +} + +template +void LSTMUnitLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators."; + if (!propagate_down[0] && !propagate_down[1]) { return; } + + const int count = top[1]->count(); + const Dtype* C_prev = bottom[0]->gpu_data(); + const Dtype* X_acts = X_acts_.gpu_data(); + const Dtype* flush = bottom[2]->gpu_data(); + const Dtype* C = top[0]->gpu_data(); + const Dtype* H = top[1]->gpu_data(); + const Dtype* C_diff = top[0]->gpu_diff(); + const Dtype* H_diff = top[1]->gpu_diff(); + Dtype* C_prev_diff = bottom[0]->mutable_gpu_diff(); + Dtype* X_acts_diff = X_acts_.mutable_gpu_diff(); + LSTMUnitBackward // NOLINT_NEXT_LINE(whitespace/operators) + <<>>(count, hidden_dim_, + C_prev, X_acts, C, H, flush, C_diff, H_diff, C_prev_diff, X_acts_diff); + CUDA_POST_KERNEL_CHECK; + const int X_count = bottom[1]->count(); + Dtype* X_diff = bottom[1]->mutable_gpu_diff(); + LSTMActsBackward // NOLINT_NEXT_LINE(whitespace/operators) + <<>>( + X_count, hidden_dim_, X_acts, X_acts_diff, X_diff); + CUDA_POST_KERNEL_CHECK; +} + +INSTANTIATE_LAYER_GPU_FUNCS(LSTMUnitLayer); + +} // namespace caffe diff --git a/src/caffe/test/test_lstm_layer.cpp b/src/caffe/test/test_lstm_layer.cpp new file mode 100644 index 00000000000..a0ce45f6383 --- /dev/null +++ b/src/caffe/test/test_lstm_layer.cpp @@ -0,0 +1,265 @@ +#include +#include + +#include "gtest/gtest.h" + +#include "caffe/blob.hpp" +#include "caffe/common.hpp" +#include "caffe/filler.hpp" +#include "caffe/sequence_layers.hpp" + +#include "caffe/test/test_caffe_main.hpp" +#include "caffe/test/test_gradient_check_util.hpp" + +namespace caffe { + +template +class LSTMLayerTest : public MultiDeviceTest { + typedef typename TypeParam::Dtype Dtype; + + protected: + LSTMLayerTest() : num_output_(7) { + blob_bottom_vec_.push_back(&blob_bottom_); + blob_bottom_vec_.push_back(&blob_bottom_flush_); + blob_top_vec_.push_back(&blob_top_); + unit_blob_bottom_vec_.push_back(&unit_blob_bottom_c_prev_); + unit_blob_bottom_vec_.push_back(&unit_blob_bottom_x_); + unit_blob_bottom_vec_.push_back(&unit_blob_bottom_flush_); + unit_blob_top_vec_.push_back(&unit_blob_top_c_); + unit_blob_top_vec_.push_back(&unit_blob_top_h_); + + ReshapeBlobs(1, 3); + + layer_param_.mutable_recurrent_param()->set_num_output(num_output_); + FillerParameter* weight_filler = + layer_param_.mutable_recurrent_param()->mutable_weight_filler(); + weight_filler->set_type("gaussian"); + weight_filler->set_std(0.2); + FillerParameter* bias_filler = + layer_param_.mutable_recurrent_param()->mutable_bias_filler(); + bias_filler->set_type("gaussian"); + bias_filler->set_std(0.1); + + layer_param_.set_phase(TEST); + } + + void ReshapeBlobs(int num_timesteps, int num_instances) { + blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2); + vector shape(2); + shape[0] = num_timesteps; + shape[1] = num_instances; + blob_bottom_flush_.Reshape(shape); + shape.push_back(num_output_); + + shape[0] = 1; shape[1] = num_instances; shape[2] = 4 * num_output_; + unit_blob_bottom_x_.Reshape(shape); + shape[0] = 1; shape[1] = num_instances; shape[2] = num_output_; + unit_blob_bottom_c_prev_.Reshape(shape); + shape[0] = 1; shape[1] = 1; shape[2] = num_instances; + unit_blob_bottom_flush_.Reshape(shape); + + FillerParameter filler_param; + filler_param.set_min(-1); + filler_param.set_max(1); + UniformFiller filler(filler_param); + filler.Fill(&blob_bottom_); + filler.Fill(&unit_blob_bottom_c_prev_); + filler.Fill(&unit_blob_bottom_x_); + } + + int num_output_; + LayerParameter layer_param_; + Blob blob_bottom_; + Blob blob_bottom_flush_; + Blob blob_top_; + vector*> blob_bottom_vec_; + vector*> blob_top_vec_; + + Blob unit_blob_bottom_flush_; + Blob unit_blob_bottom_c_prev_; + Blob unit_blob_bottom_x_; + Blob unit_blob_top_c_; + Blob unit_blob_top_h_; + vector*> unit_blob_bottom_vec_; + vector*> unit_blob_top_vec_; +}; + +TYPED_TEST_CASE(LSTMLayerTest, TestDtypesAndDevices); + +TYPED_TEST(LSTMLayerTest, TestSetUp) { + typedef typename TypeParam::Dtype Dtype; + LSTMLayer layer(this->layer_param_); + layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + vector expected_top_shape = this->blob_bottom_.shape(); + expected_top_shape.resize(3); + expected_top_shape[2] = this->num_output_; + EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape); +} + +TYPED_TEST(LSTMLayerTest, TestForward) { + typedef typename TypeParam::Dtype Dtype; + const int kNumTimesteps = 3; + const int num = this->blob_bottom_.shape(1); + this->ReshapeBlobs(kNumTimesteps, num); + + // Fill the flush blob with <0, 1, 1, ..., 1>, + // indicating a sequence that begins at the first timestep + // then continues for the rest of the sequence. + for (int t = 0; t < kNumTimesteps; ++t) { + for (int n = 0; n < num; ++n) { + this->blob_bottom_flush_.mutable_cpu_data()[t * num + n] = t > 0; + } + } + + // Process the full sequence in a single batch. + FillerParameter filler_param; + filler_param.set_mean(0); + filler_param.set_std(1); + GaussianFiller sequence_filler(filler_param); + sequence_filler.Fill(&this->blob_bottom_); + shared_ptr > layer(new LSTMLayer(this->layer_param_)); + Caffe::set_random_seed(1701); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + LOG(INFO) << "Calling forward for full sequence LSTM"; + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + + // Copy the inputs and outputs to reuse/check them later. + Blob bottom_copy(this->blob_bottom_.shape()); + bottom_copy.CopyFrom(this->blob_bottom_); + Blob top_copy(this->blob_top_.shape()); + top_copy.CopyFrom(this->blob_top_); + + // Process the batch one timestep at a time; + // check that we get the same result. + this->ReshapeBlobs(1, num); + layer.reset(new LSTMLayer(this->layer_param_)); + Caffe::set_random_seed(1701); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + const int bottom_count = this->blob_bottom_.count(); + const int top_count = this->blob_top_.count(); + const Dtype kEpsilon = 1e-5; + for (int t = 0; t < kNumTimesteps; ++t) { + caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count, + this->blob_bottom_.mutable_cpu_data()); + for (int n = 0; n < num; ++n) { + this->blob_bottom_flush_.mutable_cpu_data()[n] = t > 0; + } + LOG(INFO) << "Calling forward for LSTM timestep " << t; + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + for (int i = 0; i < top_count; ++i) { + ASSERT_LT(t * top_count + i, top_copy.count()); + EXPECT_NEAR(this->blob_top_.cpu_data()[i], + top_copy.cpu_data()[t * top_count + i], kEpsilon) + << "t = " << t << "; i = " << i; + } + } + + // Process the batch one timestep at a time with all flush blobs set to 0. + // Check that we get a different result, except in the first timestep. + Caffe::set_random_seed(1701); + layer.reset(new LSTMLayer(this->layer_param_)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + for (int t = 0; t < kNumTimesteps; ++t) { + caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count, + this->blob_bottom_.mutable_cpu_data()); + for (int n = 0; n < num; ++n) { + this->blob_bottom_flush_.mutable_cpu_data()[n] = 0; + } + LOG(INFO) << "Calling forward for LSTM timestep " << t; + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + for (int i = 0; i < top_count; ++i) { + if (t == 0) { + EXPECT_NEAR(this->blob_top_.cpu_data()[i], + top_copy.cpu_data()[t * top_count + i], kEpsilon) + << "t = " << t << "; i = " << i; + } else { + EXPECT_NE(this->blob_top_.cpu_data()[i], + top_copy.cpu_data()[t * top_count + i]) + << "t = " << t << "; i = " << i; + } + } + } +} + +TYPED_TEST(LSTMLayerTest, TestLSTMUnitSetUp) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + LSTMUnitLayer layer(layer_param); + layer.SetUp(this->unit_blob_bottom_vec_, this->unit_blob_top_vec_); + const int num_axes = this->unit_blob_bottom_c_prev_.num_axes(); + ASSERT_EQ(num_axes, this->unit_blob_top_c_.num_axes()); + ASSERT_EQ(num_axes, this->unit_blob_top_h_.num_axes()); + for (int i = 0; i < num_axes; ++i) { + EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i), + this->unit_blob_top_c_.shape(i)); + EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i), + this->unit_blob_top_h_.shape(i)); + } +} + +TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradient) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + LSTMUnitLayer layer(layer_param); + GradientChecker checker(1e-2, 1e-3); + Dtype* flush_data = this->blob_bottom_flush_.mutable_cpu_data(); + flush_data[0] = 0; + flush_data[1] = 0; + flush_data[2] = 0; + checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_, + this->unit_blob_top_vec_, 0); + checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_, + this->unit_blob_top_vec_, 1); +} + +TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradientNonZeroFlush) { + typedef typename TypeParam::Dtype Dtype; + LayerParameter layer_param; + LSTMUnitLayer layer(layer_param); + GradientChecker checker(1e-2, 1e-3); + Dtype* flush_data = this->blob_bottom_flush_.mutable_cpu_data(); + flush_data[0] = 1; + flush_data[1] = 0; + flush_data[2] = 1; + checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_, + this->unit_blob_top_vec_, 0); + checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_, + this->unit_blob_top_vec_, 1); +} + +TYPED_TEST(LSTMLayerTest, TestGradient) { + typedef typename TypeParam::Dtype Dtype; + LSTMLayer layer(this->layer_param_); + GradientChecker checker(1e-2, 1e-3); + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 0); +} + +TYPED_TEST(LSTMLayerTest, TestGradientNonZeroFlush) { + typedef typename TypeParam::Dtype Dtype; + LSTMLayer layer(this->layer_param_); + GradientChecker checker(1e-2, 1e-3); + for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) { + this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2; + } + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 0); +} + +TYPED_TEST(LSTMLayerTest, TestGradientNonZeroFlushBufferSize2) { + typedef typename TypeParam::Dtype Dtype; + this->ReshapeBlobs(2, 2); + // fill the values + FillerParameter filler_param; + UniformFiller filler(filler_param); + filler.Fill(&this->blob_bottom_); + LSTMLayer layer(this->layer_param_); + GradientChecker checker(1e-2, 1e-3); + for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) { + this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2; + } + checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, + this->blob_top_vec_, 0); +} + +} // namespace caffe From 9be6d73399b0b3dd938d11076d9b1b0a92005a38 Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Mon, 16 Feb 2015 14:29:46 -0800 Subject: [PATCH 22/26] Add scripts for downloading COCO2014 tools & data --- data/coco/README.md | 24 ++++++++++++++++++++++++ data/coco/download_tools.sh | 5 +++++ data/coco/get_coco2014_aux.sh | 13 +++++++++++++ 3 files changed, 42 insertions(+) create mode 100644 data/coco/README.md create mode 100755 data/coco/download_tools.sh create mode 100755 data/coco/get_coco2014_aux.sh diff --git a/data/coco/README.md b/data/coco/README.md new file mode 100644 index 00000000000..34850631f4d --- /dev/null +++ b/data/coco/README.md @@ -0,0 +1,24 @@ +For details about the Microsoft COCO ("Common Objects in Context") dataset [1], +visit mscoco.org. This README provides instructions for downloading and +installing the tools and dataset. + +1) Download and extract the COCO Python tools by running: + + ./download_tools.sh + +2) Install the tools, and optionally download the data by running: + + cd tools + python setup.py install # follow prompts to download or skip data + +3) Download train/val/test splits using: + + ./get_coco2014_aux.sh + +(or see the COCO README (tools/README) for more information). + + +[1] Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, + Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. + "Microsoft COCO: Common Objects in Context." + arXiv preprint arXiv:1405.0312 (2014). diff --git a/data/coco/download_tools.sh b/data/coco/download_tools.sh new file mode 100755 index 00000000000..c90bc1a2624 --- /dev/null +++ b/data/coco/download_tools.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +wget http://msvocds.blob.core.windows.net/annotations-0-9/tools.zip +unzip tools.zip +rm tools.zip diff --git a/data/coco/get_coco2014_aux.sh b/data/coco/get_coco2014_aux.sh new file mode 100755 index 00000000000..91cf3e1ef0d --- /dev/null +++ b/data/coco/get_coco2014_aux.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env sh +# +# Downloads Andrej Karpathy's train/val/test splits of COCO2014 as text files. + +echo "Downloading..." + +wget http://dl.caffe.berkeleyvision.org/coco2014_aux.tar.gz + +echo "Unzipping..." + +tar -xf coco2014_aux.tar.gz && rm -f coco2014_aux.tar.gz + +echo "Done." From bf1c90cb81ca750e191c36d3bd2c632317a10632 Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Mon, 16 Feb 2015 15:36:09 -0800 Subject: [PATCH 23/26] Add scripts to create HDF5 datasets from COCO captions --- examples/coco_caption/.gitignore | 1 + examples/coco_caption/coco_to_hdf5_data.py | 267 ++++++++++++++++++ .../coco_caption/hdf5_sequence_generator.py | 132 +++++++++ 3 files changed, 400 insertions(+) create mode 100644 examples/coco_caption/.gitignore create mode 100755 examples/coco_caption/coco_to_hdf5_data.py create mode 100644 examples/coco_caption/hdf5_sequence_generator.py diff --git a/examples/coco_caption/.gitignore b/examples/coco_caption/.gitignore new file mode 100644 index 00000000000..e040331b7f2 --- /dev/null +++ b/examples/coco_caption/.gitignore @@ -0,0 +1 @@ +h5_data/ diff --git a/examples/coco_caption/coco_to_hdf5_data.py b/examples/coco_caption/coco_to_hdf5_data.py new file mode 100755 index 00000000000..233ee829078 --- /dev/null +++ b/examples/coco_caption/coco_to_hdf5_data.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python + +from hashlib import sha1 +import os +import random +random.seed(3) +import re +import sys + +sys.path.append('./examples/coco_caption/') + +COCO_PATH = './data/coco/tools' +COCO_TOOL_PATH = '%s/pycocotools' % COCO_PATH + +MAX_HASH = 100000 + +sys.path.append(COCO_TOOL_PATH) +from coco import COCO + +from hdf5_sequence_generator import SequenceGenerator, HDF5SequenceWriter + +# UNK_IDENTIFIER is the word used to identify unknown words +UNK_IDENTIFIER = '' + +SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)') +def split_sentence(sentence): + # break sentence into a list of words and punctuation + sentence = [s.lower() for s in SENTENCE_SPLIT_REGEX.split(sentence.strip()) if len(s.strip()) > 0] + # remove the '.' from the end of the sentence + if sentence[-1] != '.': + # print "Warning: sentence doesn't end with '.'; ends with: %s" % sentence[-1] + return sentence + return sentence[:-1] + +MAX_WORDS = 20 + +class CocoSequenceGenerator(SequenceGenerator): + def __init__(self, coco, batch_num_streams, vocab=None, + max_words=MAX_WORDS, align=True, shuffle=True, gt_captions=True, + pad=True, truncate=True, split_ids=None): + self.max_words = max_words + num_empty_lines = 0 + self.images = [] + num_total = 0 + num_missing = 0 + num_captions = 0 + known_images = {} + image_root = '%s/%s' % (COCO_PATH, coco.image_folder) + if split_ids is None: + split_ids = coco.images.keys() + for image_id in split_ids: + image_info = coco.images[image_id] + image_path = '%s/%s/%s' % \ + (image_root, image_info['file_path'], image_info['file_name']) + if os.path.isfile(image_path): + assert image_id not in known_images # no duplicates allowed + known_images[image_id] = {} + known_images[image_id]['path'] = image_path + if gt_captions: + known_images[image_id]['sentences'] = [split_sentence(anno['sentence']) + for anno in coco.image_to_annotations[image_id]] + num_captions += len(known_images[image_id]['sentences']) + else: + known_images[image_id]['sentences'] = [] + else: + num_missing += 1 + print 'Warning (#%d): image not found: %s' % (num_missing, image_path) + num_total += 1 + print '%d/%d images missing' % (num_missing, num_total) + if vocab is None: + self.init_vocabulary(known_images) + else: + self.vocabulary_inverted = vocab + self.vocabulary = {} + for index, word in enumerate(self.vocabulary_inverted): + self.vocabulary[word] = index + self.image_sentence_pairs = [] + num_no_sentences = 0 + for image_filename, metadata in known_images.iteritems(): + if not metadata['sentences']: + num_no_sentences += 1 + print 'Warning (#%d): image with no sentences: %s' % (num_no_sentences, image_filename) + for sentence in metadata['sentences']: + self.image_sentence_pairs.append((metadata['path'], sentence)) + self.index = 0 + self.num_resets = 0 + self.num_truncates = 0 + self.num_pads = 0 + self.num_outs = 0 + self.image_list = [] + SequenceGenerator.__init__(self) + self.batch_num_streams = batch_num_streams + # make the number of image/sentence pairs a multiple of the buffer size + # so each timestep of each batch is useful and we can align the images + if align: + num_pairs = len(self.image_sentence_pairs) + remainder = num_pairs % batch_num_streams + if remainder > 0: + num_needed = batch_num_streams - remainder + for i in range(num_needed): + choice = random.randint(0, num_pairs - 1) + self.image_sentence_pairs.append(self.image_sentence_pairs[choice]) + assert len(self.image_sentence_pairs) % batch_num_streams == 0 + if shuffle: + random.shuffle(self.image_sentence_pairs) + self.pad = pad + self.truncate = truncate + self.negative_one_padded_streams = frozenset(('input_sentence', 'target_sentence')) + + def streams_exhausted(self): + return self.num_resets > 0 + + def init_vocabulary(self, image_annotations, min_count=5): + words_to_count = {} + for image_id, annotations in image_annotations.iteritems(): + for annotation in annotations['sentences']: + for word in annotation: + word = word.strip() + if word not in words_to_count: + words_to_count[word] = 0 + words_to_count[word] += 1 + # Sort words by count, then alphabetically + words_by_count = sorted(words_to_count.keys(), key=lambda w: (-words_to_count[w], w)) + print 'Initialized vocabulary with %d words; top 10 words:' % len(words_by_count) + for word in words_by_count[:10]: + print '\t%s (%d)' % (word, words_to_count[word]) + # Add words to vocabulary + self.vocabulary = {UNK_IDENTIFIER: 0} + self.vocabulary_inverted = [UNK_IDENTIFIER] + for index, word in enumerate(words_by_count): + word = word.strip() + if words_to_count[word] < min_count: + break + self.vocabulary_inverted.append(word) + self.vocabulary[word] = index + 1 + print 'Final vocabulary (restricted to words with counts of %d+) has %d words' % \ + (min_count, len(self.vocabulary)) + + def dump_vocabulary(self, vocab_filename): + print 'Dumping vocabulary to file: %s' % vocab_filename + with open(vocab_filename, 'wb') as vocab_file: + for word in self.vocabulary_inverted: + vocab_file.write('%s\n' % word) + print 'Done.' + + def dump_image_file(self, image_filename, dummy_image_filename=None): + print 'Dumping image list to file: %s' % image_filename + with open(image_filename, 'wb') as image_file: + for image_path, _ in self.image_list: + image_file.write('%s\n' % image_path) + if dummy_image_filename is not None: + print 'Dumping image list with dummy labels to file: %s' % dummy_image_filename + with open(dummy_image_filename, 'wb') as image_file: + for path_and_hash in self.image_list: + image_file.write('%s %d\n' % path_and_hash) + print 'Done.' + + def next_line(self): + num_lines = float(len(self.image_sentence_pairs)) + self.index += 1 + if self.index == 1 or self.index == num_lines or self.index % 10000 == 0: + print 'Processed %d/%d (%f%%) lines' % (self.index, num_lines, + 100 * self.index / num_lines) + if self.index == num_lines: + self.index = 0 + self.num_resets += 1 + + def line_to_stream(self, sentence): + stream = [] + for word in sentence: + word = word.strip() + if word in self.vocabulary: + stream.append(self.vocabulary[word]) + else: # unknown word; append UNK + stream.append(self.vocabulary[UNK_IDENTIFIER]) + # increment the stream -- 0 will be the EOS character + stream = [s + 1 for s in stream] + return stream + + def get_pad_value(self, stream_name): + return -1 if stream_name in self.negative_one_padded_streams else 0 + + def get_streams(self): + image_filename, line = self.image_sentence_pairs[self.index] + stream = self.line_to_stream(line) + pad = self.max_words - (len(stream) + 1) if self.pad else 0 + if pad > 0: self.num_pads += 1 + self.num_outs += 1 + out = {} + out['stage_indicators'] = [1] * (len(stream) + 1) + [0] * pad + out['cont_sentence'] = [0] + [1] * len(stream) + [0] * pad + out['input_sentence'] = [0] + stream + [-1] * pad + out['target_sentence'] = stream + [0] + [-1] * pad + truncated = False + if self.truncate: + for key, val in out.iteritems(): + if len(val) > self.max_words: + out[key] = val[:self.max_words] + truncated = True + self.num_truncates += truncated + image_hash = self.image_hash(image_filename) + out['hashed_image_path'] = [image_hash] * len(out['input_sentence']) + self.image_list.append((image_filename, image_hash)) + self.next_line() + return out + + def image_hash(self, filename): + image_hash = int(sha1(filename).hexdigest(), 16) % MAX_HASH + assert image_hash == float(image_hash) + return image_hash + +COCO_ANNO_PATH = '%s/annotations/sentences_%%s2014.json' % COCO_PATH +COCO_IMAGE_PATTERN = '%s/images/%%s2014' % COCO_PATH +COCO_IMAGE_ID_PATTERN = 'COCO_%s2014_%%012d.jpg' + +BUFFER_SIZE = 100 +OUTPUT_DIR = './examples/coco_caption/h5_data/buffer_%d' % BUFFER_SIZE +SPLITS_PATTERN = './data/coco/coco2014_cocoid.%s.txt' +OUTPUT_DIR_PATTERN = '%s/%%s_batches' % OUTPUT_DIR + +def preprocess_dataset(split_name, coco_split_name, batch_stream_length, + vocab=None, aligned=True): + with open(SPLITS_PATTERN % split_name, 'r') as split_file: + split_image_ids = [int(line) for line in split_file.readlines()] + output_dataset_name = split_name + if aligned: + output_dataset_name += '_aligned_%d' % MAX_WORDS + else: + output_dataset_name += '_unaligned' + output_path = OUTPUT_DIR_PATTERN % output_dataset_name + coco = COCO(COCO_ANNO_PATH % coco_split_name) + sg = CocoSequenceGenerator(coco, BUFFER_SIZE, split_ids=split_image_ids, + vocab=vocab, align=aligned, pad=aligned, truncate=aligned) + sg.batch_stream_length = batch_stream_length + writer = HDF5SequenceWriter(sg, output_dir=output_path) + writer.write_to_exhaustion() + writer.write_filelists() + if vocab is None: + vocab_out_path = '%s/vocabulary.txt' % OUTPUT_DIR + sg.dump_vocabulary(vocab_out_path) + image_out_path = '%s/image_list.txt' % output_path + image_dummy_labels_out_path = '%s/image_list.with_dummy_labels.txt' % output_path + sg.dump_image_file(image_out_path, image_dummy_labels_out_path) + num_outs = sg.num_outs + num_pads = sg.num_pads + num_truncates = sg.num_truncates + print 'Padded %d/%d sequences; truncated %d/%d sequences' % \ + (num_pads, num_outs, num_truncates, num_outs) + return sg.vocabulary_inverted + +def preprocess_coco(): + vocab = None + DATASETS = [ + ('train', 'train', 100000, True), + ('val', 'val', 100000, True), + ('test', 'val', 100000, True), + # Write unaligned datasets as well: + ('train', 'train', 100000, False), + ('val', 'val', 100000, False), + ('test', 'val', 100000, False), + ] + for split_name, coco_split_name, batch_stream_length, aligned in DATASETS: + vocab = preprocess_dataset(split_name, coco_split_name, batch_stream_length, + vocab=vocab, aligned=aligned) + +if __name__ == "__main__": + preprocess_coco() diff --git a/examples/coco_caption/hdf5_sequence_generator.py b/examples/coco_caption/hdf5_sequence_generator.py new file mode 100644 index 00000000000..98d4657b6bf --- /dev/null +++ b/examples/coco_caption/hdf5_sequence_generator.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python + +import h5py +import numpy as np +import os +import random +import sys + +class SequenceGenerator(): + def __init__(self): + self.dimension = 10 + self.batch_stream_length = 2000 + self.batch_num_streams = 8 + self.min_stream_length = 13 + self.max_stream_length = 17 + self.substream_names = None + self.streams_initialized = False + + def streams_exhausted(self): + return False + + def init_streams(self): + self.streams = [None] * self.batch_num_streams + self.stream_indices = [0] * self.batch_num_streams + self.reset_stream(0) + self.streams_initialized = True + + def reset_stream(self, stream_index): + streams = self.get_streams() + stream_names = sorted(streams.keys()) + if self.substream_names is None: + assert len(stream_names) > 0 + self.substream_names = stream_names + assert self.substream_names == stream_names + if self.streams[stream_index] is None: + self.streams[stream_index] = {} + stream_length = len(streams[stream_names[0]]) + for k, v in streams.iteritems(): + assert stream_length == len(v) + self.streams[stream_index][k] = v + self.stream_indices[stream_index] = 0 + + # Pad with zeroes by default -- override this to pad with soemthing else + # for a particular stream + def get_pad_value(self, stream_name): + return 0 + + def get_next_batch(self, truncate_at_exhaustion=True): + if not self.streams_initialized: + self.init_streams() + batch_size = self.batch_num_streams * self.batch_stream_length + batch = {} + batch_indicators = np.zeros((self.batch_stream_length, self.batch_num_streams)) + for name in self.substream_names: + batch[name] = self.get_pad_value(name) * np.ones_like(batch_indicators) + exhausted = [False] * self.batch_num_streams + all_exhausted = False + reached_exhaustion = False + num_completed_streams = 0 + for t in range(self.batch_stream_length): + all_exhausted = True + for i in range(self.batch_num_streams): + if not exhausted[i]: + if self.streams[i] is None or \ + self.stream_indices[i] == len(self.streams[i][self.substream_names[0]]): + self.stream_indices[i] = 0 + reached_exhaustion = reached_exhaustion or self.streams_exhausted() + if reached_exhaustion: exhausted[i] = True + if not reached_exhaustion or not truncate_at_exhaustion: + self.reset_stream(i) + else: + continue + for name in self.substream_names: + batch[name][t, i] = self.streams[i][name][self.stream_indices[i]] + batch_indicators[t, i] = 0 if self.stream_indices[i] == 0 else 1 + self.stream_indices[i] += 1 + if self.stream_indices[i] == len(self.streams[i][self.substream_names[0]]): + num_completed_streams += 1 + if not exhausted[i]: all_exhausted = False + if all_exhausted and truncate_at_exhaustion: + print ('Exhausted all data; cutting off batch at timestep %d ' + + 'with %d streams completed') % (t, num_completed_streams) + for name in self.substream_names: + batch[name] = batch[name][:t, :] + batch_indicators = batch_indicators[:t, :] + break + return batch, batch_indicators + + def get_streams(self): + raise Exception('get_streams should be overridden to return a dict ' + + 'of equal-length iterables.') + +class HDF5SequenceWriter(): + def __init__(self, sequence_generator, output_dir=None, verbose=False): + self.generator = sequence_generator + assert output_dir is not None # required + self.output_dir = output_dir + if os.path.exists(output_dir): + raise Exception('Output directory already exists: ' + output_dir) + os.makedirs(output_dir) + self.verbose = verbose + self.filenames = [] + + def write_batch(self, stop_at_exhaustion=False): + batch_comps, cont_indicators = self.generator.get_next_batch() + batch_index = len(self.filenames) + filename = '%s/batch_%d.h5' % (self.output_dir, batch_index) + self.filenames.append(filename) + h5file = h5py.File(filename, 'w') + dataset = h5file.create_dataset('cont', shape=cont_indicators.shape, dtype=cont_indicators.dtype) + dataset[:] = cont_indicators + dataset = h5file.create_dataset('buffer_size', shape=(1,), dtype=np.int) + dataset[:] = self.generator.batch_num_streams + for key, batch in batch_comps.iteritems(): + if self.verbose: + for s in range(self.generator.batch_num_streams): + stream = np.array(self.generator.streams[s][key]) + print 'batch %d, stream %s, index %d: ' % (batch_index, key, s), stream + h5dataset = h5file.create_dataset(key, shape=batch.shape, dtype=batch.dtype) + h5dataset[:] = batch + h5file.close() + + def write_to_exhaustion(self): + while not self.generator.streams_exhausted(): + self.write_batch(stop_at_exhaustion=True) + + def write_filelists(self): + assert self.filenames is not None + filelist_filename = '%s/hdf5_chunk_list.txt' % self.output_dir + with open(filelist_filename, 'w') as listfile: + for filename in self.filenames: + listfile.write('%s\n' % filename) From 87d9038bc3ca36942cc427badfdeedd66ebe86aa Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Mon, 16 Feb 2015 15:55:31 -0800 Subject: [PATCH 24/26] Prototxts + script for training COCO caption language model --- .../coco_caption/lstm_language_model.prototxt | 150 ++++++++++++++++++ examples/coco_caption/lstm_lm_solver.prototxt | 21 +++ examples/coco_caption/train_language_model.sh | 14 ++ 3 files changed, 185 insertions(+) create mode 100644 examples/coco_caption/lstm_language_model.prototxt create mode 100644 examples/coco_caption/lstm_lm_solver.prototxt create mode 100755 examples/coco_caption/train_language_model.sh diff --git a/examples/coco_caption/lstm_language_model.prototxt b/examples/coco_caption/lstm_language_model.prototxt new file mode 100644 index 00000000000..68fda5464fe --- /dev/null +++ b/examples/coco_caption/lstm_language_model.prototxt @@ -0,0 +1,150 @@ +name: "lstm_language_model" +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { phase: TRAIN } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_unaligned_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { + phase: TEST + stage: "test-on-train" + } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_unaligned_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { + phase: TEST + stage: "test-on-val" + } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/val_unaligned_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} +layer { + name: "embedding" + type: "Embed" + bottom: "input_sentence" + top: "embedded_input_sentence" + param { + lr_mult: 1 + } + embed_param { + bias_term: false + input_dim: 8801 # = vocab_size + 1 (for EOS) + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + } +} +layer { + name: "embed-drop" + type: "Dropout" + bottom: "embedded_input_sentence" + top: "embedded_input_sentence" + dropout_param { dropout_ratio: 0.5 } + include { stage: "embed-drop" } +} +layer { + name: "lstm1" + type: "LSTM" + bottom: "embedded_input_sentence" + bottom: "cont_sentence" + top: "lstm1" + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm-drop" + type: "Dropout" + bottom: "lstm1" + top: "lstm1" + dropout_param { dropout_ratio: 0.5 } + include { stage: "lstm-drop" } +} +layer { + name: "predict" + type: "InnerProduct" + bottom: "lstm1" + top: "predict" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + inner_product_param { + num_output: 8801 # = vocab_size + 1 (+1 for EOS) + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + axis: 2 + } +} +layer { + name: "cross_entropy_loss" + type: "SoftmaxWithLoss" + bottom: "predict" + bottom: "target_sentence" + top: "cross_entropy_loss" + loss_weight: 20 + loss_param { + ignore_label: -1 + } + softmax_param { + axis: 2 + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "predict" + bottom: "target_sentence" + top: "accuracy" + include { phase: TEST } + accuracy_param { + axis: 2 + ignore_label: -1 + } +} diff --git a/examples/coco_caption/lstm_lm_solver.prototxt b/examples/coco_caption/lstm_lm_solver.prototxt new file mode 100644 index 00000000000..fb36ad15a5b --- /dev/null +++ b/examples/coco_caption/lstm_lm_solver.prototxt @@ -0,0 +1,21 @@ +net: "./examples/coco_caption/lstm_language_model.prototxt" +train_state: { stage: 'embed-drop' stage: 'lstm-drop' } +test_iter: 25 +test_state: { stage: 'test-on-train' } +test_iter: 25 +test_state: { stage: 'test-on-val' } +test_interval: 1000 +base_lr: 0.1 +lr_policy: "step" +gamma: 0.5 +stepsize: 20000 +display: 1 +max_iter: 110000 +momentum: 0.9 +weight_decay: 0.0000 +snapshot: 5000 +snapshot_prefix: "./examples/coco_caption/lstm_lm" +solver_mode: GPU +random_seed: 1701 +average_loss: 100 +clip_gradients: 10 diff --git a/examples/coco_caption/train_language_model.sh b/examples/coco_caption/train_language_model.sh new file mode 100755 index 00000000000..6e8a8c47b37 --- /dev/null +++ b/examples/coco_caption/train_language_model.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +GPU_ID=0 +DATA_DIR=./examples/coco_caption/h5_data/ +if [ ! -d $DATA_DIR ]; then + echo "Data directory not found: $DATA_DIR" + echo "First, download the COCO dataset (follow instructions in data/coco)" + echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data" + exit 1 +fi + +./build/tools/caffe train \ + -solver ./examples/coco_caption/lstm_lm_solver.prototxt \ + -gpu $GPU_ID From 80e9c41de4a802bd88ddafc9166d7d0244feca3f Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Mon, 16 Feb 2015 16:08:28 -0800 Subject: [PATCH 25/26] Prototxts + script for training LRCN COCO image captioning model --- examples/coco_caption/lrcn.prototxt | 768 +++++++++++++++++++++ examples/coco_caption/lrcn_solver.prototxt | 30 + examples/coco_caption/train_lrcn.sh | 17 + 3 files changed, 815 insertions(+) create mode 100644 examples/coco_caption/lrcn.prototxt create mode 100644 examples/coco_caption/lrcn_solver.prototxt create mode 100755 examples/coco_caption/train_lrcn.sh diff --git a/examples/coco_caption/lrcn.prototxt b/examples/coco_caption/lrcn.prototxt new file mode 100644 index 00000000000..f0888df23bf --- /dev/null +++ b/examples/coco_caption/lrcn.prototxt @@ -0,0 +1,768 @@ +# The network is used for the image captioning experiments of LRCN [1]. +# Please consider citing LRCN [1] if you use this example in your work. +# +# [1] J. Donahue, L. A. Hendricks, S. Guadarrama, M. Rohrbach, S. Venugopalan, +# K. Saenko, T. Darrell. "Long-term Recurrent Convolutional Networks for +# Visual Recognition and Description." arXiv preprint arXiv:1411.4389 (2014). + +name: "lrcn_caffenet_to_lstm" +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { phase: TRAIN } + transform_param { + mirror: true + crop_size: 227 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 100 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { phase: TRAIN } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { + phase: TEST + stage: "test-on-train" + } + transform_param { + mirror: true + crop_size: 227 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 100 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { + phase: TEST + stage: "test-on-train" + } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { + phase: TEST + stage: "test-on-val" + } + transform_param { + mirror: true + crop_size: 227 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 100 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { + phase: TEST + stage: "test-on-val" + } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} +layer { + name: "silence" + type: "Silence" + bottom: "label" +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "norm1" + type: "LRN" + bottom: "pool1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "norm1" + top: "conv2" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "norm1" + top: "conv2" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "norm2" + type: "LRN" + bottom: "pool2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "norm2" + top: "conv3" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "norm2" + top: "conv3" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + } +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + } +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "drop6" + type: "Dropout" + bottom: "fc6" + top: "fc6" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + } +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "drop7" + type: "Dropout" + bottom: "fc7" + top: "fc7" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + # exclude { stage: "freeze-convnet" } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "embedding" + type: "Embed" + bottom: "input_sentence" + top: "embedded_input_sentence" + param { + lr_mult: 1 + } + embed_param { + bias_term: false + input_dim: 8801 + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + } +} +layer { + name: "lstm1" + type: "LSTM" + bottom: "embedded_input_sentence" + bottom: "cont_sentence" + bottom: "fc8" + top: "lstm1" + include { stage: "unfactored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm2" + type: "LSTM" + bottom: "lstm1" + bottom: "cont_sentence" + top: "lstm2" + include { + stage: "unfactored" + stage: "2-layer" + } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm1" + type: "LSTM" + bottom: "embedded_input_sentence" + bottom: "cont_sentence" + top: "lstm1" + include { stage: "factored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm2" + type: "LSTM" + bottom: "lstm1" + bottom: "cont_sentence" + bottom: "fc8" + top: "lstm2" + include { stage: "factored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "predict" + type: "InnerProduct" + bottom: "lstm1" + top: "predict" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + exclude { stage: "2-layer" } + inner_product_param { + num_output: 8801 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + axis: 2 + } +} +layer { + name: "predict" + type: "InnerProduct" + bottom: "lstm2" + top: "predict" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + include { stage: "2-layer" } + inner_product_param { + num_output: 8801 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + axis: 2 + } +} +layer { + name: "cross_entropy_loss" + type: "SoftmaxWithLoss" + bottom: "predict" + bottom: "target_sentence" + top: "cross_entropy_loss" + loss_weight: 20 + loss_param { + ignore_label: -1 + } + softmax_param { + axis: 2 + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "predict" + bottom: "target_sentence" + top: "accuracy" + include { phase: TEST } + accuracy_param { + axis: 2 + ignore_label: -1 + } +} diff --git a/examples/coco_caption/lrcn_solver.prototxt b/examples/coco_caption/lrcn_solver.prototxt new file mode 100644 index 00000000000..65ca272b30c --- /dev/null +++ b/examples/coco_caption/lrcn_solver.prototxt @@ -0,0 +1,30 @@ +net: "./examples/coco_caption/lrcn.prototxt" + +# lrcn.prototxt supports three variants of the LRCN architecture: +# (1) stage: 'factored' stage: '2-layer' +# (2) stage: 'unfactored' stage: '1-layer' +# (3) stage: 'unfactored' stage: '2-layer' +# This solver uses variant (1). +# To use a different variant, modify the states (train_state, test_state) +# below as appropriate: + +train_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' } +test_iter: 25 +test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-train' } +test_iter: 25 +test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-val' } +test_interval: 1000 +base_lr: 0.01 +lr_policy: "step" +gamma: 0.5 +stepsize: 20000 +display: 1 +max_iter: 110000 +momentum: 0.9 +weight_decay: 0.0000 +snapshot: 5000 +snapshot_prefix: "./examples/coco_caption/lrcn" +solver_mode: GPU +random_seed: 1701 +average_loss: 100 +clip_gradients: 10 diff --git a/examples/coco_caption/train_lrcn.sh b/examples/coco_caption/train_lrcn.sh new file mode 100755 index 00000000000..5099e762ccd --- /dev/null +++ b/examples/coco_caption/train_lrcn.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +GPU_ID=0 +WEIGHTS=\ +./models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel +DATA_DIR=./examples/coco_caption/h5_data/ +if [ ! -d $DATA_DIR ]; then + echo "Data directory not found: $DATA_DIR" + echo "First, download the COCO dataset (follow instructions in data/coco)" + echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data" + exit 1 +fi + +./build/tools/caffe train \ + -solver ./examples/coco_caption/lrcn_solver.prototxt \ + -weights $WEIGHTS \ + -gpu $GPU_ID From d3ebf3e5b1c721770df51ae99e9bb4de8ef9b2b1 Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Thu, 26 Mar 2015 00:56:31 -0700 Subject: [PATCH 26/26] RecurrentLayer bugfix: params still need backprop --- src/caffe/layers/recurrent_layer.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/caffe/layers/recurrent_layer.cpp b/src/caffe/layers/recurrent_layer.cpp index 7dc38fec901..13f66f9893e 100644 --- a/src/caffe/layers/recurrent_layer.cpp +++ b/src/caffe/layers/recurrent_layer.cpp @@ -208,8 +208,12 @@ template void RecurrentLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { CHECK(!propagate_down[1]) << "Cannot backpropagate to sequence indicators."; - if (!propagate_down[0] && !propagate_down[2]) { return; } + // TODO: skip backpropagation to inputs and parameters inside the unrolled + // net according to propagate_down[0] and propagate_down[2]. For now just + // backprop to inputs and parameters unconditionally, as either the inputs or + // the parameters do need backward (or Net would have set + // layer_needs_backward_[i] == false for this layer). unrolled_net_->Backward(); }