From ed3fe9f0762158f44810d370eb43d0f5c8e243be Mon Sep 17 00:00:00 2001
From: Wei Liu <lwbiosoft@gmail.com>
Date: Wed, 6 Jul 2016 23:00:12 -0400
Subject: [PATCH] add cpp demo code to evaluate a model

---
 README.md                   |   2 +-
 examples/ssd/ssd_detect.cpp | 302 ++++++++++++++++++++++++++++++++++++
 2 files changed, 303 insertions(+), 1 deletion(-)
 create mode 100644 examples/ssd/ssd_detect.cpp
diff --git a/README.md b/README.md
index 77f94dfe..00ca36de 100644
--- a/README.md
+++ b/README.md
@@ -121,7 +121,7 @@ Please cite SSD in your publications if it helps your research:
   ```
   [Here](https://drive.google.com/file/d/0BzKzrI_SkD1_R09NcjM1eElLcWc/view) is a demo video of running a SSD500 model trained on [MSCOCO](http://mscoco.org) dataset.
 
-4. Check out `examples/ssd_detect.ipynb` on how to detect objects using a SSD model.
+4. Check out `examples/ssd_detect.ipynb` or `examples/ssd/ssd_detect.cpp` on how to detect objects using a SSD model.
 
 5. To train on other dataset, please refer to data/OTHERDATASET for more details.
 We currently add support for MSCOCO and ILSVRC2016.
diff --git a/examples/ssd/ssd_detect.cpp b/examples/ssd/ssd_detect.cpp
new file mode 100644
index 00000000..2743ea89
--- /dev/null
+++ b/examples/ssd/ssd_detect.cpp
@@ -0,0 +1,302 @@
+// This is a demo code for using a SSD model to do detection.
+// The code is modified from examples/cpp_classification/classification.cpp.
+// Usage:
+//    ssd_detection [FLAGS] model_file weights_file list_file
+//
+// where model_file is the .prototxt file defining the network architecture, and
+// weights_file is the .caffemodel file containing the network parameters, and
+// list_file contains a list of image files with the format as
+//    folder/img1.JPEG
+//    folder/img2.JPEG
+//
+#include <caffe/caffe.hpp>
+#ifdef USE_OPENCV
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif  // USE_OPENCV
+#include <algorithm>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace caffe;  // NOLINT(build/namespaces)
+
+class Detector {
+ public:
+  Detector(const string& model_file,
+           const string& weights_file,
+           const string& mean_file,
+           const string& mean_value);
+
+  std::vector<vector<float> > Detect(const cv::Mat& img);
+
+ private:
+  void SetMean(const string& mean_file, const string& mean_value);
+
+  void WrapInputLayer(std::vector<cv::Mat>* input_channels);
+
+  void Preprocess(const cv::Mat& img,
+                  std::vector<cv::Mat>* input_channels);
+
+ private:
+  shared_ptr<Net<float> > net_;
+  cv::Size input_geometry_;
+  int num_channels_;
+  cv::Mat mean_;
+};
+
+Detector::Detector(const string& model_file,
+                   const string& weights_file,
+                   const string& mean_file,
+                   const string& mean_value) {
+#ifdef CPU_ONLY
+  Caffe::set_mode(Caffe::CPU);
+#else
+  Caffe::set_mode(Caffe::GPU);
+#endif
+
+  /* Load the network. */
+  net_.reset(new Net<float>(model_file, TEST));
+  net_->CopyTrainedLayersFrom(weights_file);
+
+  CHECK_EQ(net_->num_inputs(), 1) << "Network should have exactly one input.";
+  CHECK_EQ(net_->num_outputs(), 1) << "Network should have exactly one output.";
+
+  Blob<float>* input_layer = net_->input_blobs()[0];
+  num_channels_ = input_layer->channels();
+  CHECK(num_channels_ == 3 || num_channels_ == 1)
+    << "Input layer should have 1 or 3 channels.";
+  input_geometry_ = cv::Size(input_layer->width(), input_layer->height());
+
+  /* Load the binaryproto mean file. */
+  SetMean(mean_file, mean_value);
+}
+
+std::vector<vector<float> > Detector::Detect(const cv::Mat& img) {
+  Blob<float>* input_layer = net_->input_blobs()[0];
+  input_layer->Reshape(1, num_channels_,
+                       input_geometry_.height, input_geometry_.width);
+  /* Forward dimension change to all layers. */
+  net_->Reshape();
+
+  std::vector<cv::Mat> input_channels;
+  WrapInputLayer(&input_channels);
+
+  Preprocess(img, &input_channels);
+
+  net_->Forward();
+
+  /* Copy the output layer to a std::vector */
+  Blob<float>* result_blob = net_->output_blobs()[0];
+  const float* result = result_blob->cpu_data();
+  const int num_det = result_blob->height();
+  vector<vector<float> > detections;
+  for (int k = 0; k < num_det; ++k) {
+    vector<float> detection(result, result + 7);
+    detections.push_back(detection);
+    result += 7;
+  }
+  return detections;
+}
+
+/* Load the mean file in binaryproto format. */
+void Detector::SetMean(const string& mean_file, const string& mean_value) {
+  cv::Scalar channel_mean;
+  if (!mean_file.empty()) {
+    CHECK(mean_value.empty()) <<
+      "Cannot specify mean_file and mean_value at the same time";
+    BlobProto blob_proto;
+    ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
+
+    /* Convert from BlobProto to Blob<float> */
+    Blob<float> mean_blob;
+    mean_blob.FromProto(blob_proto);
+    CHECK_EQ(mean_blob.channels(), num_channels_)
+      << "Number of channels of mean file doesn't match input layer.";
+
+    /* The format of the mean file is planar 32-bit float BGR or grayscale. */
+    std::vector<cv::Mat> channels;
+    float* data = mean_blob.mutable_cpu_data();
+    for (int i = 0; i < num_channels_; ++i) {
+      /* Extract an individual channel. */
+      cv::Mat channel(mean_blob.height(), mean_blob.width(), CV_32FC1, data);
+      channels.push_back(channel);
+      data += mean_blob.height() * mean_blob.width();
+    }
+
+    /* Merge the separate channels into a single image. */
+    cv::Mat mean;
+    cv::merge(channels, mean);
+
+    /* Compute the global mean pixel value and create a mean image
+     * filled with this value. */
+    channel_mean = cv::mean(mean);
+    mean_ = cv::Mat(input_geometry_, mean.type(), channel_mean);
+  }
+  if (!mean_value.empty()) {
+    CHECK(mean_file.empty()) <<
+      "Cannot specify mean_file and mean_value at the same time";
+    stringstream ss(mean_value);
+    vector<float> values;
+    string item;
+    while (getline(ss, item, ',')) {
+      float value = std::atof(item.c_str());
+      values.push_back(value);
+    }
+    CHECK(values.size() == 1 || values.size() == num_channels_) <<
+      "Specify either 1 mean_value or as many as channels: " << num_channels_;
+
+    std::vector<cv::Mat> channels;
+    for (int i = 0; i < num_channels_; ++i) {
+      /* Extract an individual channel. */
+      cv::Mat channel(input_geometry_.height, input_geometry_.width, CV_32FC1,
+          cv::Scalar(values[i]));
+      channels.push_back(channel);
+    }
+    cv::merge(channels, mean_);
+  }
+}
+
+/* Wrap the input layer of the network in separate cv::Mat objects
+ * (one per channel). This way we save one memcpy operation and we
+ * don't need to rely on cudaMemcpy2D. The last preprocessing
+ * operation will write the separate channels directly to the input
+ * layer. */
+void Detector::WrapInputLayer(std::vector<cv::Mat>* input_channels) {
+  Blob<float>* input_layer = net_->input_blobs()[0];
+
+  int width = input_layer->width();
+  int height = input_layer->height();
+  float* input_data = input_layer->mutable_cpu_data();
+  for (int i = 0; i < input_layer->channels(); ++i) {
+    cv::Mat channel(height, width, CV_32FC1, input_data);
+    input_channels->push_back(channel);
+    input_data += width * height;
+  }
+}
+
+void Detector::Preprocess(const cv::Mat& img,
+                            std::vector<cv::Mat>* input_channels) {
+  /* Convert the input image to the input image format of the network. */
+  cv::Mat sample;
+  if (img.channels() == 3 && num_channels_ == 1)
+    cv::cvtColor(img, sample, cv::COLOR_BGR2GRAY);
+  else if (img.channels() == 4 && num_channels_ == 1)
+    cv::cvtColor(img, sample, cv::COLOR_BGRA2GRAY);
+  else if (img.channels() == 4 && num_channels_ == 3)
+    cv::cvtColor(img, sample, cv::COLOR_BGRA2BGR);
+  else if (img.channels() == 1 && num_channels_ == 3)
+    cv::cvtColor(img, sample, cv::COLOR_GRAY2BGR);
+  else
+    sample = img;
+
+  cv::Mat sample_resized;
+  if (sample.size() != input_geometry_)
+    cv::resize(sample, sample_resized, input_geometry_);
+  else
+    sample_resized = sample;
+
+  cv::Mat sample_float;
+  if (num_channels_ == 3)
+    sample_resized.convertTo(sample_float, CV_32FC3);
+  else
+    sample_resized.convertTo(sample_float, CV_32FC1);
+
+  cv::Mat sample_normalized;
+  cv::subtract(sample_float, mean_, sample_normalized);
+
+  /* This operation will write the separate BGR planes directly to the
+   * input layer of the network because it is wrapped by the cv::Mat
+   * objects in input_channels. */
+  cv::split(sample_normalized, *input_channels);
+
+  CHECK(reinterpret_cast<float*>(input_channels->at(0).data)
+        == net_->input_blobs()[0]->cpu_data())
+    << "Input channels are not wrapping the input layer of the network.";
+}
+
+DEFINE_string(mean_file, "",
+    "The mean file used to subtract from the input image.");
+DEFINE_string(mean_value, "104,117,123",
+    "If specified, can be one value or can be same as image channels"
+    " - would subtract from the corresponding channel). Separated by ','."
+    "Either mean_file or mean_value should be provided, not both.");
+DEFINE_string(out_file, "",
+    "If provided, store the detection results in the out_file.");
+DEFINE_double(confidence_threshold, 0.6,
+    "Only store detections with score higher than the threshold.");
+
+int main(int argc, char** argv) {
+#ifdef USE_OPENCV
+  ::google::InitGoogleLogging(argv[0]);
+  // Print output to stderr (while still logging)
+  FLAGS_alsologtostderr = 1;
+
+#ifndef GFLAGS_GFLAGS_H_
+  namespace gflags = google;
+#endif
+
+  gflags::SetUsageMessage("Do detection using SSD mode.\n"
+        "Usage:\n"
+        "    ssd_detect [FLAGS] model_file weights_file list_file\n");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (argc < 4) {
+    gflags::ShowUsageWithFlagsRestrict(argv[0], "examples/ssd/ssd_detect");
+    return 1;
+  }
+
+  const string& model_file = argv[1];
+  const string& weights_file = argv[2];
+  const string& mean_file = FLAGS_mean_file;
+  const string& mean_value = FLAGS_mean_value;
+  const string& out_file = FLAGS_out_file;
+  const float confidence_threshold = FLAGS_confidence_threshold;
+
+  // Initialize the network.
+  Detector detector(model_file, weights_file, mean_file, mean_value);
+
+  // Set the output mode.
+  std::streambuf* buf = std::cout.rdbuf();
+  std::ofstream outfile;
+  if (!out_file.empty()) {
+    outfile.open(out_file.c_str());
+    if (outfile.good()) {
+      buf = outfile.rdbuf();
+    }
+  }
+  std::ostream out(buf);
+
+  // Process image one by one.
+  std::ifstream infile(argv[3]);
+  std::string imgfile;
+  while (infile >> imgfile) {
+    cv::Mat img = cv::imread(imgfile, -1);
+    CHECK(!img.empty()) << "Unable to decode image " << imgfile;
+    std::vector<vector<float> > detections = detector.Detect(img);
+
+    /* Print the detection results. */
+    for (int i = 0; i < detections.size(); ++i) {
+      const vector<float>& d = detections[i];
+      // Detection format: [image_id, label, score, xmin, ymin, xmax, ymax].
+      CHECK_EQ(d.size(), 7);
+      const float score = d[2];
+      if (score >= confidence_threshold) {
+        out << imgfile << " ";
+        out << static_cast<int>(d[1]) << " ";
+        out << score << " ";
+        out << static_cast<int>(d[3] * img.cols) << " ";
+        out << static_cast<int>(d[4] * img.rows) << " ";
+        out << static_cast<int>(d[5] * img.cols) << " ";
+        out << static_cast<int>(d[6] * img.rows) << std::endl;
+      }
+    }
+  }
+#else
+  LOG(FATAL) << "This example requires OpenCV; compile with USE_OPENCV.";
+#endif  // USE_OPENCV
+  return 0;
+}