cherry-pick master 代码到SDK2.2.2 分支上 (#559)

* Add Custom Op for Yolov3 Post Process (#512) * add custom op for yolov3 * reset submodule onnx * reset tensorrt * delete build * merge odla_ops_nn * modify for passing link-check Co-authored-by: gcuser <[email protected]> (cherry picked from commit 5847cd3) * ODLA popART pipeline function (#522) * First runnable with single thread & test context * mnist runnable demot to test the pipeline * multi thread put the data to the session run * simple bash to compile and run test * An example of how to use the callback in pipeline * multi threads using local Ctx * Can run with pipeline setting in onnx file * Refactored and add no pipeline multi thread * Move codes to the odla_pipeline.h .cc * Make single empty/zero data, and delete context for empty data after get result * Add mutex to serialization the compute requests * Merge the changes for attention mask & prevous changes * test codes for time * Chage the CMakeList to make the pipeline.cc and new custom op compiled * Successfully run on 24L with attention mask custom OP * custom op attention_mask test code * And name scope to the each node in model * Try throghput test with MLPerf model * only set AMP on feed forward matmul * Run the online pipeling with config hard coded to the config read class * Compile with SDK 2.2 with pipeline online setting * Add config file for pipeline stage setting * Run pipeline with similar performance of popart * change some names & make AMP all 0.445 * Add amp parameter in config file * Detach device and clear session when DestroyComputation * Make the batch_per_step take effect on execution mode SEQUENCE to pass enough size of data * Add the new lock free queue and logging * Fix bug on empty data visit counter * delete the empty context * add some pipeline sync * Make thread sleep for 5 ms when no task in the queue * change the size() of LockFreeQueue to tail-wait * [CI] make the call by main can work with npz files * Move the computation init to create context * Add common functions to common.h and common.cc * move the compuation init out * Move common functions to the test foler * Test the config of ODLA popART and make no configuration act as before * Add tests for call the model.cc * Add FP32 to save as result * Some changes on LockFreeQueue and tests * Fix the rsqrt wrong problem, and remove std cout&cerr to avoid crash * fix the accuracy problem of large bps * Add thread check for context & computation holding to avoid conflicts * Add the batch tools to help on the test to generate model, build and run * Decreasing the empty data put * temporary commit to migrate crashed system * set pipeline information on fly change the mixed style of class member add debug setting and default to false to make the opts set by api remove the old pipeline set api * Fixed the mixed code style and removed redundant codes * Remove the function test codes of the odla_popart * remove some redundant codes and files * Changed the CACHE STRING to CACHE PATH * move ENGINE_CACHE_PATH to odla_popart.cc * format the codes with clang-format-9 -i command * Move json.hpp to third party * Set virtualgraph for model not using pipeline in set_session_opts * Add virtual graph attribute when _odla_computation constructed * Check the shape before extends it with batches_per_step Co-authored-by: gcuser <[email protected]> (cherry picked from commit 6095bdf) * fix on default configuration & computation destroyment (cherry picked from commit 40b9fc8) * definitions for static variables (cherry picked from commit 18e0e83) * disable test case test_constant_popart.cc Co-authored-by: Zars19 <[email protected]> Co-authored-by: jackzipu <[email protected]> Co-authored-by: gcuser <[email protected]>
alibaba · Sep 14, 2021 · 7f358b9 · 7f358b9
1 parent 0810ace
commit 7f358b9
Show file tree

Hide file tree

Showing 33 changed files with 28,498 additions and 474 deletions.
diff --git a/ODLA/external/tensorrt-7.0.0 b/ODLA/external/tensorrt-7.0.0
diff --git a/ODLA/include/ODLA/ops/odla_ops_nn.h b/ODLA/include/ODLA/ops/odla_ops_nn.h
@@ -530,6 +530,23 @@ extern ODLA_API_EXPORT odla_values ODLA_API_CALL odla_TopK(
     odla_uint32 axis, odla_value_type output_value_type,
     odla_value_type output_value_index_type, const odla_value_ids value_ids);
 
+//! \brief Yolov3 Post Process
+/*!
+  PostProcess Return Selected Info (cx, cy, w, h, pred_cls) of Each Class
+
+  \param orig_img_w the width of original image
+  \param orig_img_h the height of original image
+  \param bb13 BBoxes 13 x 13
+  \param bb26 BBoxes 26 x 26
+  \param bb52 BBoxes 52 x 52
+  \param value_id a unique value id (can be NULL)
+
+  \return odla_values
+*/
+extern ODLA_API_EXPORT odla_values ODLA_API_CALL odla_PostProcess(
+    odla_value orig_img_w, odla_value orig_img_h, odla_value bb13,
+    odla_value bb26, odla_value bb52, const odla_value_id value_id);
+
 #ifdef __cplusplus
 } // C extern
 #endif

diff --git a/ODLA/platforms/odla_popart/CMakeLists.txt b/ODLA/platforms/odla_popart/CMakeLists.txt
@@ -20,8 +20,9 @@ option(ODLA_BUILD_POPART_CUSTOM_OPS "Link with Popart custom ops" ON)
 add_odla_library(odla_popart SHARED common.cc odla_compute.cc
                                odla_ops_math.cc odla_ops_nn.cc
                                odla_ops_process.cc odla_ops.cc
-                               odla_ops_custom.cc
-                )
+                               odla_ops_custom.cc odla_pipeline.cc
+                               odla_popart.cc popart_config.cc
+                               )
 
 if (NOT POPLAR_ROOT)
   set(POPLAR_ROOT "/opt/poplar_sdk/poplar" CACHE PATH "Path of poplar root")
@@ -51,3 +52,8 @@ if (NOT ODLA_BUILD_POPART_USE_CXX11ABI)
 endif()
 
 target_link_libraries(odla_popart PUBLIC ODLA custom_ops popart-only)
+
+target_include_directories(odla_popart PRIVATE
+   ${CMAKE_CURRENT_SOURCE_DIR}/custom_ops/third_party/onnx/
+   ${CMAKE_CURRENT_SOURCE_DIR}/custom_ops/third_party/include/
+)
diff --git a/ODLA/platforms/odla_popart/config.json b/ODLA/platforms/odla_popart/config.json
@@ -0,0 +1,21 @@
+{
+    "version":"1.0.0",
+    "amp":0.445,
+    "batch_per_step":10,
+    "execution_mode":"pipeline",
+    "ipu_num":2,
+    "load_onnx":false,
+    "load_onnx_path":"path",
+    "pipeline":{
+        "^embedding_"   : [0, 0],
+        "^layer[0-9]_"  : [0, 0],
+        "^layer1[0-1]_" : [0, 0],
+        "^layer1[2-9]_" : [1, 1],
+        "^layer2[0-3]_" : [1, 1],
+        "^squad_"       : [1, 1]
+    },
+    "queue_type":"LockFreeQueue",
+    "queue_capacity":1048576,
+    "save_model" : true,
+    "save_model_path":"pipeline_test.onnx"
+}
diff --git a/ODLA/platforms/odla_popart/custom_ops/CMakeLists.txt b/ODLA/platforms/odla_popart/custom_ops/CMakeLists.txt
@@ -17,6 +17,8 @@
 add_odla_library(custom_ops SHARED
   erf.cc
   rsqrt.cc
+  postprocess.cc
+  attention_mask.cc
 )
 
 set_property(TARGET custom_ops PROPERTY CXX_STANDARD 14)
@@ -31,4 +33,4 @@ target_link_libraries(custom_ops PRIVATE popart-only)
 target_include_directories(custom_ops PRIVATE
    ${CMAKE_CURRENT_SOURCE_DIR}/third_party/onnx/
    ${CMAKE_CURRENT_SOURCE_DIR}/third_party/include/
-)
+)
diff --git a/ODLA/platforms/odla_popart/custom_ops/Makefile b/ODLA/platforms/odla_popart/custom_ops/Makefile
@@ -1,27 +1,31 @@
 CXX ?= g++
-CXXFLAGS = -std=c++14 -fPIC -g -DONNX_NAMESPACE=onnx -D_GLIBCXX_USE_CXX11_ABI=0
+CXXFLAGS = -std=c++14 -fPIC -g -DONNX_NAMESPACE=onnx
 LDLIBS = -shared -lpopart -lpoplar -lpopops -lpoputil
-INCLUDES = -Iinclude
+INCLUDES = -Iinclude -Ithird_party/onnx/ -Ithird_party/include
 
 BUILD_DIR = build
-SOURCES = rsqrt.cc erf.cc
+SOURCES = rsqrt.cc erf.cc postprocess.cc attention_mask.cc
 TARGET = $(BUILD_DIR)/libcustom_ops.so
 
-all: create_build_dir rsqrt_custom_op rsqrt_test erf_test
+all: create_build_dir rsqrt_custom_op rsqrt_test attention_mask_test
 
 .PHONY: create_build_dir
 create_build_dir:
 	mkdir -p $(BUILD_DIR)
 
-rsqrt_custom_op: rsqrt.cc erf.cc
+rsqrt_custom_op: ${SOURCES}
 	$(CXX) $(SOURCES)  $(LDLIBS) $(CXXFLAGS) $(INCLUDES) -o $(TARGET)
 
 rsqrt_test: rsqrt_test.cc rsqrt_custom_op
-	$(CXX) -std=c++14 rsqrt_test.cc -lpopart -lpoplar -lpopops -ldl -DONNX_NAMESPACE=onnx -o rsqrt_test -D_GLIBCXX_USE_CXX11_ABI=0
+	$(CXX) -std=c++14 rsqrt_test.cc -lpopart -lpoplar -lpopops -ldl -DONNX_NAMESPACE=onnx -o rsqrt_test
 
-erf_test: erf_test.cc rsqrt_custom_op
-	$(CXX) -std=c++14 erf_test.cc -lpopart -lpoplar -lpopops -ldl -DONNX_NAMESPACE=onnx -o erf_test -D_GLIBCXX_USE_CXX11_ABI=0
+#erf_test: erf_test.cc rsqrt_custom_op
+#	$(CXX) -std=c++14 erf_test.cc -lpopart -lpoplar -lpopops -ldl -DONNX_NAMESPACE=onnx -o erf_test
+
+attention_mask_test: attention_mask_test.cc rsqrt_custom_op
+#	$(CXX) $(LDLIBS) $(CXXFLAGS) $(INCLUDES) -o attention_mask_test
+	$(CXX) -std=c++14 -fPIC -g -DONNX_NAMESPACE=onnx attention_mask_test.cc -lpopart -lpoplar -lpopops -ldl -o attention_mask_test
 
 .PHONY: clean
 clean:
-	rm -r  $(BUILD_DIR) rsqrt_test erf_test
+	rm -r  $(BUILD_DIR) rsqrt_test attention_mask_test
diff --git a/ODLA/platforms/odla_popart/custom_ops/attention_mask.cc b/ODLA/platforms/odla_popart/custom_ops/attention_mask.cc
@@ -0,0 +1,144 @@
+// Copyright (c) 2019 Graphcore Ltd. All rights reserved.
+
+#include <iostream>
+#include <popart/names.hpp>
+#include <popart/op.hpp>
+#include <popart/opmanager.hpp>
+#include <popart/popx/devicex.hpp>
+#include <popart/popx/opx.hpp>
+#include <popart/popx/opxmanager.hpp>
+#include <popart/region.hpp>
+#include <popart/shapeinference.hpp>
+#include <popops/Cast.hpp>
+#include <popops/ElementWise.hpp>
+#include <popops/Rearrange.hpp>
+#include <poputil/TileMapping.hpp>
+#include <random>
+
+using namespace popart;
+using namespace popart::popx;
+using namespace popops::expr;
+
+namespace CustomOperators {
+const popart::OperatorIdentifier AttentionMask = {"ai.graphcore",
+                                                  "AttentionMask", 1};
+} // namespace CustomOperators
+
+// An InplaceIdentityOp that doesn't return any grad ops. This allows you to
+// disconnect the flow of gradients when creating the backwards pass
+class AttentionMaskOp : public popart::Op {
+ public:
+  poplar::Type dataType;
+
+  AttentionMaskOp(const popart::OperatorIdentifier& _opid,
+                  const Op::Settings& settings_, poplar::Type& dataTypeIn)
+      : Op(_opid, settings_), dataType(dataTypeIn) {}
+
+  void setup() final {
+    // input shape [B, S]
+    Shape inShape = inInfo(0).shape();
+    Shape refShape = inInfo(1).shape();
+
+    // output shape [B, 1, S, S]
+    Shape outShape = {inShape.at(0), 1, inShape.at(1), inShape.at(1)};
+
+    if (dataType == poplar::HALF)
+      outInfo(0) = {"FLOAT16", outShape};
+    else
+      outInfo(0) = {"FLOAT", outShape};
+  }
+
+  std::unique_ptr<Op> clone() const final {
+    return std::make_unique<AttentionMaskOp>(*this);
+  }
+
+  float getSubgraphValue() const final { return getLowSubgraphValue(); }
+};
+
+static popart::OpDefinition attentionMaskOpDef({});
+
+static popart::OpCreator<AttentionMaskOp> attentionMaskOpCreator(
+    popart::OpDefinitions({{CustomOperators::AttentionMask,
+                            attentionMaskOpDef}}),
+    [](const popart::OpCreatorInfo& oci) -> std::unique_ptr<popart::Op> {
+      std::string type =
+          oci.attributes.getAttribute<Attributes::String>("dataType");
+      poplar::Type dataType = (type == "FLOAT") ? poplar::FLOAT : poplar::HALF;
+
+      return std::unique_ptr<AttentionMaskOp>(
+          new AttentionMaskOp(oci.opid, oci.settings, dataType));
+    },
+    true);
+
+class AttentionMaskOpX : public popart::popx::Opx {
+ public:
+  AttentionMaskOpX(popart::Op* op, popart::popx::Devicex* devicex)
+      : popart::popx::Opx(op, devicex) {
+    verifyOp<AttentionMaskOp>(op, CustomOperators::AttentionMask);
+  }
+
+  popart::popx::InputCreatorType getInputCreatorType(popart::InIndex) const {
+    return popart::popx::InputCreatorType::CanUnwind;
+  }
+
+  poplar::Tensor unwindTensorLayout(poplar::Tensor tensor, popart::InIndex,
+                                    popart::OutIndex) const {
+    return tensor;
+  }
+
+  popart::view::RegMap unwindRegion(popart::InIndex, popart::OutIndex) const {
+    return [this](const popart::view::Region& r) {
+      return popart::view::Regions(1, r);
+    };
+  }
+
+  void grow(poplar::program::Sequence& prog) const final {
+    AttentionMaskOp& myOp = getOp<AttentionMaskOp>();
+
+    poplar::Type dataType = myOp.dataType;
+    poplar::Graph& graph = Opx::graph();
+    // input tensor shape [B, S]
+    poplar::Tensor seqIndex = getInTensor(0);
+    std::size_t batchSize = seqIndex.dim(0);
+    std::size_t seqLength = seqIndex.dim(1);
+    seqIndex = seqIndex.reshape({batchSize, seqLength, 1});
+    seqIndex = popops::cast(graph, seqIndex, dataType, prog, "input_mask_f");
+    poplar::Tensor attentionMatrix = getInTensor(1);
+
+    const auto dimOrdering =
+        poputil::detectDimGroupings(graph, attentionMatrix);
+    bool swapOrder = !dimOrdering.empty() && dimOrdering.front().first == 2;
+    auto seqMask =
+        swapOrder ? popops::sub(graph, seqIndex.dimShuffle({0, 2, 1}), seqIndex,
+                                prog, "maskVal")
+                        .dimShuffle({0, 2, 1})
+                  : popops::sub(graph, seqIndex, seqIndex.dimShuffle({0, 2, 1}),
+                                prog, "maskVal");
+    popops::absInPlace(graph, seqMask, prog);
+    popops::tanhInPlace(graph, seqMask, prog);
+
+    // Create constant tensor;
+    std::mt19937 randomEngine;
+    unsigned totalTile = graph.getTarget().getTilesPerIPU();
+    std::uniform_int_distribution<> distrib(0, totalTile - 1);
+    int tileForConst = distrib(randomEngine);
+    poplar::Tensor minValue = graph.addConstant(dataType, {}, -10000.0);
+    graph.setTileMapping(minValue, tileForConst);
+
+    // Create log mask
+    popops::mulInPlace(graph, seqMask, minValue, prog);
+    seqMask = seqMask.reshape({batchSize, 1, seqLength, seqLength});
+    setOutTensor(0, seqMask);
+  }
+};
+
+static popart::popx::OpxCreator<AttentionMaskOpX> attentionMaskOpxCreator(
+    CustomOperators::AttentionMask);
+
+static popart::RegisterShapeInferenceFunction AttentionMaskShapeInfer(
+    CustomOperators::AttentionMask, [](ShapeInferenceContext& ctx) {
+      auto B = ctx.inInfo(1).shape().at(0);
+      auto S = ctx.inInfo(1).shape().at(3);
+      auto dtype = ctx.inInfo(1).data_type();
+      ctx.outInfo(0) = {dtype, Shape({B, 1, S, S})};
+    });
+14 −26		plugin/generateDetectionPlugin/generateDetectionPlugin.cpp
+1 −3		plugin/generateDetectionPlugin/generateDetectionPlugin.h
+6 −15		plugin/multilevelCropAndResizePlugin/multilevelCropAndResizePlugin.cpp
+1 −1		plugin/multilevelCropAndResizePlugin/multilevelCropAndResizePlugin.h
+24 −31		plugin/multilevelProposeROI/multilevelProposeROIPlugin.cpp
+2 −3		plugin/multilevelProposeROI/multilevelProposeROIPlugin.h