Skip to content

Commit

Permalink
cherry-pick master 代码到SDK2.2.2 分支上 (#559)
Browse files Browse the repository at this point in the history
* Add Custom Op for Yolov3 Post Process (#512)

* add custom op for yolov3

* reset submodule onnx

* reset tensorrt

* delete build

* merge odla_ops_nn

* modify for passing link-check

Co-authored-by: gcuser <[email protected]>
(cherry picked from commit 5847cd3)

* ODLA popART pipeline function (#522)

* First runnable with single thread & test context

* mnist runnable demot to test the pipeline

* multi thread put the data to the session run

* simple bash to compile and run test

* An example of how to use the callback in pipeline

* multi threads using local Ctx

* Can run with pipeline setting in onnx file

* Refactored and add no pipeline multi thread

* Move codes to the odla_pipeline.h .cc

* Make single empty/zero data, and delete context for empty data after get result

* Add mutex to serialization the compute requests

* Merge the changes for attention mask & prevous changes

* test codes for time

* Chage the CMakeList to make the pipeline.cc and new custom op compiled

* Successfully run on 24L with attention mask custom OP

* custom op attention_mask test code

* And name scope to the each node in model

* Try throghput test with MLPerf model

* only set AMP on feed forward matmul

* Run the online pipeling with config hard coded to the config read class

* Compile with SDK 2.2 with pipeline online setting

* Add config file for pipeline stage setting

* Run pipeline with similar performance of popart

* change some names & make AMP all 0.445

* Add amp parameter in config file

* Detach device and clear session when DestroyComputation

* Make the batch_per_step take effect on execution mode SEQUENCE to pass enough size of data

* Add the new lock free queue and logging

* Fix bug on empty data visit counter

* delete the empty context

* add some pipeline sync

* Make thread sleep for 5 ms when no task in the queue

* change the size() of LockFreeQueue to tail-wait

* [CI] make the call by main can work with npz files

* Move the computation init to create context

* Add common functions to common.h and common.cc

* move the compuation init out

* Move common functions to the test foler

* Test the config of ODLA popART and make no configuration act as before

* Add tests for call the model.cc

* Add FP32 to save as result

* Some changes on LockFreeQueue and tests

* Fix the rsqrt wrong problem, and remove std cout&cerr to avoid crash

* fix the accuracy problem of large bps

* Add thread check for context & computation holding to avoid conflicts

* Add the batch tools to help on the test to generate model, build and run

* Decreasing the empty data put

* temporary commit to migrate crashed system

* set pipeline information on fly
change the mixed style of class member
add debug setting and default to false to make the opts set by api
remove the old pipeline set api

* Fixed the mixed code style and removed redundant codes

* Remove the function test codes of the odla_popart

* remove some redundant codes and files

* Changed the CACHE STRING to CACHE PATH

* move ENGINE_CACHE_PATH to odla_popart.cc

* format the codes with clang-format-9 -i command

* Move json.hpp to third party

* Set virtualgraph for model not using pipeline in set_session_opts

* Add virtual graph attribute when _odla_computation constructed

* Check the shape before extends it with batches_per_step

Co-authored-by: gcuser <[email protected]>
(cherry picked from commit 6095bdf)

* fix on default configuration & computation destroyment

(cherry picked from commit 40b9fc8)

* definitions for static variables

(cherry picked from commit 18e0e83)

* disable test case test_constant_popart.cc

Co-authored-by: Zars19 <[email protected]>
Co-authored-by: jackzipu <[email protected]>
Co-authored-by: gcuser <[email protected]>
  • Loading branch information
4 people authored Sep 14, 2021
1 parent 0810ace commit 7f358b9
Show file tree
Hide file tree
Showing 33 changed files with 28,498 additions and 474 deletions.
17 changes: 17 additions & 0 deletions ODLA/include/ODLA/ops/odla_ops_nn.h
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,23 @@ extern ODLA_API_EXPORT odla_values ODLA_API_CALL odla_TopK(
odla_uint32 axis, odla_value_type output_value_type,
odla_value_type output_value_index_type, const odla_value_ids value_ids);

//! \brief Yolov3 Post Process
/*!
PostProcess Return Selected Info (cx, cy, w, h, pred_cls) of Each Class
\param orig_img_w the width of original image
\param orig_img_h the height of original image
\param bb13 BBoxes 13 x 13
\param bb26 BBoxes 26 x 26
\param bb52 BBoxes 52 x 52
\param value_id a unique value id (can be NULL)
\return odla_values
*/
extern ODLA_API_EXPORT odla_values ODLA_API_CALL odla_PostProcess(
odla_value orig_img_w, odla_value orig_img_h, odla_value bb13,
odla_value bb26, odla_value bb52, const odla_value_id value_id);

#ifdef __cplusplus
} // C extern
#endif
Expand Down
10 changes: 8 additions & 2 deletions ODLA/platforms/odla_popart/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ option(ODLA_BUILD_POPART_CUSTOM_OPS "Link with Popart custom ops" ON)
add_odla_library(odla_popart SHARED common.cc odla_compute.cc
odla_ops_math.cc odla_ops_nn.cc
odla_ops_process.cc odla_ops.cc
odla_ops_custom.cc
)
odla_ops_custom.cc odla_pipeline.cc
odla_popart.cc popart_config.cc
)

if (NOT POPLAR_ROOT)
set(POPLAR_ROOT "/opt/poplar_sdk/poplar" CACHE PATH "Path of poplar root")
Expand Down Expand Up @@ -51,3 +52,8 @@ if (NOT ODLA_BUILD_POPART_USE_CXX11ABI)
endif()

target_link_libraries(odla_popart PUBLIC ODLA custom_ops popart-only)

target_include_directories(odla_popart PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/custom_ops/third_party/onnx/
${CMAKE_CURRENT_SOURCE_DIR}/custom_ops/third_party/include/
)
21 changes: 21 additions & 0 deletions ODLA/platforms/odla_popart/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"version":"1.0.0",
"amp":0.445,
"batch_per_step":10,
"execution_mode":"pipeline",
"ipu_num":2,
"load_onnx":false,
"load_onnx_path":"path",
"pipeline":{
"^embedding_" : [0, 0],
"^layer[0-9]_" : [0, 0],
"^layer1[0-1]_" : [0, 0],
"^layer1[2-9]_" : [1, 1],
"^layer2[0-3]_" : [1, 1],
"^squad_" : [1, 1]
},
"queue_type":"LockFreeQueue",
"queue_capacity":1048576,
"save_model" : true,
"save_model_path":"pipeline_test.onnx"
}
4 changes: 3 additions & 1 deletion ODLA/platforms/odla_popart/custom_ops/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
add_odla_library(custom_ops SHARED
erf.cc
rsqrt.cc
postprocess.cc
attention_mask.cc
)

set_property(TARGET custom_ops PROPERTY CXX_STANDARD 14)
Expand All @@ -31,4 +33,4 @@ target_link_libraries(custom_ops PRIVATE popart-only)
target_include_directories(custom_ops PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/third_party/onnx/
${CMAKE_CURRENT_SOURCE_DIR}/third_party/include/
)
)
22 changes: 13 additions & 9 deletions ODLA/platforms/odla_popart/custom_ops/Makefile
Original file line number Diff line number Diff line change
@@ -1,27 +1,31 @@
CXX ?= g++
CXXFLAGS = -std=c++14 -fPIC -g -DONNX_NAMESPACE=onnx -D_GLIBCXX_USE_CXX11_ABI=0
CXXFLAGS = -std=c++14 -fPIC -g -DONNX_NAMESPACE=onnx
LDLIBS = -shared -lpopart -lpoplar -lpopops -lpoputil
INCLUDES = -Iinclude
INCLUDES = -Iinclude -Ithird_party/onnx/ -Ithird_party/include

BUILD_DIR = build
SOURCES = rsqrt.cc erf.cc
SOURCES = rsqrt.cc erf.cc postprocess.cc attention_mask.cc
TARGET = $(BUILD_DIR)/libcustom_ops.so

all: create_build_dir rsqrt_custom_op rsqrt_test erf_test
all: create_build_dir rsqrt_custom_op rsqrt_test attention_mask_test

.PHONY: create_build_dir
create_build_dir:
mkdir -p $(BUILD_DIR)

rsqrt_custom_op: rsqrt.cc erf.cc
rsqrt_custom_op: ${SOURCES}
$(CXX) $(SOURCES) $(LDLIBS) $(CXXFLAGS) $(INCLUDES) -o $(TARGET)

rsqrt_test: rsqrt_test.cc rsqrt_custom_op
$(CXX) -std=c++14 rsqrt_test.cc -lpopart -lpoplar -lpopops -ldl -DONNX_NAMESPACE=onnx -o rsqrt_test -D_GLIBCXX_USE_CXX11_ABI=0
$(CXX) -std=c++14 rsqrt_test.cc -lpopart -lpoplar -lpopops -ldl -DONNX_NAMESPACE=onnx -o rsqrt_test

erf_test: erf_test.cc rsqrt_custom_op
$(CXX) -std=c++14 erf_test.cc -lpopart -lpoplar -lpopops -ldl -DONNX_NAMESPACE=onnx -o erf_test -D_GLIBCXX_USE_CXX11_ABI=0
#erf_test: erf_test.cc rsqrt_custom_op
# $(CXX) -std=c++14 erf_test.cc -lpopart -lpoplar -lpopops -ldl -DONNX_NAMESPACE=onnx -o erf_test

attention_mask_test: attention_mask_test.cc rsqrt_custom_op
# $(CXX) $(LDLIBS) $(CXXFLAGS) $(INCLUDES) -o attention_mask_test
$(CXX) -std=c++14 -fPIC -g -DONNX_NAMESPACE=onnx attention_mask_test.cc -lpopart -lpoplar -lpopops -ldl -o attention_mask_test

.PHONY: clean
clean:
rm -r $(BUILD_DIR) rsqrt_test erf_test
rm -r $(BUILD_DIR) rsqrt_test attention_mask_test
144 changes: 144 additions & 0 deletions ODLA/platforms/odla_popart/custom_ops/attention_mask.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// Copyright (c) 2019 Graphcore Ltd. All rights reserved.

#include <iostream>
#include <popart/names.hpp>
#include <popart/op.hpp>
#include <popart/opmanager.hpp>
#include <popart/popx/devicex.hpp>
#include <popart/popx/opx.hpp>
#include <popart/popx/opxmanager.hpp>
#include <popart/region.hpp>
#include <popart/shapeinference.hpp>
#include <popops/Cast.hpp>
#include <popops/ElementWise.hpp>
#include <popops/Rearrange.hpp>
#include <poputil/TileMapping.hpp>
#include <random>

using namespace popart;
using namespace popart::popx;
using namespace popops::expr;

namespace CustomOperators {
const popart::OperatorIdentifier AttentionMask = {"ai.graphcore",
"AttentionMask", 1};
} // namespace CustomOperators

// An InplaceIdentityOp that doesn't return any grad ops. This allows you to
// disconnect the flow of gradients when creating the backwards pass
class AttentionMaskOp : public popart::Op {
public:
poplar::Type dataType;

AttentionMaskOp(const popart::OperatorIdentifier& _opid,
const Op::Settings& settings_, poplar::Type& dataTypeIn)
: Op(_opid, settings_), dataType(dataTypeIn) {}

void setup() final {
// input shape [B, S]
Shape inShape = inInfo(0).shape();
Shape refShape = inInfo(1).shape();

// output shape [B, 1, S, S]
Shape outShape = {inShape.at(0), 1, inShape.at(1), inShape.at(1)};

if (dataType == poplar::HALF)
outInfo(0) = {"FLOAT16", outShape};
else
outInfo(0) = {"FLOAT", outShape};
}

std::unique_ptr<Op> clone() const final {
return std::make_unique<AttentionMaskOp>(*this);
}

float getSubgraphValue() const final { return getLowSubgraphValue(); }
};

static popart::OpDefinition attentionMaskOpDef({});

static popart::OpCreator<AttentionMaskOp> attentionMaskOpCreator(
popart::OpDefinitions({{CustomOperators::AttentionMask,
attentionMaskOpDef}}),
[](const popart::OpCreatorInfo& oci) -> std::unique_ptr<popart::Op> {
std::string type =
oci.attributes.getAttribute<Attributes::String>("dataType");
poplar::Type dataType = (type == "FLOAT") ? poplar::FLOAT : poplar::HALF;

return std::unique_ptr<AttentionMaskOp>(
new AttentionMaskOp(oci.opid, oci.settings, dataType));
},
true);

class AttentionMaskOpX : public popart::popx::Opx {
public:
AttentionMaskOpX(popart::Op* op, popart::popx::Devicex* devicex)
: popart::popx::Opx(op, devicex) {
verifyOp<AttentionMaskOp>(op, CustomOperators::AttentionMask);
}

popart::popx::InputCreatorType getInputCreatorType(popart::InIndex) const {
return popart::popx::InputCreatorType::CanUnwind;
}

poplar::Tensor unwindTensorLayout(poplar::Tensor tensor, popart::InIndex,
popart::OutIndex) const {
return tensor;
}

popart::view::RegMap unwindRegion(popart::InIndex, popart::OutIndex) const {
return [this](const popart::view::Region& r) {
return popart::view::Regions(1, r);
};
}

void grow(poplar::program::Sequence& prog) const final {
AttentionMaskOp& myOp = getOp<AttentionMaskOp>();

poplar::Type dataType = myOp.dataType;
poplar::Graph& graph = Opx::graph();
// input tensor shape [B, S]
poplar::Tensor seqIndex = getInTensor(0);
std::size_t batchSize = seqIndex.dim(0);
std::size_t seqLength = seqIndex.dim(1);
seqIndex = seqIndex.reshape({batchSize, seqLength, 1});
seqIndex = popops::cast(graph, seqIndex, dataType, prog, "input_mask_f");
poplar::Tensor attentionMatrix = getInTensor(1);

const auto dimOrdering =
poputil::detectDimGroupings(graph, attentionMatrix);
bool swapOrder = !dimOrdering.empty() && dimOrdering.front().first == 2;
auto seqMask =
swapOrder ? popops::sub(graph, seqIndex.dimShuffle({0, 2, 1}), seqIndex,
prog, "maskVal")
.dimShuffle({0, 2, 1})
: popops::sub(graph, seqIndex, seqIndex.dimShuffle({0, 2, 1}),
prog, "maskVal");
popops::absInPlace(graph, seqMask, prog);
popops::tanhInPlace(graph, seqMask, prog);

// Create constant tensor;
std::mt19937 randomEngine;
unsigned totalTile = graph.getTarget().getTilesPerIPU();
std::uniform_int_distribution<> distrib(0, totalTile - 1);
int tileForConst = distrib(randomEngine);
poplar::Tensor minValue = graph.addConstant(dataType, {}, -10000.0);
graph.setTileMapping(minValue, tileForConst);

// Create log mask
popops::mulInPlace(graph, seqMask, minValue, prog);
seqMask = seqMask.reshape({batchSize, 1, seqLength, seqLength});
setOutTensor(0, seqMask);
}
};

static popart::popx::OpxCreator<AttentionMaskOpX> attentionMaskOpxCreator(
CustomOperators::AttentionMask);

static popart::RegisterShapeInferenceFunction AttentionMaskShapeInfer(
CustomOperators::AttentionMask, [](ShapeInferenceContext& ctx) {
auto B = ctx.inInfo(1).shape().at(0);
auto S = ctx.inInfo(1).shape().at(3);
auto dtype = ctx.inInfo(1).data_type();
ctx.outInfo(0) = {dtype, Shape({B, 1, S, S})};
});
Loading

0 comments on commit 7f358b9

Please sign in to comment.