From 29fa4cf1c5a86515941bb5f69465d5657511ff18 Mon Sep 17 00:00:00 2001 From: limingxin Date: Wed, 23 Nov 2022 18:01:20 +0800 Subject: [PATCH 01/17] feat(compiler & runtime): support extern opr loader --- ci/test_tools.sh | 4 +- .../Dialect/Kernel/IR/KernelDialect.td | 13 + .../compiler/Target/MGB/dummy_loader.h | 134 +++++++++ compiler/include/megbrain/IR/ops.td | 13 + .../Conversion/MGBToKernel/MGBToKernel.cpp | 25 +- .../MGBToKernel/MGBToKernelHelper.h | 11 + .../lib/Dialect/Kernel/IR/KernelDialect.cpp | 5 + compiler/lib/Target/MGB/importer.cpp | 195 +++++++++++- compiler/lib/Target/TinyNN/exporter.cpp | 35 +++ immigration/include/extern_c_opr.h | 222 ++++++++++++++ runtime/CMakeLists.txt | 2 +- runtime/example/standard_OS/lite_main.c | 114 ++++++- runtime/schema/model.fbs | 15 +- runtime/src/cheader/model_reader.h | 32 +- runtime/src/vm.c | 1 + runtime/src/vm/extern_opr.c | 283 ++++++++++++++++++ runtime/src/vm/instruction.h | 15 +- runtime/test/CMakeLists.txt | 3 +- runtime/version.ld | 1 + script/build_and_test_not_standard_os.sh | 2 +- 20 files changed, 1099 insertions(+), 26 deletions(-) create mode 100644 compiler/include/compiler/Target/MGB/dummy_loader.h create mode 100644 immigration/include/extern_c_opr.h create mode 100644 runtime/src/vm/extern_opr.c diff --git a/ci/test_tools.sh b/ci/test_tools.sh index e3f5dfc1..3405de38 100755 --- a/ci/test_tools.sh +++ b/ci/test_tools.sh @@ -40,7 +40,7 @@ function compare_output_with_mgb(){ mkdir -p "${TINYNN_OUTPUT_DIR}" TINYMODEL_PATH=`find ${OUTPUT_DIR} -name "*.tiny"` TINYNN_SHAPE_STR=`echo $INPUT_DATA_SHAPE_STR | sed 's/[()]//g'` - $RUNTIME_BUILD_DIR/tinynn_test_lite ${TINYMODEL_PATH} "$TINYNN_OUTPUT_DIR" 0 $INPUT_DATA_STR ${TINYNN_SHAPE_STR} + $RUNTIME_BUILD_DIR/tinynn_test_lite -m ${TINYMODEL_PATH} -o "$TINYNN_OUTPUT_DIR" -l 0 -d $INPUT_DATA_STR -s ${TINYNN_SHAPE_STR} MGB_OUTPUT_DIR="$OUTPUT_DIR/mgb_out/" mkdir -p "${MGB_OUTPUT_DIR}" if [[ "$MODEL_PATH" == *".emod" ]];then @@ -69,7 +69,7 @@ function check_mem_leak_with_asan(){ cmake --build "$RUNTIME_BUILD_DIR_ASAN" --target tinynn_test_lite TINYNN_OUTPUT_ASAN_DIR="$OUTPUT_DIR/tinynn_out_asan" mkdir -p ${TINYNN_OUTPUT_ASAN_DIR} - $RUNTIME_BUILD_DIR_ASAN/tinynn_test_lite ${TINYMODEL_PATH} "$TINYNN_OUTPUT_ASAN_DIR" 0 $INPUT_DATA_STR $TINYNN_SHAPE_STR + $RUNTIME_BUILD_DIR_ASAN/tinynn_test_lite -m ${TINYMODEL_PATH} -o "$TINYNN_OUTPUT_ASAN_DIR" -l 0 -d $INPUT_DATA_STR -s $TINYNN_SHAPE_STR python3 $PROJECT_PATH/ci/compare_output_bin.py $TINYNN_OUTPUT_ASAN_DIR $MGB_OUTPUT_DIR --eps="$EPS" } diff --git a/compiler/include/compiler/Dialect/Kernel/IR/KernelDialect.td b/compiler/include/compiler/Dialect/Kernel/IR/KernelDialect.td index 815a5330..a93dbaba 100644 --- a/compiler/include/compiler/Dialect/Kernel/IR/KernelDialect.td +++ b/compiler/include/compiler/Dialect/Kernel/IR/KernelDialect.td @@ -90,6 +90,19 @@ def KernelCall: KernelBase<"KernelCall", [ ); } +def ExternOpr: KernelBase<"ExternOpr", [ + DeclareOpInterfaceMethods, + AttrSizedOperandSegments + ]> { + let arguments = (ins + Arg, "", [MemRead]>:$operands, + Arg, "", [MemWrite]>:$results, + StrAttr:$name, + StrAttr:$data, + UI32Attr:$data_len + ); +} + class InstructBase traits=[]>: KernelBase +#include +#include +#include +#include +#include +#include "megbrain/serialization/extern_c_opr.h" + +namespace { +std::map>, std::vector>> + name2outputinfo; +class MGBOprDescImpl { + static std::string loader_name; + + static inline const std::pair>, + std::vector>& + get_output_info(const std::string& loader_name) { + auto&& iter = name2outputinfo.find(loader_name); + if (iter != name2outputinfo.end()) + return iter->second; + else if (name2outputinfo.size() == 1) + return name2outputinfo.begin()->second; + else { + CC_ASSERT(0) + << "Please check loader name in command line args whether " + "consistent with loader name in dumped model.\n"; + return {}; + } + } + + static void release(MGBOprDesc* self) { + free(self->user_data); + delete self; + } + + static size_t hash(const MGBOprDesc* self) { return 1; } + + static int is_same(const MGBOprDesc* self, const MGBOprDesc* rhs) { + CC_ABORT << "The function 'is_same' is part of the dummy loader, just " + "for " + "compile but should NOT be called.\n"; + return 1; + } + + static void execute(const MGBOprDesc* self, const MGBTensor* input, + const MGBTensor* output) { + CC_ABORT << "The function 'execute' is part of the dummy loader, just " + "for " + "compile but should NOT be called.\n"; + } + + static void infer_shape(const MGBOprDesc* self, const MGBTensorShape* input, + MGBTensorShape* output) { + auto&& output_shapes = + get_output_info(reinterpret_cast(self->user_data)).first; + for (size_t i = 0; i < self->nr_output; ++i) { + output[i].ndim = output_shapes[i].size(); + for (size_t j = 0; j < output[i].ndim; ++j) + output[i].shape[j] = output_shapes[i][j]; + } + } + + static void infer_dtype(const struct MGBOprDesc* self, + const MGBDType* input, MGBDType* output) { + auto&& output_dtypes = + get_output_info(reinterpret_cast(self->user_data)) + .second; + for (size_t i = 0; i < self->nr_output; ++i) + output[i] = static_cast(output_dtypes[i]); + } + +public: + static MGBOprDesc* make(const std::string& loader_name) { + auto desc = std::make_unique(); + + uint32_t nr_output = get_output_info(loader_name).first.size(); + mgb_init_opr_desc(desc.get(), nr_output, "dummy"); +#define cb(func) desc->func = func; + MGB_OPR_DESC_FOREACH_MEM_FN(cb) +#undef cb + desc->infer_dtype = infer_dtype; + // copy loader name into desc->user_data + desc->user_data = malloc(loader_name.size() + 1); + memcpy(desc->user_data, loader_name.c_str(), loader_name.size()); + reinterpret_cast(desc->user_data)[loader_name.size()] = '\0'; + + return desc.release(); + } +}; + +class MGBOprLoaderImpl { + static std::map user_datas; + + static MGBOprDesc* create_desc(size_t nr_input, const void* buf, + size_t buf_len) { + std::string name((char*)buf + sizeof(size_t), *(size_t*)buf); + size_t data_len = buf_len - sizeof(size_t) - *(size_t*)buf; + void* user_data = malloc(sizeof(size_t) + data_len); + *(size_t*)(user_data) = data_len; + memmove(user_data + sizeof(size_t), + buf + sizeof(size_t) + *(size_t*)buf, data_len); + + user_datas[name] = user_data; + + return MGBOprDescImpl::make(name); + } + +public: + static std::map& get_user_datas() { return user_datas; } + static MGBOprLoader make() { return {"extern_opr_dummy", &create_desc}; } +}; +std::map MGBOprLoaderImpl::user_datas = {}; + +void mgb_c_opr_init_output_info( + const MGBExternCOprApi* (*get_api)(int), + const std::map>, + std::vector>>& output_info) { + name2outputinfo = std::move(output_info); + const MGBExternCOprApi* api = get_api(MGB_EXTERN_C_OPR_VERSION); + assert(api); + MGBOprLoader loader = MGBOprLoaderImpl::make(); + api->register_loader(&loader); +} +} // namespace \ No newline at end of file diff --git a/compiler/include/megbrain/IR/ops.td b/compiler/include/megbrain/IR/ops.td index bce360a4..79c22e86 100644 --- a/compiler/include/megbrain/IR/ops.td +++ b/compiler/include/megbrain/IR/ops.td @@ -429,5 +429,18 @@ def FusedElemwise: MgbHashableOp<"FusedElemwise"> { MgbArrayAttr:$modes ); } + +def ExternOpr: MgbHashableOp<"ExternOpr"> { + let inputs = (ins Variadic:$input); + let extraArguments = (ins + MgbStringAttr:$name, + MgbStringAttr:$data, + MgbUI32Attr:$data_len, + MgbUI32Attr:$nr_input, + MgbUI32Attr:$nr_output + ); + + let results = (outs Variadic:$results); +} #endif // MGB_OPS diff --git a/compiler/lib/Conversion/MGBToKernel/MGBToKernel.cpp b/compiler/lib/Conversion/MGBToKernel/MGBToKernel.cpp index f435a1fe..a3aef624 100644 --- a/compiler/lib/Conversion/MGBToKernel/MGBToKernel.cpp +++ b/compiler/lib/Conversion/MGBToKernel/MGBToKernel.cpp @@ -233,7 +233,7 @@ class ConvertElemwise final : public OpConversionPattern { case Mode::EQ: return createOp(op, operands, rewriter); case Mode::SILU: - return createOp(op, operands, rewriter); + return createOp(op, operands, rewriter); default: CC_ABORT << "Unsupport Elemwise mode :" << static_cast(op.mode()) << "\n"; @@ -550,6 +550,27 @@ class GenericConverter : public OpConversionPattern { } }; +class ExternOprConverter : public OpConversionPattern { +public: + using OpAdaptor = typename MGB::ExternOpr::Adaptor; + using OpConversionPattern::OpConversionPattern; + LogicalResult matchAndRewrite( + MGB::ExternOpr op, OpAdaptor adaptor, + ConversionPatternRewriter& rewriter) const override { + LOG_DEBUG << "Convert ExternOpr MGB dialect to Abstract kernel of " + "opr name: " + << op.getOperationName().str() << "\n"; + auto operands = adaptor.getOperands(); + CC_ASSERT(!isDynamicShape(operands)) + << "ExternOpr operands shape should not be dynamic.\n"; + auto attrs = ConvertAttr(op->getAttrDictionary(), + op->getContext()); + setOperandSegmentAttr(op->getContext(), attrs, + {op.nr_input(), op.nr_output()}); + return createOp(op, operands, rewriter, attrs); + } +}; + } // namespace void populateMGBToKernelConversionPatterns(TypeConverter& typeConverter, @@ -558,7 +579,7 @@ void populateMGBToKernelConversionPatterns(TypeConverter& typeConverter, ConvertParamStorage, ConvertParamProvider, ConvertElemwise, ConvertFusedElemwise, ConvertConvLike, ConvertReduce, ConvertReshape, ConvertSubtensor, ConvertSetSubtensor, - ConvertConcat, + ConvertConcat, ExternOprConverter, GenericConverter, GenericConverter ConvertAttr( return attrs; } +template <> +SmallVector ConvertAttr( + DictionaryAttr direct_attr, MLIRContext* context) { + SmallVector attrs; + GetParam("name"); + GetParam("data"); + GetParam("data_len"); + + return attrs; +} + template <> SmallVector ConvertAttr( DictionaryAttr direct_attr, MLIRContext* context) { diff --git a/compiler/lib/Dialect/Kernel/IR/KernelDialect.cpp b/compiler/lib/Dialect/Kernel/IR/KernelDialect.cpp index 691f1547..adfe3e9f 100644 --- a/compiler/lib/Dialect/Kernel/IR/KernelDialect.cpp +++ b/compiler/lib/Dialect/Kernel/IR/KernelDialect.cpp @@ -60,6 +60,11 @@ LogicalResult KernelCall::verifySymbolUses(SymbolTableCollection& symbolTable) { return success(); } +LogicalResult ExternOpr::verifySymbolUses(SymbolTableCollection& symbolTable) { + // TODO + return success(); +} + MemRefType Reshape::memoryForward(MemRefType inpType) { auto oupType = getResult().getType().dyn_cast(); if (!oupType) { diff --git a/compiler/lib/Target/MGB/importer.cpp b/compiler/lib/Target/MGB/importer.cpp index f0479a22..e8867f0b 100644 --- a/compiler/lib/Target/MGB/importer.cpp +++ b/compiler/lib/Target/MGB/importer.cpp @@ -19,6 +19,7 @@ #include "compiler/Common/MemoryStatus.h" #include "compiler/Dialect/MGB/IR/MGBDialect.h" #include "compiler/Target/Hako/hako_parse.h" +#include "compiler/Target/MGB/dummy_loader.h" #include "compiler/Target/MGB/helper.h" #include "compiler/Target/MGB/import.h" @@ -35,11 +36,31 @@ #include "megbrain/opr/misc.h" #include "megbrain/opr/nn_int.h" #include "megbrain/opr/tensor_manip.h" +#include "megbrain/serialization/extern_c_opr.h" +#include "megbrain/serialization/extern_c_opr_io.h" #include "megbrain/serialization/serializer.h" llvm::cl::opt hako_version( "hako", llvm::cl::desc("specific version used for encrypt"), llvm::cl::init(2)); +llvm::cl::opt ExternOprOutputShape( + "extern-opr-output-shapes", llvm::cl::Optional, + llvm::cl::desc("specific extern opr output shapes"), + llvm::cl::value_desc( + "loader_name_1=output_shape_1;output_shape_2;...:" + "loader_name_2=output_shape_1;output_shape_2;... " + "If only one loader, \"loader_name=\" can be omitted." + "e.g., " + "\"loader_1=(1,3,5,5);(1,1);(3,3):loader_2=(2,2);(1,1,3,3)\"")); +llvm::cl::opt ExternOprOutputDType( + "extern-opr-output-dtypes", llvm::cl::Optional, + llvm::cl::desc("specific extern opr output dtypes"), + llvm::cl::value_desc( + "Similar to --extern-opr-output-shapes but without " + "\"loader_name\"." + "The available values are float32, int32, uint8, float16, " + "int16. e.g., \"float32;int32;uint8:float16;int16\". Default " + "value is float32.")); using namespace mgb; using namespace llvm; @@ -139,6 +160,142 @@ std::vector read_file(std::string path) { return res; } +inline std::vector split(std::string str, + const std::string& delimiter) { + std::vector res; + size_t pos = 0; + while ((pos = str.find(delimiter)) != std::string::npos) { + res.emplace_back(std::move(str.substr(0, pos))); + str.erase(0, pos + delimiter.size()); + } + res.emplace_back(std::move(str)); + return res; +} + +inline void parse_extern_output_info() { + std::map>, + std::vector>> + name2outputinfo; + + std::string extern_opr_output_shapes = ExternOprOutputShape; + if (extern_opr_output_shapes.size()) { + auto&& output_shapes_loaders = split(extern_opr_output_shapes, ":"); + size_t nr_loader = output_shapes_loaders.size(); + + std::string extern_opr_output_dtypes = ExternOprOutputDType; + bool specify_dtype = (extern_opr_output_dtypes.size() != 0); + auto&& output_dtypes_loaders = split(extern_opr_output_dtypes, ":"); + if (specify_dtype) + CC_ASSERT(nr_loader == output_dtypes_loaders.size()); + + auto skip_whitespace = [](const std::string& str) { + int left = 0, right = str.size() - 1; + while (str[left] == ' ' || str[left] == '\t') + ++left; + while (str[right] == ' ' || str[right] == '\t') + --right; + return str.substr(left, right - left + 1); + }; + + auto parse_output_info = [=, &name2outputinfo]( + const std::string& output_shapes_str, + const std::string& output_dtypes_str, + const std::string& loader_name) { + auto&& output_shapes = split(output_shapes_str, ";"); + + std::vector uint_output_dtypes(output_shapes.size(), 0); + if (specify_dtype) { + auto&& output_dtypes = split(output_dtypes_str, ";"); + CC_ASSERT((output_shapes.size() == output_dtypes.size())) + << "Number of extern opr output shapes(" + << output_shapes.size() + << ") should equal to " + "number " + "of extern opr output dtypes(" + << output_dtypes.size() << ").\n"; + std::unordered_map dtype_str2uint{ + {"float32", 0}, + {"int32", 1}, + {"uint8", 2}, + {"float16", 3}, + {"int16", 4}}; + for (size_t i = 0; i < output_dtypes.size(); ++i) { + auto&& tmp_str = skip_whitespace(output_dtypes[i]); + if (dtype_str2uint.find(tmp_str) != dtype_str2uint.end()) + uint_output_dtypes[i] = dtype_str2uint.at(tmp_str); + else + CC_ASSERT(0) + << tmp_str + << " is invalid extern opr output dtype! Dtype " + "should be float32, int32, uint8, float16 " + "or " + "int16.\n"; + } + } + + std::vector> uint_output_shapes( + output_shapes.size()); + for (size_t i = 0; i < output_shapes.size(); ++i) { + auto&& tmp_str = skip_whitespace(output_shapes[i]); + CC_ASSERT((tmp_str[0] == '(' && + tmp_str[tmp_str.size() - 1] == ')')) + << "The output shape needs to be surrounded by " + "parentheses.\n"; + tmp_str = tmp_str.substr(1, tmp_str.size() - 2); + auto&& tmp_shape = split(tmp_str, ","); + CC_ASSERT((tmp_shape.size() <= MGB_TENSOR_MAX_NDIM)) + << "Maximum dimension of single output shape of extern " + "opr " + "is " + << MGB_TENSOR_MAX_NDIM << ".\n"; + uint_output_shapes[i].resize(tmp_shape.size()); + std::transform(tmp_shape.begin(), tmp_shape.end(), + uint_output_shapes[i].begin(), + [](const std::string& s) { + return static_cast(std::stoul(s)); + }); + } + + name2outputinfo[loader_name] = + std::make_pair(std::move(uint_output_shapes), + std::move(uint_output_dtypes)); + }; + + if (nr_loader == 1) { + auto&& name_and_shapes = split(output_shapes_loaders[0], "="); + bool specify_name = (name_and_shapes.size() == 2); + std::string&& loader_name = + (specify_name ? skip_whitespace(name_and_shapes[0]) : "_"); + const std::string& shapes = + specify_name ? name_and_shapes[1] : name_and_shapes[0]; + if (specify_dtype) { + parse_output_info(shapes, output_dtypes_loaders[0], + loader_name); + } else { + parse_output_info(shapes, "", loader_name); + } + } else if (nr_loader > 1) { + for (size_t i = 0; i < nr_loader; ++i) { + auto&& name_and_shapes = split(output_shapes_loaders[i], "="); + CC_ASSERT((name_and_shapes.size() == 2)) + << "When there are more than one loader, loader name " + "must be specified.\n"; + std::string&& loader_name = skip_whitespace(name_and_shapes[0]); + const std::string& shapes = name_and_shapes[1]; + if (specify_dtype) { + parse_output_info(shapes, output_dtypes_loaders[i], + loader_name); + } else { + parse_output_info(shapes, "", loader_name); + } + } + } + + mgb_c_opr_init_output_info(mgb_get_extern_c_opr_api_versioned, + name2outputinfo); + } +} + class Importer { using LoadResult = serialization::GraphLoader::LoadResult; using Options = MGBImporterOptions; @@ -152,8 +309,8 @@ class Importer { m_context->loadDialect(); } - mlir::LogicalResult import_mgb(std::string model_path, Options options - , int hako_ver = 0) { + mlir::LogicalResult import_mgb(std::string model_path, Options options, + int hako_ver = 0) { std::vector mdl_model_buffer; std::unique_ptr inp_file; hako_ver = hako_ver == 0 ? hako_version.getValue() : hako_ver; @@ -174,6 +331,9 @@ class Importer { CC_ASSERT(format.valid()) << "invalid model: unknown model format.\n"; m_loader = serialization::GraphLoader::make(std::move(inp_file), format.val()); + + parse_extern_output_info(); + LOG_DEBUG << "Process mgb graph\n"; process_graph(options); return mlir::verify(m_module); @@ -814,6 +974,35 @@ class Importer { std::vector{elemwise_exp, reduce_sum}, opr::Elemwise::Mode::TRUE_DIV); m_var2value.emplace(out, out_value); + } else if (auto extern_opr = + opr->try_cast_final()) { + auto user_datas = MGBOprLoaderImpl::get_user_datas(); + + void* _data = nullptr; + if (user_datas.find(opr->name()) != user_datas.end()) { + _data = user_datas[opr->name()]; + } + CC_ASSERT(_data) << "No data related to " << opr->name() << ".\n"; + std::string data( + reinterpret_cast(_data + sizeof(size_t)), + *(size_t*)(_data)); + free(_data); + + std::vector v_resultTypes(opr->output().size()); + for (int i = 0; i < opr->output().size(); ++i) { + v_resultTypes[i] = var_to_shaped_type(opr->output(i)); + } + + uint32_t nr_input = static_cast(opr->input().size()); + uint32_t nr_output = static_cast(opr->output().size()); + + auto values = m_builder.create( + m_builder.getUnknownLoc(), v_resultTypes, + var_array_to_value_array(opr->input()), opr->name(), data, + static_cast(data.size()), nr_input, nr_output); + for (int i = 0; i < opr->output().size(); ++i) { + m_var2value.emplace(opr->output(i), values.getResult(i)); + } } else { CC_ABORT << "unsupported mgb operator type " << opr->dyn_typeinfo()->name << "\n"; @@ -1200,7 +1389,7 @@ mlir::LogicalResult import_mgb(mlir::ModuleOp module, std::string model_path, LOG_DEBUG << "\n\t\t\t Begin Import MBG \t\t\t\n"; LOG_DEBUG << "load model from " << model_path << " with Options:\n\tuse_static_memory_plan=" - << options.use_naive_memory_plan + << options.use_static_memory_plan << "\n\toptimize_for_inference=" << options.optimize_for_inference << "\n\tuse_naive_memory_plan=" << options.use_naive_memory_plan << "\n\tgraph_opt_level=" diff --git a/compiler/lib/Target/TinyNN/exporter.cpp b/compiler/lib/Target/TinyNN/exporter.cpp index 274c5774..41dca537 100644 --- a/compiler/lib/Target/TinyNN/exporter.cpp +++ b/compiler/lib/Target/TinyNN/exporter.cpp @@ -294,6 +294,41 @@ class Exporter { MegCC::TensorType_WEIGHT, symbol2weight_id[op.name().str()])); }) + .Case([&](Kernel::ExternOpr op) { + kernel_exporter.addInst("EXTERN_OPR"); + + std::vector input_tensors, output_tensors; + for (auto&& i : op.operands()) { + auto&& tensor = value2typed_tensor.at( + i.getAsOpaquePointer()); + input_tensors.push_back(tensor.second); + } + + for (auto&& i : op.results()) { + auto&& tensor = value2typed_tensor.at( + i.getAsOpaquePointer()); + output_tensors.push_back(tensor.second); + } + + std::string name(op.name().data(), op.name().size()); + std::string data(op.data().data(), op.data().size()); + uint32_t data_len = data.size(); + + LOG_DEBUG << "Add ExternOpr instruction.\n"; + instructions_type.push_back( + MegCC::Instruction_ExternOpr); + instructions.push_back( + MegCC::CreateExternOpr( + m_fbs_builder, + m_fbs_builder.CreateVector( + input_tensors), + m_fbs_builder.CreateString(name), + m_fbs_builder.CreateString(data), + data_len, + m_fbs_builder.CreateVector( + output_tensors)) + .Union()); + }) .Case([&](Kernel::MemPlan op) { createTensor(op->getResult(0)); }) diff --git a/immigration/include/extern_c_opr.h b/immigration/include/extern_c_opr.h new file mode 100644 index 00000000..7d46af5f --- /dev/null +++ b/immigration/include/extern_c_opr.h @@ -0,0 +1,222 @@ +#ifndef MEGBRAIN_EXTERN_C_OPR_H +#define MEGBRAIN_EXTERN_C_OPR_H + +#include +#include +#include + +#ifdef MGE_DLL_EXPORT +#define MGB_PUBLIC __declspec(dllexport) +#else +#define MGB_PUBLIC __attribute__((visibility("default"))) +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef MGB_C_OPR_INIT_FUNC +#define MGB_C_OPR_INIT_FUNC mgb_c_opr_init +#endif + +#define INIT_FUNCS(s) #s +#define INIT_FUNC(s) INIT_FUNCS(s) +#define MGB_C_OPR_INIT_FUNC_STR INIT_FUNC(MGB_C_OPR_INIT_FUNC) + +#define MGB_EXTERN_C_OPR_VERSION 0x24 +#define MGB_TENSOR_MAX_NDIM 8 + +//! data types +typedef enum MGBDType { + MGB_DTYPE_FLOAT32, + MGB_DTYPE_INT32, + MGB_DTYPE_UINT8, + //! IEEE 754-based half-precision floating + MGB_DTYPE_FLOAT16, + MGB_DTYPE_INT16, +} MGBDType; + +typedef struct MGBTensorShape { + uint32_t ndim, shape[MGB_TENSOR_MAX_NDIM]; +} MGBTensorShape; + +typedef struct MGBTensorLayout { + uint32_t dtype; + MGBTensorShape shape; +} MGBTensorLayout; + +//! tensor representation +typedef struct MGBTensor { + MGBTensorLayout layout; + void* data; //!< the tensor value, accessible by caller CPU thread +} MGBTensor; + +//! extern device tenosr struct +typedef struct ExternDeviceTensor { + //! layout of device extern tensor, use to validity check with MGBTensor + MGBTensorLayout layout; + //! different NPU API has different type define so just define a void * to + //! compat all, need loader and SDK implement reinterpret_cast it + //! exampe for NNIE, device_ptr may define as + //! struct MemoryInfo { + //! HI_U64 phy_addr; + //! void* vir_addr; + //! size_t size = 0; + //! } + void* device_ptr; +} ExternDeviceTensor; + +//! for dynamic extern c opr param +typedef struct ExternCOprParam { + //! dump name of extern c opr in graph + //! example graph: + //! ExternCOpr1(3516:preprocess)->opr->ExternCOpr2(3559)->opr->ExternCOpr3(3516:det_face)... + //! extern_c_opr_dump_name config case: + //! when set 3516:preprocess, ExternCOpr1 will be config. + //! when set 3559, ExternCOpr2 will be config. + //! when set 3516:det_face, ExternCOpr3 will be config. + //! when set nullptr, will auto config the first ExternCOpr. + const char* extern_c_opr_dump_name; + + //! number of input/output, use to index and check + //! if set nr_input = 0, means do not provide input ExternDeviceTensor + //! if set nr_output = 0, means do not provide nr_output ExternDeviceTensor + size_t nr_input, nr_output; + + //! ptr of input/output ExternDeviceTensor + ExternDeviceTensor* input; + ExternDeviceTensor* output; + + //! device id + size_t device_id; + + //! extra info for misc dynamic config + uint8_t* extra_info; + //! size of extra_info + size_t extra_info_size; +} ExternCOprParam; + +/*! + * \brief operator descriptor + * + * Note: all the methods (except release) should be purely functional, so a + * descriptor can be shared by multiple operators + */ +typedef struct MGBOprDesc { + //! size of this MGBOprDesc object + uint32_t size; + + //! number of input/output vars + uint32_t nr_output; + + //! operator type name + const char* type_name; + + //! release this descriptor + void (*release)(struct MGBOprDesc* self); + + //! compute hash + size_t (*hash)(const struct MGBOprDesc* self); + + //! equality check + int (*is_same)(const struct MGBOprDesc* self, const struct MGBOprDesc* rhs); + + //! perform the computation + void (*execute)( + const struct MGBOprDesc* self, const MGBTensor* input, + const MGBTensor* output); + + //! infer output shapes from input shapes + void (*infer_shape)( + const struct MGBOprDesc* self, const MGBTensorShape* input, + MGBTensorShape* output); + + //! optional: infer output dtypes from input dtypes + void (*infer_dtype)( + const struct MGBOprDesc* self, const MGBDType* input, MGBDType* output); + + //! custom user data to be associated with this descriptor + void* user_data; + + //! dynamic extern c opr param + ExternCOprParam* dynamic_param; +} MGBOprDesc; + +//! foreach member function of MGBOprDesc to help initialization +#define MGB_OPR_DESC_FOREACH_MEM_FN(cb) \ + cb(release) cb(hash) cb(is_same) cb(execute) cb(infer_shape) + +//! operator loader +typedef struct MGBOprLoader { + //! name of the loader; must match the name given in + //! ExternCOprRunner::make_placeholder and would be written to graph dump + //! file + const char* name; + + /*! + * \brief create a new descriptor from saved buffer + * + * Note: there is no guarantee on the alignment of \p buf. + */ + MGBOprDesc* (*create_desc)(size_t nr_input, const void* buf, size_t buf_len); +} MGBOprLoader; + +//! APIs provided by megbrain +typedef struct MGBExternCOprApi { + /*! + * \brief register an operator loader + * + * content of the loader would be copied + * + * \return true if registration succeeds; false if duplicated name + */ + int (*register_loader)(const MGBOprLoader* loader); + + /*! + * \brief unregister a MGBOprLoader + * \return whether any loader is removed (i.e. whether the name exists) + */ + int (*unregister_loader)(const char* name); +} MGBExternCOprApi; + +//! get API ptr for specific version; return nullptr if version mismatch +MGB_PUBLIC const MGBExternCOprApi* mgb_get_extern_c_opr_api_versioned(int version); + +#ifdef __cplusplus +} +#endif + +//! get the API ptr for current header version; return nullptr on mismatch +static inline const MGBExternCOprApi* mgb_get_extern_c_opr_api() { + return mgb_get_extern_c_opr_api_versioned(MGB_EXTERN_C_OPR_VERSION); +} + +static inline size_t mgb_get_dtype_size(MGBDType dtype) { + switch (dtype) { + case MGB_DTYPE_INT32: + return 4; + case MGB_DTYPE_FLOAT32: + return 4; + case MGB_DTYPE_UINT8: + return 1; + case MGB_DTYPE_FLOAT16: + case MGB_DTYPE_INT16: + return 2; + default: + __builtin_trap(); + return -1; + } +} + +static inline void mgb_init_opr_desc( + MGBOprDesc* desc, uint32_t nr_output, const char* type_name) { + memset(desc, 0, sizeof(MGBOprDesc)); + desc->size = sizeof(MGBOprDesc); + desc->nr_output = nr_output; + desc->type_name = type_name; +} + +#undef MGB_PUBLIC +#endif // MEGBRAIN_EXTERN_C_OPR_H + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index af9ad2e3..30930510 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -183,7 +183,7 @@ target_include_directories( if(NOT TINYNN_BUILD_FOR_NOT_STANDARD_OS) add_executable(tinynn_test_lite example/standard_OS/lite_main.c) - target_link_libraries(tinynn_test_lite TinyNN m) + target_link_libraries(tinynn_test_lite TinyNN m dl) endif() if(TINYNN_ACHIEVE_ALL AND NOT TINYNN_BUILD_FOR_NOT_STANDARD_OS) diff --git a/runtime/example/standard_OS/lite_main.c b/runtime/example/standard_OS/lite_main.c index 77ec8975..4035a68c 100644 --- a/runtime/example/standard_OS/lite_main.c +++ b/runtime/example/standard_OS/lite_main.c @@ -7,7 +7,9 @@ */ #include +#include #include +#include "extern_c_opr.h" #include "lite-c/common_enum_c.h" #include "lite-c/global_c.h" #include "lite-c/network_c.h" @@ -156,30 +158,115 @@ static TinyNnCallBack g_cb = { .tinynn_fread_cb = fread, }; #endif + +void usage(){ + fprintf(stderr, + "Usage:\n" + "\t--input-model/-m: input model path\n" + "\t--output-dir/-o: output file path\n" + "\t--log-level/-l: 0:ERROR, 1:WARN, 2:INFO, 3:DEBUG\n" + "\t--input-data/-d: var=path/to/data_file\n" + "\t--data-shape/-s: data shape\n" + "\t--c-opr-lib/-c: path to extern opr lib file(.so)\n" + "\t--c-opr-init-interface/-i: the init API of your loader\n" + ); +} + +#if defined(_WIN32) +#include +#include +#define RTLD_LAZY 0 + +static void* dlopen(const char* file, int) { + return (void*)(LoadLibrary(file)); +} + +static void* dlsym(void* handle, const char* name) { + FARPROC symbol = GetProcAddress((HMODULE)handle, name); + return (void*)symbol; +} + +#else +#include +#endif + int main(int argc, char** argv) { LITE_set_log_level(WARN); #if TINYNN_CALLBACK_ENABLE register_tinynn_cb(TINYNN_CB_VERSION, g_cb); #endif - if (argc < 2) { - fprintf(stderr, "input error, please run with:\n"); - fprintf(stderr, - "tinynn_test " - " " - "..." - "\n"); - return -1; + char* model_path = NULL; + char* output_dir = NULL; + int print_out = 0; + char* data_str = NULL; + char* data_shape_str = NULL; + char* extern_so = NULL; + const char* c_opr_lib_interface = "mgb_c_opr_init"; + + const struct option long_options[] = { + {"input-model", required_argument, 0, 'm'}, + {"output-dir", required_argument, 0, 'o'}, + {"log-level", required_argument, 0, 'l'}, + {"input-data", required_argument, 0, 'd'}, + {"data-shape", required_argument, 0, 's'}, + {"c-opr-lib", required_argument, 0, 'c'}, + {"c-opr-init-interface", required_argument, 0, 'i'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0} + }; + const char* shortopt = "m:o:l:d:s:c:i:h"; + int c, option_idx = 0; + while(1) { + c = getopt_long(argc, argv, shortopt, long_options, &option_idx); + if(c == -1){ + break; + } + switch(c){ + case 'm': + model_path = optarg; + break; + case 'o': + output_dir = optarg; + break; + case 'l': + print_out = atoi(optarg); + break; + case 'd': + data_str = optarg; + break; + case 's': + data_shape_str = optarg; + break; + case 'c': + extern_so = optarg; + break; + case 'i': + c_opr_lib_interface = optarg; + break; + case 'h': + usage(); + exit(0); + break; + default: + abort(); + } } - const char* model_path = argv[1]; - const char* output_dir = argc > 2 ? argv[2] : NULL; - int print_out = argc > 3 ? atoi(argv[3]) : 0; + if (print_out == 2) { LITE_set_log_level(INFO); } else if (print_out == 3) { LITE_set_log_level(DEBUG); } - char* data_str = argc > 4 ? argv[4] : NULL; - char* data_shape_str = argc > 5 ? argv[5] : NULL; + + if(extern_so){ + void* handle = dlopen(extern_so, RTLD_LAZY); + EXAMPLE_ASSERT(handle, "load loader failed.\n"); + void (*func)(const MGBExternCOprApi* (*)(int)) = NULL; + *(void**)&func = dlsym(handle, c_opr_lib_interface); + EXAMPLE_ASSERT(func, "load init interface of loader failed.\n"); + func(mgb_get_extern_c_opr_api_versioned); + } + LiteNetwork model; LITE_CAPI_CHECK( LITE_make_network(&model, *default_config(), *default_network_io()), @@ -283,6 +370,7 @@ int main(int argc, char** argv) { } LITE_CAPI_CHECK(LITE_destroy_network(model), "delete model failed\n"); + return 0; } diff --git a/runtime/schema/model.fbs b/runtime/schema/model.fbs index aadaefcc..abcadad1 100644 --- a/runtime/schema/model.fbs +++ b/runtime/schema/model.fbs @@ -176,6 +176,18 @@ table Reshape { output: int; } +table ExternOpr { + // the input tensor idx + input: [int]; + // opr name + name: string; + // opr user data + data: string; + data_len: uint32; + // the output tensor idx + output: [int]; +} + enum ArithMode : byte { ROUND = 0, NEGATE, @@ -348,9 +360,10 @@ union Instruction { IndexingMultiAxis = 14, ArgSort = 15, Reshape = 16, + ExternOpr = 17, // terminator to mark the end of instruction definitions // all instruction types should be placed before here - INSTRUCTION_TABLE_END = 17 + INSTRUCTION_TABLE_END = 18 } // device base data for a Model diff --git a/runtime/src/cheader/model_reader.h b/runtime/src/cheader/model_reader.h index dabd191d..cc8fe563 100644 --- a/runtime/src/cheader/model_reader.h +++ b/runtime/src/cheader/model_reader.h @@ -71,6 +71,10 @@ typedef const struct MegCC_Reshape_table *MegCC_Reshape_table_t; typedef struct MegCC_Reshape_table *MegCC_Reshape_mutable_table_t; typedef const flatbuffers_uoffset_t *MegCC_Reshape_vec_t; typedef flatbuffers_uoffset_t *MegCC_Reshape_mutable_vec_t; +typedef const struct MegCC_ExternOpr_table *MegCC_ExternOpr_table_t; +typedef struct MegCC_ExternOpr_table *MegCC_ExternOpr_mutable_table_t; +typedef const flatbuffers_uoffset_t *MegCC_ExternOpr_vec_t; +typedef flatbuffers_uoffset_t *MegCC_ExternOpr_mutable_vec_t; typedef const struct MegCC_Arithmetic_table *MegCC_Arithmetic_table_t; typedef struct MegCC_Arithmetic_table *MegCC_Arithmetic_mutable_table_t; typedef const flatbuffers_uoffset_t *MegCC_Arithmetic_vec_t; @@ -240,6 +244,15 @@ typedef flatbuffers_uoffset_t *MegCC_Model_mutable_vec_t; #endif #define MegCC_Reshape_type_hash ((flatbuffers_thash_t)0xe65906ba) #define MegCC_Reshape_type_identifier "\xba\x06\x59\xe6" +#ifndef MegCC_ExternOpr_file_identifier +#define MegCC_ExternOpr_file_identifier flatbuffers_identifier +#endif +/* deprecated, use MegCC_ExternOpr_file_identifier */ +#ifndef MegCC_ExternOpr_identifier +#define MegCC_ExternOpr_identifier flatbuffers_identifier +#endif +#define MegCC_ExternOpr_type_hash ((flatbuffers_thash_t)0x6183fc9d) +#define MegCC_ExternOpr_type_identifier "\x9d\xfc\x83\x61" #ifndef MegCC_Arithmetic_file_identifier #define MegCC_Arithmetic_file_identifier flatbuffers_identifier #endif @@ -805,6 +818,20 @@ __flatbuffers_define_vector_field(0, MegCC_Reshape, inputs, flatbuffers_int32_ve __flatbuffers_define_vector_field(1, MegCC_Reshape, input_types, MegCC_TensorType_vec_t, 0) __flatbuffers_define_scalar_field(2, MegCC_Reshape, output, flatbuffers_int32, int32_t, INT32_C(0)) +struct MegCC_ExternOpr_table { uint8_t unused__; }; + +static inline size_t MegCC_ExternOpr_vec_len(MegCC_ExternOpr_vec_t vec) +__flatbuffers_vec_len(vec) +static inline MegCC_ExternOpr_table_t MegCC_ExternOpr_vec_at(MegCC_ExternOpr_vec_t vec, size_t i) +__flatbuffers_offset_vec_at(MegCC_ExternOpr_table_t, vec, i, 0) +__flatbuffers_table_as_root(MegCC_ExternOpr) + +__flatbuffers_define_vector_field(0, MegCC_ExternOpr, input, flatbuffers_int32_vec_t, 0) +__flatbuffers_define_string_field(1, MegCC_ExternOpr, name, 0) +__flatbuffers_define_string_field(2, MegCC_ExternOpr, data, 0) +__flatbuffers_define_scalar_field(3, MegCC_ExternOpr, data_len, flatbuffers_uint32, uint32_t, UINT32_C(0)) +__flatbuffers_define_vector_field(4, MegCC_ExternOpr, output, flatbuffers_int32_vec_t, 0) + struct MegCC_Arithmetic_table { uint8_t unused__; }; static inline size_t MegCC_Arithmetic_vec_len(MegCC_Arithmetic_vec_t vec) @@ -992,7 +1019,8 @@ __flatbuffers_define_union(flatbuffers_, MegCC_Instruction) #define MegCC_Instruction_IndexingMultiAxis ((MegCC_Instruction_union_type_t)UINT8_C(14)) #define MegCC_Instruction_ArgSort ((MegCC_Instruction_union_type_t)UINT8_C(15)) #define MegCC_Instruction_Reshape ((MegCC_Instruction_union_type_t)UINT8_C(16)) -#define MegCC_Instruction_INSTRUCTION_TABLE_END ((MegCC_Instruction_union_type_t)UINT8_C(17)) +#define MegCC_Instruction_ExternOpr ((MegCC_Instruction_union_type_t)UINT8_C(17)) +#define MegCC_Instruction_INSTRUCTION_TABLE_END ((MegCC_Instruction_union_type_t)UINT8_C(18)) static inline const char *MegCC_Instruction_type_name(MegCC_Instruction_union_type_t type) { @@ -1014,6 +1042,7 @@ static inline const char *MegCC_Instruction_type_name(MegCC_Instruction_union_ty case MegCC_Instruction_IndexingMultiAxis: return "IndexingMultiAxis"; case MegCC_Instruction_ArgSort: return "ArgSort"; case MegCC_Instruction_Reshape: return "Reshape"; + case MegCC_Instruction_ExternOpr: return "ExternOpr"; case MegCC_Instruction_INSTRUCTION_TABLE_END: return "INSTRUCTION_TABLE_END"; default: return ""; } @@ -1039,6 +1068,7 @@ static inline int MegCC_Instruction_is_known_type(MegCC_Instruction_union_type_t case MegCC_Instruction_IndexingMultiAxis: return 1; case MegCC_Instruction_ArgSort: return 1; case MegCC_Instruction_Reshape: return 1; + case MegCC_Instruction_ExternOpr: return 1; case MegCC_Instruction_INSTRUCTION_TABLE_END: return 1; default: return 0; } diff --git a/runtime/src/vm.c b/runtime/src/vm.c index b02891b5..8618946e 100644 --- a/runtime/src/vm.c +++ b/runtime/src/vm.c @@ -18,6 +18,7 @@ void register_all(VM* vm) { register_dimshuffle(vm); register_broadcast_shape_of(vm); register_reshape(vm); + register_extern_opr(vm); } VM* vm_global_inst() { diff --git a/runtime/src/vm/extern_opr.c b/runtime/src/vm/extern_opr.c new file mode 100644 index 00000000..e39ca310 --- /dev/null +++ b/runtime/src/vm/extern_opr.c @@ -0,0 +1,283 @@ +/** + * \file runtime/src/vm/extern_opr.c + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ + +#include "vm.h" +#include "utils.h" +#include "extern_c_opr.h" + +#if ENABLE_INST_EXTERN_OPR + +#include +#include +#include +#include + +#include "init.h" +#include "parse.h" +#include "vm/common.h" +#include "vm/instruction.h" +#include "vm/registry.h" + +typedef struct LoaderMap { + MGBOprLoader loader; +} LoaderMap; + +typedef struct LoaderMapVec { + LoaderMap* loader_map; + size_t size; + size_t capacity; +} LoaderMapVec; + +static LoaderMapVec loader_maps; + +static int insert_loader(LoaderMapVec* lm, const MGBOprLoader* loader) { + for (int i = 0; i < lm->size; ++i) { + if (strcmp(lm->loader_map[i].loader.name, loader->name) == 0) { + return 0; + } + } + + if (lm->capacity == 0) { + lm->capacity = 2; + lm->loader_map = tinynn_malloc(sizeof(LoaderMap) * lm->capacity); + } + if (lm->size >= lm->capacity) { + lm->capacity *= 2; + LoaderMap* tmp = tinynn_malloc(sizeof(LoaderMap) * lm->capacity); + memcpy(tmp, lm->loader_map, sizeof(LoaderMap) * lm->size); + tinynn_free(lm->loader_map); + lm->loader_map = tmp; + } + + lm->loader_map[lm->size].loader = *loader; + ++lm->size; + return 1; +} + +static int register_loader(const MGBOprLoader* loader) { + return insert_loader(&loader_maps, loader); +} + +static int delete_loader(LoaderMapVec* lm, const char* name) { + for (int i = 0; i < lm->size; ++i) { + if (strcmp(lm->loader_map[i].loader.name, name) == 0) { + memmove(lm->loader_map + i, lm->loader_map + i + 1, + lm->size - i - 1); + --lm->size; + return 1; + } + } + return 0; +} + +static int unregister_loader(const char* name) { + return delete_loader(&loader_maps, name); +} + +static LoaderMap* find_loader_by_name(const LoaderMapVec* lm, + const char* name) { + for (int i = 0; i < lm->size; ++i) { + if (strcmp(lm->loader_map[i].loader.name, name) == 0) { + return lm->loader_map + i; + } + } + return NULL; +} + +static void free_loader_maps(LoaderMapVec* lm) { + if (lm->loader_map) { + tinynn_free(lm->loader_map); + lm->loader_map = NULL; + } +} + +//! get API ptr for specific version; return nullptr if version mismatch +const MGBExternCOprApi* mgb_get_extern_c_opr_api_versioned(int version) { + static MGBExternCOprApi api; + api.unregister_loader = unregister_loader; + TINYNN_ASSERT_MSG(version >= 0x24, "Extern opr loader version must greater than 0x24.\n"); + + if (version != MGB_EXTERN_C_OPR_VERSION) { + return NULL; + } + + api.register_loader = register_loader; + return &api; +} + +// Convert Tensor to MGBTensor, except MGBTensor.data. +static void Tensor2MGBTensor(const Tensor* tensor, MGBTensor* mgb_tensor){ + mgb_tensor->layout.shape.ndim = tensor->layout.nr_dim; + for(int i = 0; i < tensor->layout.nr_dim; ++i){ + mgb_tensor->layout.shape.shape[i] = tensor->layout.dims[i]; + } + switch(tensor->dtype.type_enum){ + case TinyNN_FLOAT: + mgb_tensor->layout.dtype = MGB_DTYPE_FLOAT32; + break; + case TinyNN_FLOAT16: + mgb_tensor->layout.dtype = MGB_DTYPE_FLOAT16; + break; + case TinyNN_INT: + mgb_tensor->layout.dtype = MGB_DTYPE_INT32; + break; + case TinyNN_INT16: + mgb_tensor->layout.dtype = MGB_DTYPE_INT16; + break; + case TinyNN_UINT8: + mgb_tensor->layout.dtype = MGB_DTYPE_UINT8; + break; + default: + TINYNN_ASSERT_MSG(0, "Unsupport data type\n"); + } +} + +static void MGBTensor2Tensor(const MGBTensor* mgb_tensor, Tensor* tensor){ + tensor->layout.nr_dim = mgb_tensor->layout.shape.ndim; + for(int i = 0; i < mgb_tensor->layout.shape.ndim; ++i){ + tensor->layout.dims[i] = mgb_tensor->layout.shape.shape[i]; + } + + switch(mgb_tensor->layout.dtype){ + case MGB_DTYPE_FLOAT32: + tensor->dtype.type_enum = TinyNN_FLOAT; + break; + case MGB_DTYPE_FLOAT16: + tensor->dtype.type_enum = TinyNN_FLOAT16; + break; + case MGB_DTYPE_INT32: + tensor->dtype.type_enum = TinyNN_INT; + break; + case MGB_DTYPE_INT16: + tensor->dtype.type_enum = TinyNN_INT16; + break; + case MGB_DTYPE_UINT8: + tensor->dtype.type_enum = TinyNN_UINT8; + break; + default: + TINYNN_ASSERT_MSG(0, "Unsupport data type\n"); + } +} + +static TinyNNStatus load(flatbuffers_generic_t fbs_inst, Instruction* inst, + VM* vm) { + ExternOpr* extern_opr = &inst->workload.extern_opr; + DeviceModel* model = get_active_device_model(vm); + ns(ExternOpr_table_t) fbs_extern_opr = (ns(ExternOpr_table_t))(fbs_inst); + inst->tag = TinyNN_INST_EXTERN_OPR; + + flatbuffers_int32_vec_t fbs_inputs = ns(ExternOpr_input(fbs_extern_opr)); + extern_opr->nr_input = flatbuffers_int32_vec_len(fbs_inputs); + extern_opr->inputs = tinynn_malloc(sizeof(Tensor*) * extern_opr->nr_input); + for(int i = 0; i < extern_opr->nr_input; ++i){ + extern_opr->inputs[i] = model->tensors + fbs_inputs[i]; + } + + flatbuffers_int32_vec_t fbs_outputs = ns(ExternOpr_output(fbs_extern_opr)); + extern_opr->nr_output = flatbuffers_int32_vec_len(fbs_outputs); + extern_opr->outputs = tinynn_malloc(sizeof(Tensor*) * extern_opr->nr_output); + for(int i = 0; i < extern_opr->nr_output; ++i){ + extern_opr->outputs[i] = model->tensors + fbs_outputs[i]; + } + + const char* name = ns(ExternOpr_name(fbs_extern_opr)); + const void* data = ns(ExternOpr_data(fbs_extern_opr)); + size_t data_len = ns(ExternOpr_data_len(fbs_extern_opr)); + + LoaderMap* loader_map = find_loader_by_name(&loader_maps, name); + TINYNN_ASSERT_MSG(loader_map, "Wrong loader.\n"); + extern_opr->desc = loader_map->loader.create_desc(extern_opr->nr_input, + data, data_len); + + extern_opr->mgb_inputs = tinynn_malloc(sizeof(MGBTensor) * extern_opr->nr_input); + MGBTensorShape* inputs_shape = tinynn_malloc(sizeof(MGBTensorShape) * extern_opr->nr_input); + MGBDType* inputs_type = tinynn_malloc(sizeof(MGBDType) * extern_opr->nr_input); + for(int i = 0; i < extern_opr->nr_input; ++i){ + Tensor2MGBTensor(extern_opr->inputs[i], extern_opr->mgb_inputs + i); + inputs_shape[i] = extern_opr->mgb_inputs[i].layout.shape; + inputs_type[i] = extern_opr->mgb_inputs[i].layout.dtype; + } + + extern_opr->mgb_outputs = tinynn_malloc(sizeof(MGBTensor) * extern_opr->nr_output); + MGBTensorShape* outputs_shape = tinynn_malloc(sizeof(MGBTensorShape) * extern_opr->nr_output); + MGBDType* outputs_type = tinynn_malloc(sizeof(MGBDType) * extern_opr->nr_output); + + extern_opr->desc->infer_shape(extern_opr->desc, inputs_shape, outputs_shape); + if(extern_opr->desc->infer_dtype){ + extern_opr->desc->infer_dtype(extern_opr->desc, inputs_type, outputs_type); + }else{ + for(int i = 0; i < extern_opr->nr_output; ++i){ + outputs_type[i] = inputs_type[0]; + } + } + + for(int i = 0; i < extern_opr->nr_output; ++i){ + extern_opr->mgb_outputs[i].layout.dtype = outputs_type[i]; + extern_opr->mgb_outputs[i].layout.shape.ndim = outputs_shape[i].ndim; + for(int j = 0; j < extern_opr->mgb_outputs[i].layout.shape.ndim; ++j){ + extern_opr->mgb_outputs[i].layout.shape.shape[j] = outputs_shape[i].shape[j]; + } + } + + tinynn_free(outputs_shape); + tinynn_free(outputs_type); + + tinynn_free(inputs_shape); + tinynn_free(inputs_type); + + return TinyNN_SUCCESS; +} + +static TinyNNStatus execute(Instruction* inst, VM* vm) { + ExternOpr* extern_opr = &inst->workload.extern_opr; + + for(int i = 0; i < extern_opr->nr_input; ++i){ + extern_opr->mgb_inputs[i].data = extern_opr->inputs[i]->ptr; + } + for(int i = 0; i < extern_opr->nr_output; ++i){ + extern_opr->mgb_outputs[i].data = extern_opr->outputs[i]->ptr; + } + extern_opr->desc->execute(extern_opr->desc, extern_opr->mgb_inputs, extern_opr->mgb_outputs); + for(int i = 0; i < extern_opr->nr_output; ++i){ + MGBTensor2Tensor(extern_opr->mgb_outputs + i, extern_opr->outputs[i]); + } + + return TinyNN_SUCCESS; +} + +static TinyNNStatus destruct(VM* vm, Instruction* inst) { + ExternOpr* extern_opr = &inst->workload.extern_opr; + + FREE(extern_opr->inputs); + FREE(extern_opr->outputs); + FREE(extern_opr->mgb_outputs); + FREE(extern_opr->mgb_inputs); + + free_loader_maps(&loader_maps); + + return TinyNN_SUCCESS; +} + +void register_extern_opr(VM* vm) { + vm_register_instruction_load(vm, ns(Instruction_ExternOpr), &load); + vm_register_instruction_call(vm, TinyNN_INST_EXTERN_OPR, &execute); + vm_register_instruction_destruct(vm, TinyNN_INST_EXTERN_OPR, &destruct); +} +#else +void register_extern_opr(VM* vm) {} + +const MGBExternCOprApi* mgb_get_extern_c_opr_api_versioned(int i) { + TINYNN_ASSERT_MSG( + 0, + "Should NOT execute here!!!\n" + "Maybe there is no extern opr in model, " + "but command line argument --c-opr-lib/-c is provided.\n"); + return NULL; +} +#endif +// vim: syntax=cpp.doxygen diff --git a/runtime/src/vm/instruction.h b/runtime/src/vm/instruction.h index 0896839c..3fa4fbd3 100644 --- a/runtime/src/vm/instruction.h +++ b/runtime/src/vm/instruction.h @@ -12,6 +12,7 @@ #include "data_struct.h" #include "model_reader.h" #include "runtime_inst_switch.h" +#include "extern_c_opr.h" // clang-format off #define FOR_EACH_INSTRUCTION_TYPE(cb) \ @@ -30,7 +31,8 @@ cb(TinyNN_INST_TYPECVT)\ cb(TinyNN_INST_INDEXING_MULTI_AXIS)\ cb(TinyNN_INST_ARGSORT)\ - cb(TinyNN_INST_RESHAPE) + cb(TinyNN_INST_RESHAPE)\ + cb(TinyNN_INST_EXTERN_OPR) typedef enum { TinyNN_INST_NONE = 0, @@ -202,6 +204,16 @@ typedef struct { Tensor* output; } Reshape; +typedef struct { + int32_t nr_input; + int32_t nr_output; + Tensor** inputs; + MGBTensor* mgb_inputs; + MGBTensor* mgb_outputs; + Tensor** outputs; + MGBOprDesc* desc; +} ExternOpr; + typedef struct Instruction { InstructionType tag; union { @@ -221,6 +233,7 @@ typedef struct Instruction { IndexingMultiAxis indexing_multi_axis; ArgSort argsort; Reshape reshape; + ExternOpr extern_opr; } workload; #if TINYNN_PROFILE_KERNEL float time_ms; diff --git a/runtime/test/CMakeLists.txt b/runtime/test/CMakeLists.txt index 7cf9db71..1d0b0759 100644 --- a/runtime/test/CMakeLists.txt +++ b/runtime/test/CMakeLists.txt @@ -68,7 +68,8 @@ target_include_directories( ./runtime/ ${SCHEMA_OUTPUT} ${PROJECT_SOURCE_DIR}/../../third_party/flatcc/include - ${PROJECT_SOURCE_DIR}/../../third_part/gtest/include) + ${PROJECT_SOURCE_DIR}/../../third_part/gtest/include + ${PROJECT_SOURCE_DIR}/../../immigration/include) target_link_libraries(TinyNNTest gtest) diff --git a/runtime/version.ld b/runtime/version.ld index 356a3a34..76275193 100644 --- a/runtime/version.ld +++ b/runtime/version.ld @@ -4,6 +4,7 @@ global: default_config; default_network_io; register_tinynn_cb; + mgb_get_extern_c_opr_api_versioned; local: diff --git a/script/build_and_test_not_standard_os.sh b/script/build_and_test_not_standard_os.sh index 9437ef54..cdfdbb44 100755 --- a/script/build_and_test_not_standard_os.sh +++ b/script/build_and_test_not_standard_os.sh @@ -28,7 +28,7 @@ cmake --build "$MEGCC_BUILD_DIR" -j$(nproc) --target mgb-to-tinynn --target mgb- function check_key_words() { #elf self mangle words, we do not care!! - white_list="@MEGW mgb1 5Mbg6 MGBi O:MgBnWk Yr]< 4emUi0B >HMgE kMEG RmEg MbGV4 MEgIy @MEg mGe#S BMgb MGB( mBg: MBgr8C A&mGB mEg; mGb>/ mEg= .strtab .shstrtab A=MgE= mgb=g MGe= g=MgE Date: Wed, 28 Dec 2022 17:20:05 +0800 Subject: [PATCH 02/17] feat(compiler): add quant fuse_and_relu and Convbackdata naive kernel --- .../BareMetal/ConvBackDataKernel.cpp | 52 +++-- .../KernelGen/BareMetal/ElemwiseMultiType.cpp | 14 +- compiler/test/kernel/opr/naive/conv.cpp | 190 ++++++++++++------ .../kernel/opr/naive/elemwise_multitype.cpp | 3 +- 4 files changed, 174 insertions(+), 85 deletions(-) diff --git a/compiler/lib/KernelGen/BareMetal/ConvBackDataKernel.cpp b/compiler/lib/KernelGen/BareMetal/ConvBackDataKernel.cpp index 2cd81319..9a453f0b 100644 --- a/compiler/lib/KernelGen/BareMetal/ConvBackDataKernel.cpp +++ b/compiler/lib/KernelGen/BareMetal/ConvBackDataKernel.cpp @@ -11,7 +11,6 @@ #include #include -#include "Activation.h" #include "ConvBackDataKernel.h" #include "FormatHelper.h" #include "Utils/StringTemplate.h" @@ -64,6 +63,19 @@ std::string gen_inline_addr(std::string format_str, std::string sparse) { return ss.str(); } +std::string gen_dep() { + return R"( + static inline int8_t fp32_to_int8(float src){ + int res = roundf(src); + if(res > 127){ + res=127; + }else if(res < -128){ + res=-128; + } + return (int8_t)(res); + } + )"; +} std::string get_format(TContext* ctx) { auto format_str = ctx->getAttrStr("format"); return format_str; @@ -83,12 +95,17 @@ bool ConvBackDataGeneral::IsAvailable(TContext* ctx) const { bool param_mode_ok = (ctx->getAttrStr("format") == "NCHW" || ctx->getAttrStr("format") == "NCHW44") && ctx->getAttrStr("mode") == "CROSS_CORRELATION"; - bool type_float_ok = ctx->getAttrInt("nr_operands") >= 3 && + bool type_float_ok = ctx->getAttrInt("nr_operands") == 3 && ((ctx->getAttrOprand("operand:0").dtype == "f32" && ctx->getAttrOprand("operand:1").dtype == "f32" && ctx->getAttrOprand("operand:2").dtype == "f32")); + bool type_qint_ok = + ctx->getAttrInt("nr_operands") == 3 && + (Utils::is_quant_dtype(ctx->getAttrOprand("operand:0").dtype, 8) && + Utils::is_quant_dtype(ctx->getAttrOprand("operand:1").dtype, 8) && + Utils::is_quant_dtype(ctx->getAttrOprand("operand:2").dtype, 8)); - return param_mode_ok && (type_float_ok); + return param_mode_ok && (type_float_ok || type_qint_ok); } std::string ConvBackDataGeneral::GetKernelSymbol(TContext* ctx) const { @@ -136,8 +153,13 @@ std::string ConvBackDataGeneral::GetKernelBody(TContext* context) const { auto flt_specifier = Utils::cvt_dtype_specifier(flt_dtype); auto dst_specifier = Utils::cvt_dtype_specifier(dst_dtype); std::string acc_specifier = "float"; + std::string convert = ""; + std::string compute_kern = "(*sval) + dval * fval"; if (src_specifier == "int8_t" && flt_specifier == "int8_t") { - acc_specifier = "int"; + convert = "fp32_to_int8"; + compute_kern = + "((*sval) * scale + dval * dst_scale * fval * " + "flt_scale)/scale"; } uint32_t spatial_start = 2; @@ -168,10 +190,10 @@ std::string ConvBackDataGeneral::GetKernelBody(TContext* context) const { ss << R"( #include )"; - ss << GenActivation::gen_func_call_with_typecvt_dep( - noline_mode, acc_specifier, dst_specifier) - << "\n"; ss << gen_inline_addr(filter_format_str, sparse_str); + if (src_specifier == "int8_t" && flt_specifier == "int8_t") { + ss << gen_dep(); + } ss << GenCommonRet() << " " << GetKernelSignature(context) << "{\n"; ss << "const uint32_t spatial_start = " << spatial_start << ";\n"; ss << "const uint32_t channel_pos = " << channel_pos << ";\n"; @@ -244,7 +266,7 @@ std::string ConvBackDataGeneral::GetKernelBody(TContext* context) const { uint32_t oc_idx = group_idx * ocpg + ocpg_idx; for (uint32_t oh_idx = 0; oh_idx < oh; ++oh_idx) { for (uint32_t ow_idx = 0; ow_idx < ow; ++ow_idx) { - ${acc_specifier} dval = dst_ptr[${dst_layout_iter_symbol}(batch_idx, oc_idx, oh_idx, + ${dst_specifier} dval = dst_ptr[${dst_layout_iter_symbol}(batch_idx, oc_idx, oh_idx, ow_idx, dst_layout.stride, true)]; for (uint32_t fh_idx = 0; fh_idx < fh; ++fh_idx) { @@ -258,14 +280,16 @@ std::string ConvBackDataGeneral::GetKernelBody(TContext* context) const { ++icpg_idx) { uint32_t ic_idx = group_idx * icpg + icpg_idx; - ${acc_specifier}* sval = &src_ptr[${src_layout_iter_symbol}( + ${src_specifier}* sval = &src_ptr[${src_layout_iter_symbol}( batch_idx, ic_idx, ih_idx, iw_idx, src_layout.stride, false)]; - ${acc_specifier} fval = flt_ptr[${filter_iter_symbol}( + ${flt_specifier} fval = flt_ptr[${filter_iter_symbol}( group_idx, ocpg_idx, icpg_idx, fh_idx, fw_idx, filter_stride)]; - *sval += dval * fval; + ${acc_specifier} tmp_mid_val0 = ${compute_kern}; + ${src_specifier} tmp_mid_val = ${convert}(tmp_mid_val0); + *sval = tmp_mid_val; } } } @@ -290,14 +314,12 @@ std::string ConvBackDataGeneral::GetKernelBody(TContext* context) const { .add("filter_iter_symbol", "get_filter_addr_" + filter_format_str + "_" + sparse_str) - .add("act_func", GenActivation::gen_func_call_with_typecvt( - noline_mode, "dval", acc_specifier, - dst_specifier, "scale", - "flt_scale", "dst_scale")) .add("src_specifier", src_specifier) .add("flt_specifier", flt_specifier) .add("dst_specifier", dst_specifier) .add("acc_specifier", acc_specifier) + .add("convert", convert) + .add("compute_kern", compute_kern) .render(body_template); return ss.str(); } diff --git a/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp b/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp index 5d384819..b5cec2c9 100644 --- a/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp +++ b/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp @@ -35,7 +35,14 @@ std::string gen_dep(std::string mode) { std::string gen_binary(std::string mode) { if (mode == "QADD") { - return "fp32_to_int8((scale_0 * val_0 + scale_1 * val_1) * scale_div)"; + return "float out_val = fp32_to_int8((scale_0 * val_0 + scale_1 * " + "val_1) * scale_div);"; + } else if (mode == "QFUSE_ADD_RELU") { + return R"( + float val0 = scale_0 * val_0; + float val1 = scale_1 * val_1; + float out_val = fp32_to_int8( ((val0 + val1) > 0? (val0 + val1):0) * scale_div); + )"; } else { CC_ABORT << "not support mode " << mode.c_str() << "\n"; } @@ -48,7 +55,7 @@ bool ElemwiseMultiTypeKernel::IsAvailable(TContext* context) const { auto mode = context->getAttrStr("mode"); auto nr_operands = context->getAttrInt("nr_operands"); bool nr_operands_ok = nr_operands == 3; - bool mode_ok_binary = mode == "QADD"; + bool mode_ok_binary = mode == "QADD" || mode == "QFUSE_ADD_RELU"; return nr_operands_ok && (mode_ok_binary); } @@ -99,7 +106,8 @@ std::string ElemwiseMultiTypeKernel::GetKernelBody(TContext* context) const { for(size_t i = 0; i < nr_elem; ++i){ ${op0_specifier} val_0 = input_0[i]; ${op1_specifier} val_1 = input_1[i]; - output_data[i] = ${act}; + ${act}; + output_data[i] = out_val; } return TinyNN_SUCCESS; } diff --git a/compiler/test/kernel/opr/naive/conv.cpp b/compiler/test/kernel/opr/naive/conv.cpp index a234282d..838b81dc 100644 --- a/compiler/test/kernel/opr/naive/conv.cpp +++ b/compiler/test/kernel/opr/naive/conv.cpp @@ -11,7 +11,58 @@ using namespace megdnn; using namespace megcc::test; using namespace megcc::KernelGen; - +namespace { +void nchw_backdata(Checker& checker) { + ConvolutionBackwardData::Param param; + param.compute_mode = ConvolutionBackwardData::Param::ComputeMode::DEFAULT; + param.format = ConvolutionBackwardData::Param::Format::NCHW; + checker.set_epsilon(1e-4); + for (size_t n : {2}) + for (size_t oc : {1, 4}) + for (size_t ic : {1, 4}) + for (size_t hw : {7, 12}) + for (size_t kernel : {1, 3}) + for (size_t pad : {(size_t)0, kernel / 2}) + for (size_t stride : {1, 2}) { + param.pad_h = pad; + param.pad_w = pad; + param.stride_h = stride; + param.stride_w = stride; + param.sparse = + ConvBiasForward::Param::Sparse::DENSE; + checker.set_param(param); + checker.execs({{oc, ic, kernel, kernel}, + {n, oc, hw, hw}, + {n, ic, + (hw - 1) * stride + + (kernel - 1) * + param.dilate_h + + 1 - pad * 2, + (hw - 1) * stride + + (kernel - 1) * + param.dilate_w + + 1 - pad * 2}}); + if (ic == oc) { + size_t group = oc; + param.sparse = ConvolutionBackwardData:: + Param::Sparse::GROUP; + checker.set_param(param); + checker.execs( + {{group, 1, 1, kernel, kernel}, + {n, oc, hw, hw}, + {n, ic, + (hw - 1) * stride + + (kernel - 1) * + param.dilate_h + + 1 - pad * 2, + (hw - 1) * stride + + (kernel - 1) * + param.dilate_w + + 1 - pad * 2}}); + } + } +} +} // namespace TEST(NAIVE, ConvBiasNCHWQS8) { Checker checker(Arch::BAREMETAL); checker.set_kernel_symbol("kernel_.*"); @@ -222,42 +273,7 @@ TEST(NAIVE, ConvBackDataNCHW) { Checker checker(Arch::BAREMETAL); checker.set_kernel_symbol("kernel_.*"); ConvolutionBackwardData::Param param; - ConstRNG seq(2.0); - ConstRNG const_rng(1.0); - - checker.set_rng(1, &seq); - checker.set_rng(0, &const_rng); - - param.compute_mode = ConvolutionBackwardData::Param::ComputeMode::DEFAULT; - param.format = ConvolutionBackwardData::Param::Format::NCHW; - checker.set_epsilon(1e-4); - for (size_t n : {2}) - for (size_t oc : {1, 4}) - for (size_t ic : {1, 4}) - for (size_t hw : {7, 12}) - for (size_t kernel : {1, 3}) - for (size_t pad : {(size_t)0, kernel / 2}) - for (size_t stride : {1, 2}) { - param.pad_h = pad; - param.pad_w = pad; - param.stride_h = stride; - param.stride_w = stride; - param.sparse = ConvBiasForward::Param:: - Sparse::DENSE; - checker.set_param(param); - checker.execs({{oc, ic, kernel, kernel}, - {n, oc, hw, hw}, - {n, ic, (hw-1)*stride+(kernel-1)*param.dilate_h+1-pad*2, (hw-1)*stride+(kernel-1)*param.dilate_w+1-pad*2}}); - if (ic == oc) { - size_t group = oc; - param.sparse = ConvolutionBackwardData::Param:: - Sparse::GROUP; - checker.set_param(param); - checker.execs({{ group, 1, 1, kernel, kernel}, - {n, oc, hw, hw}, - {n, ic, (hw-1)*stride+(kernel-1)*param.dilate_h+1-pad*2, (hw-1)*stride+(kernel-1)*param.dilate_w+1-pad*2}}); - } - } + nchw_backdata(checker); } TEST(NAIVE, ConvBackDataNCHW44) { @@ -267,35 +283,77 @@ TEST(NAIVE, ConvBackDataNCHW44) { checker.set_epsilon(1e-4); param.compute_mode = ConvolutionBackwardData::Param::ComputeMode::DEFAULT; param.format = ConvolutionBackwardData::Param::Format::NCHW44; - for (size_t n : {12}) - for (size_t oc : {4, 12}) - for (size_t ic : {4, 12}) - for (size_t hw : {7, 12}) - for (size_t kernel : {1, 3}) - for (size_t pad : {(size_t)0, kernel / 2}) - for (size_t stride : {1, 2}) { - param.pad_h = pad; - param.pad_w = pad; - param.stride_h = stride; - param.stride_w = stride; - param.sparse = ConvolutionBackwardData::Param:: - Sparse::DENSE; + for (size_t n : {12}) + for (size_t oc : {4, 12}) + for (size_t ic : {4, 12}) + for (size_t hw : {7, 12}) + for (size_t kernel : {1, 3}) + for (size_t pad : {(size_t)0, kernel / 2}) + for (size_t stride : {1, 2}) { + param.pad_h = pad; + param.pad_w = pad; + param.stride_h = stride; + param.stride_w = stride; + param.sparse = ConvolutionBackwardData::Param:: + Sparse::DENSE; + checker.set_param(param); + checker.execs( + {{oc / 4, ic / 4, kernel, kernel, 4, 4}, + {n, oc / 4, hw, hw, 4}, + {n, ic / 4, + (hw - 1) * stride + + (kernel - 1) * + param.dilate_h + + 1 - pad * 2, + (hw - 1) * stride + + (kernel - 1) * + param.dilate_w + + 1 - pad * 2, + 4}}); + if (ic == oc) { + size_t group = oc; + param.sparse = ConvolutionBackwardData:: + Param::Sparse::GROUP; checker.set_param(param); - checker.execs({{oc / 4, ic / 4, kernel, - kernel, 4, 4}, - {n, oc / 4, hw, hw, 4}, - {n, ic / 4 , (hw-1)*stride+(kernel-1)*param.dilate_h+1-pad*2, (hw-1)*stride+(kernel-1)*param.dilate_w+1-pad*2, 4} - }); - if (ic == oc) { - size_t group = oc; - param.sparse = ConvolutionBackwardData::Param:: - Sparse::GROUP; - checker.set_param(param); - checker.execs({{group / 4, 1, 1, kernel, - kernel, 4}, - {n, oc / 4, hw, hw, 4}, - {n, ic / 4 , (hw-1)*stride+(kernel-1)*param.dilate_h+1-pad*2, (hw-1)*stride+(kernel-1)*param.dilate_w+1-pad*2, 4} - }); - } + checker.execs( + {{group / 4, 1, 1, kernel, kernel, + 4}, + {n, oc / 4, hw, hw, 4}, + {n, ic / 4, + (hw - 1) * stride + + (kernel - 1) * + param.dilate_h + + 1 - pad * 2, + (hw - 1) * stride + + (kernel - 1) * + param.dilate_w + + 1 - pad * 2, + 4}}); } + } +} + +TEST(NAIVE, ConvBackDataNCHWQS8) { + Checker checker(Arch::BAREMETAL); + checker.set_kernel_symbol("kernel_.*"); + ConvolutionBackwardData::Param param; + + checker.set_dtype(0, dtype::QuantizedS8(1.0f)); + checker.set_dtype(1, dtype::QuantizedS8(2.0f)); + checker.set_dtype(2, dtype::QuantizedS8(2.0f)); + nchw_backdata(checker); +} + +TEST(NAIVE, ConvBackDataNCHWQS8Overflow) { + Checker checker(Arch::BAREMETAL); + checker.set_kernel_symbol("kernel_.*"); + ConvolutionBackwardData::Param param; + UniformIntRNG qint_rng(30, 50); + checker.set_rng(0, &qint_rng); + checker.set_rng(1, &qint_rng); + + checker.set_dtype(0, dtype::QuantizedS8(1.0f)); + checker.set_dtype(1, dtype::QuantizedS8(2.0f)); + checker.set_dtype(2, dtype::QuantizedS8(2.0f)); + nchw_backdata(checker); } diff --git a/compiler/test/kernel/opr/naive/elemwise_multitype.cpp b/compiler/test/kernel/opr/naive/elemwise_multitype.cpp index 82b29831..4e3c1fa0 100644 --- a/compiler/test/kernel/opr/naive/elemwise_multitype.cpp +++ b/compiler/test/kernel/opr/naive/elemwise_multitype.cpp @@ -20,7 +20,8 @@ TEST(NAIVE, ElementwiseMultitypeBinary) { checker.set_dtype(1, dtype::QuantizedS8(2.f)); checker.set_dtype(2, dtype::QuantizedS8(3.f)); ElemwiseMultiType::Param param; - for (auto mode : {MODE::QADD}) { + + for (auto mode : {MODE::QADD, MODE::QFUSE_ADD_RELU}) { param.mode = mode; checker.set_param(param); checker.execs({{1}, {1}, {}}); From 61b1d8e925270db5c4be9cf7fa257b6ab4cd3150 Mon Sep 17 00:00:00 2001 From: zhanghaolong Date: Thu, 29 Dec 2022 13:25:30 +0800 Subject: [PATCH 03/17] feat(misc): misc opt 1: config run iter count 2: make compiler more friendly for debugging --- compiler/CMakeLists.txt | 2 +- runtime/example/standard_OS/lite_main.c | 99 +++++++++++++++---------- runtime/src/lite/network.c | 4 +- 3 files changed, 61 insertions(+), 44 deletions(-) diff --git a/compiler/CMakeLists.txt b/compiler/CMakeLists.txt index 9bdd7419..d904a0bf 100644 --- a/compiler/CMakeLists.txt +++ b/compiler/CMakeLists.txt @@ -115,7 +115,7 @@ set(TCC_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/../third_party/tcc/include) set(RUNTIME_SRC_DIR ${PROJECT_SOURCE_DIR}/../runtime/src) set(RUNTIME_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/../runtime/include) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -g") include(cmake/GenLiteSchema.cmake) gen_lite_schema() diff --git a/runtime/example/standard_OS/lite_main.c b/runtime/example/standard_OS/lite_main.c index 4035a68c..ce7fe294 100644 --- a/runtime/example/standard_OS/lite_main.c +++ b/runtime/example/standard_OS/lite_main.c @@ -6,8 +6,8 @@ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. */ -#include #include +#include #include #include "extern_c_opr.h" #include "lite-c/common_enum_c.h" @@ -58,27 +58,31 @@ static void write_file(const char* file_name, void* ptr, size_t length) { } static inline void run_model(LiteNetwork model, const char* output_dir, - int instance_cnt, const int print_out) { + int instance_cnt, const int print_out, + const size_t warmup_count, + const size_t iter_count) { + size_t number = iter_count; + size_t warmup = warmup_count; #if TINYNN_DUMP_TENSOR || DEBUG_MODE - const int number = 1; - const int warmup = 0; -#else - const int number = 100; - const int warmup = 20; + number = 1; + warmup = 0; + printf("(DEBUG or TINYNN_DUMP_TENSOR enable)overwriting run iter to: %zu, " + "warmup count to: %zu\n", + number, warmup); #endif - for (int i = 0; i < warmup; i++) { + for (size_t i = 0; i < warmup; i++) { LITE_CAPI_CHECK(LITE_forward(model), "run model failed\n"); LITE_CAPI_CHECK(LITE_wait(model), "wait model failed\n"); - printf("warmup iter %d finished.\n\n", i); + printf("warmup iter %zu finished.\n", i); } struct timeval start; struct timeval end; gettimeofday(&start, NULL); - for (int i = 0; i < number; i++) { + for (size_t i = 0; i < number; i++) { LITE_CAPI_CHECK(LITE_forward(model), "run model failed\n"); LITE_CAPI_CHECK(LITE_wait(model), "wait model failed\n"); - printf("execute iter %d finished.\n", i); + printf("execute iter %zu finished.\n", i); } gettimeofday(&end, NULL); unsigned long diff = @@ -159,17 +163,19 @@ static TinyNnCallBack g_cb = { }; #endif -void usage(){ - fprintf(stderr, - "Usage:\n" - "\t--input-model/-m: input model path\n" - "\t--output-dir/-o: output file path\n" - "\t--log-level/-l: 0:ERROR, 1:WARN, 2:INFO, 3:DEBUG\n" - "\t--input-data/-d: var=path/to/data_file\n" - "\t--data-shape/-s: data shape\n" - "\t--c-opr-lib/-c: path to extern opr lib file(.so)\n" - "\t--c-opr-init-interface/-i: the init API of your loader\n" - ); +void usage() { + fprintf(stderr, + "Usage:\n" + "\t--input-model/-m: input model path\n" + "\t--output-dir/-o: output file path\n" + "\t--log-level/-l: 0:ERROR, 1:WARN, 2:INFO, 3:DEBUG\n" + "\t--input-data/-d: var=path/to/data_file, create by: " + "python3 compiler/script/debug/gen_input.py\n" + "\t--data-shape/-s: data shape\n" + "\t--c-opr-lib/-c: path to extern opr lib file(.so)\n" + "\t--c-opr-init-interface/-i: the init API of your loader\n" + "\t--warmup-count/-w: warmup count before run model\n" + "\t--iter-count/-t: iter run model\n"); } #if defined(_WIN32) @@ -202,26 +208,29 @@ int main(int argc, char** argv) { char* data_shape_str = NULL; char* extern_so = NULL; const char* c_opr_lib_interface = "mgb_c_opr_init"; + size_t warmup_count = 1; + size_t iter = 10; const struct option long_options[] = { - {"input-model", required_argument, 0, 'm'}, - {"output-dir", required_argument, 0, 'o'}, - {"log-level", required_argument, 0, 'l'}, - {"input-data", required_argument, 0, 'd'}, - {"data-shape", required_argument, 0, 's'}, - {"c-opr-lib", required_argument, 0, 'c'}, - {"c-opr-init-interface", required_argument, 0, 'i'}, - {"help", no_argument, 0, 'h'}, - {0, 0, 0, 0} - }; - const char* shortopt = "m:o:l:d:s:c:i:h"; + {"input-model", required_argument, 0, 'm'}, + {"output-dir", required_argument, 0, 'o'}, + {"log-level", required_argument, 0, 'l'}, + {"input-data", required_argument, 0, 'd'}, + {"data-shape", required_argument, 0, 's'}, + {"c-opr-lib", required_argument, 0, 'c'}, + {"c-opr-init-interface", required_argument, 0, 'i'}, + {"warmup-count", required_argument, 0, 'w'}, + {"iter-count", required_argument, 0, 't'}, + {"help", no_argument, 0, 'h'}, + {0, 0, 0, 0}}; + const char* shortopt = "m:o:l:d:s:c:i:w:t:h"; int c, option_idx = 0; - while(1) { + while (1) { c = getopt_long(argc, argv, shortopt, long_options, &option_idx); - if(c == -1){ + if (c == -1) { break; } - switch(c){ + switch (c) { case 'm': model_path = optarg; break; @@ -243,6 +252,12 @@ int main(int argc, char** argv) { case 'i': c_opr_lib_interface = optarg; break; + case 'w': + warmup_count = atoi(optarg); + break; + case 't': + iter = atoi(optarg); + break; case 'h': usage(); exit(0); @@ -251,14 +266,14 @@ int main(int argc, char** argv) { abort(); } } - + if (print_out == 2) { LITE_set_log_level(INFO); } else if (print_out == 3) { LITE_set_log_level(DEBUG); } - if(extern_so){ + if (extern_so) { void* handle = dlopen(extern_so, RTLD_LAZY); EXAMPLE_ASSERT(handle, "load loader failed.\n"); void (*func)(const MGBExternCOprApi* (*)(int)) = NULL; @@ -266,7 +281,7 @@ int main(int argc, char** argv) { EXAMPLE_ASSERT(func, "load init interface of loader failed.\n"); func(mgb_get_extern_c_opr_api_versioned); } - + LiteNetwork model; LITE_CAPI_CHECK( LITE_make_network(&model, *default_config(), *default_network_io()), @@ -354,7 +369,8 @@ int main(int argc, char** argv) { nr_input, input_cnt); } } - run_model(model, output_dir, instance_cnt, print_out); + run_model(model, output_dir, instance_cnt, print_out, warmup_count, + iter); for (size_t i = 0; i < nr_input; ++i) { free(data[i]); } @@ -366,7 +382,8 @@ int main(int argc, char** argv) { } //! if no input data set, just run the model with random input data if (instance_cnt == 0) { - run_model(model, output_dir, instance_cnt, print_out); + run_model(model, output_dir, instance_cnt, print_out, warmup_count, + iter); } LITE_CAPI_CHECK(LITE_destroy_network(model), "delete model failed\n"); diff --git a/runtime/src/lite/network.c b/runtime/src/lite/network.c index 2d8a8bf7..35b6f6e0 100644 --- a/runtime/src/lite/network.c +++ b/runtime/src/lite/network.c @@ -201,7 +201,7 @@ int LITE_forward(const LiteNetwork network) { Layout in_layout = opr->inputs[0]->layout; Layout out_layout = opr->outputs[0]->layout; LOG_ERROR( - " instruction %s \n%f \t" + " instruction: %s \nuse %fms \t" "[%d(%d), %d(%d), %d(%d), %d(%d), %d(%d)] \t" "[%d(%d), %d(%d), %d(%d), %d(%d), %d(%d)]\n", opr->type, inst->time_ms / inst->time_count, @@ -335,7 +335,7 @@ int LITE_destroy_network(LiteNetwork network) { //! preprocessed weight for (int i = 0; i < model->nr_processed_weight; i++) { Tensor* weight = model->processed_weights + i; - if(!weight->is_shared) + if (!weight->is_shared) model->device.free(weight->ptr); } FREE(model->processed_weights); From 87235b4244e23fc33013c81ecdac15e701571e3e Mon Sep 17 00:00:00 2001 From: zhanghaolong Date: Thu, 29 Dec 2022 13:40:08 +0800 Subject: [PATCH 04/17] feat(misc): increase priority of gemv/gevm --- compiler/lib/KernelGen/KernelGen.cpp | 32 ++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/compiler/lib/KernelGen/KernelGen.cpp b/compiler/lib/KernelGen/KernelGen.cpp index f7be9a13..20e68428 100644 --- a/compiler/lib/KernelGen/KernelGen.cpp +++ b/compiler/lib/KernelGen/KernelGen.cpp @@ -32,15 +32,29 @@ KernelPack::GetKernel(KernelPack::KernType kernel_type, Arch arch) { //! arm64v7 is used by tinycv, nn opr should be armv64 or armv7, not arm64v7 auto deduce_func = GetDeduceLayout(kernel_type); if (arch == Arch::ARM64 || arch == Arch::ARM64V7) { - auto a64_kerns = Arm64::ArchKernelPack::GetKernel(kernel_type); - auto armcommon_kerns = - ArmCommon::ArchKernelPack::GetKernel(kernel_type); - auto gi_kerns = - GeneralIntrinsic::ArchKernelPack::GetKernel(kernel_type); - a64_kerns.insert(a64_kerns.end(), armcommon_kerns.begin(), - armcommon_kerns.end()); - a64_kerns.insert(a64_kerns.end(), gi_kerns.begin(), gi_kerns.end()); - return {a64_kerns, deduce_func}; + if (kernel_type == KernelPack::KernType::MatrixMulKernel) { + auto a64_kerns = Arm64::ArchKernelPack::GetKernel(kernel_type); + auto armcommon_kerns = + ArmCommon::ArchKernelPack::GetKernel(kernel_type); + auto gi_kerns = + GeneralIntrinsic::ArchKernelPack::GetKernel(kernel_type); + armcommon_kerns.insert(armcommon_kerns.end(), a64_kerns.begin(), + a64_kerns.end()); + armcommon_kerns.insert(armcommon_kerns.end(), gi_kerns.begin(), + gi_kerns.end()); + return {armcommon_kerns, deduce_func}; + } else { + auto a64_kerns = Arm64::ArchKernelPack::GetKernel(kernel_type); + auto armcommon_kerns = + ArmCommon::ArchKernelPack::GetKernel(kernel_type); + auto gi_kerns = + GeneralIntrinsic::ArchKernelPack::GetKernel(kernel_type); + a64_kerns.insert(a64_kerns.end(), armcommon_kerns.begin(), + armcommon_kerns.end()); + a64_kerns.insert(a64_kerns.end(), gi_kerns.begin(), gi_kerns.end()); + return {a64_kerns, deduce_func}; + } + } else if (arch == Arch::ARMV7) { auto a32_kerns = Armv7::ArchKernelPack::GetKernel(kernel_type); From 63f677ce44706b13dea11ff527229e246511cabc Mon Sep 17 00:00:00 2001 From: yuxiongxiong Date: Fri, 30 Dec 2022 17:58:52 +0800 Subject: [PATCH 05/17] feat(compiler): add qrelu mode of elemwisemultitype --- .../KernelGen/BareMetal/ElemwiseMultiType.cpp | 57 +++++++++++++++++-- .../kernel/opr/naive/elemwise_multitype.cpp | 16 ++++++ 2 files changed, 67 insertions(+), 6 deletions(-) diff --git a/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp b/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp index b5cec2c9..94785a79 100644 --- a/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp +++ b/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp @@ -32,16 +32,26 @@ std::string gen_dep(std::string mode) { } )"; } +std::string gen_unary(std::string mode) { + if (mode == "QRELU") { + return "int8_t out_val = fp32_to_int8(((scale_0 * val_0) > 0?(scale_0 " + "* " + "val_0 ):0) * scale_div)"; + } else { + CC_ABORT << "not support mode " << mode.c_str() << "\n"; + } + return ""; +} std::string gen_binary(std::string mode) { if (mode == "QADD") { - return "float out_val = fp32_to_int8((scale_0 * val_0 + scale_1 * " + return "int8_t out_val = fp32_to_int8((scale_0 * val_0 + scale_1 * " "val_1) * scale_div);"; } else if (mode == "QFUSE_ADD_RELU") { return R"( float val0 = scale_0 * val_0; float val1 = scale_1 * val_1; - float out_val = fp32_to_int8( ((val0 + val1) > 0? (val0 + val1):0) * scale_div); + int8_t out_val = fp32_to_int8( ((val0 + val1) > 0? (val0 + val1):0) * scale_div); )"; } else { CC_ABORT << "not support mode " << mode.c_str() << "\n"; @@ -54,9 +64,11 @@ std::string gen_binary(std::string mode) { bool ElemwiseMultiTypeKernel::IsAvailable(TContext* context) const { auto mode = context->getAttrStr("mode"); auto nr_operands = context->getAttrInt("nr_operands"); - bool nr_operands_ok = nr_operands == 3; - bool mode_ok_binary = mode == "QADD" || mode == "QFUSE_ADD_RELU"; - return nr_operands_ok && (mode_ok_binary); + bool nr_operands_ok = nr_operands == 2 || nr_operands == 3; + bool mode_ok_unary = nr_operands == 2 && mode == "QRELU"; + bool mode_ok_binary = + nr_operands == 3 && (mode == "QADD" || mode == "QFUSE_ADD_RELU"); + return nr_operands_ok && (mode_ok_unary || mode_ok_binary); } std::string ElemwiseMultiTypeKernel::GetKernelSymbol(TContext* context) const { @@ -74,8 +86,41 @@ std::string ElemwiseMultiTypeKernel::GetKernelBody(TContext* context) const { writer << gen_dep(mode); writer << GenCommonRet() << " "; writer << GetKernelSignature(context); + if (context->getAttrInt("nr_operands") == 2) { + auto op0 = context->getAttrOprand("operand:0"); + auto dst = context->getAttrOprand("operand:1"); + CC_ASSERT(Utils::is_quant_dtype(op0.dtype, 8) && + Utils::is_quant_dtype(dst.dtype, 8)); + auto op0_specifier = Utils::cvt_dtype_specifier(op0.dtype); + auto dst_specifier = Utils::cvt_dtype_specifier(dst.dtype); + std::string binary_str = R"({ + ${op0_specifier}* input_0 = (${op0_specifier}*)inputs[0]->ptr; + float scale_0 = inputs[0]->dtype.param.scale; + TINYNN_ASSERT(input_0); + ${dst_specifier}* output_data = (${dst_specifier}*)outputs[0]->ptr; + float scale_dst = outputs[0]->dtype.param.scale; + TINYNN_ASSERT(output_data); + float scale_div = 1.f / scale_dst; - if (context->getAttrInt("nr_operands") == 3) { + Layout in_layout = inputs[0]->layout; + size_t nr_elem = 1; + for (int i = 0; i < in_layout.nr_dim; ++i) { + nr_elem *= in_layout.dims[i]; + } + for(size_t i = 0; i < nr_elem; ++i){ + ${op0_specifier} val_0 = input_0[i]; + ${act}; + output_data[i] = out_val; + } + return TinyNN_SUCCESS; + } + )"; + writer << StringTemplate::StringTemplateArgs() + .add("op0_specifier", op0_specifier) + .add("dst_specifier", dst_specifier) + .add("act", gen_unary(mode)) + .render(binary_str); + } else if (context->getAttrInt("nr_operands") == 3) { auto op0 = context->getAttrOprand("operand:0"); auto op1 = context->getAttrOprand("operand:1"); auto dst = context->getAttrOprand("operand:2"); diff --git a/compiler/test/kernel/opr/naive/elemwise_multitype.cpp b/compiler/test/kernel/opr/naive/elemwise_multitype.cpp index 4e3c1fa0..38ec4cce 100644 --- a/compiler/test/kernel/opr/naive/elemwise_multitype.cpp +++ b/compiler/test/kernel/opr/naive/elemwise_multitype.cpp @@ -12,6 +12,22 @@ using namespace megdnn; using namespace megcc::test; using MODE = ElemwiseMultiType::Param::Mode; +TEST(NAIVE, ElementwiseMultitypeUnary) { + Checker checker; + checker.set_kernel_symbol("kernel_.*"); + checker.set_epsilon(1e-4); + checker.set_dtype(0, dtype::QuantizedS8(1.f)); + checker.set_dtype(1, dtype::QuantizedS8(2.f)); + ElemwiseMultiType::Param param; + for (auto mode : {MODE::QRELU}) { + param.mode = mode; + checker.set_param(param); + checker.execs({{1}, {}}); + checker.execs({{1, 10}, {}}); + checker.execs({{1, 10, 12, 13}, {}}); + } +} + TEST(NAIVE, ElementwiseMultitypeBinary) { Checker checker; checker.set_kernel_symbol("kernel_.*"); From bfdeb57f26b73bdfac153e53788509227b4e3b7f Mon Sep 17 00:00:00 2001 From: zhanghaolong Date: Tue, 20 Dec 2022 15:14:03 +0800 Subject: [PATCH 06/17] feat(compiler/tools): add kernel export tools --- compiler/include/compiler/Common/Logger.h | 6 +- compiler/lib/Common/Logger.cpp | 18 + .../ConvKernel/Int8/Int8DotConvNchwNchw44.cpp | 1 + .../ConvKernel/Fp32/Fp32ConvNchwNchw44.cpp | 4 +- .../ConvKernel/F32ConvNCHWNCHW443x3s2.cpp | 5 +- .../ConvKernel/Fp32ConvNchwNchw44.cpp | 42 +- compiler/tools/CMakeLists.txt | 1 + compiler/tools/kernel_exporter/CMakeLists.txt | 7 + .../tools/kernel_exporter/config_attr.cpp | 1076 +++++++++++++++++ compiler/tools/kernel_exporter/config_attr.h | 33 + .../tools/kernel_exporter/exporter_imp.cpp | 134 ++ compiler/tools/kernel_exporter/exporter_imp.h | 73 ++ .../tools/kernel_exporter/tinynn-exporter.cpp | 53 + compiler/tools/kernel_exporter/utils.cpp | 60 + compiler/tools/kernel_exporter/utils.h | 14 + script/build_and_test_not_standard_os.sh | 2 +- script/release_megcc.sh | 7 + 17 files changed, 1517 insertions(+), 19 deletions(-) create mode 100644 compiler/tools/kernel_exporter/CMakeLists.txt create mode 100644 compiler/tools/kernel_exporter/config_attr.cpp create mode 100644 compiler/tools/kernel_exporter/config_attr.h create mode 100644 compiler/tools/kernel_exporter/exporter_imp.cpp create mode 100644 compiler/tools/kernel_exporter/exporter_imp.h create mode 100644 compiler/tools/kernel_exporter/tinynn-exporter.cpp create mode 100644 compiler/tools/kernel_exporter/utils.cpp create mode 100644 compiler/tools/kernel_exporter/utils.h diff --git a/compiler/include/compiler/Common/Logger.h b/compiler/include/compiler/Common/Logger.h index a510cd01..868b9347 100644 --- a/compiler/include/compiler/Common/Logger.h +++ b/compiler/include/compiler/Common/Logger.h @@ -20,6 +20,8 @@ enum class LogLevel : uint32_t { void SetLogLevel(LogLevel); +void setAssertThrow(bool); + LogLevel GetLogLevel(); class Logger { @@ -59,8 +61,8 @@ class Logger { class LogFatal : public Logger { public: - LogFatal() : Logger(LogLevel::ERROR) {} - ~LogFatal() { abort(); } + LogFatal(); + ~LogFatal(); }; #define LOG_DEBUG megcc::Logger::debug() diff --git a/compiler/lib/Common/Logger.cpp b/compiler/lib/Common/Logger.cpp index daf78066..f8dc1968 100644 --- a/compiler/lib/Common/Logger.cpp +++ b/compiler/lib/Common/Logger.cpp @@ -7,15 +7,22 @@ */ #include "compiler/Common/Logger.h" +#include using namespace megcc; static LogLevel GlobalLogLevel = LogLevel::WARN; +static bool g_is_assert_throw = false; + void megcc::SetLogLevel(LogLevel level) { GlobalLogLevel = level; } +void megcc::setAssertThrow(bool is_throw) { + g_is_assert_throw = is_throw; +} + LogLevel megcc::GetLogLevel() { return GlobalLogLevel; } @@ -36,4 +43,15 @@ Logger Logger::error() { return Logger(LogLevel::ERROR); } +LogFatal::LogFatal() : Logger(LogLevel::ERROR) { +#if __EXCEPTIONS + if (g_is_assert_throw) { + throw std::exception(); + } +#endif +} +LogFatal::~LogFatal() { + abort(); +} + // vim: syntax=cpp.doxygen diff --git a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConvNchwNchw44.cpp b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConvNchwNchw44.cpp index 5f2d5611..0d255fc1 100644 --- a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConvNchwNchw44.cpp +++ b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConvNchwNchw44.cpp @@ -44,6 +44,7 @@ bool ConvDotNCHWNCHW44::IsAvailable(TContext* ctx) const { } std::string ConvDotNCHWNCHW44::GetKernelSymbol(TContext* ctx) const { auto src_tensor = ctx->getAttrOprand("operand:0"); + CC_ASSERT((src_tensor.shape.size()) > 0) << "src_tensor.shape.size > 0"; uint32_t ic = src_tensor.shape[1]; auto dst_tensor = ctx->getAttrOprand( "operand:" + std::to_string(ctx->getAttrInt("nr_operands") - 1)); diff --git a/compiler/lib/KernelGen/Arm/ArmCommon/ConvKernel/Fp32/Fp32ConvNchwNchw44.cpp b/compiler/lib/KernelGen/Arm/ArmCommon/ConvKernel/Fp32/Fp32ConvNchwNchw44.cpp index 1fadf62f..71fc0fad 100644 --- a/compiler/lib/KernelGen/Arm/ArmCommon/ConvKernel/Fp32/Fp32ConvNchwNchw44.cpp +++ b/compiler/lib/KernelGen/Arm/ArmCommon/ConvKernel/Fp32/Fp32ConvNchwNchw44.cpp @@ -49,6 +49,8 @@ bool ConvFloatNCHWNCHW44::IsAvailable(TContext* ctx) const { } std::string ConvFloatNCHWNCHW44::GetKernelSymbol(TContext* ctx) const { auto src_tensor = ctx->getAttrOprand("operand:0"); + CC_ASSERT(src_tensor.shape.size() > 0) + << "src_tensor size should > 0, now" << src_tensor.shape.size(); uint32_t ic = src_tensor.shape[1]; auto dst_tensor = ctx->getAttrOprand( "operand:" + std::to_string(ctx->getAttrInt("nr_operands") - 1)); @@ -241,7 +243,7 @@ std::string render_kernel(TContext* ctx) { std::string mode = ctx->haveAttr("nonlineMode") ? ctx->getAttrStr("nonlineMode") : "IDENTITY"; - + auto activate_gen = create_activation_gener_instrinsic(mode); auto src_tensor = ctx->getAttrOprand("operand:0"); diff --git a/compiler/lib/KernelGen/Arm/Armv7/ConvKernel/F32ConvNCHWNCHW443x3s2.cpp b/compiler/lib/KernelGen/Arm/Armv7/ConvKernel/F32ConvNCHWNCHW443x3s2.cpp index 1c508cf8..a38958b5 100644 --- a/compiler/lib/KernelGen/Arm/Armv7/ConvKernel/F32ConvNCHWNCHW443x3s2.cpp +++ b/compiler/lib/KernelGen/Arm/Armv7/ConvKernel/F32ConvNCHWNCHW443x3s2.cpp @@ -50,6 +50,8 @@ bool ConvFloatNCHWNCHW443x3s2::IsAvailable(TContext* ctx) const { } std::string ConvFloatNCHWNCHW443x3s2::GetKernelSymbol(TContext* ctx) const { auto src_tensor = ctx->getAttrOprand("operand:0"); + CC_ASSERT(src_tensor.shape.size() > 0) + << "src_tensor size should > 0, now" << src_tensor.shape.size(); uint32_t ic = src_tensor.shape[1]; auto dst_tensor = ctx->getAttrOprand( "operand:" + std::to_string(ctx->getAttrInt("nr_operands") - 1)); @@ -117,7 +119,8 @@ std::string ConvFloatNCHWNCHW443x3s2::GetInitBody(TContext* ctx) const { return writer.str(); } -std::string ConvFloatNCHWNCHW443x3s2::GetWorkspaceBody(TContext* context) const { +std::string ConvFloatNCHWNCHW443x3s2::GetWorkspaceBody( + TContext* context) const { std::stringstream ss; ss << R"( static inline int round_up(int x, int d){ diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp index 0a3a1e67..86fba8ad 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp @@ -47,6 +47,8 @@ bool ConvFloatNCHWNCHW44::IsAvailable(TContext* ctx) const { } std::string ConvFloatNCHWNCHW44::GetKernelSymbol(TContext* ctx) const { auto src_tensor = ctx->getAttrOprand("operand:0"); + CC_ASSERT(src_tensor.shape.size() > 0) + << "src_tensor size should > 0, now" << src_tensor.shape.size(); uint32_t ic = src_tensor.shape[1]; auto dst_tensor = ctx->getAttrOprand( "operand:" + std::to_string(ctx->getAttrInt("nr_operands") - 1)); @@ -151,7 +153,8 @@ std::string render_init(int c_idx, int nr_ow, bool with_bias) { for (int src_idx = 0; src_idx < nr_ow; ++src_idx) { if (with_bias) { ss << "c[" << c_idx << "][" << src_idx - << "] = GiFloat32Type2FixLenType(GiLoadFloat32(bias_ptr + " << c_idx << " * 4));"; + << "] = GiFloat32Type2FixLenType(GiLoadFloat32(bias_ptr + " + << c_idx << " * 4));"; } else { ss << "c[" << c_idx << "][" << src_idx << "] = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.f));"; @@ -172,20 +175,23 @@ std::string render_core(int src_reg_size, int filter_size, bool is_big_oc, } else { for (int src_idx = 0; src_idx < src_reg_size; ++src_idx) { fw_ss << "src[" << src_idx - << "] = GiFloat32Type2FixLenType(GiLoadFloat32(src_ptr + ${fh_idx} * packed_iw + " + << "] = GiFloat32Type2FixLenType(GiLoadFloat32(src_ptr + " + "${fh_idx} * packed_iw + " << src_idx << "* ${simd_len}));\n"; } } for (int fw_idx = 0; fw_idx < filter_size; ++fw_idx) { fw_ss << "weight[0][" << fw_idx - << "] = GiFloat32Type2FixLenType(GiLoadFloat32(filter_ptr + ${fh_idx} * ${ld_weight_fw} + " + << "] = GiFloat32Type2FixLenType(GiLoadFloat32(filter_ptr + " + "${fh_idx} * ${ld_weight_fw} + " << fw_idx << " * ${simd_len}));\n"; } if (is_big_oc) { for (int fw_idx = 0; fw_idx < filter_size; ++fw_idx) { fw_ss << "weight[1][" << fw_idx - << "] = GiFloat32Type2FixLenType(GiLoadFloat32(filter_ptr + ${ld_weight_oc} + " + << "] = GiFloat32Type2FixLenType(GiLoadFloat32(filter_ptr + " + "${ld_weight_oc} + " "${fh_idx} * " "${ld_weight_fw} + " << fw_idx << " * ${simd_len}));\n"; @@ -195,14 +201,22 @@ std::string render_core(int src_reg_size, int filter_size, bool is_big_oc, auto src_idx = fw_idx; auto weight_idx = fw_idx; for (int i = 0; i < nr_ow; ++i) { - fw_ss << "c[0][" << i << "] = GiFloat32Type2FixLenType(GiSimdFmaLane(GiFixLenType2GiFloat32Type(c[0][" << i - << "]), GiFixLenType2GiFloat32Type(weight[0][" << weight_idx << "]), GiFixLenType2GiFloat32Type(src[(" << i + fw_ss << "c[0][" << i + << "] = " + "GiFloat32Type2FixLenType(GiSimdFmaLane(" + "GiFixLenType2GiFloat32Type(c[0][" + << i << "]), GiFixLenType2GiFloat32Type(weight[0][" + << weight_idx << "]), GiFixLenType2GiFloat32Type(src[(" << i << " * ${stride} + " << src_idx << ") / 4]), " << (i * stride + src_idx) % 4 << "));"; if (is_big_oc) { - fw_ss << "c[1][" << i << "] = GiFloat32Type2FixLenType(GiSimdFmaLane(GiFixLenType2GiFloat32Type(c[1][" << i - << "]), GiFixLenType2GiFloat32Type(weight[1][" << weight_idx << "]), GiFixLenType2GiFloat32Type(src[(" << i - << " * ${stride} + " << src_idx << ") / 4]), " + fw_ss << "c[1][" << i + << "] = " + "GiFloat32Type2FixLenType(GiSimdFmaLane(" + "GiFixLenType2GiFloat32Type(c[1][" + << i << "]), GiFixLenType2GiFloat32Type(weight[1][" + << weight_idx << "]), GiFixLenType2GiFloat32Type(src[(" + << i << " * ${stride} + " << src_idx << ") / 4]), " << (i * stride + src_idx) % 4 << "));"; } } @@ -223,11 +237,11 @@ std::string render_store(int nr_ow, int c_idx, const std::string& store_offset, const ActivationGenIntrinsicBase& act) { std::stringstream ss; for (int ow_idx = 0; ow_idx < nr_ow; ++ow_idx) { - ss << act.GenIntrinsicFloatStore("GiFixLenType2GiFloat32Type(c[" + std::to_string(c_idx) + "][" + - std::to_string(ow_idx) + "])", - "dst_ptr + " + store_offset + " + " + - std::to_string(ow_idx) + - " * simd_len"); + ss << act.GenIntrinsicFloatStore( + "GiFixLenType2GiFloat32Type(c[" + std::to_string(c_idx) + "][" + + std::to_string(ow_idx) + "])", + "dst_ptr + " + store_offset + " + " + std::to_string(ow_idx) + + " * simd_len"); } return ss.str(); } diff --git a/compiler/tools/CMakeLists.txt b/compiler/tools/CMakeLists.txt index a8642194..c8197ed3 100644 --- a/compiler/tools/CMakeLists.txt +++ b/compiler/tools/CMakeLists.txt @@ -6,3 +6,4 @@ add_subdirectory(tinynn-exporter) add_subdirectory(hako-to-mgb) add_subdirectory(dump-kernel) add_subdirectory(megcc-translate) +add_subdirectory(kernel_exporter) \ No newline at end of file diff --git a/compiler/tools/kernel_exporter/CMakeLists.txt b/compiler/tools/kernel_exporter/CMakeLists.txt new file mode 100644 index 00000000..453f6d02 --- /dev/null +++ b/compiler/tools/kernel_exporter/CMakeLists.txt @@ -0,0 +1,7 @@ +get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) +add_llvm_executable(kernel_exporter exporter_imp.cpp config_attr.cpp utils.cpp tinynn-exporter.cpp) +llvm_update_compile_flags(kernel_exporter) +target_link_libraries(kernel_exporter PRIVATE ${dialect_libs} KernelGen Common) +target_compile_options(Common PUBLIC -fexceptions) +target_compile_options(kernel_exporter PUBLIC -fexceptions) +mlir_check_all_link_libraries(kernel_exporter) diff --git a/compiler/tools/kernel_exporter/config_attr.cpp b/compiler/tools/kernel_exporter/config_attr.cpp new file mode 100644 index 00000000..6e72067c --- /dev/null +++ b/compiler/tools/kernel_exporter/config_attr.cpp @@ -0,0 +1,1076 @@ +/** + * \file + * compiler/tools/kernel_exporter/config_attr.cpp + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ +#include "config_attr.h" +#include +#include "compiler/Common/TContext.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" +#include "megbrain/common.h" +#include "megbrain/reflection.h" +#include "megdnn/basic_types.h" +#include "megdnn/dtype.h" +#include "megdnn/opr_param_defs.h" +#include "megdnn/oprs/cv.h" +#include "megdnn/oprs/general.h" +#include "megdnn/oprs/imgproc.h" +#include "megdnn/oprs/linalg.h" +#include "megdnn/oprs/nn.h" +#include "megdnn/oprs/nn_int.h" +#include "utils.h" + +#define megcore_check(expr) \ + do { \ + megcoreStatus_t _err = (expr); \ + if (_err != megcoreSuccess) { \ + fprintf(stderr, "mgb failed : line=%d %s:%d\n", (int)_err, \ + __FILE__, __LINE__); \ + abort(); \ + } \ + } while (0) + +namespace { +#define DEFINE_DNNPARAM2STR(cls) \ + std::string dnnparam_2_str(cls value) { \ + return mgb::reflection::nameOfEnumValue(value); \ + } + +DEFINE_DNNPARAM2STR(ConvParam::Format) +DEFINE_DNNPARAM2STR(ConvParam::Sparse) +DEFINE_DNNPARAM2STR(ConvParam::Mode) +DEFINE_DNNPARAM2STR(megdnn::ElemwiseForward::Param::Mode) +DEFINE_DNNPARAM2STR(megdnn::ElemwiseMultiType::Param::Mode) +DEFINE_DNNPARAM2STR(megdnn::PoolingForward::Param::Mode) +DEFINE_DNNPARAM2STR(megdnn::MatrixMulForward::Param::Format) +DEFINE_DNNPARAM2STR(megdnn::MatrixMulForward::Param::ComputeMode) +DEFINE_DNNPARAM2STR(megdnn::Reduce::Param::Mode) +DEFINE_DNNPARAM2STR(megdnn::Reduce::Param::DataType) +DEFINE_DNNPARAM2STR(megdnn::WarpPerspectiveForward::Param::BorderMode) +DEFINE_DNNPARAM2STR(megdnn::WarpPerspectiveForward::Param::InterpolationMode) +DEFINE_DNNPARAM2STR(megdnn::CvtColor::Param::Mode) +DEFINE_DNNPARAM2STR(megdnn::Argsort::Param::Order) +#undef DEFINE_DNNPARAM2STR + +int get_int() { + llvm::outs() << "please input a int number" + << "\n"; + std::string ret; + std::string num; + std::cin >> num; + int n = num.size(); + for (int i = 0; i < n; i++) { + if (num[i] >= '0' && num[i] <= '9') { + ret.push_back(num[i]); + } + } + llvm::outs() << "input: " << stoi(ret) << "\n"; + + return stoi(ret); +} + +float get_float() { + llvm::outs() << "please input a float number" + << "\n"; + std::string ret; + std::string num; + std::cin >> num; + int n = num.size(); + for (int i = 0; i < n; i++) { + if ((num[i] >= '0' && num[i] <= '9') || num[i] == '.') { + ret.push_back(num[i]); + } + } + llvm::outs() << "input: " << stof(ret) << "\n"; + + return stof(ret); +} + +std::string support_map_to_msg(const std::map& m) { + std::string msg = "\n"; + for (const auto& i : m) { + msg += std::to_string(i.first); + msg += " = "; + msg += i.second; + msg += ",\n"; + } + + return msg; +} + +std::pair> support_dtype() { + std::map enum2dtype{ + {0, "f32"}, {1, "si8"}, {2, "i32"}, {3, "i16"}, + {4, "ui8"}, {5, "qsi8"}, {6, "qsi32"}}; + + return {support_map_to_msg(enum2dtype), enum2dtype}; +} + +std::pair> support_format() { + std::map format2enum{ + {0, "NCHW"}, {7, "NCHW44"}, {8, "NCHW44_DOT"}}; + + return {support_map_to_msg(format2enum), format2enum}; +} + +} // namespace + +namespace megcc { +namespace exporter { +#define FILL_MAP(_map_name, _parm_name, _attr_name) \ + _map_name[#_attr_name] = CCAttr(_parm_name._attr_name) +#define FILL_MAP_EX(_map_name, _parm_name, _attr_name, _helper_fun) \ + _map_name[#_attr_name] = CCAttr(_helper_fun(_parm_name._attr_name)) +using KernType = KernelGen::KernelPack::KernType; +template +class ParamHelper { +public: + using Param = typename Opr::Param; + ParamHelper() { + megcore_check(megcoreCreateDeviceHandle(&m_device_handle, + megcorePlatformCPU)); + megcore_check(megcoreCreateComputingHandle(&m_compute_handle, + m_device_handle)); + m_dnn_handle = megdnn::Handle::make(m_compute_handle, 2); + } + + ~ParamHelper() { + megcore_check(megcoreDestroyComputingHandle(m_compute_handle)); + megcore_check(megcoreDestroyDeviceHandle(m_device_handle)); + } + + Param create_param() { + auto opr = m_dnn_handle->create_operator(); + return opr->param(); + } + +protected: + megcoreDeviceHandle_t m_device_handle; + megcoreComputingHandle_t m_compute_handle; + std::unique_ptr m_dnn_handle; +}; + +std::vector config_attr(KPT k_type, std::string k_name, + bool use_default_attr) { +#define DEC_DTYPE() \ + auto dtypes = support_dtype(); \ + llvm::outs() << "please config \"src type\" " \ + << "support one of: " << dtypes.first << "\n"; \ + auto dtype_enum = get_int(); \ + if (dtypes.second.find(dtype_enum) == dtypes.second.end()) { \ + llvm::outs() << "invalid input" \ + << "\n"; \ + abort(); \ + } \ + std::string dtype_input = dtypes.second[dtype_enum] + +#define DEC_FORMAT() \ + auto formats = support_format(); \ + llvm::outs() << "please config \"format\" " \ + << "support one of: " << formats.first << "\n"; \ + auto format_input = get_int(); \ + if (dtypes.second.find(format_input) == formats.second.end()) { \ + llvm::outs() << "invalid input" \ + << "\n"; \ + abort(); \ + } + + std::vector ret; + std::unordered_map attr_map; + if (!use_default_attr) { + llvm::outs() << "+++++++++++++++++++++++++++++++++++++\n"; + llvm::outs() << " please config attr for " << k_name << "\n"; + llvm::outs() << "+++++++++++++++++++++++++++++++++++++\n"; + } + switch (k_type) { + case KPT::TopK: { + megcc::CCOperand cc_operand; + attr_map["nr_operands"] = megcc::CCAttr(1); + if (use_default_attr) { + attr_map["k"] = megcc::CCAttr(10); + attr_map["mode"] = megcc::CCAttr("KTH_ONLY"); + cc_operand.dtype = "f32"; + } else { + llvm::outs() << "please config \"k\"" + << "\n"; + auto int_input = get_int(); + attr_map["k"] = megcc::CCAttr(int_input); + + auto support_mode = [&]() + -> std::pair> { + std::map enum2mode{ + {0, "KTH_ONLY"}, + {1, "VALUE_IDX_NOSORT"}, + {2, "VALUE_IDX_SORTED"}}; + + return {support_map_to_msg(enum2mode), enum2mode}; + }; + auto m = support_mode(); + llvm::outs() << "please config \"mode\" " + << "support one of: " << m.first << "\n"; + auto mode_enum = get_int(); + if (m.second.find(mode_enum) == m.second.end()) { + llvm::outs() << "invalid input" + << "\n"; + abort(); + } + std::string input_str = m.second[mode_enum]; + attr_map["mode"] = megcc::CCAttr(input_str); + + DEC_DTYPE(); + cc_operand.dtype = dtype_input; + } + attr_map[llvm::formatv("operand:{0}", 0)] = cc_operand; + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::ElemwiseKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + if (use_default_attr) { + param.mode = megdnn::Elemwise::Mode::RELU; + attr_map["mode"] = CCAttr(dnnparam_2_str(param.mode)); + attr_map["nr_operands"] = megcc::CCAttr(2); + megcc::CCOperand res; + res.dtype = "f32"; + attr_map["operand:0"] = megcc::CCAttr(res); + attr_map["operand:1"] = megcc::CCAttr(res); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } else { + EXPORT_ERR( + "ElemwiseKernel have so many case , it`s hard to user " + "dynamic config, not support now"); + } + } break; + case KPT::ElemwiseMultiKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + if (use_default_attr) { + param.mode = megdnn::ElemwiseMultiType::Mode::QADD; + attr_map["mode"] = CCAttr(dnnparam_2_str(param.mode)); + attr_map["nr_operands"] = megcc::CCAttr(3); + megcc::CCOperand res; + res.dtype = "qsi8"; + attr_map["operand:0"] = megcc::CCAttr(res); + res.dtype = "qsi8"; + attr_map["operand:1"] = megcc::CCAttr(res); + res.dtype = "qsi8"; + attr_map["operand:2"] = megcc::CCAttr(res); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } else { + EXPORT_ERR( + "ElemwiseMultiType have so many case , it`s hard to " + "user " + "dynamic config, not support now"); + } + } break; + case KPT::PoolingKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + if (use_default_attr) { + //! init default attr + res.dtype = "f32"; + param.stride_h = 1; + param.stride_w = 1; + param.pad_h = 1; + param.pad_w = 1; + param.window_h = 3; + param.window_w = 3; + param.format = ConvParam::Format::NCHW; + param.mode = megdnn::param::PoolingV0::Mode::MAX; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + llvm::outs() << "please config \"stride_h\"" + << "\n"; + auto int_input = get_int(); + param.stride_h = int_input; + llvm::outs() << "please config \"stride_w\"" + << "\n"; + int_input = get_int(); + param.stride_w = int_input; + llvm::outs() << "please config \"pad_h\"" + << "\n"; + int_input = get_int(); + param.pad_h = int_input; + llvm::outs() << "please config \"pad_w\"" + << "\n"; + int_input = get_int(); + param.pad_w = int_input; + llvm::outs() << "please config \"window_h\"" + << "\n"; + int_input = get_int(); + param.window_h = int_input; + llvm::outs() << "please config \"window_w\"" + << "\n"; + int_input = get_int(); + param.window_w = int_input; + + DEC_FORMAT(); + param.format = static_cast(format_input); + auto support_mode = [&]() + -> std::pair> { + std::map enum2mode{ + {0, "MAX"}, + {1, "AVERAGE"}, + {2, "AVERAGE_COUNT_EXCLUDE_PADDING"}}; + + return {support_map_to_msg(enum2mode), enum2mode}; + }; + + auto m = support_mode(); + llvm::outs() << "please config \"mode\"" + << "support one of: " << m.first << "\n"; + auto mode_enum = get_int(); + if (m.second.find(mode_enum) == m.second.end()) { + llvm::outs() << "invalid input" + << "\n"; + abort(); + } + param.mode = + static_cast(mode_enum); + } + FILL_MAP(attr_map, param, stride_h); + FILL_MAP(attr_map, param, stride_w); + FILL_MAP(attr_map, param, pad_h); + FILL_MAP(attr_map, param, pad_w); + FILL_MAP(attr_map, param, window_h); + FILL_MAP(attr_map, param, window_w); + + FILL_MAP_EX(attr_map, param, format, dnnparam_2_str); + FILL_MAP_EX(attr_map, param, mode, dnnparam_2_str); + attr_map["nr_operands"] = megcc::CCAttr(3); + attr_map["operand:0"] = megcc::CCAttr(res); + attr_map["operand:1"] = megcc::CCAttr(res); + attr_map["operand:2"] = megcc::CCAttr(res); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::MatrixMulKernel: + case KPT::BatchMatmulKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + if (use_default_attr) { + //! init default attr + res.dtype = "f32"; + param.transposeA = false; + param.transposeB = false; + param.format = megdnn::param::MatrixMul::Format::DEFAULT; + param.compute_mode = + megdnn::param::MatrixMul::ComputeMode::DEFAULT; + FILL_MAP(attr_map, param, transposeA); + FILL_MAP(attr_map, param, transposeB); + + FILL_MAP_EX(attr_map, param, format, dnnparam_2_str); + FILL_MAP_EX(attr_map, param, compute_mode, dnnparam_2_str); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + + llvm::outs() << "please config \"transposeA\"" + << "\n"; + auto int_input = get_int(); + param.transposeA = int_input; + llvm::outs() << "please config \"transposeB\"" + << "\n"; + int_input = get_int(); + param.transposeB = int_input; + auto support_mode = [&]() + -> std::pair> { + std::map enum2mode{{0, "DEFAULT"}, + {1, "MK4"}, + {2, "MK8"}, + {3, "MK4_DOT"}}; + + return {support_map_to_msg(enum2mode), enum2mode}; + }; + auto m = support_mode(); + llvm::outs() << "please config \"format\"" + << "support one of: " << m.first << "\n"; + auto mode_enum = get_int(); + if (m.second.find(mode_enum) == m.second.end()) { + llvm::outs() << "invalid input" + << "\n"; + abort(); + } + param.format = static_cast( + mode_enum); + param.compute_mode = + static_cast(0); + } + FILL_MAP(attr_map, param, transposeA); + FILL_MAP(attr_map, param, transposeB); + + FILL_MAP_EX(attr_map, param, format, dnnparam_2_str); + FILL_MAP_EX(attr_map, param, compute_mode, dnnparam_2_str); + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::MatrixInvKernel: + case KPT::RelayoutKernel: { + megcc::CCOperand res; + if (use_default_attr) { + res.dtype = "f32"; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + } + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::ReduceKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + if (use_default_attr) { + //! init default attr + res.dtype = "f32"; + param.axis = 1; + param.mode = megdnn::param::Reduce::Mode::SUM; + param.data_type = megdnn::param::Reduce::DataType::DEFAULT; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + + llvm::outs() << "please config \"axis\"" + << "\n"; + auto int_input = get_int(); + param.axis = int_input; + auto support_mode = [&]() + -> std::pair> { + std::map enum2mode{ + {0, "SUM"}, {1, "SUM_SQR"}, {2, "PRODUCT"}, + {3, "MIN"}, {4, "MAX"}, {5, "MEAN"}}; + + return {support_map_to_msg(enum2mode), enum2mode}; + }; + auto m = support_mode(); + llvm::outs() << "please config \"mode\" " + << "support one of: " << m.first << "\n"; + auto mode_enum = get_int(); + if (m.second.find(mode_enum) == m.second.end()) { + llvm::outs() << "invalid input" + << "\n"; + abort(); + } + param.mode = + static_cast(mode_enum); + param.data_type = + static_cast(0); + } + + FILL_MAP(attr_map, param, axis); + FILL_MAP_EX(attr_map, param, mode, dnnparam_2_str); + FILL_MAP_EX(attr_map, param, data_type, dnnparam_2_str); + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::IndexingMultiAxisKernel: { + megcc::CCOperand res; + if (use_default_attr) { + res.dtype = "f32"; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + } + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::IndexingOneHotKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + if (use_default_attr) { + //! init default attr + res.dtype = "f32"; + param.axis = 1; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + + llvm::outs() << "please config \"axis\"" + << "\n"; + auto int_input = get_int(); + param.axis = int_input; + } + + FILL_MAP(attr_map, param, axis); + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::WarpPerspectiveKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + if (use_default_attr) { + //! init default attr + res.dtype = "f32"; + param.border_val = 0.1; + param.bmode = megdnn::param::WarpPerspective::BorderMode:: + BORDER_CONSTANT; + param.imode = + megdnn::param::WarpPerspective::InterpolationMode::AREA; + param.format = megdnn::param::WarpPerspective::Format::NCHW; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + + param.border_val = 0.1; + auto support_mode = [&]() + -> std::pair> { + std::map enum2mode{ + {0, "REPLICATE/BORDER_REPLICATE"}, + {1, "REFLECT/BORDER_REFLECT"}, + {2, "REFLECT_101/BORDER_REFLECT_101"}, + {3, "WRAP/BORDER_WRAP"}, + {4, "CONSTANT/BORDER_CONSTANT"}, + {5, "TRANSPARENT/BORDER_TRANSPARENT"}, + {6, "ISOLATED/BORDER_ISOLATED"}, + }; + + return {support_map_to_msg(enum2mode), enum2mode}; + }; + auto m = support_mode(); + llvm::outs() << "please config \"bmode\" " + << "support one of: " << m.first << "\n"; + auto mode_enum = get_int(); + if (m.second.find(mode_enum) == m.second.end()) { + llvm::outs() << "invalid input" + << "\n"; + abort(); + } + param.bmode = + megdnn::param::WarpPerspective::BorderMode(mode_enum); + auto support_imode = [&]() + -> std::pair> { + std::map enum2mode{ + {0, "NEAREST/INTER_NEAREST"}, + {1, "LINEAR/INTER_LINEAR"}, + {2, "AREA/INTER_AREA"}, + {3, "CUBIC/INTER_CUBIC"}, + {4, "LANCZOS4/INTER_LANCZOS4"}, + }; + + return {support_map_to_msg(enum2mode), enum2mode}; + }; + auto im = support_imode(); + llvm::outs() << "please config \"imode\" " + << "support one of: " << im.first << "\n"; + auto imode_enum = get_int(); + if (im.second.find(imode_enum) == im.second.end()) { + llvm::outs() << "invalid input" + << "\n"; + abort(); + } + param.imode = megdnn::param::WarpPerspective::InterpolationMode( + imode_enum); + DEC_FORMAT(); + param.format = + megdnn::param::WarpPerspective::Format(format_input); + } + + FILL_MAP(attr_map, param, border_val); + FILL_MAP_EX(attr_map, param, bmode, dnnparam_2_str); + FILL_MAP_EX(attr_map, param, imode, dnnparam_2_str); + FILL_MAP_EX(attr_map, param, format, dnnparam_2_str); + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::WarpAffineKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + if (use_default_attr) { + //! init default attr + res.dtype = "f32"; + param.border_val = 0.1; + param.border_mode = + megdnn::param::WarpAffine::BorderMode::BORDER_CONSTANT; + param.imode = + megdnn::param::WarpAffine::InterpolationMode::AREA; + param.format = megdnn::param::WarpAffine::Format::NCHW; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + + param.border_val = 0.1; + auto support_mode = [&]() + -> std::pair> { + std::map enum2mode{ + {0, "REPLICATE/BORDER_REPLICATE"}, + {1, "REFLECT/BORDER_REFLECT"}, + {2, "REFLECT_101/BORDER_REFLECT_101"}, + {3, "WRAP/BORDER_WRAP"}, + {4, "CONSTANT/BORDER_CONSTANT"}, + {5, "TRANSPARENT/BORDER_TRANSPARENT"}, + {6, "ISOLATED/BORDER_ISOLATED"}, + }; + + return {support_map_to_msg(enum2mode), enum2mode}; + }; + auto m = support_mode(); + llvm::outs() << "please config \"bmode\" " + << "support one of: " << m.first << "\n"; + auto mode_enum = get_int(); + if (m.second.find(mode_enum) == m.second.end()) { + llvm::outs() << "invalid input" + << "\n"; + abort(); + } + param.border_mode = + megdnn::param::WarpAffine::BorderMode(mode_enum); + auto support_imode = [&]() + -> std::pair> { + std::map enum2mode{ + {0, "NEAREST/INTER_NEAREST"}, + {1, "LINEAR/INTER_LINEAR"}, + {2, "AREA/INTER_AREA"}, + {3, "CUBIC/INTER_CUBIC"}, + {4, "LANCZOS4/INTER_LANCZOS4"}, + }; + + return {support_map_to_msg(enum2mode), enum2mode}; + }; + auto im = support_imode(); + llvm::outs() << "please config \"imode\" " + << "support one of: " << im.first << "\n"; + auto imode_enum = get_int(); + if (im.second.find(imode_enum) == im.second.end()) { + llvm::outs() << "invalid input" + << "\n"; + abort(); + } + param.imode = megdnn::param::WarpAffine::InterpolationMode( + imode_enum); + DEC_FORMAT(); + param.format = megdnn::param::WarpAffine::Format(format_input); + } + + FILL_MAP(attr_map, param, border_val); + FILL_MAP_EX(attr_map, param, border_mode, dnnparam_2_str); + FILL_MAP_EX(attr_map, param, imode, dnnparam_2_str); + FILL_MAP_EX(attr_map, param, format, dnnparam_2_str); + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::TypeCvtKernel: { + if (use_default_attr) { + attr_map["nr_operands"] = megcc::CCAttr(2); + megcc::CCOperand res; + res.dtype = "ui8"; + attr_map["operand:0"] = megcc::CCAttr(res); + res.dtype = "f32"; + attr_map["operand:1"] = megcc::CCAttr(res); + } else { + attr_map["nr_operands"] = megcc::CCAttr(2); + megcc::CCOperand res; + auto dtypes = support_dtype(); + llvm::outs() << "please config \"src type\" " + << "support one of: " << dtypes.first << "\n"; + auto dtype_enum = get_int(); + if (dtypes.second.find(dtype_enum) == dtypes.second.end()) { + llvm::outs() << "invalid input" + << "\n"; + abort(); + } + std::string str_input = dtypes.second[dtype_enum]; + res.dtype = str_input; + attr_map["operand:0"] = megcc::CCAttr(res); + llvm::outs() << "please config \"dst type\" " + << "support one of: " << dtypes.first << "\n"; + dtype_enum = get_int(); + if (dtypes.second.find(dtype_enum) == dtypes.second.end()) { + llvm::outs() << "invalid input" + << "\n"; + abort(); + } + str_input = dtypes.second[dtype_enum]; + res.dtype = str_input; + attr_map["operand:1"] = megcc::CCAttr(res); + } + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::PowCKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + if (use_default_attr) { + //! init default attr + param.exp = 2; + res.dtype = "f32"; + + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + + llvm::outs() << "please config \"exp float value\" " + << "\n"; + float f_input = get_float(); + param.exp = f_input; + } + FILL_MAP(attr_map, param, exp); + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::CVTransposeKernel: + case KPT::FlipKernel: { + megcc::CCOperand res; + if (use_default_attr) { + res.dtype = "f32"; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + } + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::ResizeKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + if (use_default_attr) { + res.dtype = "f32"; + param.imode = megdnn::param::Resize::InterpolationMode::NEAREST; + param.format = megdnn::param::Resize::Format::NCHW; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + + auto support_mode = [&]() + -> std::pair> { + std::map enum2mode{ + {0, "NEAREST/INTER_NEAREST"}, + {1, "LINEAR/INTER_LINEAR"}, + {2, "AREA/INTER_AREA"}, + {3, "CUBIC/INTER_CUBIC"}, + {4, "LANCZOS4/INTER_LANCZOS4"}, + }; + return {support_map_to_msg(enum2mode), enum2mode}; + }; + auto m = support_mode(); + llvm::outs() << "please config \"mode\" " + << "support one of: " << m.first << "\n"; + auto mode_enum = get_int(); + if (m.second.find(mode_enum) == m.second.end()) { + llvm::outs() << "invalid input" + << "\n"; + abort(); + } + param.imode = + megdnn::param::Resize::InterpolationMode(mode_enum); + DEC_FORMAT(); + param.format = megdnn::param::Resize::Format(format_input); + } + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + FILL_MAP_EX(attr_map, param, imode, dnnparam_2_str); + FILL_MAP_EX(attr_map, param, format, dnnparam_2_str); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::RotateKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + if (use_default_attr) { + res.dtype = "f32"; + param.clockwise = true; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + + llvm::outs() << "please config \"clockwise\" " + << "0 means false, other wise means true: " + << "\n"; + int int_input = get_int(); + param.clockwise = int_input; + } + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + FILL_MAP(attr_map, param, clockwise); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::RoiCopyKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + if (use_default_attr) { + res.dtype = "f32"; + param.row_from = 1; + param.row_to = 1; + param.col_from = 1; + param.col_to = 1; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + + llvm::outs() << "please config \"row_from\" " + << "\n"; + int int_input = get_int(); + param.row_from = int_input; + llvm::outs() << "please config \"row_to\" " + << "\n"; + int_input = get_int(); + param.row_to = int_input; + llvm::outs() << "please config \"col_from\" " + << "\n"; + int_input = get_int(); + param.col_from = int_input; + llvm::outs() << "please config \"col_to\" " + << "\n"; + int_input = get_int(); + param.col_to = int_input; + } + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + FILL_MAP(attr_map, param, row_from); + FILL_MAP(attr_map, param, row_to); + FILL_MAP(attr_map, param, col_from); + FILL_MAP(attr_map, param, col_to); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::CvtColorKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + if (use_default_attr) { + res.dtype = "f32"; + param.mode = megdnn::param::CvtColor::Mode::RGB2YUV; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + + auto support_mode = [&]() + -> std::pair> { + std::map enum2mode{ + {0, "RGB2GRAY"}, {1, "RGB2YUV"}, {2, "YUV2RGB"}, + {3, "YUV2RGB"}, {4, "RGBA2RGB"}, {5, "RGBA2BGR"}, + {6, "RGBA2GRAY"}, {7, "RGB2BGR"}, {8, "BGR2GRAY"}, + {9, "BGR2RGB"}, + }; + + return {support_map_to_msg(enum2mode), enum2mode}; + }; + auto m = support_mode(); + llvm::outs() << "please config \"mode\" " + << "support one of: " << m.first << "\n"; + auto mode_enum = get_int(); + if (m.second.find(mode_enum) == m.second.end()) { + llvm::outs() << "invalid input" + << "\n"; + abort(); + } + param.mode = megdnn::param::CvtColor::Mode(mode_enum); + } + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + FILL_MAP_EX(attr_map, param, mode, dnnparam_2_str); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::ArgSortKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + if (use_default_attr) { + res.dtype = "f32"; + param.order = megdnn::param::Argsort::Order::ASCENDING; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + + auto support_mode = [&]() + -> std::pair> { + std::map enum2mode{{0, "ASCENDING"}, + {1, "DESCENDING"}}; + + return {support_map_to_msg(enum2mode), enum2mode}; + }; + auto m = support_mode(); + llvm::outs() << "please config \"order\" " + << "support one of: " << m.first << "\n"; + auto mode_enum = get_int(); + if (m.second.find(mode_enum) == m.second.end()) { + llvm::outs() << "invalid input" + << "\n"; + abort(); + } + param.order = megdnn::param::Argsort::Order(mode_enum); + } + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + attr_map["order"] = CCAttr(dnnparam_2_str(param.order)); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::ArgmaxKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + if (use_default_attr) { + res.dtype = "f32"; + param.axis = 1; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + + llvm::outs() << "please config \"axis\" " + << "\n"; + int int_input = get_int(); + param.axis = int32_t(int_input); + } + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + attr_map["operand:1"] = megcc::CCAttr(res); + FILL_MAP(attr_map, param, axis); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::ConcatKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + if (use_default_attr) { + res.dtype = "f32"; + param.axis = 1; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + + llvm::outs() << "please config \"axis\" " + << "\n"; + int int_input = get_int(); + param.axis = int32_t(int_input); + } + attr_map["nr_operands"] = megcc::CCAttr(1); + attr_map["operand:0"] = megcc::CCAttr(res); + FILL_MAP(attr_map, param, axis); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + case KPT::ConvKernel: + case KPT::ConvBackDataKernel: { + auto&& m_helper = ParamHelper(); + auto param = m_helper.create_param(); + megcc::CCOperand res; + uint32_t kernel_h = 0, kernel_w = 0; + if (use_default_attr) { + res.dtype = "f32"; + + kernel_h = 3; + kernel_w = 3; + param.sparse = ConvParam::Sparse::DENSE; + param.format = ConvParam::Format::NCHW; + param.stride_h = 1; + param.stride_w = 1; + param.pad_h = 1; + param.pad_w = 1; + param.dilate_h = 1; + param.dilate_w = 1; + param.mode = ConvParam::Mode::CONVOLUTION; + } else { + DEC_DTYPE(); + res.dtype = dtype_input; + + llvm::outs() << "please config \"kernel_h\" " + << "\n"; + int int_input = get_int(); + kernel_h = int_input; +#define CB(name) \ + llvm::outs() << "please config: " << #name << "\n"; \ + int_input = get_int(); \ + name = int_input + + CB(kernel_w); + CB(param.stride_h); + CB(param.stride_w); + CB(param.pad_h); + CB(param.pad_w); + CB(param.dilate_h); + CB(param.dilate_w); +#undef CB + auto support_mode = [&]() + -> std::pair> { + std::map enum2mode{ + {0, "DENSE"}, + {1, "GROUP"}, + }; + + return {support_map_to_msg(enum2mode), enum2mode}; + }; + auto m = support_mode(); + llvm::outs() << "please config \"sparse\" " + << "support one of: " << m.first << "\n"; + auto mode_enum = get_int(); + if (m.second.find(mode_enum) == m.second.end()) { + llvm::outs() << "invalid input" + << "\n"; + abort(); + } + param.sparse = ConvParam::Sparse(mode_enum); + + DEC_FORMAT(); + param.format = ConvParam::Format(format_input); + + param.mode = ConvParam::Mode(1); + } + attr_map["nr_operands"] = megcc::CCAttr(3); + attr_map["operand:0"] = megcc::CCAttr(res); + attr_map["operand:1"] = megcc::CCAttr(res); + attr_map["operand:2"] = megcc::CCAttr(res); + attr_map["kernel_h"] = CCAttr(kernel_h); + attr_map["kernel_w"] = CCAttr(kernel_w); + FILL_MAP(attr_map, param, stride_h); + FILL_MAP(attr_map, param, stride_w); + FILL_MAP(attr_map, param, pad_h); + FILL_MAP(attr_map, param, pad_w); + FILL_MAP(attr_map, param, dilate_h); + FILL_MAP(attr_map, param, dilate_w); + FILL_MAP_EX(attr_map, param, sparse, dnnparam_2_str); + FILL_MAP_EX(attr_map, param, format, dnnparam_2_str); + FILL_MAP_EX(attr_map, param, mode, dnnparam_2_str); + megcc::CodeGenContext ctx(attr_map); + ret.push_back(ctx); + } break; + default: + EXPORT_ERR(ssprintf("config_attr not imp for: %s", k_name.c_str())); + break; + } + + return ret; +#undef DEC_DTYPE +} + +} // namespace exporter +} // namespace megcc diff --git a/compiler/tools/kernel_exporter/config_attr.h b/compiler/tools/kernel_exporter/config_attr.h new file mode 100644 index 00000000..19adf2d7 --- /dev/null +++ b/compiler/tools/kernel_exporter/config_attr.h @@ -0,0 +1,33 @@ +/** + * \file + * compiler/tools/kernel_exporter/config_attr.h + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ +#pragma once + +//#include + +#include "compiler/KernelGen/KernelGen.h" +#include "megbrain/common.h" +#include "megdnn/oprs/general.h" +#include "megdnn/oprs/nn.h" + +namespace { +using ConvParam = megdnn::ConvolutionForward::Param; +using ConvBiasParam = megdnn::ConvBiasForward::Param; +using KPT = megcc::KernelGen::KernelPack::KernType; +using KA = megcc::KernelGen::Arch; + +} // namespace + +namespace megcc { +namespace exporter { + +std::vector config_attr(KPT k_type, std::string k_name, + bool use_default_attr); + +} // namespace exporter +} // namespace megcc diff --git a/compiler/tools/kernel_exporter/exporter_imp.cpp b/compiler/tools/kernel_exporter/exporter_imp.cpp new file mode 100644 index 00000000..f74551a6 --- /dev/null +++ b/compiler/tools/kernel_exporter/exporter_imp.cpp @@ -0,0 +1,134 @@ +/** + * \file compiler/tools/kernel_exporter/exporter_imp.cpp + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ + +#include "exporter_imp.h" + +KPT KernelExporter::kernel_name_to_type() { + KPT ret; + auto m_find = m_kern_name2type.find(m_kernel_name); + if (m_find == m_kern_name2type.end()) { + EXPORT_ERR( + ssprintf("do not support kernel name: %s, support lists:\n%s", + m_kernel_name.c_str(), support_kernels().c_str())); + } else { + ret = m_find->second; + } + + return ret; +} + +KA KernelExporter::get_arch_type() { + KA ret; + auto m_find = m_name2arch.find(m_kernel_arch); + if (m_find == m_name2arch.end()) { + EXPORT_ERR(ssprintf("do not support arch: %s, support archs:\n%s", + m_kernel_arch.c_str(), support_archs().c_str())); + } else { + ret = m_find->second; + } + + return ret; +} + +std::pair, + const megcc::KernelGen::DeduceFunc*> +KernelExporter::get_kernels() { + KPT k_type = kernel_name_to_type(); + KA arch_type = get_arch_type(); + return megcc::KernelGen::KernelPack::GetKernel(k_type, arch_type); +} + +void KernelExporter::gen_kenrels() { + auto kernels = get_kernels().first; + if (kernels.size() <= 0) { + EXPORT_ERR(ssprintf("ERR: can not find any KernelFunc for: %s", + m_kernel_name.c_str())); + } + + auto attrs = megcc::exporter::config_attr( + kernel_name_to_type(), m_kernel_name, m_use_default_attr); + std::string common_header = R"( +#include +#include +#include +)"; + for (auto& i : kernels) { + for (auto& ctx : attrs) { + auto gen = [&]() { + bool is_cv = !i->GetCVKernelSymbol(&ctx).empty(); + auto kernel_file_name = i->GetKernelSymbol(&ctx) + ".c"; + if (is_cv) { + kernel_file_name = i->GetCVKernelSymbol(&ctx) + ".c"; + } + std::stringstream ss; + auto file_path = kernel_file_name; + llvm::outs() << "\n"; + llvm::outs() << "\n"; + ss << common_header; + if (is_cv) { + ss << i->GetCVKernelBody(&ctx) << "\n"; + } else { + ss << i->GetKernelBody(&ctx) << "\n"; + for (auto& d : i->GetDependInternalSymbol(&ctx)) { + ss << d.kernel_body; + } + } + if (m_print_to_console) { + std::cout << ss.rdbuf() << "\n"; + }; + std::ofstream out_file(file_path); + out_file << ss.str(); + out_file.close(); + llvm::outs() << "====>get kernel to: " << file_path << "\n"; + }; + + try { + gen(); + } catch (...) { + } + } + } + + llvm::outs() << "Export tinynnkernel done.\n"; +} + +std::map KernelExporter::m_kern_name2type{ + {"ConvKernel", KPT::ConvKernel}, + {"ElemwiseKernel", KPT::ElemwiseKernel}, + {"ElemwiseMultiKernel", KPT::ElemwiseMultiKernel}, + {"PoolingKernel", KPT::PoolingKernel}, + {"MatrixMulKernel", KPT::MatrixMulKernel}, + {"MatrixInvKernel", KPT::MatrixInvKernel}, + {"RelayoutKernel", KPT::RelayoutKernel}, + {"ReduceKernel", KPT::ReduceKernel}, + {"IndexingMultiAxisKernel", KPT::IndexingMultiAxisKernel}, + {"IndexingOneHotKernel", KPT::IndexingOneHotKernel}, + {"WarpPerspectiveKernel", KPT::WarpPerspectiveKernel}, + {"WarpAffineKernel", KPT::WarpAffineKernel}, + {"TypeCvtKernel", KPT::TypeCvtKernel}, + {"TopK", KPT::TopK}, + {"BatchMatmulKernel", KPT::BatchMatmulKernel}, + {"PowCKernel", KPT::PowCKernel}, + {"CVTransposeKernel", KPT::CVTransposeKernel}, + {"FlipKernel", KPT::FlipKernel}, + {"ResizeKernel", KPT::ResizeKernel}, + {"RotateKernel", KPT::RotateKernel}, + {"RoiCopyKernel", KPT::RoiCopyKernel}, + {"CvtColorKernel", KPT::CvtColorKernel}, + {"ArgSortKernel", KPT::ArgSortKernel}, + {"ArgmaxKernel", KPT::ArgmaxKernel}, + {"ConcatKernel", KPT::ConcatKernel}, + {"ConvBackDataKernel", KPT::ConvBackDataKernel} + +}; + +std::map KernelExporter::m_name2arch{ + {"BAREMETAL", KA::BAREMETAL}, + {"ARM64", KA::ARM64}, + {"ARMV7", KA::ARMV7}, +}; diff --git a/compiler/tools/kernel_exporter/exporter_imp.h b/compiler/tools/kernel_exporter/exporter_imp.h new file mode 100644 index 00000000..6ba734ab --- /dev/null +++ b/compiler/tools/kernel_exporter/exporter_imp.h @@ -0,0 +1,73 @@ +/** + * \file compiler/tools/kernel_exporter/exporter_imp.h + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ + +#include + +#include "config_attr.h" +#include "utils.h" + +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" +#include "mlir/Parser.h" + +#include "compiler/Common/Logger.h" +#include "compiler/Common/Version.h" +#include "compiler/KernelGen/KernelGen.h" + +using namespace llvm; + +class KernelExporter { + static std::map m_kern_name2type; + static std::map m_name2arch; + + std::string m_kernel_name; + std::string m_kernel_arch; + bool m_use_default_attr; + bool m_print_to_console; + + KPT kernel_name_to_type(); + KA get_arch_type(); + + std::pair, + const megcc::KernelGen::DeduceFunc*> + get_kernels(); + +public: + KernelExporter(std::string kernel_name, std::string kernel_arch, + bool use_default_attr, bool print_to_console) + : m_kernel_name{kernel_name}, + m_kernel_arch{kernel_arch}, + m_use_default_attr(use_default_attr), + m_print_to_console(print_to_console) { + std::string attr = "use kernel default attr"; + if (!m_use_default_attr) { + attr = "use user config attr"; + } + llvm::outs() << "try export tinynn kernel of " << m_kernel_name << "(" + << m_kernel_arch << ")" + << "\n"; + llvm::outs() << "kernel attr: " << attr << "\n"; + llvm::outs() << "print to console: " << m_print_to_console << "\n"; + megcc::setAssertThrow(true); + }; + +#define MAPKEY2STR(m) \ + std::string ret; \ + for (auto i : m) { \ + ret += i.first; \ + ret += "\n"; \ + } \ + return ret; + + static std::string support_kernels() { MAPKEY2STR(m_kern_name2type); } + + static std::string support_archs() { MAPKEY2STR(m_name2arch); } + + void gen_kenrels(); +}; diff --git a/compiler/tools/kernel_exporter/tinynn-exporter.cpp b/compiler/tools/kernel_exporter/tinynn-exporter.cpp new file mode 100644 index 00000000..5246583c --- /dev/null +++ b/compiler/tools/kernel_exporter/tinynn-exporter.cpp @@ -0,0 +1,53 @@ +/** + * \file compiler/tools/kernel_exporter/tinynn-exporter.cpp + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ + +#include + +#include "config_attr.h" +#include "utils.h" + +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" +#include "mlir/Parser.h" + +#include "exporter_imp.h" + +using namespace llvm; + +int main(int argc, char** argv) { + auto k_name_desc = "input kernel name, valid option:\n" + + KernelExporter::support_kernels(); + cl::opt KernelName("kernel", cl::Required, + cl::desc(k_name_desc)); + auto arch_desc = "the platform arch, valid options:\n" + + KernelExporter::support_archs(); + cl::opt kernelArch("arch", cl::Required, cl::desc(arch_desc)); + cl::opt use_default_attr( + "use_default_attr", + cl::desc("Use a default attribute to generate kernel, if not " + "config, user need dynamic config it")); + cl::opt print_to_console("print_to_console", + cl::desc("Print kernel body to console")); + cl::opt Verbose( + "verbose", + cl::desc("log more detail information when compiler model")); + + cl::AddExtraVersionPrinter( + [](raw_ostream& oss) { oss << megcc::getMegccVersionString(); }); + cl::ParseCommandLineOptions(argc, argv); + if (Verbose) { + megcc::SetLogLevel(megcc::LogLevel::DEBUG); + } + KernelExporter exporter(KernelName.getValue(), kernelArch.getValue(), + use_default_attr.getValue(), + print_to_console.getValue()); + exporter.gen_kenrels(); + + return 0; +} diff --git a/compiler/tools/kernel_exporter/utils.cpp b/compiler/tools/kernel_exporter/utils.cpp new file mode 100644 index 00000000..a798feb8 --- /dev/null +++ b/compiler/tools/kernel_exporter/utils.cpp @@ -0,0 +1,60 @@ +/** + * \file compiler/tools/kernel_exporter/utils.cpp + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ +#include "utils.h" +#include +#include + +inline constexpr const char* convert_fmt_str(const char* fmt) { + return fmt; +} + +std::string svsprintf(const char* fmt, va_list ap_orig) { + fmt = convert_fmt_str(fmt); + int size = 100; /* Guess we need no more than 100 bytes */ + char* p; + + if ((p = (char*)malloc(size)) == nullptr) + goto err; + + for (;;) { + va_list ap; + va_copy(ap, ap_orig); + int n = vsnprintf(p, size, fmt, ap); + va_end(ap); + + if (n < 0) + goto err; + + if (n < size) { + std::string rst(p); + free(p); + return rst; + } + + size = n + 1; + + char* np = (char*)realloc(p, size); + if (!np) { + free(p); + goto err; + } else + p = np; + } + +err: + fprintf(stderr, "could not allocate memory for svsprintf; fmt=%s\n", fmt); + __builtin_trap(); +} + +std::string ssprintf(const char* fmt, ...) { + va_list ap; + va_start(ap, fmt); + auto rst = svsprintf(fmt, ap); + va_end(ap); + return rst; +} diff --git a/compiler/tools/kernel_exporter/utils.h b/compiler/tools/kernel_exporter/utils.h new file mode 100644 index 00000000..48f3d8f8 --- /dev/null +++ b/compiler/tools/kernel_exporter/utils.h @@ -0,0 +1,14 @@ +/** + * \file compiler/tools/kernel_exporter/utils.h + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ +#include + +#define EXPORT_ERR(msg) \ + llvm::outs() << msg << "\n"; \ + __builtin_trap(); + +std::string ssprintf(const char* fmt, ...); diff --git a/script/build_and_test_not_standard_os.sh b/script/build_and_test_not_standard_os.sh index cdfdbb44..12c9e161 100755 --- a/script/build_and_test_not_standard_os.sh +++ b/script/build_and_test_not_standard_os.sh @@ -28,7 +28,7 @@ cmake --build "$MEGCC_BUILD_DIR" -j$(nproc) --target mgb-to-tinynn --target mgb- function check_key_words() { #elf self mangle words, we do not care!! - white_list="@MEGW mgb1 5Mbg6 MGBi O:MgBnWk Yr]< 4emUi0B >HMgE kMEG RmEg MbGV4 MEgIy @MEg mGe#S BMgb MGB( mBg: MBgr8C A&mGB mEg; mGb>/ mEg= .strtab .shstrtab A=MgE= mgb=g MGe= g=MgE " + white_list="@MEGW mgb1 5Mbg6 MGBi O:MgBnWk Yr]< 4emUi0B >HMgE kMEG RmEg MbGV4 MEgIy @MEg mGe#S BMgb MGB( mBg: MBgr8C A&mGB mEg; mGb>/ mEg= .strtab .shstrtab A=MgE= mgb=g MGe= g=MgE MGE<" elf_file=$1 if [ ! -f ${elf_file} ];then echo "ERR: can not find ${elf_file}" diff --git a/script/release_megcc.sh b/script/release_megcc.sh index 9b2c2f2e..eef6c540 100755 --- a/script/release_megcc.sh +++ b/script/release_megcc.sh @@ -32,10 +32,17 @@ pushd ${OUT_DIR}/build_host cmake ${COMPILER_PATH} -G Ninja ninja cp tools/mgb-to-tinynn/mgb-to-tinynn ${OUT_DIR}/bin/ + strip mgb-to-tinynn cp tools/mgb-runner/mgb-runner ${OUT_DIR}/bin/ + strip mgb-runner cp tools/mgb-importer/mgb-importer ${OUT_DIR}/bin/ + strip mgb-importer + cp tools/kernel_exporter/kernel_exporter ${OUT_DIR}/bin/ + strip kernel_exporter cp tools/hako-to-mgb/hako-to-mgb ${OUT_DIR}/bin/ + strip hako-to-mgb cp tools/megcc-opt/megcc-opt ${OUT_DIR}/bin/ + strip megcc-opt popd pushd ${PROJECT_PATH}/compiler GIT_ID=`git rev-parse --short HEAD` From 50559109b17344cc2069274137f703cbb2036eb9 Mon Sep 17 00:00:00 2001 From: yuxiongxiong Date: Tue, 27 Dec 2022 14:34:02 +0800 Subject: [PATCH 07/17] feat(compiler): optimize arm64 sigmoid --- .../KernelGen/Arm/Arm64/Elemwise/Elemwise.cpp | 137 +++++++ .../KernelGen/Arm/Arm64/Elemwise/Elemwise.h | 33 ++ .../Arm64/ElemwiseHelper/ElemwiseHelper.cpp | 45 ++ .../Arm/Arm64/ElemwiseHelper/ElemwiseHelper.h | 103 +++++ .../Arm/Arm64/ElemwiseHelper/UnaryHelper.cpp | 383 ++++++++++++++++++ .../lib/KernelGen/Arm/Arm64/KernelPack.cpp | 5 +- compiler/test/kernel/opr/arm/Elementwise.cpp | 13 + .../kernel/opr/arm/benchmark_elemwise.cpp | 9 + 8 files changed, 727 insertions(+), 1 deletion(-) create mode 100644 compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.cpp create mode 100644 compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.h create mode 100644 compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.cpp create mode 100644 compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.h create mode 100644 compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/UnaryHelper.cpp diff --git a/compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.cpp b/compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.cpp new file mode 100644 index 00000000..cf6235e5 --- /dev/null +++ b/compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.cpp @@ -0,0 +1,137 @@ +/** + * \file + * compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.cpp + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ + +#include "Elemwise.h" +#include "../ElemwiseHelper/ElemwiseHelper.h" +#include "Arm/ArmCommon/InternalKernel.h" +#include "Utils/SymbolHelper.h" +#include "compiler/Common/Logger.h" +using namespace megcc; +using namespace KernelGen; +using namespace Arm64; + +bool ElemwiseKernel::IsAvailable(TContext* ctx) const { + //! TODO: now only support float type + int nr_operands = ctx->getAttrInt("nr_operands"); + bool type_ok = true; + for (int i = 0; i < nr_operands; i++) { + type_ok &= (ctx->getAttrOprand("operand:" + std::to_string(i)).dtype == + "f32"); + } + auto mode = ctx->getAttrStr("mode"); + bool mode_ok = mode == "SIGMOID"; + bool ok_input = nr_operands == 2; + bool usable = type_ok && mode_ok && ok_input; + return usable; +} + +std::string ElemwiseKernel::GetKernelSymbol(TContext* context) const { + std::stringstream ss; + ss << "Arm64_kernel_elementwise"; + ss << "_" << context->getAttrStr("mode"); + int nr_operands = context->getAttrInt("nr_operands"); + if (nr_operands == 2) { + ss << "_unary_vec_vec"; + } else { + //! Not implement ternary elemwise kernel + ss << "_invalid_nr_operands_"; + } + //! TODO: add ternary elemwise + ss << "_" << SymbolHelper::gen_io_str(context); + return ss.str(); +} + +std::string ElemwiseKernel::GetKernelBody(TContext* ctx) const { + std::stringstream writer; + int nr_operands = ctx->getAttrInt("nr_operands"); + auto mode = ctx->getAttrStr("mode"); + std::vector operands; + for (int i = 0; i < nr_operands; i++) { + operands.push_back(ctx->getAttrOprand("operand:" + std::to_string(i))); + } + auto ElemwiseImpl = ElemwiseHelperFunc::CreateGenHelper(mode, operands); + auto InternalKernelFunc = ArmCommon::ExpNeonKernel(); + + CC_ASSERT(ElemwiseImpl) << "ElemwiseHelper Create error!\n"; + writer << R"( + #include + #include + #include + #include "tensor_util.h" + )"; + writer << "\n\n"; + writer << "extern " << InternalKernelFunc.GetKernelSignature(ctx) << ";\n"; + writer << R"( + static const struct { + float lower_range; + float upper_range; + float alpha_9; + float alpha_7; + float alpha_5; + float alpha_3; + float alpha_1; + float beta_10; + float beta_8; + float beta_6; + float beta_4; + float beta_2; + float beta_0; + float one_half; +} sigmoid_constants = { + -18.0f, + 18.0f, + 4.37031012579801e-11f, + 1.15627324459942e-07f, + 6.08574864600143e-05f, + 8.51377133304701e-03f, + 2.48287947061529e-01f, + 6.10247389755681e-13f, + 5.76102136993427e-09f, + 6.29106785017040e-06f, + 1.70198817374094e-03f, + 1.16817656904453e-01f, + 9.93151921023180e-01f, + 0.5f, +}; + )"; + writer << GenCommonRet() << " " << GetKernelSignature(ctx) << "{\n"; + //! input + output = 2, unary case + if (nr_operands == 2) { + writer << R"( + float* input_data0 = inputs[0]->ptr; + TINYNN_ASSERT(input_data0); + float* output_data = outputs[0]->ptr; + ${ElemwiseImpl(input_data0, output_data)}; + )"; + } else { + CC_ABORT << "not support ternary elemwise.\n"; + } + writer << "\nreturn TinyNN_SUCCESS;\n}"; + + std::stringstream ss; + auto ImpleGen = [=](std::vector strs) { + return ElemwiseImpl->GenCodeBody(strs); + }; + ss << StringTemplate::StringTemplateArgs() + .add("ElemwiseImpl", ImpleGen) + .render(writer.str()); + return ss.str(); +} + +std::vector ElemwiseKernel::GetDependInternalSymbol( + TContext* ctx) const { + std::vector depends; + ArmCommon::ExpNeonKernel kern; + depends.emplace_back(kern.GetKernelSymbol(ctx), kern.GetKernelBody(ctx), + kern.GetBodyGuardBegin(ctx), + kern.GetBodyGuardEnd(ctx)); + return depends; +} + +// vim: syntax=cpp.doxygen diff --git a/compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.h b/compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.h new file mode 100644 index 00000000..787f5dc4 --- /dev/null +++ b/compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.h @@ -0,0 +1,33 @@ +/** + * \file + * compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.h + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ +#pragma once +#include +#include +#include "compiler/KernelGen/KernelGen.h" + +namespace megcc { +namespace KernelGen { +namespace Arm64 { + +class ElemwiseKernel : public KernelFunc { +public: + virtual ~ElemwiseKernel(){}; + bool IsAvailable(TContext* context) const override; + std::string GetKernelSymbol(TContext* context) const override; + std::string GetKernelBody(TContext* context) const override; + + std::vector GetDependInternalSymbol( + TContext* context) const override; +}; + +} // namespace Arm64 +} // namespace KernelGen +} // namespace megcc + +// vim: syntax=cpp.doxygen diff --git a/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.cpp b/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.cpp new file mode 100644 index 00000000..a212120e --- /dev/null +++ b/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.cpp @@ -0,0 +1,45 @@ +/** + * \file + * compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.cpp + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ + +#include "ElemwiseHelper.h" +#include "Utils/SymbolHelper.h" +#include "compiler/Common/Logger.h" +using namespace megcc; +using namespace KernelGen; +using namespace Arm64; + +#define CASE_DISPATCH(_mode, _helper_name) \ + if (mode == _mode) { \ + return std::make_shared<_helper_name>(); \ + } + +#define CASE_DISPATCH_ARG(_mode, _helper_name, ...) \ + if (mode == _mode) { \ + return std::make_shared<_helper_name>(__VA_ARGS__); \ + } + +std::shared_ptr ElemwiseHelperFunc::CreateGenHelper( + std::string mode, std::vector operands) { + size_t nr_operands = operands.size(); + if (nr_operands == 2) { + CASE_DISPATCH("SIGMOID", ElemwiseGenUnarySigmoid); + } else { + CC_ABORT << mode << " not Implement now\n"; + } + return nullptr; +} + +#undef CASE_DISPATCH +#undef CASE_DISPATCH_ARG + +std::string ElemwiseHelperFunc::BcastType2String(BcastType bcast_type) { + return ArmCommon::ElemwiseHelperFunc::BcastType2String(bcast_type); +} + +// vim: syntax=cpp.doxygen diff --git a/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.h b/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.h new file mode 100644 index 00000000..ff81f71b --- /dev/null +++ b/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.h @@ -0,0 +1,103 @@ +/** + * \file + * compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.h + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ +#pragma once +#include +#include +#include +#include "Arm/ArmCommon/ArmSimdHelper.h" +#include "Arm/ArmCommon/ElemwiseHelper/ElemwiseHelper.h" +#include "Common/ElemwiseCommon.h" +#include "Utils/StringTemplate.h" +#include "Utils/SymbolHelper.h" +#include "compiler/KernelGen/KernelGen.h" +namespace megcc { +namespace KernelGen { +namespace Arm64 { + +class ElemwiseGenBase { +public: + //! gen the code out side the compute kernel, just address offset, for loop + virtual std::string GenCodeBody(std::vector) const = 0; + + //! Gen elemwise kernel asm computing init code, init for the necessary simd + //! variable, such as zero in Relu + virtual std::string GenKernelAsmInit(std::vector) const = 0; + + //! Gen the simd elemwise compute code, and the degree of unroll is specific + //! by first param + virtual std::string GenKernelSimdUnroll(std::vector) const = 0; + + //! Gen the naive C elemwise compute code, and the degree of unroll is + //! specific by first param + virtual std::string GenKernelNaiveUnroll( + std::vector) const = 0; + + virtual ~ElemwiseGenBase() {} +}; + +//! The Unary elemwise kernel base +class ElemwiseGenUnary : public ElemwiseGenBase { +public: + std::string m_src_dtype; + std::string m_dst_dtype; + bool m_inline_mode; + std::unique_ptr m_src_simd; + std::unique_ptr m_dst_simd; + bool m_i32_to_qs8; + std::unique_ptr m_common_sigmoid_gen; + ElemwiseGenUnary(std::string src_dtype = "f32", + std::string dst_dtype = "f32", bool inline_mode = false) + : m_src_dtype(src_dtype), + m_dst_dtype(dst_dtype), + m_inline_mode(inline_mode) { + m_src_simd = std::make_unique(src_dtype); + m_dst_simd = std::make_unique(dst_dtype); + m_common_sigmoid_gen = + std::make_unique( + src_dtype, dst_dtype, inline_mode); + m_i32_to_qs8 = Utils::is_int_dtype(m_src_dtype, 32) && + Utils::is_int_dtype(m_dst_dtype, 8); + }; + std::string GenCodeBody(std::vector) const override; + virtual std::string GenInlineName() const = 0; +}; + +//! create the elemwise helper implement according to the mode and operand +struct ElemwiseHelperFunc { + static std::shared_ptr CreateGenHelper( + std::string mode, std::vector operands); + static std::string BcastType2String(BcastType bcast_type); +}; + +/************************************Unary***********************************/ + +#define DEFINE_NNARY_OP(_name) \ + class _name : public ElemwiseGenUnary { \ + public: \ + _name(std::string src_dtype = "f32", std::string dst_dtype = "f32", \ + bool inline_mode = false) \ + : ElemwiseGenUnary(SymbolHelper::gen_valid_dtype(src_dtype), \ + SymbolHelper::gen_valid_dtype(dst_dtype), \ + inline_mode) {} \ + std::string GenKernelAsmInit(std::vector) const override; \ + std::string GenKernelSimdUnroll( \ + std::vector) const override; \ + std::string GenKernelNaiveUnroll( \ + std::vector) const override; \ + std::string GenInlineName() const override; \ + }; + +DEFINE_NNARY_OP(ElemwiseGenUnarySigmoid) +#undef DEFINE_NNARY_OP + +} // namespace Arm64 +} // namespace KernelGen +} // namespace megcc + +// vim: syntax=cpp.doxygen diff --git a/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/UnaryHelper.cpp b/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/UnaryHelper.cpp new file mode 100644 index 00000000..b88f6307 --- /dev/null +++ b/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/UnaryHelper.cpp @@ -0,0 +1,383 @@ +/** + * \file + * compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/UnaryHelper.cpp + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ + +#include "ElemwiseHelper.h" +#include "Utils/SymbolHelper.h" +#include "Utils/Utils.h" +#include "compiler/Common/Logger.h" +using namespace megcc; +using namespace KernelGen; +using namespace Arm64; +std::string ElemwiseGenUnary::GenCodeBody(std::vector strs) const { + std::stringstream body_ss; + if (m_inline_mode) { + body_ss << R"(static inline void ${inline_func_name}(const ${src_specifier}* src, ${dst_specifier}* dst, size_t nr_elem)"; + body_ss << "){"; + } else { + body_ss << R"( + Layout in_layout = inputs[0]->layout; + size_t nr_elem = 1; + for (int i = 0; i < in_layout.nr_dim; ++i) { + nr_elem *= in_layout.dims[i]; + } + const ${src_specifier} * src = ${source}; + ${dst_specifier}* dst = ${dst}; + )"; + } + body_ss << R"( + + ${kernel_init()} + + size_t index = offset; + for(; index + 7 < nr_elem; index += 8) { + ${src_simd_specifier} vsrc0 = ${src_ld1q}(src); + ${src_simd_specifier} vsrc1 = ${src_ld1q}(src + 4); + ${kernel_simd_unroll(2, vsrc0, vdst0, vsrc1, vdst1)} + ${dst_store(dst, vdst0)}; + ${dst_store(dst + 4, vdst1)}; + src += 8; + dst += 8; + } + for(; index + 3 < nr_elem; index += 4) { + ${src_simd_specifier} vsrc0 = ${src_ld1q}(src); + ${kernel_simd_unroll(1, vsrc0, vdst0)} + ${dst_store(dst, vdst0)}; + src += 4; + dst += 4; + } + for(; index < nr_elem; index++) { + ${kernel_naive_unroll(1, src, dst)} + src += 1; + dst += 1; + })"; + if (m_inline_mode) { + body_ss << "}"; + } + auto kernel_init = [this](std::vector strs) { + return GenKernelAsmInit(strs); + }; + auto kernel_simd_unroll = [this](std::vector strs) { + return GenKernelSimdUnroll(strs); + }; + auto kernel_naive_unroll = [this](std::vector strs) { + return GenKernelNaiveUnroll(strs); + }; + std::stringstream ss; + auto body_render = StringTemplate::StringTemplateArgs() + .add("kernel_init", kernel_init) + .add("kernel_simd_unroll", kernel_simd_unroll) + .add("kernel_naive_unroll", kernel_naive_unroll) + .add("src_specifier", + Utils::cvt_dtype_specifier(m_src_dtype)) + .add("dst_specifier", + Utils::cvt_dtype_specifier(m_dst_dtype)) + .add("src_ld1q", m_src_simd->get_ld1q_symbol()) + .add("dst_store", + [=](std::string ptr, std::string dst_reg) { + return m_dst_simd->get_st1q_symbol() + + "(" + ptr + "," + dst_reg + + ")\n"; + }) + .add("dst_st1q", m_dst_simd->get_st1q_symbol()) + .add("src_simd_specifier", + m_src_simd->get_specifier_q_symbol()); + + if (m_inline_mode) { + body_render.add("inline_func_name", GenInlineName()); + } else { + auto input = strs[0]; + auto output = strs[1]; + body_render.add("source", input).add("dst", output); + } + ss << body_render.render(body_ss.str()); + + return ss.str(); +} + +//! Sigmoid +std::string ElemwiseGenUnarySigmoid::GenInlineName() const { + return "ElemwiseGenUnarySigmoid"; +} +std::string ElemwiseGenUnarySigmoid::GenKernelAsmInit( + std::vector) const { + std::stringstream writer; + writer << R"( + size_t x6_iter = nr_elem / (4 * 6); + size_t offset = x6_iter * 4 * 6; + float32x4_t lower_range; + float32x4_t upper_range; + float32x4_t alpha_9; + float32x4_t alpha_7; + float32x4_t alpha_5; + float32x4_t alpha_3; + float32x4_t alpha_1; + float32x4_t beta_10; + float32x4_t beta_8; + float32x4_t beta_6; + float32x4_t beta_4; + float32x4_t beta_2; + float32x4_t beta_0; + float32x4_t one_half; + + const float* const_ptr = &(sigmoid_constants.lower_range); + if (x6_iter > 0) { + /** + * q0 - q5 : squared + * q6 - q11 : p + * q12- q17 : val(temp), q + * q18- q31 : const + */ + asm volatile( + "ld1r {%[lower_range].4s}, [%[const_ptr]], #4\n" + "ld1r {%[upper_range].4s}, [%[const_ptr]], #4\n" + "ld1r {%[alpha_9].4s}, [%[const_ptr]], #4\n" + "ld1r {%[alpha_7].4s}, [%[const_ptr]], #4\n" + "ld1r {%[alpha_5].4s}, [%[const_ptr]], #4\n" + "ld1r {%[alpha_3].4s}, [%[const_ptr]], #4\n" + "ld1r {%[alpha_1].4s}, [%[const_ptr]], #4\n" + "ld1r {%[beta_10].4s}, [%[const_ptr]], #4\n" + "ld1r {%[beta_8].4s}, [%[const_ptr]], #4\n" + "ld1r {%[beta_6].4s}, [%[const_ptr]], #4\n" + "ld1r {%[beta_4].4s}, [%[const_ptr]], #4\n" + "ld1r {%[beta_2].4s}, [%[const_ptr]], #4\n" + "ld1r {%[beta_0].4s}, [%[const_ptr]], #4\n" + "ld1r {%[one_half].4s}, [%[const_ptr]], #4\n" + + "1:\n" + "ldr q12, [%[a_ptr]] \n" + "ldr q13, [%[a_ptr], #16]\n" + "ldr q14, [%[a_ptr], #32]\n" + "ldr q15, [%[a_ptr], #48]\n" + "ldr q16, [%[a_ptr], #64]\n" + "ldr q17, [%[a_ptr], #80]\n" + // auto val = vmaxq_f32(vdupq_n_f32(sigmoid_constants.lower_range), + // src); + "fmax v12.4s, v12.4s, %[lower_range].4s\n" + "fmax v13.4s, v13.4s, %[lower_range].4s\n" + "fmax v14.4s, v14.4s, %[lower_range].4s\n" + "fmax v15.4s, v15.4s, %[lower_range].4s\n" + "fmax v16.4s, v16.4s, %[lower_range].4s\n" + "fmax v17.4s, v17.4s, %[lower_range].4s\n" + "add %[a_ptr], %[a_ptr], #96\n" + + // val = vminq_f32(vdupq_n_f32(sigmoid_constants.upper_range), val); + "fmin v12.4s, v12.4s, %[upper_range].4s\n" + "fmin v13.4s, v13.4s, %[upper_range].4s\n" + "fmin v14.4s, v14.4s, %[upper_range].4s\n" + "fmin v15.4s, v15.4s, %[upper_range].4s\n" + "fmin v16.4s, v16.4s, %[upper_range].4s\n" + "fmin v17.4s, v17.4s, %[upper_range].4s\n" + + //! auto squared = vmulq_f32(val, val); + "fmul v0.4s, v12.4s, v12.4s\n" + "fmul v1.4s, v13.4s, v13.4s\n" + "fmul v2.4s, v14.4s, v14.4s\n" + "fmul v3.4s, v15.4s, v15.4s\n" + "fmul v4.4s, v16.4s, v16.4s\n" + "fmul v5.4s, v17.4s, v17.4s\n" + // auto p = fma_ps_f32( + // vdupq_n_f32(sigmoid_constants.alpha_7), squared, + // vdupq_n_f32(sigmoid_constants.alpha_9)); + "fmul v6.4s, v0.4s, %[alpha_9].4s\n" + "fmul v7.4s, v1.4s, %[alpha_9].4s\n" + "fmul v8.4s, v2.4s, %[alpha_9].4s\n" + "fmul v9.4s, v3.4s, %[alpha_9].4s\n" + "fmul v10.4s, v4.4s, %[alpha_9].4s\n" + "fmul v11.4s, v5.4s, %[alpha_9].4s\n" + "fadd v6.4s, v6.4s, %[alpha_7].4s\n" + "fadd v7.4s, v7.4s, %[alpha_7].4s\n" + "fadd v8.4s, v8.4s, %[alpha_7].4s\n" + "fadd v9.4s, v9.4s, %[alpha_7].4s\n" + "fadd v10.4s, v10.4s, %[alpha_7].4s\n" + "fadd v11.4s, v11.4s, %[alpha_7].4s\n" + + // p = fma_ps_f32(vdupq_n_f32(sigmoid_constants.alpha_5), p, squared); + "fmul v6.4s, v6.4s, v0.4s\n" + "fmul v7.4s, v7.4s, v1.4s\n" + "fmul v8.4s, v8.4s, v2.4s\n" + "fmul v9.4s, v9.4s, v3.4s\n" + "fmul v10.4s, v10.4s, v4.4s\n" + "fmul v11.4s, v11.4s, v5.4s\n" + "fadd v6.4s, v6.4s, %[alpha_5].4s\n" + "fadd v7.4s, v7.4s, %[alpha_5].4s\n" + "fadd v8.4s, v8.4s, %[alpha_5].4s\n" + "fadd v9.4s, v9.4s, %[alpha_5].4s\n" + "fadd v10.4s, v10.4s, %[alpha_5].4s\n" + "fadd v11.4s, v11.4s, %[alpha_5].4s\n" + + // p = fma_ps_f32(vdupq_n_f32(sigmoid_constants.alpha_3), p, squared); + "fmul v6.4s, v6.4s, v0.4s\n" + "fmul v7.4s, v7.4s, v1.4s\n" + "fmul v8.4s, v8.4s, v2.4s\n" + "fmul v9.4s, v9.4s, v3.4s\n" + "fmul v10.4s, v10.4s, v4.4s\n" + "fmul v11.4s, v11.4s, v5.4s\n" + "fadd v6.4s, v6.4s, %[alpha_3].4s\n" + "fadd v7.4s, v7.4s, %[alpha_3].4s\n" + "fadd v8.4s, v8.4s, %[alpha_3].4s\n" + "fadd v9.4s, v9.4s, %[alpha_3].4s\n" + "fadd v10.4s, v10.4s, %[alpha_3].4s\n" + "fadd v11.4s, v11.4s, %[alpha_3].4s\n" + + // p = fma_ps_f32(vdupq_n_f32(sigmoid_constants.alpha_1), p, squared); + "fmul v6.4s, v6.4s, v0.4s\n" + "fmul v7.4s, v7.4s, v1.4s\n" + "fmul v8.4s, v8.4s, v2.4s\n" + "fmul v9.4s, v9.4s, v3.4s\n" + "fmul v10.4s, v10.4s, v4.4s\n" + "fmul v11.4s, v11.4s, v5.4s\n" + "fadd v6.4s, v6.4s, %[alpha_1].4s\n" + "fadd v7.4s, v7.4s, %[alpha_1].4s\n" + "fadd v8.4s, v8.4s, %[alpha_1].4s\n" + "fadd v9.4s, v9.4s, %[alpha_1].4s\n" + "fadd v10.4s, v10.4s, %[alpha_1].4s\n" + "fadd v11.4s, v11.4s, %[alpha_1].4s\n" + + // p = vmulq_f32(p, val); + "fmul v6.4s, v6.4s, v12.4s\n" + "fmul v7.4s, v7.4s, v13.4s\n" + "fmul v8.4s, v8.4s, v14.4s\n" + "fmul v9.4s, v9.4s, v15.4s\n" + "fmul v10.4s, v10.4s, v16.4s\n" + "fmul v11.4s, v11.4s, v17.4s\n" + + // auto q = fma_ps_f32( + // vdupq_n_f32(sigmoid_constants.beta_8), squared, + // vdupq_n_f32(sigmoid_constants.beta_10)); + "fmul v12.4s, v0.4s, %[beta_10].4s\n" + "fmul v13.4s, v1.4s, %[beta_10].4s\n" + "fmul v14.4s, v2.4s, %[beta_10].4s\n" + "fmul v15.4s, v3.4s, %[beta_10].4s\n" + "fmul v16.4s, v4.4s, %[beta_10].4s\n" + "fmul v17.4s, v5.4s, %[beta_10].4s\n" + "fadd v12.4s, v12.4s, %[beta_8].4s\n" + "fadd v13.4s, v13.4s, %[beta_8].4s\n" + "fadd v14.4s, v14.4s, %[beta_8].4s\n" + "fadd v15.4s, v15.4s, %[beta_8].4s\n" + "fadd v16.4s, v16.4s, %[beta_8].4s\n" + "fadd v17.4s, v17.4s, %[beta_8].4s\n" + + // q = fma_ps_f32(vdupq_n_f32(sigmoid_constants.beta_6), q, + // squared); + "fmul v12.4s, v12.4s, v0.4s\n" + "fmul v13.4s, v13.4s, v1.4s\n" + "fmul v14.4s, v14.4s, v2.4s\n" + "fmul v15.4s, v15.4s, v3.4s\n" + "fmul v16.4s, v16.4s, v4.4s\n" + "fmul v17.4s, v17.4s, v5.4s\n" + "fadd v12.4s, v12.4s, %[beta_6].4s\n" + "fadd v13.4s, v13.4s, %[beta_6].4s\n" + "fadd v14.4s, v14.4s, %[beta_6].4s\n" + "fadd v15.4s, v15.4s, %[beta_6].4s\n" + "fadd v16.4s, v16.4s, %[beta_6].4s\n" + "fadd v17.4s, v17.4s, %[beta_6].4s\n" + + // q = fma_ps_f32(vdupq_n_f32(sigmoid_constants.beta_4), q, + // squared); + "fmul v12.4s, v12.4s, v0.4s\n" + "fmul v13.4s, v13.4s, v1.4s\n" + "fmul v14.4s, v14.4s, v2.4s\n" + "fmul v15.4s, v15.4s, v3.4s\n" + "fmul v16.4s, v16.4s, v4.4s\n" + "fmul v17.4s, v17.4s, v5.4s\n" + "fadd v12.4s, v12.4s, %[beta_4].4s\n" + "fadd v13.4s, v13.4s, %[beta_4].4s\n" + "fadd v14.4s, v14.4s, %[beta_4].4s\n" + "fadd v15.4s, v15.4s, %[beta_4].4s\n" + "fadd v16.4s, v16.4s, %[beta_4].4s\n" + "fadd v17.4s, v17.4s, %[beta_4].4s\n" + + // q = fma_ps_f32(vdupq_n_f32(sigmoid_constants.beta_2), q, + // squared); + "fmul v12.4s, v12.4s, v0.4s\n" + "fmul v13.4s, v13.4s, v1.4s\n" + "fmul v14.4s, v14.4s, v2.4s\n" + "fmul v15.4s, v15.4s, v3.4s\n" + "fmul v16.4s, v16.4s, v4.4s\n" + "fmul v17.4s, v17.4s, v5.4s\n" + "fadd v12.4s, v12.4s, %[beta_2].4s\n" + "fadd v13.4s, v13.4s, %[beta_2].4s\n" + "fadd v14.4s, v14.4s, %[beta_2].4s\n" + "fadd v15.4s, v15.4s, %[beta_2].4s\n" + "fadd v16.4s, v16.4s, %[beta_2].4s\n" + "fadd v17.4s, v17.4s, %[beta_2].4s\n" + + // q = fma_ps_f32(vdupq_n_f32(sigmoid_constants.beta_0), q, squared); + "fmul v12.4s, v12.4s, v0.4s\n" + "fmul v13.4s, v13.4s, v1.4s\n" + "fmul v14.4s, v14.4s, v2.4s\n" + "fmul v15.4s, v15.4s, v3.4s\n" + "fmul v16.4s, v16.4s, v4.4s\n" + "fmul v17.4s, v17.4s, v5.4s\n" + "fadd v12.4s, v12.4s, %[beta_0].4s\n" + "fadd v13.4s, v13.4s, %[beta_0].4s\n" + "fadd v14.4s, v14.4s, %[beta_0].4s\n" + "fadd v15.4s, v15.4s, %[beta_0].4s\n" + "fadd v16.4s, v16.4s, %[beta_0].4s\n" + "fadd v17.4s, v17.4s, %[beta_0].4s\n" + + // vaddq_f32(div_ps_f32(p, q), + // vdupq_n_f32(sigmoid_constants.one_half)); + "fdiv v12.4s, v6.4s, v12.4s\n" + "fdiv v13.4s, v7.4s, v13.4s\n" + "fdiv v14.4s, v8.4s, v14.4s\n" + "fdiv v15.4s, v9.4s, v15.4s\n" + "fdiv v16.4s, v10.4s, v16.4s\n" + "fdiv v17.4s, v11.4s, v17.4s\n" + "subs %w[x6_iter], %w[x6_iter], #1\n" + "fadd v12.4s, v12.4s, %[one_half].4s\n" + "fadd v13.4s, v13.4s, %[one_half].4s\n" + "fadd v14.4s, v14.4s, %[one_half].4s\n" + "fadd v15.4s, v15.4s, %[one_half].4s\n" + "fadd v16.4s, v16.4s, %[one_half].4s\n" + "fadd v17.4s, v17.4s, %[one_half].4s\n" + + // save it + "str q12, [%[d_ptr]] \n" + "str q13, [%[d_ptr], #16]\n" + "str q14, [%[d_ptr], #32]\n" + "str q15, [%[d_ptr], #48]\n" + "str q16, [%[d_ptr], #64]\n" + "str q17, [%[d_ptr], #80]\n" + "add %[d_ptr], %[d_ptr], #96\n" + + "bne 1b\n" + + "2:\n" + : [a_ptr] "+r"(src), [d_ptr] "+r"(dst), [const_ptr] "+r"(const_ptr), + [x6_iter] "+r"(x6_iter), [lower_range] "=w"(lower_range), + [alpha_9] "=w"(alpha_9), [upper_range] "=w"(upper_range), + [alpha_7] "=w"(alpha_7), [alpha_5] "=w"(alpha_5), + [alpha_3] "=w"(alpha_3), [alpha_1] "=w"(alpha_1), + [beta_10] "=w"(beta_10), [beta_8] "=w"(beta_8), + [beta_6] "=w"(beta_6), [beta_4] "=w"(beta_4), + [beta_2] "=w"(beta_2), [beta_0] "=w"(beta_0), + [one_half] "=w"(one_half) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "x1", "x2", "x8", + "x9", "cc", "memory"); + } + + )"; + writer << "\nfloat32x4_t ones = vdupq_n_f32(1.f);"; + return writer.str(); +} + +std::string ElemwiseGenUnarySigmoid::GenKernelSimdUnroll( + std::vector strs) const { + return m_common_sigmoid_gen->GenKernelSimdUnroll(strs); +} + +std::string ElemwiseGenUnarySigmoid::GenKernelNaiveUnroll( + std::vector strs) const { + return m_common_sigmoid_gen->GenKernelNaiveUnroll(strs); +} + +// vim: syntax=cpp.doxygen diff --git a/compiler/lib/KernelGen/Arm/Arm64/KernelPack.cpp b/compiler/lib/KernelGen/Arm/Arm64/KernelPack.cpp index abaa184a..0c1906f6 100644 --- a/compiler/lib/KernelGen/Arm/Arm64/KernelPack.cpp +++ b/compiler/lib/KernelGen/Arm/Arm64/KernelPack.cpp @@ -10,10 +10,10 @@ #include #include "ConvKernel.h" +#include "Elemwise/Elemwise.h" #include "InternalKernel/InternalKernel.h" #include "KernelPack.h" #include "MatMulKernel/Fp32MatMul.h" - using namespace megcc; using namespace KernelGen; using namespace Arm64; @@ -40,6 +40,9 @@ struct AllA64Kernel { std::make_shared(), std::make_shared(), std::make_shared()}; + + inner_map[KernelPack::KernType::ElemwiseKernel] = { + std::make_shared()}; } std::unordered_map>> diff --git a/compiler/test/kernel/opr/arm/Elementwise.cpp b/compiler/test/kernel/opr/arm/Elementwise.cpp index 55a39b47..8e5e86d1 100644 --- a/compiler/test/kernel/opr/arm/Elementwise.cpp +++ b/compiler/test/kernel/opr/arm/Elementwise.cpp @@ -28,6 +28,19 @@ TEST(AARCH64, ElementwiseUnique) { } } +TEST(AARCH64, ElementwiseUnique_asm) { + Checker checker(Arch::ARM64); + checker.set_kernel_symbol("Arm64_kernel_elementwise.+"); + ElemwiseForward::Param param; + for (auto mode : {MODE::SIGMOID}) { + param.mode = mode; + checker.set_param(param); + checker.execs({{1, 10}, {}}); + checker.execs({{1, 10, 12, 13}, {}}); + checker.execs({{10, 8, 2, 1}, {}}); + } +} + TEST(AARCH64, ElementwiseBinary) { Checker checker(Arch::ARM64); ElemwiseForward::Param param; diff --git a/compiler/test/kernel/opr/arm/benchmark_elemwise.cpp b/compiler/test/kernel/opr/arm/benchmark_elemwise.cpp index 475cea93..1bb8baf6 100644 --- a/compiler/test/kernel/opr/arm/benchmark_elemwise.cpp +++ b/compiler/test/kernel/opr/arm/benchmark_elemwise.cpp @@ -23,4 +23,13 @@ TEST(AARCH64, BenchmarkElemwise) { benchmarker.execs({{1, 3, 160, 160}, {}}).print(); benchmarker.execs({{1, 3, 160, 160}, {}}).print(); } +TEST(AARCH64, BenchmarkElemwise_asm) { + Benchmarker benchmarker(Arch::ARM64); + benchmarker.set_kernel_symbol("Arm64.*"); + ElemwiseForward::Param param; + param.mode = MODE::SIGMOID; + benchmarker.set_param(param); + benchmarker.execs({{1, 3, 160, 160}, {}}).print(); + benchmarker.execs({{1, 3, 160, 160}, {}}).print(); +} #endif From 85f8ee1c916671b644695996e3ad66e4fde8ae32 Mon Sep 17 00:00:00 2001 From: limingxin Date: Thu, 29 Dec 2022 13:43:18 +0800 Subject: [PATCH 08/17] feat(compiler): support to set ENV and loader path using cmdline arg when compile --- .../compiler/Target/MGB/dummy_loader.h | 84 ++++++-- compiler/lib/Target/MGB/importer.cpp | 57 +++++- compiler/lib/Target/TinyNN/exporter.cpp | 22 ++- runtime/example/standard_OS/lite_main.c | 4 +- runtime/src/vm/extern_opr.c | 183 ++++++++++++++---- runtime/version.ld | 2 +- script/ppl_build.sh | 2 +- script/ppl_gen.sh | 18 +- 8 files changed, 294 insertions(+), 78 deletions(-) diff --git a/compiler/include/compiler/Target/MGB/dummy_loader.h b/compiler/include/compiler/Target/MGB/dummy_loader.h index bc809af2..49cce414 100644 --- a/compiler/include/compiler/Target/MGB/dummy_loader.h +++ b/compiler/include/compiler/Target/MGB/dummy_loader.h @@ -8,22 +8,28 @@ #include #include -#include #include #include +#include #include #include "megbrain/serialization/extern_c_opr.h" namespace { -std::map>, std::vector>> - name2outputinfo; -class MGBOprDescImpl { - static std::string loader_name; +struct LoaderInfo { + std::unordered_map>, + std::vector>> + m_name_2_outputinfo; + std::unordered_map m_envs; + std::pair m_loader_path_with_interface; +}; +static LoaderInfo loaderInfo; +class MGBOprDescImpl { static inline const std::pair>, std::vector>& get_output_info(const std::string& loader_name) { + const auto& name2outputinfo = loaderInfo.m_name_2_outputinfo; auto&& iter = name2outputinfo.find(loader_name); if (iter != name2outputinfo.end()) return iter->second; @@ -99,6 +105,58 @@ class MGBOprDescImpl { class MGBOprLoaderImpl { static std::map user_datas; + // extra_data format: + // total_len + // nr_env + // ENV_len_1:ENV_1:VALUE_len_1:VALUE_1 + // ENV_len_2.... + // loader_path_len:loader_path:interface_len:interface + static std::shared_ptr extra_data; + + static void make_extra_data() { + // calculate len + size_t len = 0; + size_t nr_env = loaderInfo.m_envs.size(); + len += sizeof(nr_env); // nr_env + for (const auto& env : loaderInfo.m_envs) { + size_t env_len = env.first.size(), value_len = env.second.size(); + len += sizeof(env_len) + env_len + sizeof(value_len) + + value_len; // ENV_len_x + ENV_x + VALUE_len_x + VALUE_x + } + len += sizeof(size_t) + + loaderInfo.m_loader_path_with_interface.first.size() + + sizeof(size_t) + + loaderInfo.m_loader_path_with_interface.second + .size(); // loader_path_len + loader_path + + // interface_len + interface + + extra_data = std::shared_ptr(malloc(sizeof(size_t) + len), free); + // fill memory + void* tmp_p = extra_data.get(); + *(size_t*)(tmp_p) = len; + tmp_p += sizeof(size_t); + *(size_t*)tmp_p = nr_env; + tmp_p += sizeof(size_t); + for (const auto& env : loaderInfo.m_envs) { + *(size_t*)tmp_p = env.first.size(); + tmp_p += sizeof(size_t); + memmove(tmp_p, env.first.c_str(), env.first.size()); + tmp_p += env.first.size(); + *(size_t*)tmp_p = env.second.size(); + tmp_p += sizeof(size_t); + memmove(tmp_p, env.second.c_str(), env.second.size()); + tmp_p += env.second.size(); + } + *(size_t*)tmp_p = loaderInfo.m_loader_path_with_interface.first.size(); + tmp_p += sizeof(size_t); + memmove(tmp_p, loaderInfo.m_loader_path_with_interface.first.c_str(), + loaderInfo.m_loader_path_with_interface.first.size()); + tmp_p += loaderInfo.m_loader_path_with_interface.first.size(); + *(size_t*)tmp_p = loaderInfo.m_loader_path_with_interface.second.size(); + tmp_p += sizeof(size_t); + memmove(tmp_p, loaderInfo.m_loader_path_with_interface.second.c_str(), + loaderInfo.m_loader_path_with_interface.second.size()); + } static MGBOprDesc* create_desc(size_t nr_input, const void* buf, size_t buf_len) { @@ -116,16 +174,16 @@ class MGBOprLoaderImpl { public: static std::map& get_user_datas() { return user_datas; } - static MGBOprLoader make() { return {"extern_opr_dummy", &create_desc}; } + static void* get_extra_data() { return extra_data.get(); } + static MGBOprLoader make() { + make_extra_data(); + return {"extern_opr_dummy", &create_desc}; + } }; std::map MGBOprLoaderImpl::user_datas = {}; +std::shared_ptr MGBOprLoaderImpl::extra_data = {}; -void mgb_c_opr_init_output_info( - const MGBExternCOprApi* (*get_api)(int), - const std::map>, - std::vector>>& output_info) { - name2outputinfo = std::move(output_info); +static void dummy_mgb_c_opr_init(const MGBExternCOprApi* (*get_api)(int)) { const MGBExternCOprApi* api = get_api(MGB_EXTERN_C_OPR_VERSION); assert(api); MGBOprLoader loader = MGBOprLoaderImpl::make(); diff --git a/compiler/lib/Target/MGB/importer.cpp b/compiler/lib/Target/MGB/importer.cpp index e8867f0b..b7e19f13 100644 --- a/compiler/lib/Target/MGB/importer.cpp +++ b/compiler/lib/Target/MGB/importer.cpp @@ -61,6 +61,17 @@ llvm::cl::opt ExternOprOutputDType( "The available values are float32, int32, uint8, float16, " "int16. e.g., \"float32;int32;uint8:float16;int16\". Default " "value is float32.")); +llvm::cl::opt ExternOprLoaderPathWithInterface( + "loader-path-with-interface", llvm::cl::Optional, + llvm::cl::desc("specific extern opr loader path with interface. If " + "\"interface\" " + "is not provided, using \"mgb_c_opr_init\" default."), + llvm::cl::value_desc("loader_path:interface")); +llvm::cl::opt ExternOprLoaderEnv( + "set-extern-opr-env", llvm::cl::Optional, + llvm::cl::desc("set ENV for all extern opr loader, must surrounded by " + "\" if set multiple ENV."), + llvm::cl::value_desc("\"ENV_1=VALUE_1;ENV_2=VALUE_2...\"")); using namespace mgb; using namespace llvm; @@ -172,10 +183,8 @@ inline std::vector split(std::string str, return res; } -inline void parse_extern_output_info() { - std::map>, - std::vector>> - name2outputinfo; +inline void parse_extern_loader_info() { + auto& name2outputinfo = loaderInfo.m_name_2_outputinfo; std::string extern_opr_output_shapes = ExternOprOutputShape; if (extern_opr_output_shapes.size()) { @@ -290,9 +299,34 @@ inline void parse_extern_output_info() { } } } + } + + // parse ENV + std::string env = ExternOprLoaderEnv; + if (env.size()) { + auto&& env_values = split(env, ";"); + for (auto&& env_value : env_values) { + auto&& env_value_vec = split(env_value, "="); + CC_ASSERT((env_value_vec.size() == 2)) + << "Wrong format. Set ENV using \"ENV=VALUE\""; + loaderInfo.m_envs[env_value_vec[0]] = env_value_vec[1]; + } + } - mgb_c_opr_init_output_info(mgb_get_extern_c_opr_api_versioned, - name2outputinfo); + // parse loader path and interface + std::string loaderPathWithInterface = ExternOprLoaderPathWithInterface; + if (loaderPathWithInterface.size()) { + auto&& loaderPath_interface = split(loaderPathWithInterface, ":"); + CC_ASSERT((loaderPath_interface.size() <= 2)) + << "Wrong format. Specify loader path and interface using " + "loader_path[:interface]"; + loaderInfo.m_loader_path_with_interface.first = loaderPath_interface[0]; + if (loaderPath_interface.size() == 1 || loaderPath_interface[1] == "") { + loaderInfo.m_loader_path_with_interface.second = "mgb_c_opr_init"; + } else { + loaderInfo.m_loader_path_with_interface.second = + loaderPath_interface[1]; + } } } @@ -332,7 +366,8 @@ class Importer { m_loader = serialization::GraphLoader::make(std::move(inp_file), format.val()); - parse_extern_output_info(); + parse_extern_loader_info(); + dummy_mgb_c_opr_init(mgb_get_extern_c_opr_api_versioned); LOG_DEBUG << "Process mgb graph\n"; process_graph(options); @@ -977,6 +1012,7 @@ class Importer { } else if (auto extern_opr = opr->try_cast_final()) { auto user_datas = MGBOprLoaderImpl::get_user_datas(); + void* extra_data = MGBOprLoaderImpl::get_extra_data(); void* _data = nullptr; if (user_datas.find(opr->name()) != user_datas.end()) { @@ -986,6 +1022,11 @@ class Importer { std::string data( reinterpret_cast(_data + sizeof(size_t)), *(size_t*)(_data)); + uint32_t data_len = static_cast(data.size()); + if (extra_data) + data += std::string(reinterpret_cast( + extra_data + sizeof(size_t)), + *(size_t*)(extra_data)); free(_data); std::vector v_resultTypes(opr->output().size()); @@ -999,7 +1040,7 @@ class Importer { auto values = m_builder.create( m_builder.getUnknownLoc(), v_resultTypes, var_array_to_value_array(opr->input()), opr->name(), data, - static_cast(data.size()), nr_input, nr_output); + data_len, nr_input, nr_output); for (int i = 0; i < opr->output().size(); ++i) { m_var2value.emplace(opr->output(i), values.getResult(i)); } diff --git a/compiler/lib/Target/TinyNN/exporter.cpp b/compiler/lib/Target/TinyNN/exporter.cpp index 41dca537..5cad1962 100644 --- a/compiler/lib/Target/TinyNN/exporter.cpp +++ b/compiler/lib/Target/TinyNN/exporter.cpp @@ -312,7 +312,7 @@ class Exporter { std::string name(op.name().data(), op.name().size()); std::string data(op.data().data(), op.data().size()); - uint32_t data_len = data.size(); + uint32_t data_len = op.data_len(); LOG_DEBUG << "Add ExternOpr instruction.\n"; instructions_type.push_back( @@ -462,7 +462,7 @@ class Exporter { output_tensor = tensor.second; auto descs = llvm::to_vector<4>( op.descs().getAsRange()); - auto flags= llvm::to_vector<4>( + auto flags = llvm::to_vector<4>( op.flags().getAsRange()); std::vector> descs_; std::vector> flags_; @@ -478,7 +478,8 @@ class Exporter { auto descs_fbs = m_fbs_builder.CreateVector(descs_); auto flags_fbs = m_fbs_builder.CreateVector(flags_); - MegCC::SubTensorBuilder subtensor_builder(m_fbs_builder); + MegCC::SubTensorBuilder subtensor_builder( + m_fbs_builder); subtensor_builder.add_inputs(input_tensors_); subtensor_builder.add_input_types(input_types_); subtensor_builder.add_output(output_tensor); @@ -486,8 +487,10 @@ class Exporter { subtensor_builder.add_flags(flags_fbs); LOG_DEBUG << "Add subtensor instruction.\n"; - instructions_type.push_back(MegCC::Instruction_SubTensor); - instructions.push_back(subtensor_builder.Finish().Union()); + instructions_type.push_back( + MegCC::Instruction_SubTensor); + instructions.push_back( + subtensor_builder.Finish().Union()); }) .Case([&](Kernel::SetSubtensorIns op) { kernel_exporter.addInst("SETSUBTENSOR"); @@ -593,7 +596,8 @@ class Exporter { auto&& out_tensor = value2typed_tensor.at( op.result().getAsOpaquePointer()); LOG_DEBUG << "Add Broadcast instruction.\n"; - instructions_type.push_back(MegCC::Instruction_BroadCast); + instructions_type.push_back( + MegCC::Instruction_BroadCast); instructions.push_back( MegCC::CreateBroadCast( m_fbs_builder, input_tensors_, @@ -751,7 +755,7 @@ class Exporter { auto&& out_tensor = value2typed_tensor.at( op.result().getAsOpaquePointer()); - auto mat_id= op.mat_idx(); + auto mat_id = op.mat_idx(); auto member = llvm::to_vector<4>( mat_id.getAsRange()); std::vector mat_id_v; @@ -1008,8 +1012,8 @@ class Exporter { m_fbs_builder.CreateString(name)); } - Offset indexdesc_to_fbs(ArrayAttr desc){ - CC_ASSERT(desc.size()==5); + Offset indexdesc_to_fbs(ArrayAttr desc) { + CC_ASSERT(desc.size() == 5); auto member = llvm::to_vector<5>(desc.getAsRange()); return MegCC::CreateIndexDesc(m_fbs_builder, member[0].getInt(), member[1].getInt(), member[2].getInt(), diff --git a/runtime/example/standard_OS/lite_main.c b/runtime/example/standard_OS/lite_main.c index ce7fe294..ff469b35 100644 --- a/runtime/example/standard_OS/lite_main.c +++ b/runtime/example/standard_OS/lite_main.c @@ -196,6 +196,8 @@ static void* dlsym(void* handle, const char* name) { #include #endif +const MGBExternCOprApi* megcc_get_extern_c_opr_api_versioned(int version); + int main(int argc, char** argv) { LITE_set_log_level(WARN); #if TINYNN_CALLBACK_ENABLE @@ -279,7 +281,7 @@ int main(int argc, char** argv) { void (*func)(const MGBExternCOprApi* (*)(int)) = NULL; *(void**)&func = dlsym(handle, c_opr_lib_interface); EXAMPLE_ASSERT(func, "load init interface of loader failed.\n"); - func(mgb_get_extern_c_opr_api_versioned); + func(megcc_get_extern_c_opr_api_versioned); } LiteNetwork model; diff --git a/runtime/src/vm/extern_opr.c b/runtime/src/vm/extern_opr.c index e39ca310..4af4aec0 100644 --- a/runtime/src/vm/extern_opr.c +++ b/runtime/src/vm/extern_opr.c @@ -6,9 +6,9 @@ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. */ -#include "vm.h" -#include "utils.h" #include "extern_c_opr.h" +#include "utils.h" +#include "vm.h" #if ENABLE_INST_EXTERN_OPR @@ -97,10 +97,11 @@ static void free_loader_maps(LoaderMapVec* lm) { } //! get API ptr for specific version; return nullptr if version mismatch -const MGBExternCOprApi* mgb_get_extern_c_opr_api_versioned(int version) { +const MGBExternCOprApi* megcc_get_extern_c_opr_api_versioned(int version) { static MGBExternCOprApi api; api.unregister_loader = unregister_loader; - TINYNN_ASSERT_MSG(version >= 0x24, "Extern opr loader version must greater than 0x24.\n"); + TINYNN_ASSERT_MSG(version >= 0x24, + "Extern opr loader version must greater than 0x24.\n"); if (version != MGB_EXTERN_C_OPR_VERSION) { return NULL; @@ -111,12 +112,12 @@ const MGBExternCOprApi* mgb_get_extern_c_opr_api_versioned(int version) { } // Convert Tensor to MGBTensor, except MGBTensor.data. -static void Tensor2MGBTensor(const Tensor* tensor, MGBTensor* mgb_tensor){ +static void Tensor2MGBTensor(const Tensor* tensor, MGBTensor* mgb_tensor) { mgb_tensor->layout.shape.ndim = tensor->layout.nr_dim; - for(int i = 0; i < tensor->layout.nr_dim; ++i){ + for (int i = 0; i < tensor->layout.nr_dim; ++i) { mgb_tensor->layout.shape.shape[i] = tensor->layout.dims[i]; } - switch(tensor->dtype.type_enum){ + switch (tensor->dtype.type_enum) { case TinyNN_FLOAT: mgb_tensor->layout.dtype = MGB_DTYPE_FLOAT32; break; @@ -137,13 +138,13 @@ static void Tensor2MGBTensor(const Tensor* tensor, MGBTensor* mgb_tensor){ } } -static void MGBTensor2Tensor(const MGBTensor* mgb_tensor, Tensor* tensor){ +static void MGBTensor2Tensor(const MGBTensor* mgb_tensor, Tensor* tensor) { tensor->layout.nr_dim = mgb_tensor->layout.shape.ndim; - for(int i = 0; i < mgb_tensor->layout.shape.ndim; ++i){ + for (int i = 0; i < mgb_tensor->layout.shape.ndim; ++i) { tensor->layout.dims[i] = mgb_tensor->layout.shape.shape[i]; } - switch(mgb_tensor->layout.dtype){ + switch (mgb_tensor->layout.dtype) { case MGB_DTYPE_FLOAT32: tensor->dtype.type_enum = TinyNN_FLOAT; break; @@ -164,6 +165,26 @@ static void MGBTensor2Tensor(const MGBTensor* mgb_tensor, Tensor* tensor){ } } +#if defined(_WIN32) +#include +#include +#define RTLD_LAZY 0 + +static void* dlopen(const char* file, int) { + return (void*)(LoadLibrary(file)); +} + +static void* dlsym(void* handle, const char* name) { + FARPROC symbol = GetProcAddress((HMODULE)handle, name); + return (void*)symbol; +} + +#else +#include +#endif + +static int has_set_env_and_loader = 0; + static TinyNNStatus load(flatbuffers_generic_t fbs_inst, Instruction* inst, VM* vm) { ExternOpr* extern_opr = &inst->workload.extern_opr; @@ -174,53 +195,140 @@ static TinyNNStatus load(flatbuffers_generic_t fbs_inst, Instruction* inst, flatbuffers_int32_vec_t fbs_inputs = ns(ExternOpr_input(fbs_extern_opr)); extern_opr->nr_input = flatbuffers_int32_vec_len(fbs_inputs); extern_opr->inputs = tinynn_malloc(sizeof(Tensor*) * extern_opr->nr_input); - for(int i = 0; i < extern_opr->nr_input; ++i){ + for (int i = 0; i < extern_opr->nr_input; ++i) { extern_opr->inputs[i] = model->tensors + fbs_inputs[i]; } flatbuffers_int32_vec_t fbs_outputs = ns(ExternOpr_output(fbs_extern_opr)); extern_opr->nr_output = flatbuffers_int32_vec_len(fbs_outputs); - extern_opr->outputs = tinynn_malloc(sizeof(Tensor*) * extern_opr->nr_output); - for(int i = 0; i < extern_opr->nr_output; ++i){ + extern_opr->outputs = + tinynn_malloc(sizeof(Tensor*) * extern_opr->nr_output); + for (int i = 0; i < extern_opr->nr_output; ++i) { extern_opr->outputs[i] = model->tensors + fbs_outputs[i]; } - const char* name = ns(ExternOpr_name(fbs_extern_opr)); + char* name = ns(ExternOpr_name(fbs_extern_opr)); const void* data = ns(ExternOpr_data(fbs_extern_opr)); size_t data_len = ns(ExternOpr_data_len(fbs_extern_opr)); + int idx = 0; + while (name[idx] != '\0' && name[idx] != ':') + ++idx; + name[idx] = '\0'; + + if (!has_set_env_and_loader) { + const void* extra_data = data + data_len; + // parse and set ENV + size_t nr_env = *(size_t*)extra_data; + extra_data += sizeof(size_t); + for (int i = 0; i < nr_env; ++i) { + size_t env_len = *(size_t*)extra_data; + extra_data += sizeof(size_t); + char* env = (char*)tinynn_malloc(env_len + 1); + memcpy(env, extra_data, env_len); + env[env_len] = '\0'; + extra_data += env_len; + + size_t value_len = *(size_t*)extra_data; + extra_data += sizeof(size_t); + char* value = (char*)tinynn_malloc(value_len + 1); + memcpy(value, extra_data, value_len); + value[value_len] = '\0'; + extra_data += value_len; + + TINYNN_ASSERT_MSG((!setenv(env, value, 1)), + "setenv failed.\n"); // 1 means overwrite when + // 'env' does exist. + LOG_DEBUG("Set ENV: %s=%s\n", env, value); + + tinynn_free(env); + tinynn_free(value); + } + + // load loader + size_t loader_path_len = *(size_t*)extra_data; + extra_data += sizeof(size_t); + if (loader_path_len) { + char* loader_path = tinynn_malloc(loader_path_len + 1); + memcpy(loader_path, extra_data, loader_path_len); + extra_data += loader_path_len; + loader_path[loader_path_len] = '\0'; + LOG_DEBUG("Try to load loader in path %s.\n", loader_path); + void* handle = dlopen(loader_path, RTLD_LAZY); + // if dlopen failed, but loader path is NOT absolute path. + if (!handle && loader_path[0] != '/') { + // try current path + char* extend_loader_path = tinynn_malloc(loader_path_len + 3); + extend_loader_path[0] = '.'; + extend_loader_path[1] = '/'; + memcpy(extend_loader_path + 2, loader_path, + loader_path_len + 1); + LOG_DEBUG( + "Load loader in path %s failed. Now try to load loader " + "in path %s.\n", + loader_path, extend_loader_path); + handle = dlopen(extend_loader_path, RTLD_LAZY); + tinynn_free(extend_loader_path); + } + tinynn_free(loader_path); + TINYNN_ASSERT_MSG(handle, + "Load loader failed. Can NOT find loader file in " + "given path.\n"); + + size_t interface_len = *(size_t*)extra_data; + extra_data += sizeof(size_t); + char* c_opr_lib_interface = tinynn_malloc(interface_len + 1); + memcpy(c_opr_lib_interface, extra_data, interface_len); + c_opr_lib_interface[interface_len] = '\0'; + void (*func)(const MGBExternCOprApi* (*)(int)) = NULL; + *(void**)&func = dlsym(handle, c_opr_lib_interface); + tinynn_free(c_opr_lib_interface); + TINYNN_ASSERT_MSG(func, "load init interface of loader failed.\n"); + func(megcc_get_extern_c_opr_api_versioned); + } + has_set_env_and_loader = 1; + } LoaderMap* loader_map = find_loader_by_name(&loader_maps, name); TINYNN_ASSERT_MSG(loader_map, "Wrong loader.\n"); extern_opr->desc = loader_map->loader.create_desc(extern_opr->nr_input, - data, data_len); - - extern_opr->mgb_inputs = tinynn_malloc(sizeof(MGBTensor) * extern_opr->nr_input); - MGBTensorShape* inputs_shape = tinynn_malloc(sizeof(MGBTensorShape) * extern_opr->nr_input); - MGBDType* inputs_type = tinynn_malloc(sizeof(MGBDType) * extern_opr->nr_input); - for(int i = 0; i < extern_opr->nr_input; ++i){ + data, data_len); + + extern_opr->mgb_inputs = + tinynn_malloc(sizeof(MGBTensor) * extern_opr->nr_input); + MGBTensorShape* inputs_shape = + tinynn_malloc(sizeof(MGBTensorShape) * extern_opr->nr_input); + MGBDType* inputs_type = + tinynn_malloc(sizeof(MGBDType) * extern_opr->nr_input); + for (int i = 0; i < extern_opr->nr_input; ++i) { Tensor2MGBTensor(extern_opr->inputs[i], extern_opr->mgb_inputs + i); inputs_shape[i] = extern_opr->mgb_inputs[i].layout.shape; inputs_type[i] = extern_opr->mgb_inputs[i].layout.dtype; } - extern_opr->mgb_outputs = tinynn_malloc(sizeof(MGBTensor) * extern_opr->nr_output); - MGBTensorShape* outputs_shape = tinynn_malloc(sizeof(MGBTensorShape) * extern_opr->nr_output); - MGBDType* outputs_type = tinynn_malloc(sizeof(MGBDType) * extern_opr->nr_output); - - extern_opr->desc->infer_shape(extern_opr->desc, inputs_shape, outputs_shape); - if(extern_opr->desc->infer_dtype){ - extern_opr->desc->infer_dtype(extern_opr->desc, inputs_type, outputs_type); - }else{ - for(int i = 0; i < extern_opr->nr_output; ++i){ + extern_opr->mgb_outputs = + tinynn_malloc(sizeof(MGBTensor) * extern_opr->nr_output); + MGBTensorShape* outputs_shape = + tinynn_malloc(sizeof(MGBTensorShape) * extern_opr->nr_output); + MGBDType* outputs_type = + tinynn_malloc(sizeof(MGBDType) * extern_opr->nr_output); + + extern_opr->desc->infer_shape(extern_opr->desc, inputs_shape, + outputs_shape); + if (extern_opr->desc->infer_dtype) { + extern_opr->desc->infer_dtype(extern_opr->desc, inputs_type, + outputs_type); + } else { + for (int i = 0; i < extern_opr->nr_output; ++i) { outputs_type[i] = inputs_type[0]; } } - for(int i = 0; i < extern_opr->nr_output; ++i){ + for (int i = 0; i < extern_opr->nr_output; ++i) { extern_opr->mgb_outputs[i].layout.dtype = outputs_type[i]; extern_opr->mgb_outputs[i].layout.shape.ndim = outputs_shape[i].ndim; - for(int j = 0; j < extern_opr->mgb_outputs[i].layout.shape.ndim; ++j){ - extern_opr->mgb_outputs[i].layout.shape.shape[j] = outputs_shape[i].shape[j]; + for (int j = 0; j < extern_opr->mgb_outputs[i].layout.shape.ndim; ++j) { + extern_opr->mgb_outputs[i].layout.shape.shape[j] = + outputs_shape[i].shape[j]; } } @@ -236,14 +344,15 @@ static TinyNNStatus load(flatbuffers_generic_t fbs_inst, Instruction* inst, static TinyNNStatus execute(Instruction* inst, VM* vm) { ExternOpr* extern_opr = &inst->workload.extern_opr; - for(int i = 0; i < extern_opr->nr_input; ++i){ + for (int i = 0; i < extern_opr->nr_input; ++i) { extern_opr->mgb_inputs[i].data = extern_opr->inputs[i]->ptr; } - for(int i = 0; i < extern_opr->nr_output; ++i){ + for (int i = 0; i < extern_opr->nr_output; ++i) { extern_opr->mgb_outputs[i].data = extern_opr->outputs[i]->ptr; } - extern_opr->desc->execute(extern_opr->desc, extern_opr->mgb_inputs, extern_opr->mgb_outputs); - for(int i = 0; i < extern_opr->nr_output; ++i){ + extern_opr->desc->execute(extern_opr->desc, extern_opr->mgb_inputs, + extern_opr->mgb_outputs); + for (int i = 0; i < extern_opr->nr_output; ++i) { MGBTensor2Tensor(extern_opr->mgb_outputs + i, extern_opr->outputs[i]); } @@ -271,7 +380,7 @@ void register_extern_opr(VM* vm) { #else void register_extern_opr(VM* vm) {} -const MGBExternCOprApi* mgb_get_extern_c_opr_api_versioned(int i) { +const MGBExternCOprApi* megcc_get_extern_c_opr_api_versioned(int i) { TINYNN_ASSERT_MSG( 0, "Should NOT execute here!!!\n" diff --git a/runtime/version.ld b/runtime/version.ld index 76275193..bddbdb30 100644 --- a/runtime/version.ld +++ b/runtime/version.ld @@ -4,7 +4,7 @@ global: default_config; default_network_io; register_tinynn_cb; - mgb_get_extern_c_opr_api_versioned; + megcc_get_extern_c_opr_api_versioned; local: diff --git a/script/ppl_build.sh b/script/ppl_build.sh index c6d6f946..815191aa 100755 --- a/script/ppl_build.sh +++ b/script/ppl_build.sh @@ -2,4 +2,4 @@ set -ex PROJECT_PATH="$(dirname $(readlink -f $0))/" KERNEL_DIR="${PROJECT_PATH}/kern/" -${PROJECT_PATH}/script/runtime_build.py --cross_build --kernel_dir ${KERNEL_DIR} --remove_old_build --specify_build_dir ${PROJECT_PATH}/build $@ +${PROJECT_PATH}/runtime/script/runtime_build.py --cross_build --kernel_dir ${KERNEL_DIR} --remove_old_build --specify_build_dir ${PROJECT_PATH}/build $@ diff --git a/script/ppl_gen.sh b/script/ppl_gen.sh index e0661202..dee56295 100755 --- a/script/ppl_gen.sh +++ b/script/ppl_gen.sh @@ -16,23 +16,25 @@ RUNTIME_PATH=${PROJECT_PATH}/runtime mkdir -p ${OUT_DIR} KERN_DIR="${OUT_DIR}/kern/" rm -fr ${OUT_DIR}/* +mkdir -p "${OUT_DIR}/runtime" mkdir -p "${OUT_DIR}/model" mkdir -p "${OUT_DIR}/model_info" -mkdir -p "${OUT_DIR}/script" +mkdir -p "${OUT_DIR}/runtime/script" mkdir -p "${KERN_DIR}" ${DUMP_APP} --json="${JSON_PATH}" "${ARCH_SPECIFIC}" --dump="${KERN_DIR}" ${EXTRA_DUMP_CMD} -cp -r "${RUNTIME_PATH}/flatcc" "${OUT_DIR}/flatcc" -cp -r "${RUNTIME_PATH}/include" "${OUT_DIR}/include" -cp -r "${RUNTIME_PATH}/schema" "${OUT_DIR}/schema" -cp -r "${RUNTIME_PATH}/example" "${OUT_DIR}/example" -cp -r "${RUNTIME_PATH}/src" "${OUT_DIR}/src" -cp "${RUNTIME_PATH}/CMakeLists.txt" "${OUT_DIR}/CMakeLists.txt" +cp -r "${RUNTIME_PATH}/flatcc" "${OUT_DIR}/runtime/flatcc" +cp -r "${RUNTIME_PATH}/include" "${OUT_DIR}/runtime/include" +cp -r "${RUNTIME_PATH}/schema" "${OUT_DIR}/runtime/schema" +cp -r "${RUNTIME_PATH}/example" "${OUT_DIR}/runtime/example" +cp -r "${RUNTIME_PATH}/src" "${OUT_DIR}/runtime/src" +cp -r "${PROJECT_PATH}/immigration" "${OUT_DIR}/immigration" +cp "${RUNTIME_PATH}/CMakeLists.txt" "${OUT_DIR}/runtime/CMakeLists.txt" MODEL_FILE=`find ${OUT_DIR}/kern/ -name "*.tiny"` if [ ! -z "${MODEL_FILE}" ];then mv ${OUT_DIR}/kern/*.tiny "${OUT_DIR}/model" mv ${OUT_DIR}/kern/*.tiny.txt "${OUT_DIR}/model_info" fi cp -a "${PROJECT_PATH}"/script/{ppl_build.sh,test_model.py} "${OUT_DIR}/" -cp "${PROJECT_PATH}/runtime/scripts/runtime_build.py" "${OUT_DIR}/script/" +cp "${RUNTIME_PATH}/scripts/runtime_build.py" "${OUT_DIR}/runtime/script/" cp "${JSON_PATH}" "${OUT_DIR}/" tar -czf megcc_ppl_gen.tar.gz "${OUT_DIR}" From 2647e9b69c0a5a939210984286e3868b9cce269b Mon Sep 17 00:00:00 2001 From: zhanghaolong Date: Fri, 6 Jan 2023 16:42:06 +0800 Subject: [PATCH 09/17] feat(compiler): support qsi8qsi8qsi32qsi32 for Int8DotConv1x1Mk4M8N12 --- .../lib/KernelGen/Arm/Arm64/Activation.cpp | 2 +- .../Int8/Int8DotConv1x1Mk4M8N12.cpp | 11 ++++++++-- .../InternalKernel/Int8DotM8N12MK4GEMM.cpp | 19 +++++++++++----- .../Arm/ArmCommon/InternalKernel.cpp | 22 +++++++++++++++++-- .../lib/KernelGen/Arm/ArmCommon/Typecvt.cpp | 1 + compiler/lib/KernelGen/Common/ConvKernel.h | 10 +++++++-- compiler/lib/KernelGen/Utils/Utils.h | 3 ++- runtime/src/vm/registry.h | 2 ++ 8 files changed, 57 insertions(+), 13 deletions(-) diff --git a/compiler/lib/KernelGen/Arm/Arm64/Activation.cpp b/compiler/lib/KernelGen/Arm/Arm64/Activation.cpp index 3ef6a2be..6f07e8a4 100644 --- a/compiler/lib/KernelGen/Arm/Arm64/Activation.cpp +++ b/compiler/lib/KernelGen/Arm/Arm64/Activation.cpp @@ -191,7 +191,7 @@ std::string ActivationGenAsmBase::GenAsmQuantStore( } ss << gener.render(temp_ss.str()); } else { - CC_ASSERT(dst_specifier == "int32_t"); + CC_ASSERT(dst_specifier == "int32_t" || dst_specifier == "int"); if (!with_store) { return ""; } diff --git a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConv1x1Mk4M8N12.cpp b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConv1x1Mk4M8N12.cpp index 04d25398..0e055ab1 100644 --- a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConv1x1Mk4M8N12.cpp +++ b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConv1x1Mk4M8N12.cpp @@ -40,7 +40,7 @@ bool Conv1x1DotMk4::IsAvailable(TContext* ctx) const { ctx->getAttrStr("nonlineMode") == "RELU" || ctx->getAttrStr("nonlineMode") == "H_SWISH"; - bool type_ok = is_qint8_conv_dtype(ctx); + bool type_ok = is_qint8_conv_dtype(ctx, true); bool layout_ok = ctx->getAttrOprand("operand:0").shape.size() == 5 && ctx->getAttrOprand("operand:0").shape[4] == 4; @@ -186,6 +186,9 @@ std::shared_ptr Conv1x1DotMk4::GetInnerCtx(TContext* ctx) const { inner_ctx->setAttr("transposeB", false); inner_ctx->setAttr("format", "MK4_DOT"); inner_ctx->setAttr("dtype", ctx->getAttrOprand("operand:0").dtype); + auto last_dtype = Utils::get_last_operand(ctx).dtype; + auto last_dtype_str = SymbolHelper::gen_valid_dtype(last_dtype); + inner_ctx->setAttr("last_dtype", last_dtype_str); return inner_ctx; } @@ -202,6 +205,9 @@ std::string Conv1x1DotMk4::GetKernelBody(TContext* ctx) const { gen_temp_dst = "void* temp_dst = (int8_t*) workspace_ptr + pack_b_align;"; } + auto last_dtype = Utils::get_last_operand(ctx).dtype; + auto last_dtype_str = SymbolHelper::gen_valid_dtype(last_dtype); + std::string dst_specifier = Utils::cvt_dtype_specifier(last_dtype_str); writer << StringTemplate::StringTemplateArgs() .add("bias_ptr_str", bias_ptr_str) .add("packb_size_sym", @@ -212,9 +218,10 @@ std::string Conv1x1DotMk4::GetKernelBody(TContext* ctx) const { .add("naked_kern_sym", m_inner_gemm.GetNakedKernelSymbol(inner_ctx.get())) .add("gen_temp_dst", gen_temp_dst) + .add("dst_specifier", dst_specifier) .render(R"({ int8_t* input_data = inputs[0]->ptr; - int8_t* output_data = outputs[0]->ptr; + ${dst_specifier}* output_data = outputs[0]->ptr; Layout in_layout = inputs[0]->layout; Layout out_layout = outputs[0]->layout; diff --git a/compiler/lib/KernelGen/Arm/Arm64/InternalKernel/Int8DotM8N12MK4GEMM.cpp b/compiler/lib/KernelGen/Arm/Arm64/InternalKernel/Int8DotM8N12MK4GEMM.cpp index 7517b0bf..aed24e9f 100644 --- a/compiler/lib/KernelGen/Arm/Arm64/InternalKernel/Int8DotM8N12MK4GEMM.cpp +++ b/compiler/lib/KernelGen/Arm/Arm64/InternalKernel/Int8DotM8N12MK4GEMM.cpp @@ -9,11 +9,11 @@ #include "Arm/Arm64/Activation.h" #include "Arm/ArmCommon/MatmulCommon.h" +#include "Arm/ArmCommon/common_asm_utils.h" #include "InternalKernel.h" #include "Utils/StringTemplate.h" #include "Utils/Utils.h" #include "compiler/Common/Logger.h" -#include "Arm/ArmCommon/common_asm_utils.h" using namespace megcc; using namespace KernelGen; using namespace Arm64; @@ -51,10 +51,10 @@ std::string interleave_1x4_4_b() { } std::string prefetch() { - return R"( + return R"( #define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n" - )" + KernelGen::ArmCommon::gen_common_prefetch_2x_f32() - + KernelGen::ArmCommon::gen_common_prefetch_3x_f32(); + )" + KernelGen::ArmCommon::gen_common_prefetch_2x_f32() + + KernelGen::ArmCommon::gen_common_prefetch_3x_f32(); } std::string transpose_1x12() { @@ -1500,6 +1500,11 @@ std::string MatmulInt8DotM8N12MK4Kernel::GetKernelSymbol(TContext* ctx) const { CC_ASSERT(dtype == "8832"); ss << "_" << dtype; } + if (ctx->haveAttr("last_dtype")) { + auto last_dtype = ctx->getAttrStr("last_dtype"); + ss << "_" + << "output_dtype_" << last_dtype; + } return ss.str(); } @@ -1533,6 +1538,10 @@ std::string MatmulInt8DotM8N12MK4Kernel::GetKernelBody(TContext* ctx) const { writer << prefetch(); writer << transpose_1x12(); auto dtype = ctx->getAttrStr("dtype"); + std::string last_dtype = "si8"; + if (ctx->haveAttr("last_dtype")) { + last_dtype = ctx->getAttrStr("last_dtype"); + } std::string dst_specifier = "int32_t"; auto nonline_mode = ctx->haveAttr("nonlineMode") ? ctx->getAttrStr("nonlineMode") @@ -1540,7 +1549,7 @@ std::string MatmulInt8DotM8N12MK4Kernel::GetKernelBody(TContext* ctx) const { if (Utils::is_quant_dtype(dtype) && (nonline_mode == "RELU" || nonline_mode == "IDENTITY" || nonline_mode == "H_SWISH")) { - dst_specifier = "int8_t"; + dst_specifier = Utils::cvt_dtype_specifier(last_dtype); } //! sigmoid use explicit postprocess bool need_temp_dst = need_post_process(ctx); diff --git a/compiler/lib/KernelGen/Arm/ArmCommon/InternalKernel.cpp b/compiler/lib/KernelGen/Arm/ArmCommon/InternalKernel.cpp index ea57df76..66df056c 100644 --- a/compiler/lib/KernelGen/Arm/ArmCommon/InternalKernel.cpp +++ b/compiler/lib/KernelGen/Arm/ArmCommon/InternalKernel.cpp @@ -25,8 +25,17 @@ std::string MatmulInternal::GenNakedKernelCall(TContext* ctx) { return R"((const float* pack_a, const float* pack_b, float* C, size_t LDC, size_t M, size_t N, size_t K, const float* bias_ptr))"; } else if (Utils::is_quant_dtype(dtype, 8)) { - return R"((const int8_t* pack_a, const int8_t* pack_b, int8_t* C, + std::string last_dtype = "si8"; + if (ctx->haveAttr("last_dtype")) { + last_dtype = ctx->getAttrStr("last_dtype"); + } + if (Utils::is_int_dtype(last_dtype, 32)) { + return R"((const int8_t* pack_a, const int8_t* pack_b, int* C, size_t LDC, size_t M, size_t N, size_t K, const int32_t* bias_ptr, void* workspace, float scale, float temp_scale, float dst_scale_inv))"; + } else { + return R"((const int8_t* pack_a, const int8_t* pack_b, int8_t* C, + size_t LDC, size_t M, size_t N, size_t K, const int32_t* bias_ptr, void* workspace, float scale, float temp_scale, float dst_scale_inv))"; + } } else if (dtype == "8832") { return R"((const int8_t* pack_a, const int8_t* pack_b, int32_t* C, size_t LDC, size_t M, size_t N, size_t K, const int32_t* bias_ptr, float scale))"; @@ -42,8 +51,17 @@ std::string MatmulInternal::GenKernelCall(TContext* ctx) { return R"((const float* A, size_t LDA, const float* B, size_t LDB, float* C, size_t LDC, size_t M, size_t N, size_t K, const float* bias_ptr, void* workspace))"; } else if (Utils::is_quant_dtype(dtype, 8)) { - return R"((const int8_t* A, size_t LDA, const int8_t* B, size_t LDB, int8_t* C, + std::string last_dtype = "si8"; + if (ctx->haveAttr("last_dtype")) { + last_dtype = ctx->getAttrStr("last_dtype"); + } + if (Utils::is_int_dtype(last_dtype, 32)) { + return R"((const int8_t* A, size_t LDA, const int8_t* B, size_t LDB, int* C, + size_t LDC, size_t M, size_t N, size_t K, const int32_t* bias_ptr, void* workspace, float scale, float temp_scale, float dst_scale_inv))"; + } else { + return R"((const int8_t* A, size_t LDA, const int8_t* B, size_t LDB, int8_t* C, size_t LDC, size_t M, size_t N, size_t K, const int32_t* bias_ptr, void* workspace, float scale, float temp_scale, float dst_scale_inv))"; + } } else if (dtype == "8832") { return R"((const int8_t* A, size_t LDA, const int8_t* B, size_t LDB, int32_t* C, size_t LDC, size_t M, size_t N, size_t K, const int32_t* bias_ptr, void* workspace, float scale))"; diff --git a/compiler/lib/KernelGen/Arm/ArmCommon/Typecvt.cpp b/compiler/lib/KernelGen/Arm/ArmCommon/Typecvt.cpp index 4abcb519..af9ee0d4 100644 --- a/compiler/lib/KernelGen/Arm/ArmCommon/Typecvt.cpp +++ b/compiler/lib/KernelGen/Arm/ArmCommon/Typecvt.cpp @@ -195,6 +195,7 @@ std::string TypecvtKernel::GetKernelBody(TContext* context) const { std::string dst_specifier = Utils::cvt_dtype_specifier(dst_dtype_str); ss << R"( #include + #include )"; ss << gen_neon_intrin_compat(); ss << init_declare(src_dtype_str, dst_dtype_str); diff --git a/compiler/lib/KernelGen/Common/ConvKernel.h b/compiler/lib/KernelGen/Common/ConvKernel.h index 1c946403..6c25b265 100644 --- a/compiler/lib/KernelGen/Common/ConvKernel.h +++ b/compiler/lib/KernelGen/Common/ConvKernel.h @@ -26,14 +26,20 @@ class ConvImpl : public KernelFunc { } std::string GetKernelSymbol(TContext* context) const override; - static bool is_qint8_conv_dtype(TContext* ctx) { + static bool is_qint8_conv_dtype(TContext* ctx, + bool is_dst_support_si32 = false) { bool type_ok = ctx->getAttrInt("nr_operands") >= 3; auto dst_dtype = Utils::get_last_operand(ctx).dtype; type_ok = type_ok && Utils::is_quant_dtype( ctx->getAttrOprand("operand:0").dtype, 8); type_ok = type_ok && Utils::is_quant_dtype( ctx->getAttrOprand("operand:1").dtype, 8); - type_ok = type_ok && Utils::is_quant_dtype(dst_dtype, 8); + if (is_dst_support_si32) { + type_ok = type_ok && (Utils::is_quant_dtype(dst_dtype, 8) || + Utils::is_quant_dtype(dst_dtype, 32)); + } else { + type_ok = type_ok && Utils::is_quant_dtype(dst_dtype, 8); + } if (is_bias(ctx)) { type_ok = type_ok && Utils::is_quant_dtype( diff --git a/compiler/lib/KernelGen/Utils/Utils.h b/compiler/lib/KernelGen/Utils/Utils.h index 11f4ae0b..a46b86b7 100644 --- a/compiler/lib/KernelGen/Utils/Utils.h +++ b/compiler/lib/KernelGen/Utils/Utils.h @@ -63,7 +63,8 @@ static inline bool is_float_dtype(const std::string& dtype, static inline bool is_int_dtype(const std::string& dtype, int bit_width = -1) { if (bit_width == 8 && (dtype == "i8" || dtype == "si8" || dtype == "ui8")) { return true; - } else if (bit_width == 32 && (dtype == "i32" || dtype == "si32")) { + } else if (bit_width == 32 && + (dtype == "i32" || dtype == "si32" || dtype == "qsi32")) { return true; } else if (bit_width == 16 && (dtype == "i16" || dtype == "ui16")) { return true; diff --git a/runtime/src/vm/registry.h b/runtime/src/vm/registry.h index 83b11721..5e48e78d 100644 --- a/runtime/src/vm/registry.h +++ b/runtime/src/vm/registry.h @@ -29,6 +29,8 @@ void register_broadcast_shape_of(VM* vm); void register_reshape(VM* vm); +void register_extern_opr(VM* vm); + #endif // VM_REGISTRY_H // vim: syntax=cpp.doxygen From 5a265083f08dcbc80ab6bd25b42722161e78b404 Mon Sep 17 00:00:00 2001 From: limingxin Date: Fri, 6 Jan 2023 19:18:55 +0800 Subject: [PATCH 10/17] ci: add extern c opr loader test --- compiler/include/compiler/Target/MGB/dummy_loader.h | 8 ++++---- runtime/src/vm/extern_opr.c | 6 ++++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/compiler/include/compiler/Target/MGB/dummy_loader.h b/compiler/include/compiler/Target/MGB/dummy_loader.h index 49cce414..842da8bd 100644 --- a/compiler/include/compiler/Target/MGB/dummy_loader.h +++ b/compiler/include/compiler/Target/MGB/dummy_loader.h @@ -106,11 +106,11 @@ class MGBOprDescImpl { class MGBOprLoaderImpl { static std::map user_datas; // extra_data format: - // total_len - // nr_env - // ENV_len_1:ENV_1:VALUE_len_1:VALUE_1 + // total_len(size_t) + // nr_env(size_t) + // ENV_len_1(size_t):ENV_1(char[ENV_len_1]):VALUE_len_1(size_t):VALUE_1(char[VALUE_len_1]) // ENV_len_2.... - // loader_path_len:loader_path:interface_len:interface + // loader_path_len(size_t):loader_path(char[loader_path_len]):interface_len(size_t):interface(char[interface_len]) static std::shared_ptr extra_data; static void make_extra_data() { diff --git a/runtime/src/vm/extern_opr.c b/runtime/src/vm/extern_opr.c index 4af4aec0..12d7dbdd 100644 --- a/runtime/src/vm/extern_opr.c +++ b/runtime/src/vm/extern_opr.c @@ -217,6 +217,12 @@ static TinyNNStatus load(flatbuffers_generic_t fbs_inst, Instruction* inst, if (!has_set_env_and_loader) { const void* extra_data = data + data_len; + // extra_data format: + // nr_env(size_t) + // ENV_len_1(size_t):ENV_1(char[ENV_len_1]):VALUE_len_1(size_t):VALUE_1(char[VALUE_len_1]) + // ENV_len_2.... + // loader_path_len(size_t):loader_path(char[loader_path_len]):interface_len(size_t):interface(char[interface_len]) + // parse and set ENV size_t nr_env = *(size_t*)extra_data; extra_data += sizeof(size_t); From 8c4ef921906f880ce35cc4e5e11d20f1769612f2 Mon Sep 17 00:00:00 2001 From: yuxiongxiong Date: Mon, 9 Jan 2023 10:39:52 +0800 Subject: [PATCH 11/17] feat(compiler): basic change for more winograd algo --- compiler/lib/KernelGen/Arm/Arm64/ConvKernel.h | 5 +- .../lib/KernelGen/BareMetal/ConvKernel.cpp | 53 ++++--- .../GeneralIntrinsic/ConvKernel/ConvKernel.h | 37 +++++ .../ConvKernel/Fp32Conv1x1Mk4M4N12.cpp | 50 +++--- .../ConvKernel/Fp32ConvNchwNchw44.cpp | 32 ++-- .../ConvKernel/Fp32Im2col.cpp | 53 ++++--- .../ConvKernel/Fp32WinogradNchw44.cpp | 149 ++++++++++++++++++ .../ConvKernel/Winograd/WinogradCommon.cpp | 18 ++- .../ConvKernel/Winograd/WinogradCommon.h | 2 +- .../Winograd/WinogradF23Strategy4x8MK4.cpp | 2 +- .../Winograd/WinogradF23Strategy4x8MK4.h | 2 +- .../InternalKernel/Fp32M4N12K4Matmul.cpp | 18 ++- .../InternalKernel/InternalKernel.cpp | 8 +- .../InternalKernel/InternalKernel.h | 4 +- .../KernelGen/GeneralIntrinsic/KernelPack.cpp | 8 +- compiler/lib/KernelGen/KernelGen.cpp | 74 +++++++-- immigration/include/marm_neon.h | 2 +- 17 files changed, 394 insertions(+), 123 deletions(-) diff --git a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel.h b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel.h index ba959e35..dee5bd6f 100644 --- a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel.h +++ b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel.h @@ -9,9 +9,9 @@ #pragma once #include #include +#include "Arm/ArmCommon/ConvKernel/Fp32/Winograd/WinogradCommon.h" #include "Common/ConvKernel.h" #include "ConvKernel/Fp32/Winograd/WinogradF23Strategy4x16MK4.h" -#include "Arm/ArmCommon/ConvKernel/Fp32/Winograd/WinogradCommon.h" #include "InternalKernel/InternalKernel.h" #include "Utils/StringTemplate.h" #include "Utils/SymbolHelper.h" @@ -138,6 +138,7 @@ class ChannelWiseInt8Mk4K3 : public Arm64ConvImpl { class WinogradFloatF23Nchw44 : public Arm64ConvImpl { mutable ArmCommon::WinogradFrameNchw44 m_framework; mutable WinogradF23Strategy4x16MK4 m_winograd_strategy; + public: bool IsAvailable(TContext* context) const override; //! kernel gen @@ -154,6 +155,6 @@ class WinogradFloatF23Nchw44 : public Arm64ConvImpl { } // namespace Arm64 } // namespace KernelGen -} // namespace megcc +} // namespace megcc // vim: syntax=cpp.doxygen diff --git a/compiler/lib/KernelGen/BareMetal/ConvKernel.cpp b/compiler/lib/KernelGen/BareMetal/ConvKernel.cpp index f7e2016f..fd9f3310 100644 --- a/compiler/lib/KernelGen/BareMetal/ConvKernel.cpp +++ b/compiler/lib/KernelGen/BareMetal/ConvKernel.cpp @@ -129,31 +129,36 @@ std::string get_dst_foramt(const std::string& filter_format) { } // namespace std::string ConvImpl::GetKernelSymbol(TContext* ctx) const { std::stringstream extra_ss; - extra_ss << "_" << SymbolHelper::gen_io_str(ctx); - if (is_bias(ctx)) { - extra_ss << "_bias"; - } - if (ctx->haveAttr("nonlineMode") && - ctx->getAttrStr("nonlineMode") != "IDENTITY") { - extra_ss << "_" << ctx->getAttrStr("nonlineMode"); + if (ctx) { + extra_ss << "_" << SymbolHelper::gen_io_str(ctx); + if (is_bias(ctx)) { + extra_ss << "_bias"; + } + if (ctx->haveAttr("nonlineMode") && + ctx->getAttrStr("nonlineMode") != "IDENTITY") { + extra_ss << "_" << ctx->getAttrStr("nonlineMode"); + } + std::string name_temp = + "kernel_conv2d_${kernel_h}x${kernel_w}_${format}_${sparse}_p$" + "{pad_h}x${pad_w}_s${stride_h}x${stride_w}_d${dilate_h}x${" + "dilate_w}" + "${extra}"; + return StringTemplate::StringTemplateArgs(ctx) + .add_ctx_int("kernel_h") + .add_ctx_int("kernel_w") + .add("format", get_format(ctx)) + .add_ctx_str("sparse") + .add_ctx_int("pad_h") + .add_ctx_int("pad_w") + .add_ctx_int("stride_h") + .add_ctx_int("stride_w") + .add_ctx_int("dilate_h") + .add_ctx_int("dilate_w") + .add("extra", extra_ss.str()) + .render(name_temp); + } else { + return "kernel_conv2d"; } - std::string name_temp = - "kernel_conv2d_${kernel_h}x${kernel_w}_${format}_${sparse}_p$" - "{pad_h}x${pad_w}_s${stride_h}x${stride_w}_d${dilate_h}x${dilate_w}" - "${extra}"; - return StringTemplate::StringTemplateArgs(ctx) - .add_ctx_int("kernel_h") - .add_ctx_int("kernel_w") - .add("format", get_format(ctx)) - .add_ctx_str("sparse") - .add_ctx_int("pad_h") - .add_ctx_int("pad_w") - .add_ctx_int("stride_h") - .add_ctx_int("stride_w") - .add_ctx_int("dilate_h") - .add_ctx_int("dilate_w") - .add("extra", extra_ss.str()) - .render(name_temp); } std::string ConvGeneral::GetKernelBody(TContext* context) const { diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/ConvKernel.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/ConvKernel.h index f09484f2..5b27175a 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/ConvKernel.h +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/ConvKernel.h @@ -14,6 +14,8 @@ #include "Utils/StringTemplate.h" #include "Utils/SymbolHelper.h" #include "Winograd/WinogradF23Strategy4x8MK4.h" +#include "Winograd/WinogradF43Strategy4x16MK4.h" +#include "Winograd/WinogradF63Strategy4x16MK4.h" #include "compiler/KernelGen/KernelGen.h" namespace megcc { @@ -129,6 +131,41 @@ class Conv1x1FloatMk4 : public GIConvImpl { std::shared_ptr GetInnerCtx(TContext* ctx) const; MatmulM4N12MK4Kernel m_inner_gemm; }; +class WinogradFloatF43Nchw44 : public GIConvImpl { + mutable WinogradFrameNchw44 m_framework; + mutable WinogradF43Strategy4x16MK4 m_winograd_strategy; + +public: + bool IsAvailable(TContext* context) const override; + //! kernel gen + std::string GetKernelBody(TContext* context) const override; + //! init gen + std::string GetInitBody(TContext* context) const override; + std::string GetWorkspaceBody(TContext* context) const override; + + std::vector GetDependInternalSymbol( + TContext* context) const override; + + std::string GetKernelSymbol(TContext* context) const override; +}; + +class WinogradFloatF63Nchw44 : public GIConvImpl { + mutable WinogradFrameNchw44 m_framework; + mutable WinogradF63Strategy4x16MK4 m_winograd_strategy; + +public: + bool IsAvailable(TContext* context) const override; + //! kernel gen + std::string GetKernelBody(TContext* context) const override; + //! init gen + std::string GetInitBody(TContext* context) const override; + std::string GetWorkspaceBody(TContext* context) const override; + + std::vector GetDependInternalSymbol( + TContext* context) const override; + + std::string GetKernelSymbol(TContext* context) const override; +}; } // namespace GeneralIntrinsic } // namespace KernelGen diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Conv1x1Mk4M4N12.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Conv1x1Mk4M4N12.cpp index eecacb7b..f5cdc8fc 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Conv1x1Mk4M4N12.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Conv1x1Mk4M4N12.cpp @@ -46,30 +46,34 @@ bool Conv1x1FloatMk4::IsAvailable(TContext* ctx) const { std::string Conv1x1FloatMk4::GetKernelSymbol(TContext* ctx) const { std::stringstream extra_ss; - if (is_bias(ctx)) { - extra_ss << "_bias"; - } - if (ctx->haveAttr("nonlineMode") && - ctx->getAttrStr("nonlineMode") != "IDENTITY") { - extra_ss << "_" << ctx->getAttrStr("nonlineMode"); + if (ctx) { + if (is_bias(ctx)) { + extra_ss << "_bias"; + } + if (ctx->haveAttr("nonlineMode") && + ctx->getAttrStr("nonlineMode") != "IDENTITY") { + extra_ss << "_" << ctx->getAttrStr("nonlineMode"); + } + std::string name_temp = + "GI_kernel_conv2d_conv1x1_${format}_${kernel_h}x${kernel_w}_${" + "sparse}_p${pad_h}x${pad_w}_s${stride_h}x${stride_w}_d${" + "dilate_h}x${dilate_w}${extra}"; + return StringTemplate::StringTemplateArgs(ctx) + .add_ctx_int("kernel_h") + .add_ctx_int("kernel_w") + .add_ctx_str("format") + .add_ctx_str("sparse") + .add_ctx_int("pad_h") + .add_ctx_int("pad_w") + .add_ctx_int("stride_h") + .add_ctx_int("stride_w") + .add_ctx_int("dilate_h") + .add_ctx_int("dilate_w") + .add("extra", extra_ss.str()) + .render(name_temp); + } else { + return "GI_kernel_conv2d_conv1x1"; } - std::string name_temp = - "GI_kernel_conv2d_conv1x1_${format}_${kernel_h}x${kernel_w}_${" - "sparse}_p${pad_h}x${pad_w}_s${stride_h}x${stride_w}_d${" - "dilate_h}x${dilate_w}${extra}"; - return StringTemplate::StringTemplateArgs(ctx) - .add_ctx_int("kernel_h") - .add_ctx_int("kernel_w") - .add_ctx_str("format") - .add_ctx_str("sparse") - .add_ctx_int("pad_h") - .add_ctx_int("pad_w") - .add_ctx_int("stride_h") - .add_ctx_int("stride_w") - .add_ctx_int("dilate_h") - .add_ctx_int("dilate_w") - .add("extra", extra_ss.str()) - .render(name_temp); } std::string Conv1x1FloatMk4::GetInitBody(TContext* ctx) const { diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp index 86fba8ad..945e35c2 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp @@ -46,19 +46,25 @@ bool ConvFloatNCHWNCHW44::IsAvailable(TContext* ctx) const { return param_value_ok && param_mode_ok && type_ok && noline_ok && layout_ok; } std::string ConvFloatNCHWNCHW44::GetKernelSymbol(TContext* ctx) const { - auto src_tensor = ctx->getAttrOprand("operand:0"); - CC_ASSERT(src_tensor.shape.size() > 0) - << "src_tensor size should > 0, now" << src_tensor.shape.size(); - uint32_t ic = src_tensor.shape[1]; - auto dst_tensor = ctx->getAttrOprand( - "operand:" + std::to_string(ctx->getAttrInt("nr_operands") - 1)); - uint32_t oc = dst_tensor.shape[1] * 4; - std::string name_temp = "${base_kernel_sym}_nchw_nchw44_oc${oc}_ic${ic}"; - return StringTemplate::StringTemplateArgs(ctx) - .add("base_kernel_sym", GIConvImpl::GetKernelSymbol(ctx)) - .add("oc", oc) - .add("ic", ic) - .render(name_temp); + if (ctx) { + auto src_tensor = ctx->getAttrOprand("operand:0"); + CC_ASSERT(src_tensor.shape.size() > 0) + << "src_tensor size should > 0, now" << src_tensor.shape.size(); + uint32_t ic = src_tensor.shape[1]; + auto dst_tensor = ctx->getAttrOprand( + "operand:" + + std::to_string(ctx->getAttrInt("nr_operands") - 1)); + uint32_t oc = dst_tensor.shape[1] * 4; + std::string name_temp = + "${base_kernel_sym}_nchw_nchw44_oc${oc}_ic${ic}"; + return StringTemplate::StringTemplateArgs(ctx) + .add("base_kernel_sym", GIConvImpl::GetKernelSymbol(ctx)) + .add("oc", oc) + .add("ic", ic) + .render(name_temp); + } else { + return "GI_kernel_conv2d_nchw_nchw44"; + } } std::string ConvFloatNCHWNCHW44::GetInitBody(TContext* ctx) const { diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Im2col.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Im2col.cpp index 1d82f1df..3fc6391c 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Im2col.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Im2col.cpp @@ -66,31 +66,36 @@ std::string gen_im2col(TContext* ctx, TContext* inner_ctx) { } // namespace std::string ConvIm2colFloat::GetKernelSymbol(TContext* ctx) const { std::stringstream extra_ss; - if (is_bias(ctx)) { - extra_ss << "_bias"; - } - if (ctx->haveAttr("nonlineMode") && - ctx->getAttrStr("nonlineMode") != "IDENTITY") { - extra_ss << "_" << ctx->getAttrStr("nonlineMode"); + if (ctx) { + if (is_bias(ctx)) { + extra_ss << "_bias"; + } + if (ctx->haveAttr("nonlineMode") && + ctx->getAttrStr("nonlineMode") != "IDENTITY") { + extra_ss << "_" << ctx->getAttrStr("nonlineMode"); + } + extra_ss << ctx->getAttrOprand("operand:0").dtype; + std::string name_temp = + "GI_kernel_conv2d_im2col_${kernel_h}x${kernel_w}_${" + "format}_${sparse}_p${pad_h}x${pad_w}_s${stride_h}x${stride_w}_" + "d${" + "dilate_h}x${dilate_w}${extra}"; + return StringTemplate::StringTemplateArgs(ctx) + .add_ctx_int("kernel_h") + .add_ctx_int("kernel_w") + .add_ctx_str("format") + .add_ctx_str("sparse") + .add_ctx_int("pad_h") + .add_ctx_int("pad_w") + .add_ctx_int("stride_h") + .add_ctx_int("stride_w") + .add_ctx_int("dilate_h") + .add_ctx_int("dilate_w") + .add("extra", extra_ss.str()) + .render(name_temp); + } else { + return "GI_kernel_conv2d_im2col"; } - extra_ss << ctx->getAttrOprand("operand:0").dtype; - std::string name_temp = - "GI_kernel_conv2d_im2col_${kernel_h}x${kernel_w}_${" - "format}_${sparse}_p${pad_h}x${pad_w}_s${stride_h}x${stride_w}_d${" - "dilate_h}x${dilate_w}${extra}"; - return StringTemplate::StringTemplateArgs(ctx) - .add_ctx_int("kernel_h") - .add_ctx_int("kernel_w") - .add_ctx_str("format") - .add_ctx_str("sparse") - .add_ctx_int("pad_h") - .add_ctx_int("pad_w") - .add_ctx_int("stride_h") - .add_ctx_int("stride_w") - .add_ctx_int("dilate_h") - .add_ctx_int("dilate_w") - .add("extra", extra_ss.str()) - .render(name_temp); } bool ConvIm2colFloat::IsAvailable(TContext* ctx) const { diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32WinogradNchw44.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32WinogradNchw44.cpp index 961afcfa..f482b13f 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32WinogradNchw44.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32WinogradNchw44.cpp @@ -8,6 +8,7 @@ */ #include +#include "Arm/Arm64/InternalKernel/InternalKernel.h" #include "ConvKernel.h" #include "GeneralIntrinsic/Activation.h" #include "GeneralIntrinsic/InternalKernel/InternalKernel.h" @@ -90,4 +91,152 @@ std::string WinogradFloatF23NCHW44::GetKernelSymbol(TContext* context) const { auto symbol = GIConvImpl::GetKernelSymbol(context); return symbol + "_winograd_f23"; } + +bool WinogradFloatF43Nchw44::IsAvailable(TContext* ctx) const { + bool param_value_ok = + ctx->getAttrUInt("kernel_h") == 3 && + ctx->getAttrUInt("kernel_w") == 3 && + ctx->getAttrUInt("stride_h") == ctx->getAttrUInt("stride_w") && + ctx->getAttrUInt("stride_h") == 1 && + ctx->getAttrUInt("dilate_h") == 1 && + ctx->getAttrUInt("dilate_w") == 1; + + bool param_mode_ok = ctx->getAttrStr("sparse") == "DENSE" && + ctx->getAttrStr("format") == "NCHW44" && + ctx->getAttrStr("mode") == "CROSS_CORRELATION"; + + bool noline_ok = !ctx->haveAttr("nonlineMode") || + ctx->getAttrStr("nonlineMode") == "IDENTITY" || + ctx->getAttrStr("nonlineMode") == "RELU" || + ctx->getAttrStr("nonlineMode") == "H_SWISH"; + + bool type_ok = ctx->getAttrInt("nr_operands") >= 3 && + ctx->getAttrOprand("operand:0").dtype == "f32" && + ctx->getAttrOprand("operand:1").dtype == "f32" && + ctx->getAttrOprand("operand:2").dtype == "f32"; + + bool layout_ok = ctx->getAttrOprand("operand:0").shape.size() == 5 && + ctx->getAttrOprand("operand:0").shape[4] == 4; + + return param_value_ok && param_mode_ok && type_ok && noline_ok && layout_ok; +} + +std::string WinogradFloatF43Nchw44::GetInitBody(TContext* ctx) const { + std::stringstream writer; + writer << "#include\"gi_float.h\"\n"; + writer << "#include\n"; + writer << "\n\n"; + writer << GenCommonRet() << " " << GetInitSignature(ctx) << "{\n"; + writer << m_framework.GenInitCode(ctx, &m_winograd_strategy); + writer << "\n}"; + return writer.str(); +} + +std::string WinogradFloatF43Nchw44::GetWorkspaceBody(TContext* ctx) const { + std::stringstream writer; + writer << GenCommonRet() << " " << GetWorkspaceSignature(ctx) << "{\n"; + writer << m_framework.GenGetWorkSpaceCode(ctx, &m_winograd_strategy); + writer << "\n}"; + return writer.str(); +} + +std::string WinogradFloatF43Nchw44::GetKernelBody(TContext* ctx) const { + std::stringstream writer; + writer << "#include \"gi_float.h\""; + writer << "\n\n"; + writer << "extern " + << Arm64::MatmulM4N16MK4Kernel().GetKernelSignature(nullptr) + << ";\n"; + writer << GenCommonRet() << " " << GetKernelSignature(ctx) << "{\n"; + writer << m_framework.GenKernelBodyCode(ctx, &m_winograd_strategy); + writer << "return TinyNN_SUCCESS;\n}"; + return writer.str(); +} + +std::vector WinogradFloatF43Nchw44::GetDependInternalSymbol( + TContext*) const { + auto matmul = Arm64::MatmulM4N16MK4Kernel(); + return {{matmul.GetKernelSymbol(nullptr), matmul.GetKernelBody(nullptr), + matmul.GetBodyGuardBegin(nullptr), matmul.GetBodyGuardEnd(nullptr), + matmul.GetDependInternalSymbol(nullptr)}}; +} + +std::string WinogradFloatF43Nchw44::GetKernelSymbol(TContext* context) const { + auto symbol = GIConvImpl::GetKernelSymbol(context); + return symbol + "_winograd_f43"; +} + +bool WinogradFloatF63Nchw44::IsAvailable(TContext* ctx) const { + bool param_value_ok = + ctx->getAttrUInt("kernel_h") == 3 && + ctx->getAttrUInt("kernel_w") == 3 && + ctx->getAttrUInt("stride_h") == ctx->getAttrUInt("stride_w") && + ctx->getAttrUInt("stride_h") == 1 && + ctx->getAttrUInt("dilate_h") == 1 && + ctx->getAttrUInt("dilate_w") == 1; + + bool param_mode_ok = ctx->getAttrStr("sparse") == "DENSE" && + ctx->getAttrStr("format") == "NCHW44" && + ctx->getAttrStr("mode") == "CROSS_CORRELATION"; + + bool noline_ok = !ctx->haveAttr("nonlineMode") || + ctx->getAttrStr("nonlineMode") == "IDENTITY" || + ctx->getAttrStr("nonlineMode") == "RELU" || + ctx->getAttrStr("nonlineMode") == "H_SWISH"; + + bool type_ok = ctx->getAttrInt("nr_operands") >= 3 && + ctx->getAttrOprand("operand:0").dtype == "f32" && + ctx->getAttrOprand("operand:1").dtype == "f32" && + ctx->getAttrOprand("operand:2").dtype == "f32"; + + bool layout_ok = ctx->getAttrOprand("operand:0").shape.size() == 5 && + ctx->getAttrOprand("operand:0").shape[4] == 4; + + return param_value_ok && param_mode_ok && type_ok && noline_ok && layout_ok; +} + +std::string WinogradFloatF63Nchw44::GetInitBody(TContext* ctx) const { + std::stringstream writer; + writer << "#include\"gi_float.h\"\n"; + writer << "#include\n"; + writer << "\n\n"; + writer << GenCommonRet() << " " << GetInitSignature(ctx) << "{\n"; + writer << m_framework.GenInitCode(ctx, &m_winograd_strategy); + writer << "\n}"; + return writer.str(); +} + +std::string WinogradFloatF63Nchw44::GetWorkspaceBody(TContext* ctx) const { + std::stringstream writer; + writer << GenCommonRet() << " " << GetWorkspaceSignature(ctx) << "{\n"; + writer << m_framework.GenGetWorkSpaceCode(ctx, &m_winograd_strategy); + writer << "\n}"; + return writer.str(); +} + +std::string WinogradFloatF63Nchw44::GetKernelBody(TContext* ctx) const { + std::stringstream writer; + writer << "#include \"gi_float.h\""; + writer << "\n\n"; + writer << "extern " + << Arm64::MatmulM4N16MK4Kernel().GetKernelSignature(nullptr) + << ";\n"; + writer << GenCommonRet() << " " << GetKernelSignature(ctx) << "{\n"; + writer << m_framework.GenKernelBodyCode(ctx, &m_winograd_strategy); + writer << "return TinyNN_SUCCESS;\n}"; + return writer.str(); +} + +std::vector WinogradFloatF63Nchw44::GetDependInternalSymbol( + TContext*) const { + auto matmul = Arm64::MatmulM4N16MK4Kernel(); + return {{matmul.GetKernelSymbol(nullptr), matmul.GetKernelBody(nullptr), + matmul.GetBodyGuardBegin(nullptr), matmul.GetBodyGuardEnd(nullptr), + matmul.GetDependInternalSymbol(nullptr)}}; +} + +std::string WinogradFloatF63Nchw44::GetKernelSymbol(TContext* context) const { + auto symbol = GIConvImpl::GetKernelSymbol(context); + return symbol + "_winograd_f63"; +} // vim: syntax=cpp.doxygen diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp index 26f1523d..dae4bd6a 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp @@ -22,7 +22,8 @@ using namespace GeneralIntrinsic; std::string WinogradFrameNchw44::GenGetWorkSpaceCode( TContext* context, WinogradStrategyBase* strategy) { CC_ASSERT(context->getAttrStr("format") == "NCHW44") - << "format mismatch now: "<< context->getAttrStr("format") << ", expect: NCHW44\n"; + << "format mismatch now: " << context->getAttrStr("format") + << ", expect: NCHW44\n"; auto WeightShape = context->getAttrOprand("operand:1").shape; std::stringstream ss; std::string workspace_temp = R"({ @@ -55,7 +56,12 @@ std::string WinogradFrameNchw44::GenGetWorkSpaceCode( Alpha * Alpha * OC * ${tile_per_loop} * sizeof(float); output_transform_buf_size = (output_transform_buf_size + Align -1) / Align * Align; - *workspace = input_transform_buf_size + output_transform_buf_size; + + size_t transform_mid_buf_size = 2 * Alpha * Alpha * sizeof(float) * + PACK_C_SIZE; + transform_mid_buf_size = (transform_mid_buf_size + Align -1) / Align * Align; + *workspace = input_transform_buf_size + output_transform_buf_size + + transform_mid_buf_size; return TinyNN_SUCCESS; })"; ss << StringTemplate::StringTemplateArgs() @@ -177,10 +183,18 @@ std::string WinogradFrameNchw44::GenKernelBodyCode( Alpha * Alpha * IC * nr_tiles_per_loop * sizeof(float); input_transform_buf_size = (input_transform_buf_size + Align -1) / Align * Align; + + size_t output_transform_buf_size = + Alpha * Alpha * OC * nr_tiles_per_loop * sizeof(float); + output_transform_buf_size = + (output_transform_buf_size + Align -1) / Align * Align; float* transform_input_ptr = workspace->ptr; float* transform_output_ptr = transform_input_ptr + input_transform_buf_size / sizeof(float); + + float* transform_mid_ptr = transform_output_ptr + + output_transform_buf_size / sizeof(float); const float* input_ptr = input->ptr; const float* weight_ptr = weight->ptr; diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h index 58223146..488010d3 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h @@ -63,7 +63,7 @@ class WinogradStrategyBase { }; class WinogradFrameNchw44 { - uint32_t m_tile_per_loop = 24; + uint32_t m_tile_per_loop = 32; public: //! gen init code diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.cpp index 0cf02cb8..a2564b21 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.cpp @@ -356,7 +356,7 @@ std::string WinogradF23Strategy4x8MK4::OutputFeatureTrans( MULTI_TWO(1); #undef MULTI_TWO - if (bias_ptr) { + if (bias_ptr_) { GI_FLOAT32_t vbias = GiLoadFloat32(bptr + oc); dst_v[0][0]= GiFloat32Type2FixLenType(GiAddFloat32(GiFixLenType2GiFloat32Type(dst_v[0][0]), vbias)); dst_v[0][1]= GiFloat32Type2FixLenType(GiAddFloat32(GiFixLenType2GiFloat32Type(dst_v[0][1]), vbias)); diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.h index 354a816b..2381fa79 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.h +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.h @@ -1,6 +1,6 @@ /** * \file - * compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/Winograd_strategy_4x16_mk4.h + * compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.h * * This file is part of MegCC, a deep learning compiler developed by Megvii. * diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/Fp32M4N12K4Matmul.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/Fp32M4N12K4Matmul.cpp index fe477774..2716b9cc 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/Fp32M4N12K4Matmul.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/Fp32M4N12K4Matmul.cpp @@ -20,7 +20,7 @@ static inline std::pair gen_postprocess_inline( TContext* ctx, bool need_postprocess = true) { std::string call_str; std::stringstream declare_ss; - auto nonline_mode = ctx->haveAttr("nonlineMode") + auto nonline_mode = ctx && ctx->haveAttr("nonlineMode") ? ctx->getAttrStr("nonlineMode") : "IDENTITY"; if ((nonline_mode == "SIGMOID") && need_postprocess) { @@ -206,11 +206,13 @@ static inline void transpose_1x4_4_s(const float* inptr0, float* outptr) { } static std::string kern_4x12(TContext* ctx) { - auto nonline_mode = ctx->haveAttr("nonlineMode") + auto nonline_mode = ctx && ctx->haveAttr("nonlineMode") ? ctx->getAttrStr("nonlineMode") : "IDENTITY"; auto activation_gen = create_activation_gener_instrinsic(nonline_mode); - bool with_bias = ctx->getAttrBool("with_bias"); + bool with_bias = ctx && ctx->getAttrBool("with_bias") + ? ctx->getAttrBool("with_bias") + : false; std::stringstream writer; writer << R"( static inline void kern_4x12_bias_relu(const float* packA, const float* packB, int K, @@ -437,11 +439,13 @@ static inline void kern_4x12_bias_relu(const float* packA, const float* packB, i } static std::string kern_4x4(TContext* ctx) { - auto nonline_mode = ctx->haveAttr("nonlineMode") + auto nonline_mode = ctx && ctx->haveAttr("nonlineMode") ? ctx->getAttrStr("nonlineMode") : "IDENTITY"; auto activation_gen = create_activation_gener_instrinsic(nonline_mode); - bool with_bias = ctx->getAttrBool("with_bias"); + bool with_bias = ctx && ctx->getAttrBool("with_bias") + ? ctx->getAttrBool("with_bias") + : false; std::stringstream writer; writer << R"( static inline void kern_4x4_bias_relu(const float* packA, const float* packB, int K, @@ -696,10 +700,10 @@ std::string gen_kernel(const std::string& sig, TContext* ctx, std::string MatmulM4N12MK4Kernel::GetKernelSymbol(TContext* ctx) const { std::stringstream ss; ss << "GI_fp32_m4_n12_k4_matmul"; - if (ctx->getAttrBool("with_bias")) { + if (ctx && ctx->getAttrBool("with_bias")) { ss << "_bias"; } - if (ctx->haveAttr("nonlineMode") && + if (ctx && ctx->haveAttr("nonlineMode") && ctx->getAttrStr("nonlineMode") != "IDENTITY") { ss << "_" << ctx->getAttrStr("nonlineMode"); } diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.cpp index 0506fdb6..c96b3127 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.cpp @@ -20,7 +20,7 @@ const std::string MatmulInternal::m_workspace_call = "(int y0, int ymax, int x0, int xmax, int k0, int kmax)"; std::string MatmulInternal::GenNakedKernelCall(TContext* ctx) { - auto dtype = ctx->getAttrStr("dtype"); + auto dtype = ctx ? ctx->getAttrStr("dtype") : "f32"; if (Utils::is_float_dtype(dtype)) { return R"((const float* pack_a, const float* pack_b, float* C, size_t LDC, size_t M, size_t N, size_t K, const float* bias_ptr))"; @@ -37,7 +37,7 @@ std::string MatmulInternal::GenNakedKernelCall(TContext* ctx) { } std::string MatmulInternal::GenKernelCall(TContext* ctx) { - auto dtype = ctx->getAttrStr("dtype"); + auto dtype = ctx ? ctx->getAttrStr("dtype") : "f32"; if (Utils::is_float_dtype(dtype)) { return R"((const float* A, size_t LDA, const float* B, size_t LDB, float* C, size_t LDC, size_t M, size_t N, size_t K, const float* bias_ptr, void* workspace))"; @@ -54,7 +54,7 @@ std::string MatmulInternal::GenKernelCall(TContext* ctx) { } std::string MatmulInternal::GenPackACall(TContext* ctx) { - auto dtype = ctx->getAttrStr("dtype"); + auto dtype = ctx ? ctx->getAttrStr("dtype") : "f32"; if (Utils::is_float_dtype(dtype)) { return "(float* outptr, const float* inptr, int ldin, int y0, int " "ymax, int k0, int kmax)"; @@ -68,7 +68,7 @@ std::string MatmulInternal::GenPackACall(TContext* ctx) { } std::string MatmulInternal::GenPackBCall(TContext* ctx) { - auto dtype = ctx->getAttrStr("dtype"); + auto dtype = ctx ? ctx->getAttrStr("dtype") : "f32"; if (Utils::is_float_dtype(dtype)) { return "(float* outptr, const float* inptr, int ldin, int x0, int " "xmax, int k0, int kmax)"; diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.h b/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.h index f2df656b..8ce3bca8 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.h +++ b/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.h @@ -22,12 +22,12 @@ class MatmulInternal : public InternalKernelFunc { return "void " + GetKernelSymbol(ctx) + GenKernelCall(ctx); } virtual std::string GetPackASymbol(TContext* ctx) const { - bool trans_a = ctx->getAttrBool("transposeA"); + bool trans_a = ctx && ctx->getAttrBool("transposeA") ? true : false; std::string suffix = trans_a ? "t" : "n"; return GetKernelSymbol(ctx) + "_packa_" + suffix; } virtual std::string GetPackBSymbol(TContext* ctx) const { - bool trans_b = ctx->getAttrBool("transposeB"); + bool trans_b = ctx && ctx->getAttrBool("transposeB") ? true : false; std::string suffix = trans_b ? "t" : "n"; return GetKernelSymbol(ctx) + "_packb_" + suffix; } diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/KernelPack.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/KernelPack.cpp index 41856a1c..18c0bdaa 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/KernelPack.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/KernelPack.cpp @@ -46,11 +46,15 @@ struct AllGICommonKernel { std::make_shared(), std::make_shared()}; inner_map[KernelPack::KernType::ConvKernel] = { + std::make_shared(), std::make_shared(), std::make_shared(), - std::make_shared(), + std::make_shared(), + std::make_shared(), std::make_shared(), - std::make_shared()}; + std::make_shared(), + + }; inner_map[KernelPack::KernType::PoolingKernel] = { std::make_shared(), diff --git a/compiler/lib/KernelGen/KernelGen.cpp b/compiler/lib/KernelGen/KernelGen.cpp index 20e68428..cd7978b9 100644 --- a/compiler/lib/KernelGen/KernelGen.cpp +++ b/compiler/lib/KernelGen/KernelGen.cpp @@ -6,7 +6,7 @@ * * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. */ - +#include #include "Arm/Arm64/KernelPack.h" #include "Arm/ArmCommon/KernelPack.h" #include "Arm/Armv7/KernelPack.h" @@ -32,29 +32,51 @@ KernelPack::GetKernel(KernelPack::KernType kernel_type, Arch arch) { //! arm64v7 is used by tinycv, nn opr should be armv64 or armv7, not arm64v7 auto deduce_func = GetDeduceLayout(kernel_type); if (arch == Arch::ARM64 || arch == Arch::ARM64V7) { + auto a64_kerns = Arm64::ArchKernelPack::GetKernel(kernel_type); + auto armcommon_kerns = + ArmCommon::ArchKernelPack::GetKernel(kernel_type); + auto gi_kerns = + GeneralIntrinsic::ArchKernelPack::GetKernel(kernel_type); if (kernel_type == KernelPack::KernType::MatrixMulKernel) { - auto a64_kerns = Arm64::ArchKernelPack::GetKernel(kernel_type); - auto armcommon_kerns = - ArmCommon::ArchKernelPack::GetKernel(kernel_type); - auto gi_kerns = - GeneralIntrinsic::ArchKernelPack::GetKernel(kernel_type); armcommon_kerns.insert(armcommon_kerns.end(), a64_kerns.begin(), a64_kerns.end()); armcommon_kerns.insert(armcommon_kerns.end(), gi_kerns.begin(), gi_kerns.end()); return {armcommon_kerns, deduce_func}; + } + + std::vector valid_kern; + if (kernel_type == KernelPack::KernType::ConvKernel) { + std::vector sorted_kern(2); + for (auto&& kern : gi_kerns) { + auto kern_sym = kern->GetKernelSymbol(nullptr); + auto is_f63 = std::regex_match( + kern_sym, std::regex("^GI.*_winograd_f63.*")); + auto is_f43 = std::regex_match( + kern_sym, std::regex("^GI.*_winograd_f43.*")); + auto if_match = is_f63 || is_f43; + if (!if_match) { + valid_kern.push_back(kern); + } else { + if (is_f43) { + sorted_kern[0] = kern; + } else { + sorted_kern[1] = kern; + } + } + } + //! WARNING: the f63 and f43 must exist in GI kernel + a64_kerns.insert(a64_kerns.begin(), sorted_kern.begin(), + sorted_kern.end()); } else { - auto a64_kerns = Arm64::ArchKernelPack::GetKernel(kernel_type); - auto armcommon_kerns = - ArmCommon::ArchKernelPack::GetKernel(kernel_type); - auto gi_kerns = - GeneralIntrinsic::ArchKernelPack::GetKernel(kernel_type); - a64_kerns.insert(a64_kerns.end(), armcommon_kerns.begin(), - armcommon_kerns.end()); - a64_kerns.insert(a64_kerns.end(), gi_kerns.begin(), gi_kerns.end()); - return {a64_kerns, deduce_func}; + valid_kern = gi_kerns; } + a64_kerns.insert(a64_kerns.end(), armcommon_kerns.begin(), + armcommon_kerns.end()); + a64_kerns.insert(a64_kerns.end(), valid_kern.begin(), valid_kern.end()); + return {a64_kerns, deduce_func}; + } else if (arch == Arch::ARMV7) { auto a32_kerns = Armv7::ArchKernelPack::GetKernel(kernel_type); @@ -77,10 +99,30 @@ KernelPack::GetKernel(KernelPack::KernType kernel_type, Arch arch) { #endif else { CC_ASSERT(arch == Arch::BAREMETAL); + //! FIXME: the f43 f63 winograd matmul is using arm64 asm kernel, it is + //! invalid for barmetal auto gi_kerns = GeneralIntrinsic::ArchKernelPack::GetKernel(kernel_type); + std::vector valid_kern; + if (kernel_type == KernelPack::KernType::ConvKernel) { + for (auto&& kern : gi_kerns) { + auto kern_sym = kern->GetKernelSymbol(nullptr); + auto if_match = + std::regex_match(kern_sym, + std::regex("^GI.*_winograd_f63.*")) || + std::regex_match(kern_sym, + std::regex("^GI.*_winograd_f43.*")); + if (!if_match) { + valid_kern.push_back(kern); + } + } + } else { + valid_kern = gi_kerns; + } + auto naive_impl = BareMetal::ArchKernelPack::GetKernel(kernel_type); - naive_impl.insert(naive_impl.begin(), gi_kerns.begin(), gi_kerns.end()); + naive_impl.insert(naive_impl.begin(), valid_kern.begin(), + valid_kern.end()); return {naive_impl, deduce_func}; } } diff --git a/immigration/include/marm_neon.h b/immigration/include/marm_neon.h index 375e9617..f8327103 100644 --- a/immigration/include/marm_neon.h +++ b/immigration/include/marm_neon.h @@ -627,7 +627,7 @@ __ai float32x4_t vfmsq_laneq_f32_impl_3(float32x4_t a, float32x4_t b, #undef vfmaq_laneq_f32 #define vfmaq_laneq_f32(a, b, v, lane) vfmaq_laneq_f32_impl_##lane(a, b, v) #undef vfmsq_laneq_f32 -#define vfmsq_laneq_f32(a, b, v, lane) Vfmsq_laneq_f32_impl_##lane(a, b, v) +#define vfmsq_laneq_f32(a, b, v, lane) vfmsq_laneq_f32_impl_##lane(a, b, v) #endif From befea7ec9c9d3d17417949a26cbbc82d4fcbca1a Mon Sep 17 00:00:00 2001 From: yuxiongxiong Date: Mon, 9 Jan 2023 10:45:37 +0800 Subject: [PATCH 12/17] feat(compiler): add f43 winograd for arm64 kernel --- .../Winograd/WinogradF43Strategy4x16MK4.cpp | 1066 +++++++++++++++++ .../Winograd/WinogradF43Strategy4x16MK4.h | 36 + 2 files changed, 1102 insertions(+) create mode 100644 compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp create mode 100644 compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp new file mode 100644 index 00000000..becbb458 --- /dev/null +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp @@ -0,0 +1,1066 @@ +/** + * \file + * compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ + +#include "WinogradF43Strategy4x16MK4.h" +#include +#include "Arm/Arm64/InternalKernel/InternalKernel.h" +#include "GeneralIntrinsic/Activation.h" +#include "GeneralIntrinsic/ConvKernel/ConvKernel.h" +#include "Utils/StringTemplate.h" +#include "compiler/KernelGen/KernelGen.h" + +using namespace megcc; +using namespace KernelGen; +using namespace GeneralIntrinsic; + +std::string WinogradF43Strategy4x16MK4::WeightTrans( + const std::vector& strs) { + auto inptr = strs[0]; + auto outptr = strs[1]; + auto OC = strs[2]; + auto IC = strs[3]; + std::string filter_process = R"( + const uint32_t PACK_C_SIZE= 4; + const uint32_t KERNEL_SIZE = 3; + const uint32_t alpha = 4 + 3 - 1; + size_t OCB = ${OC} / PACK_C_SIZE; + size_t ICB = ${IC} / PACK_C_SIZE; + + for (size_t ocb = 0; ocb < OCB; ocb++) { + for (size_t icb = 0; icb < ICB; icb++) { + for (size_t ic_inner = 0; ic_inner < PACK_C_SIZE; ic_inner++) { + const float* fptr = ${filter} + (ocb * ICB + icb) * KERNEL_SIZE * + KERNEL_SIZE * PACK_C_SIZE * PACK_C_SIZE + + ic_inner * PACK_C_SIZE; + //! read 4OC 1IC filter + GI_FLOAT32_t g00 = GiLoadFloat32(fptr + 0* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g01 = GiLoadFloat32(fptr + 1* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g02 = GiLoadFloat32(fptr + 2* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g10 = GiLoadFloat32(fptr + 3* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g11 = GiLoadFloat32(fptr + 4* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g12 = GiLoadFloat32(fptr + 5* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g20 = GiLoadFloat32(fptr + 6* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g21 = GiLoadFloat32(fptr + 7* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g22 = GiLoadFloat32(fptr + 8* PACK_C_SIZE * PACK_C_SIZE); + + //! twice matmul + GI_FLOAT32_t tmp0, tmp1; + ${FilterTransUnroll(3, midle, g, tmp0, tmp1)} + ${FilterTransUnroll(6, ret, midle, tmp0, tmp1)} + + //! write to the dst + float* dst = ${outptr}; + ${StoreRet2D(6, 6, ret)}; + } + } + })"; + auto FilterTransUnroll = [](const std::vector& strs) { + int times = std::stoi(strs[0]); + std::string dst = strs[1]; + std::string src = strs[2]; + std::string tmp0 = strs[3]; + std::string tmp1 = strs[4]; + std::stringstream ss; + for (int i = 0; i < times; i++) { + ss << "GI_FLOAT32_t " << dst << i << "0 = GiMultiplyScalerFloat32(" + << src << "0" << i << ", 0.25f);\n"; + ss << tmp0 << " = GiMultiplyScalerFloat32(GiAddFloat32(" << src + << "0" << i << ", " << src << "2" << i << "), (-1.0/6));\n"; + ss << tmp1 << " = GiMultiplyScalerFloat32(" << src << "1" << i + << ", (-1.0/6));\n"; + ss << "GI_FLOAT32_t " << dst << i << "1 = GiAddFloat32(" << tmp0 + << ", " << tmp1 << ");\n"; + ss << "GI_FLOAT32_t " << dst << i << "2 = GiSubtractFloat32(" + << tmp0 << ", " << tmp1 << ");\n"; + ss << tmp0 << " = GiAddFloat32(GiMultiplyScalerFloat32(" << src + << "0" << i << ", 1.0/24), GiMultiplyScalerFloat32(" << src + << "2" << i << ", 1.0/6));\n"; + ss << tmp1 << " = GiMultiplyScalerFloat32(" << src << "1" << i + << ", 1.0/12);\n"; + ss << "GI_FLOAT32_t " << dst << i << "3 = GiAddFloat32(" << tmp0 + << ", " << tmp1 << ");\n"; + ss << "GI_FLOAT32_t " << dst << i << "4 = GiSubtractFloat32(" + << tmp0 << ", " << tmp1 << ");\n"; + ss << "GI_FLOAT32_t " << dst << i << "5 = " << src << "2" << i + << ";\n"; + } + return ss.str(); + }; + + auto StoreRet2D = [](const std::vector& strs) { + int times_out = std::stoi(strs[0]); + int times_inner = std::stoi(strs[1]); + std::string src = strs[2]; + std::stringstream ss; + for (int out = 0; out < times_out; out++) { + for (int inner = 0; inner < times_inner; inner++) { + ss << "GiStoreFloat32(dst + (" << out << " * alpha + " << inner + << ") * OCB * ICB * PACK_C_SIZE * PACK_C_SIZE + ocb * ICB * " + "PACK_C_SIZE *PACK_C_SIZE + icb* PACK_C_SIZE * " + "PACK_C_SIZE + " + "ic_inner*PACK_C_SIZE, " + << src << out << inner << ");\n"; + } + } + return ss.str(); + }; + std::stringstream ss; + ss << StringTemplate::StringTemplateArgs() + .add("StoreRet2D", StoreRet2D) + .add("FilterTransUnroll", FilterTransUnroll) + .add("OC", OC) + .add("IC", IC) + .add("filter", inptr) + .add("outptr", outptr) + .render(filter_process); + return ss.str(); +} + +std::string WinogradF43Strategy4x16MK4::InputFeatureTrans( + const std::vector& strs) { + auto InputTransformF43NCHW44 = [](std::vector) { + std::stringstream ss; + std::string kernel = R"( + size_t ICB = IC_ / PACK_C_SIZE; + size_t icb = ic / PACK_C_SIZE; + + #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* v0 = input_parameters; + #else + GI_FLOAT32_t v0 = GiLoadFloat32(input_parameters); + #endif + int base_offset= ic * IH_ * IW_ + ih_start * IW_ * 4 + iw_start * 4; + const float* input_ptr_ =source; + + GI_FLOAT32_t zero = GiZeroFloat32(); + GI_FLOAT32_t d00, d01, d02, d03, d04, d05; + GI_FLOAT32_t d10, d11, d12, d13, d14, d15; + GI_FLOAT32_t d20, d21, d22, d23, d24, d25; + GI_FLOAT32_t d30, d31, d32, d33, d34, d35; + +#define cb(i) GI_FLOAT32_t t##i; + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + + // load line 0 -> d00 ... d05 + int offset = base_offset; + const float* line_ptr = input_ptr_+ offset; + if (inner) { +#define cb(i) d0##i = GiLoadFloat32(line_ptr + i * PACK_C_SIZE); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { + if (ih_valid[0] == 1) { +#define cb(i) d0##i = iw_valid[i] ==1 ? GiLoadFloat32(line_ptr + i * PACK_C_SIZE) : zero; + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { +#define cb(i) d0##i = zero; + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } + } + + // load line 4 -> d30 ... t35 + offset = base_offset + 4 * IW_ * 4; + line_ptr = input_ptr_ + offset; + if (inner) { +#define cb(i) \ + d3##i = GiLoadFloat32(line_ptr + i * PACK_C_SIZE); \ + t##i = MADD(d3##i, d0##i, v0, 0); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { + if (ih_valid[4] == 1 ) { +#define cb(i) \ + d3##i = iw_valid[i] == 1 ? GiLoadFloat32(line_ptr + i * PACK_C_SIZE) : zero; \ + t##i = MADD(d3##i, d0##i, v0, 0); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { +#define cb(i) \ + d3##i = zero; \ + t##i = MADD(d3##i, d0##i, v0, 0); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } + } + + // load line 2 -> d20 ... d25 + offset = base_offset + 2 * IW_ * 4; + line_ptr = input_ptr_ + offset; + if (inner) { +#define cb(i) \ + d2##i = GiLoadFloat32(line_ptr + i * PACK_C_SIZE); \ + t##i = MSUB(t##i, d2##i, v0, 1); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { + if (ih_valid[2] == 1 ) { +#define cb(i) \ + d2##i = iw_valid[i] ==1 ? GiLoadFloat32(line_ptr + i * PACK_C_SIZE) : zero; \ + t##i = MSUB(t##i, d2##i, v0, 1); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { +#define cb(i) \ + d2##i = zero; \ + t##i = MSUB(t##i, d2##i, v0, 1); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } + } + + // load line 3 -> d10 ... d15 + offset = base_offset + 3 * IW_ * 4; + line_ptr = input_ptr_ + offset; + if (inner) { +#define cb(i) d1##i = GiLoadFloat32(line_ptr + i * PACK_C_SIZE); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { + if (ih_valid[3] ==1 ) { +#define cb(i) d1##i = iw_valid[i] ==1 ? GiLoadFloat32(line_ptr + i * PACK_C_SIZE) : zero; + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { +#define cb(i) d1##i = zero; + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } + } + + float* buf_ptr = dst + icb * nr_tiles_in_loop_ * PACK_C_SIZE + + tile_idx * PACK_C_SIZE; + + d00 = MADD(t4, t0, v0, 0); + d00 = MSUB(d00, t2, v0, 1); + GiStoreFloat32(buf_ptr, d00); + d00 = MSUB(t3, t1, v0, 0); + d01 = MSUB(t4, t2, v0, 0); + d02 = GiAddFloat32(d00, d01); + GiStoreFloat32(buf_ptr + 1 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d02); + d02 = GiSubtractFloat32(d01, d00); + GiStoreFloat32(buf_ptr + 2 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d02); + d00 = GiSubtractFloat32(t3, t1); + d01 = GiSubtractFloat32(t4, t2); + d02 = MADD(d01, d00, v0, 2); + GiStoreFloat32(buf_ptr + 3 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d02); + d02 = MSUB(d01, d00, v0, 2); + GiStoreFloat32(buf_ptr + 4 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d02); + d01 = GiSubtractFloat32(t5, t3); + d02 = MSUB(d01, d00, v0, 0); + GiStoreFloat32(buf_ptr + 5 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d02); + +// ln4 - ln2 -> t +#define cb(i) t##i = GiSubtractFloat32(d3##i, d2##i); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + + // load line 1 -> d00 ... d05 + offset = base_offset + 1 * IW_ * 4; + line_ptr = input_ptr_ + offset; + if (inner) { +#define cb(i) d0##i = GiLoadFloat32(line_ptr + i * PACK_C_SIZE); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { + if (ih_valid[1] ==1) { +#define cb(i) d0##i = iw_valid[i] ==1 ? GiLoadFloat32(line_ptr + i * PACK_C_SIZE) : zero; + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { +#define cb(i) d0##i = zero; + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } + } + +// ln4 - 4 * ln2 -> ln4 +#define cb(i) d3##i = MSUB(d3##i, d2##i, v0, 0); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + +// ln3 - 4 * ln1 -> ln2 +#define cb(i) d2##i = MSUB(d1##i, d0##i, v0, 0); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + +// ln3 - ln1 -> ln3 +#define cb(i) d1##i = GiSubtractFloat32(d1##i, d0##i); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + +// (ln4 - 4 * ln2)[ln4] + (ln3 - 4 * ln1)[ln2] -> ln1 +#define cb(i) d0##i = GiAddFloat32(d3##i, d2##i); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + +// (ln4 - 4 * ln2)[ln4] - (ln3 - 4 * ln1)[ln2] -> ln2 +#define cb(i) d2##i = GiSubtractFloat32(d3##i, d2##i); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + + // ln4(d30 ... d35) is free until now + buf_ptr = dst + 1 * Alpha * ICB * nr_tiles_in_loop_ * PACK_C_SIZE + + icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE; + d30 = MADD(d04, d00, v0, 0); + d30 = MSUB(d30, d02, v0, 1); + GiStoreFloat32(buf_ptr, d30); + d30 = MSUB(d03, d01, v0, 0); + d32 = MSUB(d04, d02, v0, 0); + d31 = GiAddFloat32(d30, d32); + GiStoreFloat32(buf_ptr + ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d31); + d31 = GiSubtractFloat32(d32, d30); + GiStoreFloat32(buf_ptr + 2 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d31); + d30 = GiSubtractFloat32(d03, d01); + d31 = GiSubtractFloat32(d04, d02); + d32 = MADD(d31, d30, v0, 2); + GiStoreFloat32(buf_ptr + 3 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d32); + d32 = MSUB(d31, d30, v0, 2); + GiStoreFloat32(buf_ptr + 4 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d32); + d31 = GiSubtractFloat32(d05, d03); + d32 = MSUB(d31, d30, v0, 0); + GiStoreFloat32(buf_ptr + 5 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d32); + + buf_ptr = dst + 2 * Alpha * ICB * nr_tiles_in_loop_ * PACK_C_SIZE + + icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE; + d33 = MADD(d24, d20, v0, 0); + d33 = MSUB(d33, d22, v0, 1); + GiStoreFloat32(buf_ptr, d33); + d33 = MSUB(d23, d21, v0, 0); + d35 = MSUB(d24, d22, v0, 0); + d34 = GiAddFloat32(d33, d35); + GiStoreFloat32(buf_ptr + ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d34); + d34 = GiSubtractFloat32(d35, d33); + GiStoreFloat32(buf_ptr + 2 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d34); + d33 = GiSubtractFloat32(d23, d21); + d34 = GiSubtractFloat32(d24, d22); + d35 = MADD(d34, d33, v0, 2); + GiStoreFloat32(buf_ptr + 3 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d35); + d35 = MSUB(d34, d33, v0, 2); + GiStoreFloat32(buf_ptr + 4 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d35); + d34 = GiSubtractFloat32(d25, d23); + d35 = MSUB(d34, d33, v0, 0); + GiStoreFloat32(buf_ptr + 5 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, d35); + +// (ln4 - ln2)[t] + (ln3 - ln1)[ln3] * 2 -> ln4 +#define cb(i) d3##i = MADD(t##i, d1##i, v0, 2); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + +// (ln4 - ln2)[t] - (ln3 - ln1)[ln3] * 2 -> ln3 +#define cb(i) d1##i = MSUB(t##i, d1##i, v0, 2); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + // t is free + buf_ptr = dst + 3 * Alpha * ICB * nr_tiles_in_loop_ * PACK_C_SIZE + + icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE; + t0 = MADD(d34, d30, v0, 0); + t0 = MSUB(t0, d32, v0, 1); + GiStoreFloat32(buf_ptr, t0); + t0 = MSUB(d33, d31, v0, 0); + t2 = MSUB(d34, d32, v0, 0); + t1 = GiAddFloat32(t0, t2); + GiStoreFloat32(buf_ptr + ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t1); + t1 = GiSubtractFloat32(t2, t0); + GiStoreFloat32(buf_ptr + 2 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t1); + t0 = GiSubtractFloat32(d33, d31); + t1 = GiSubtractFloat32(d34, d32); + t2 = MADD(t1, t0, v0, 2); + GiStoreFloat32(buf_ptr + 3 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t2); + t2 = MSUB(t1, t0, v0, 2); + GiStoreFloat32(buf_ptr + 4 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t2); + t1 = GiSubtractFloat32(d35, d33); + t2 = MSUB(t1, t0, v0, 0); + GiStoreFloat32(buf_ptr + 5 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t2); + + buf_ptr = dst + 4 * Alpha * ICB * nr_tiles_in_loop_ * PACK_C_SIZE + + icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE; + t3 = MADD(d14, d10, v0, 0); + t3 = MSUB(t3, d12, v0, 1); + GiStoreFloat32(buf_ptr, t3); + t3 = MSUB(d13, d11, v0, 0); + t5 = MSUB(d14, d12, v0, 0); + t4 = GiAddFloat32(t3, t5); + GiStoreFloat32(buf_ptr + ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t4); + t4 = GiSubtractFloat32(t5, t3); + GiStoreFloat32(buf_ptr + 2 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t4); + t3 = GiSubtractFloat32(d13, d11); + t4 = GiSubtractFloat32(d14, d12); + t5 = MADD(t4, t3, v0, 2); + GiStoreFloat32(buf_ptr + 3 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t5); + t5 = MSUB(t4, t3, v0, 2); + GiStoreFloat32(buf_ptr + 4 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t5); + t4 = GiSubtractFloat32(d15, d13); + t5 = MSUB(t4, t3, v0, 0); + GiStoreFloat32(buf_ptr + 5 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t5); + + // load line 5 -> d30 ... d35 + offset = base_offset + 5 * IW_ * 4; + line_ptr = input_ptr_ + offset; + if (inner) { +#define cb(i) d3##i = GiLoadFloat32(line_ptr + i * PACK_C_SIZE); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { + if (ih_valid[5] == 1) { +#define cb(i) d3##i = iw_valid[i] ==1 ? GiLoadFloat32(line_ptr + i * PACK_C_SIZE) : zero; + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { +#define cb(i) d3##i = zero; + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } + } + + // load line 1 -> d0 ... d5 + offset = base_offset + 1 * IW_ * 4; + line_ptr = input_ptr_ + offset; + if (inner) { +#define cb(i) \ + d0##i = GiLoadFloat32(line_ptr + i * PACK_C_SIZE); \ + d3##i = MADD(d3##i, d0##i, v0, 0); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { + if (ih_valid[1] ==1) { +#define cb(i) \ + d0##i = iw_valid[i] ==1 ? GiLoadFloat32(line_ptr + i * PACK_C_SIZE) : zero; \ + d3##i = MADD(d3##i, d0##i, v0, 0); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { +#define cb(i) \ + d0##i = zero; \ + d3##i = MADD(d3##i, d0##i, v0, 0); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } + } + + // load line 3 -> d10 ... d15 + offset = base_offset + 3 * IW_ * 4; + line_ptr = input_ptr_ + offset; + if (inner) { +#define cb(i) \ + d1##i = GiLoadFloat32(line_ptr + i * PACK_C_SIZE); \ + d3##i = MSUB(d3##i, d1##i, v0, 1); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { + if (ih_valid[3] == 1 ) { +#define cb(i) \ + d1##i = iw_valid[i] ==1 ? GiLoadFloat32(line_ptr + i * PACK_C_SIZE) : zero; \ + d3##i = MSUB(d3##i, d1##i, v0, 1); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } else { +#define cb(i) \ + d1##i = zero; \ + d3##i = MSUB(d3##i, d1##i, v0, 1); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + } + } + + buf_ptr = dst + 5 * Alpha * ICB * nr_tiles_in_loop_ * PACK_C_SIZE + + icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE; + t0 = MADD(d34, d30, v0, 0); + t0 = MSUB(t0, d32, v0, 1); + GiStoreFloat32(buf_ptr, t0); + t0 = MSUB(d33, d31, v0, 0); + t2 = MSUB(d34, d32, v0, 0); + t1 = GiAddFloat32(t0, t2); + GiStoreFloat32(buf_ptr + ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t1); + t1 = GiSubtractFloat32(t2, t0); + GiStoreFloat32(buf_ptr + 2 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t1); + t0 = GiSubtractFloat32(d33, d31); + t1 = GiSubtractFloat32(d34, d32); + t2 = MADD(t1, t0, v0, 2); + GiStoreFloat32(buf_ptr + 3 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t2); + t2 = MSUB(t1, t0, v0, 2); + GiStoreFloat32(buf_ptr + 4 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t2); + t1 = GiSubtractFloat32(d35, d33); + t2 = MSUB(t1, t0, v0, 0); + GiStoreFloat32(buf_ptr + 5 * ICB * nr_tiles_in_loop_ * PACK_C_SIZE, t2); + + + +)"; + return kernel; + }; + std::string input_process = R"( + const uint32_t OUTPUT_BLOCK_SIZE = 4; + const uint32_t KS = 3; + + float* dst = ${transform_input_ptr}; + const float* source = ${inptr}; + uint32_t IH_ = ${IH}; + uint32_t IW_ = ${IW}; + uint32_t IC_ = ${IC}; + uint32_t PH_ = ${PH}; + uint32_t PW_ = ${PW}; + uint32_t nr_tiles_in_loop_ = ${nr_tiles_in_loop}; + uint32_t tile_id_ = ${tile_id}; + + const float input_parameters[4] = {4.0f, 5.0f, 2.0f, 0.0f}; + + #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + //! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use + //! GiMultiplyAddScalarFloat32 + #define MADD(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d)) + #define MSUB(a, b, c, d) GiMultiplySubScalarFloat32(a, b, *(c + d)) + #else + #define MADD(a, b, c, d) GiSimdFmaLane(a, b, c, d) + #define MSUB(a, b, c, d) GiFmsqLaneQFloat32(a, b, c, d) + #endif + + uint32_t OW = IW_ + 2 * PW_ - KS + 1; + uint32_t tiles_w = (OW + OUTPUT_BLOCK_SIZE -1)/ OUTPUT_BLOCK_SIZE; + int ih_valid[6]={0,0,0,0,0,0}; + int iw_valid[6]={0,0,0,0,0,0}; + + for (uint32_t ic = 0; ic < IC_; ic += 4) { + uint32_t tile_start_id = tile_id_; + for(uint32_t tile_idx = 0; tile_idx < nr_tiles_in_loop_; tile_idx++) { + uint32_t index = tile_start_id + tile_idx; + uint32_t nh = index / tiles_w; + uint32_t nw = index % tiles_w; + + int ih_start = nh * OUTPUT_BLOCK_SIZE - PH_; + int iw_start = nw * OUTPUT_BLOCK_SIZE - PW_; + int inner = (ih_start >= 0 && iw_start >= 0 && + ih_start + Alpha <= (int)IH_ && + iw_start + Alpha <= (int)IW_)?1:0; + if(!inner){ + for (int iho = 0; iho < Alpha; ++iho) { + ih_valid[iho] = + (iho + ih_start >= 0 && + iho + ih_start < (int)IH_) ? 1:0; + } + for (int iwo = 0; iwo < Alpha; ++iwo) { + iw_valid[iwo] = + (iwo + iw_start >= 0 && + iwo + iw_start < (int)(IW_))?1:0; + } + } + ${InputTransformF43NCHW44()} + } + })"; + std::stringstream ss; + ss << StringTemplate::StringTemplateArgs() + .add("inptr", strs[0]) + .add("transform_input_ptr", strs[1]) + .add("IH", strs[2]) + .add("IW", strs[3]) + .add("IC", strs[4]) + .add("PH", strs[5]) + .add("PW", strs[6]) + .add("tile_id", strs[7]) + .add("nr_tiles_in_loop", strs[8]) + .add("InputTransformF43NCHW44", InputTransformF43NCHW44) + .render(input_process); + return ss.str(); +} + +std::string WinogradF43Strategy4x16MK4::DependMatmulSymbol() { + return Arm64::MatmulM4N16MK4Kernel().GetKernelSymbol(NULL); +} + +std::string WinogradF43Strategy4x16MK4::BatchedMatMul( + const std::vector& strs) { + std::string matmul_compute = R"( + for(uint32_t i =0; i< Alpha; i++){ + for(uint32_t j=0; j& strs, TContext* ctx) { + std::string ouput_trans = R"( + float* transform_output_ptr_ = ${transform_output_ptr}; + const float output_parameters[4] = {1.0f, 2.0f, 4.0f, 8.0f}; + float* outptr_ = ${outptr}; + const float* bias = ${bias_ptr}; + + uint32_t OH_ = ${OH}; + uint32_t OW_ = ${OW}; + uint32_t OC_ = ${OC}; + + uint32_t tile_id_ = ${tile_id}; + uint32_t nr_tiles_in_loop_ = ${nr_tiles_in_loop}; + #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* v0 = output_parameters; + #else + GI_FLOAT32_t v0 = GiLoadFloat32(output_parameters); + #endif + uint32_t tiles_w_ = (OW_ + OutputBlockSize -1) / OutputBlockSize; + for (uint32_t oc = 0; oc < OC_; oc += 4) { + for(uint32_t tile_idx = 0; tile_idx < nr_tiles_in_loop_; tile_idx++) { + uint32_t index = tile_id_ + tile_idx; + uint32_t nh = index / tiles_w_; + uint32_t nw = index % tiles_w_; + uint32_t oh_start = nh * OutputBlockSize; + uint32_t ow_start = nw * OutputBlockSize; + + size_t num_valid_oh =(OH_ - oh_start) < 4 ?(OH_ - oh_start) : 4; + size_t num_valid_ow = (OW_ - ow_start) < 4 ?(OW_ - ow_start) : 4; + + //! AT * m * A + size_t OCB = (OC_) / PACK_C_SIZE; + size_t ocb = oc / PACK_C_SIZE; + size_t col_step = OCB * nr_tiles_in_loop_ * 4; + size_t row_step = Alpha * col_step; + + GI_FLOAT32_t vbias = GiZeroFloat32(); + GI_FLOAT32_t v00, v01, v02, v03, v04, v05; + GI_FLOAT32_t v10, v11, v12, v13, v14, v15; + GI_FLOAT32_t v20, v21, v22, v23, v24, v25; + GI_FLOAT32_t v30, v31, v32, v33, v34, v35; + GI_FLOAT32_t v40, v41, v42, v43, v44, v45; + + if(num_valid_ow == num_valid_oh && num_valid_ow ==4){ + const float* buf_base = + transform_output_ptr_ + ocb * nr_tiles_in_loop_ * 4 + tile_idx * 4; + const float* buf_ptr = NULL; + + // load line 1 -> v10 ... v15 + buf_ptr = buf_base + row_step; + #define cb(i) v1##i = GiLoadFloat32(buf_ptr + i * col_step); + UNROLL_CALL_NOWRAPPER(6, cb); + #undef cb + + // load line 2 -> v20 ... v25 + buf_ptr = buf_base + 2 * row_step; + #define cb(i) \ + v2##i = GiLoadFloat32(buf_ptr + i * col_step); \ + v0##i = GiAddFloat32(v1##i, v2##i); \ + v1##i = GiSubtractFloat32(v1##i, v2##i); + UNROLL_CALL_NOWRAPPER(6, cb); + #undef cb + + // load line 3 -> v30 ... v35 + buf_ptr = buf_base + 3 * row_step; + #define cb(i) v3##i = GiLoadFloat32(buf_ptr + i * col_step); + UNROLL_CALL_NOWRAPPER(6, cb); + #undef cb + + // load line 4 -> v40 ... v45 + buf_ptr = buf_base + 4 * row_step; + #define cb(i) \ + v4##i = GiLoadFloat32(buf_ptr + i * col_step); \ + v2##i = GiAddFloat32(v3##i, v4##i); \ + v3##i = GiSubtractFloat32(v3##i, v4##i); \ + v4##i = MADD(v0##i, v2##i, v0, 2); \ + v2##i = GiAddFloat32(v2##i, v0##i); + UNROLL_CALL_NOWRAPPER(6, cb); + #undef cb + ${nonline_gen_init()} + float* output_base = outptr_ + oc * OH_ * OW_ + oh_start * OW_ * PACK_C_SIZE + + ow_start * PACK_C_SIZE; + float* output_ptr = output_base + 2 * OW_ * PACK_C_SIZE; + if (bias) { + vbias = GiLoadFloat32(bias + oc); + } + v00 = GiAddFloat32(v41, v42); + v01 = GiAddFloat32(v43, v44); + v02 = GiAddFloat32(v40, v00); + v02 = GiAddFloat32(v02, v01); + + v02 = GiAddFloat32(v02, vbias); + ${nonline_gen_func(v02, v02)}; + GiStoreFloat32(output_ptr, v02); + + v03 = GiSubtractFloat32(v41, v42); + v04 = GiSubtractFloat32(v43, v44); + v05 = MADD(v03, v04, v0, 1); + + v05 = GiAddFloat32(v05, vbias); + ${nonline_gen_func(v05, v05)}; + GiStoreFloat32(output_ptr + PACK_C_SIZE, v05); + + v02 = MADD(v00, v01, v0, 2); + + v02 = GiAddFloat32(v02, vbias); + ${nonline_gen_func(v02, v02)}; + GiStoreFloat32(output_ptr + 2 * PACK_C_SIZE, v02); + + v05 = MADD(v03, v04, v0, 3); + v05 = GiAddFloat32(v05, v45); + + v05 = GiAddFloat32(v05, vbias); + ${nonline_gen_func(v05, v05)}; + GiStoreFloat32(output_ptr + 3 * PACK_C_SIZE, v05); + + buf_ptr = buf_base; + #define cb(i) \ + v4##i = GiLoadFloat32(buf_ptr + i * col_step); \ + v4##i = GiAddFloat32(v4##i, v2##i); + UNROLL_CALL_NOWRAPPER(6, cb); + #undef cb + + output_ptr = output_base; + + v00 = GiAddFloat32(v41, v42); + v01 = GiAddFloat32(v43, v44); + v02 = GiAddFloat32(v40, v00); + v02 = GiAddFloat32(v02, v01); + + v02 = GiAddFloat32(v02, vbias); + ${nonline_gen_func(v02, v02)}; + GiStoreFloat32(output_ptr, v02); + + v03 = GiSubtractFloat32(v41, v42); + v04 = GiSubtractFloat32(v43, v44); + v05 = MADD(v03, v04, v0, 1); + + v05 = GiAddFloat32(v05, vbias); + ${nonline_gen_func(v05, v05)}; + GiStoreFloat32(output_ptr + PACK_C_SIZE, v05); + + v02 = MADD(v00, v01, v0, 2); + + v02 = GiAddFloat32(v02, vbias); + ${nonline_gen_func(v02,v02)}; + GiStoreFloat32(output_ptr + 2 * PACK_C_SIZE, v02); + + v05 = MADD(v03, v04, v0, 3); + v05 = GiAddFloat32(v05, v45); + + v05 = GiAddFloat32(v05, vbias); + ${nonline_gen_func(v05,v05)}; + GiStoreFloat32(output_ptr + 3 * PACK_C_SIZE, v05); + + #define cb(i) v4##i = MADD(v1##i, v3##i, v0, 1); + UNROLL_CALL_NOWRAPPER(6, cb); + #undef cb + + output_ptr = output_base + OW_ * PACK_C_SIZE; + + v00 = GiAddFloat32(v41, v42); + v01 = GiAddFloat32(v43, v44); + v02 = GiAddFloat32(v40, v00); + v02 = GiAddFloat32(v02, v01); + + v02 = GiAddFloat32(v02, vbias); + ${nonline_gen_func(v02, v02)}; + GiStoreFloat32(output_ptr, v02); + + v03 = GiSubtractFloat32(v41, v42); + v04 = GiSubtractFloat32(v43, v44); + v05 = MADD(v03, v04, v0, 1); + + v05 = GiAddFloat32(v05, vbias); + ${nonline_gen_func(v05, v05)}; + GiStoreFloat32(output_ptr + PACK_C_SIZE, v05); + + v02 = MADD(v00, v01, v0, 2); + + v02 = GiAddFloat32(v02, vbias); + ${nonline_gen_func(v02, v02)}; + GiStoreFloat32(output_ptr + 2 * PACK_C_SIZE, v02); + + v05 = MADD(v03, v04, v0, 3); + v05 = GiAddFloat32(v05, v45); + + v05 = GiAddFloat32(v05, vbias); + ${nonline_gen_func(v05, v05)}; + GiStoreFloat32(output_ptr + 3 * PACK_C_SIZE, v05); + + buf_ptr = buf_base + 5 * row_step; + #define cb(i) \ + v2##i = GiLoadFloat32(buf_ptr + i * col_step); \ + v1##i = MADD(v1##i, v3##i, v0, 3); \ + v2##i = GiAddFloat32(v1##i, v2##i); + UNROLL_CALL_NOWRAPPER(6, cb); + #undef cb + + output_ptr = output_base + 3 * OW_ * PACK_C_SIZE; + + v00 = GiAddFloat32(v21, v22); + v01 = GiAddFloat32(v23, v24); + v02 = GiAddFloat32(v20, v00); + v02 = GiAddFloat32(v02, v01); + + v02 = GiAddFloat32(v02, vbias); + ${nonline_gen_func(v02, v02)}; + GiStoreFloat32(output_ptr, v02); + + v03 = GiSubtractFloat32(v21, v22); + v04 = GiSubtractFloat32(v23, v24); + v05 = MADD(v03, v04, v0, 1); + + v05 = GiAddFloat32(v05, vbias); + ${nonline_gen_func(v05, v05)}; + GiStoreFloat32(output_ptr + PACK_C_SIZE, v05); + + v02 = MADD(v00, v01, v0, 2); + + v02 = GiAddFloat32(v02, vbias); + ${nonline_gen_func(v02, v02)}; + GiStoreFloat32(output_ptr + 2 * PACK_C_SIZE, v02); + + v05 = MADD(v03, v04, v0, 3); + v05 = GiAddFloat32(v05, v25); + + v05 = GiAddFloat32(v05, vbias); + ${nonline_gen_func(v05, v05)}; + GiStoreFloat32(output_ptr + 3 * PACK_C_SIZE, v05); + }else{ + + const float* buf_base = + transform_output_ptr_ + ocb * nr_tiles_in_loop_ * 4 + tile_idx * 4; + const float* buf_ptr = NULL; + + // load line 1 -> v10 ... v15 + buf_ptr = buf_base + row_step; +#define cb(i) v1##i = GiLoadFloat32(buf_ptr + i * col_step); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + + // load line 2 -> v20 ... v25 + buf_ptr = buf_base + 2 * row_step; +#define cb(i) \ + v2##i = GiLoadFloat32(buf_ptr + i * col_step); \ + v0##i = GiAddFloat32(v1##i, v2##i); \ + v1##i = GiSubtractFloat32(v1##i, v2##i); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + + // load line 3 -> v30 ... v35 + buf_ptr = buf_base + 3 * row_step; +#define cb(i) v3##i = GiLoadFloat32(buf_ptr + i * col_step); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + + // load line 4 -> v40 ... v45 + buf_ptr = buf_base + 4 * row_step; +#define cb(i) \ + v4##i = GiLoadFloat32(buf_ptr + i * col_step); \ + v2##i = GiAddFloat32(v3##i, v4##i); \ + v3##i = GiSubtractFloat32(v3##i, v4##i); \ + v4##i = MADD(v0##i, v2##i, v0, 2); \ + v2##i = GiAddFloat32(v2##i, v0##i); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + + // result line 2, v40 ... v45 -> v02 ... v05 + // v40 ... v45 is free. + v00 = GiAddFloat32(v41, v42); + v01 = GiAddFloat32(v43, v44); + v02 = GiAddFloat32(v40, v00); + v02 = GiAddFloat32(v02, v01); + + v04 = MADD(v00, v01, v0, 2); + + v00 = GiSubtractFloat32(v41, v42); + v01 = GiSubtractFloat32(v43, v44); + v03 = MADD(v00, v01, v0, 1); + + v05 = MADD(v00, v01, v0, 3); + v05 = GiAddFloat32(v05, v45); + + buf_ptr = buf_base; +#define cb(i) \ + v4##i = GiLoadFloat32(buf_ptr + i * col_step); \ + v4##i = GiAddFloat32(v4##i, v2##i); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + + // result line 0 + // v40 ... v45 -> v22 ... v25 + v20 = GiAddFloat32(v41, v42); + v21 = GiAddFloat32(v43, v44); + v22 = GiAddFloat32(v40, v20); + v22 = GiAddFloat32(v22, v21); + + v24 = MADD(v20, v21, v0, 2); + + v20 = GiSubtractFloat32(v41, v42); + v21 = GiSubtractFloat32(v43, v44); + v23 = MADD(v20, v21, v0, 1); + + v25 = MADD(v20, v21, v0, 3); + v25 = GiAddFloat32(v25, v45); + +#define cb(i) \ + v4##i = MADD(v1##i, v3##i, v0, 1); \ + v3##i = MADD(v1##i, v3##i, v0, 3); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + + // result line 1 + // v40 ... v45 -> v12 ... v15 + v10 = GiAddFloat32(v41, v42); + v11 = GiAddFloat32(v43, v44); + v12 = GiAddFloat32(v40, v10); + v12 = GiAddFloat32(v12, v11); + + v14 = MADD(v10, v11, v0, 2); + + v10 = GiSubtractFloat32(v41, v42); + v11 = GiSubtractFloat32(v43, v44); + v13 = MADD(v10, v11, v0, 1); + + v15 = MADD(v10, v11, v0, 3); + v15 = GiAddFloat32(v15, v45); + + buf_ptr = buf_base + 5 * row_step; +#define cb(i) \ + v4##i = GiLoadFloat32(buf_ptr + i * col_step); \ + v4##i = GiAddFloat32(v3##i, v4##i); + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + + // result line 3 + // v40 ... v45 -> v32 ... v35 + v30 = GiAddFloat32(v41, v42); + v31 = GiAddFloat32(v43, v44); + v32 = GiAddFloat32(v40, v30); + v32 = GiAddFloat32(v32, v31); + + v34 = MADD(v30, v31, v0, 2); + + v30 = GiSubtractFloat32(v41, v42); + v31 = GiSubtractFloat32(v43, v44); + v33 = MADD(v30, v31, v0, 1); + + v35 = MADD(v30, v31, v0, 3); + v35 = GiAddFloat32(v35, v45); + + float* output_base = outptr_ + oc * OH_ * OW_ + oh_start * OW_ * PACK_C_SIZE + + ow_start * PACK_C_SIZE; + float* output_ptr = NULL; + + + ${nonline_gen_init()} + if (bias) { + vbias = GiLoadFloat32(bias + oc); + } +# define BIAS_LINE(i, j, k) \ + v##j##k = GiAddFloat32(v##j##k, vbias); + +#define BIAS(m) \ + BIAS_LINE(3, m, 5) \ + BIAS_LINE(2, m, 4) \ + BIAS_LINE(1, m, 3) \ + BIAS_LINE(0, m, 2) + + +// add_bias +if(bias){ + BIAS(0) + BIAS(1) + BIAS(2) + BIAS(3) +} +#undef BIAS_LINE +#undef BIAS + +// activate + +${nonline_gen_func(v35, vbias)};v35=vbias; +${nonline_gen_func(v34, vbias)};v34=vbias; +${nonline_gen_func(v33, vbias)};v33=vbias; +${nonline_gen_func(v32, vbias)};v32=vbias; + +${nonline_gen_func(v25, vbias)};v25=vbias; +${nonline_gen_func(v24, vbias)};v24=vbias; +${nonline_gen_func(v23, vbias)};v23=vbias; +${nonline_gen_func(v22, vbias)};v22=vbias; + +${nonline_gen_func(v15, vbias)};v15=vbias; +${nonline_gen_func(v14, vbias)};v14=vbias; +${nonline_gen_func(v13, vbias)};v13=vbias; +${nonline_gen_func(v12, vbias)};v12=vbias; + +${nonline_gen_func(v05, vbias)};v05=vbias; +${nonline_gen_func(v04, vbias)};v04=vbias; +${nonline_gen_func(v03, vbias)};v03=vbias; +${nonline_gen_func(v02, vbias)};v02=vbias; + + +// store +# define STORE_LINE(i, j, k) \ +if(num_valid_ow >i){ \ + GiStoreFloat32(output_ptr + i * PACK_C_SIZE, v##j##k); \ +} +#define STORE(m, l) \ +if(num_valid_oh >m){ \ + output_ptr = output_base + m * OW_ * PACK_C_SIZE; \ + STORE_LINE(3, l, 5) \ + STORE_LINE(2, l, 4) \ + STORE_LINE(1, l, 3) \ + STORE_LINE(0, l, 2) \ +} + STORE(3, 3) + STORE(2, 0) + STORE(1, 1) + STORE(0, 2) + } + +#undef MSUB +#undef MADD + } + })"; + std::string nonline_mode = ctx->haveAttr("nonlineMode") + ? ctx->getAttrStr("nonlineMode") + : "IDENTITY"; + auto nonline_gen = create_activation_gener_instrinsic(nonline_mode); + auto nonline_gen_func = [&](std::vector str) -> std::string { + return nonline_gen->GenIntrinsicFloat(str[0], str[1]); + }; + auto nonline_gen_init = [&]() -> std::string { + return nonline_gen->GenIntrinsicInitFloat(); + }; + + std::stringstream ss; + ss << StringTemplate::StringTemplateArgs() + .add("nonline_gen_func", nonline_gen_func) + .add("nonline_gen_init", nonline_gen_init) + .add("transform_output_ptr", strs[0]) + .add("outptr", strs[1]) + .add("bias_ptr", strs[2]) + .add("OH", strs[3]) + .add("OW", strs[4]) + .add("OC", strs[5]) + .add("tile_id", strs[6]) + .add("nr_tiles_in_loop", strs[7]) + .render(ouput_trans); + return ss.str(); +} + +// vim: syntax=cpp.doxygen diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h new file mode 100644 index 00000000..13dbccec --- /dev/null +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h @@ -0,0 +1,36 @@ +/** + * \file + * compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ +#pragma once +#include +#include "Common/ConvKernel.h" +#include "Utils/StringTemplate.h" +#include "WinogradCommon.h" +#include "compiler/KernelGen/KernelGen.h" +namespace megcc { +namespace KernelGen { +namespace GeneralIntrinsic { + +class WinogradF43Strategy4x16MK4 : public WinogradStrategyBase { +public: + uint32_t GetKernelSize() override { return 3; } + uint32_t GetOutputBlockSize() override { return 4; } + std::string DependMatmulSymbol() override; + std::string WeightTrans(const std::vector& strs) override; + std::string InputFeatureTrans( + const std::vector& strs) override; + std::string BatchedMatMul(const std::vector& strs) override; + std::string OutputFeatureTrans(const std::vector& strs, + TContext*) override; +}; + +} // namespace GeneralIntrinsic +} // namespace KernelGen +} // namespace megcc + +// vim: syntax=cpp.doxygen From 4f3b5ddb15cc5610315d02b8dd56a23b3cf0a3dd Mon Sep 17 00:00:00 2001 From: yuxiongxiong Date: Mon, 9 Jan 2023 11:11:38 +0800 Subject: [PATCH 13/17] feat(compiler): add f63 winograd for arm64 kernel --- .../Winograd/WinogradF63Strategy4x16MK4.cpp | 667 ++++++++++++++++++ .../Winograd/WinogradF63Strategy4x16MK4.h | 36 + 2 files changed, 703 insertions(+) create mode 100644 compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp create mode 100644 compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp new file mode 100644 index 00000000..c63bb9f9 --- /dev/null +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp @@ -0,0 +1,667 @@ +/** + * \file + * compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ + +#include "WinogradF63Strategy4x16MK4.h" +#include +#include "Arm/Arm64/InternalKernel/InternalKernel.h" +#include "GeneralIntrinsic/Activation.h" +#include "GeneralIntrinsic/ConvKernel/ConvKernel.h" +#include "Utils/StringTemplate.h" +#include "compiler/KernelGen/KernelGen.h" + +using namespace megcc; +using namespace KernelGen; +using namespace GeneralIntrinsic; + +std::string WinogradF63Strategy4x16MK4::WeightTrans( + const std::vector& strs) { + auto inptr = strs[0]; + auto outptr = strs[1]; + auto OC = strs[2]; + auto IC = strs[3]; + std::string filter_process = R"( + const uint32_t PACK_C_SIZE= 4; + const uint32_t KERNEL_SIZE = 3; + size_t OCB = ${OC} / PACK_C_SIZE; + size_t ICB = ${IC} / PACK_C_SIZE; + + for (size_t ocb = 0; ocb < OCB; ocb++) { + for (size_t icb = 0; icb < ICB; icb++) { + for (size_t ic_inner = 0; ic_inner < PACK_C_SIZE; ic_inner++) { + const float* fptr = ${filter} + (ocb * ICB + icb) * KERNEL_SIZE * + KERNEL_SIZE * PACK_C_SIZE * PACK_C_SIZE + + ic_inner * PACK_C_SIZE; + //! read 4OC 1IC filter + GI_FLOAT32_t g00 = GiLoadFloat32(fptr + 0* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g01 = GiLoadFloat32(fptr + 1* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g02 = GiLoadFloat32(fptr + 2* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g10 = GiLoadFloat32(fptr + 3* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g11 = GiLoadFloat32(fptr + 4* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g12 = GiLoadFloat32(fptr + 5* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g20 = GiLoadFloat32(fptr + 6* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g21 = GiLoadFloat32(fptr + 7* PACK_C_SIZE * PACK_C_SIZE); + GI_FLOAT32_t g22 = GiLoadFloat32(fptr + 8* PACK_C_SIZE * PACK_C_SIZE); + + //! twice matmul + GI_FLOAT32_t tmp0, tmp1; + ${FilterTransUnroll(3, midle, g, tmp0, tmp1)} + ${FilterTransUnroll(8, ret, midle, tmp0, tmp1)} + + //! write to the dst + float* dst = ${outptr}; + ${StoreRet2D(8, 8, ret)}; + } + } + })"; + auto FilterTransUnroll = [](const std::vector& strs) { + int times = std::stoi(strs[0]); + std::string dst = strs[1]; + std::string src = strs[2]; + std::string tmp0 = strs[3]; + std::string tmp1 = strs[4]; + std::stringstream ss; + for (int i = 0; i < times; i++) { + ss << "GI_FLOAT32_t " << dst << i << "0 = " << src << "0" << i + << ";\n"; + ss << tmp0 << " = GiMultiplyScalerFloat32(GiAddFloat32(" << src + << "0" << i << ", " << src << "2" << i << "), (-2.0/9));\n"; + ss << tmp1 << " = GiMultiplyScalerFloat32(" << src << "1" << i + << ", (-2.0/9));\n"; + ss << "GI_FLOAT32_t " << dst << i << "1 = GiAddFloat32(" << tmp0 + << ", " << tmp1 << ");\n"; + ss << "GI_FLOAT32_t " << dst << i << "2 = GiSubtractFloat32(" + << tmp0 << ", " << tmp1 << ");\n"; + ss << tmp0 << " = GiAddFloat32(GiMultiplyScalerFloat32(" << src + << "0" << i << ", 1.0/90), GiMultiplyScalerFloat32(" << src + << "2" << i << ", 2.0/45));\n"; + ss << tmp1 << " = GiMultiplyScalerFloat32(" << src << "1" << i + << ", 2.0/90);\n"; + ss << "GI_FLOAT32_t " << dst << i << "3 = GiAddFloat32(" << tmp0 + << ", " << tmp1 << ");\n"; + ss << "GI_FLOAT32_t " << dst << i << "4 = GiSubtractFloat32(" + << tmp0 << ", " << tmp1 << ");\n"; + ss << tmp0 << " = GiAddFloat32(GiMultiplyScalerFloat32(" << src + << "0" << i << ", 32.0/45), GiMultiplyScalerFloat32(" << src + << "2" << i << ", 8.0/45));\n"; + ss << tmp1 << " = GiMultiplyScalerFloat32(" << src << "1" << i + << ", 16.0/45);\n"; + ss << "GI_FLOAT32_t " << dst << i << "5 = GiAddFloat32(" << tmp0 + << ", " << tmp1 << ");\n"; + ss << "GI_FLOAT32_t " << dst << i << "6 = GiSubtractFloat32(" + << tmp0 << ", " << tmp1 << ");\n"; + ss << "GI_FLOAT32_t " << dst << i << "7 = " << src << "2" << i + << ";\n"; + } + return ss.str(); + }; + + auto StoreRet2D = [](const std::vector& strs) { + int times_out = std::stoi(strs[0]); + int times_inner = std::stoi(strs[1]); + std::string src = strs[2]; + std::stringstream ss; + for (int out = 0; out < times_out; out++) { + for (int inner = 0; inner < times_inner; inner++) { + ss << "GiStoreFloat32(dst + (" << out << " * Alpha + " << inner + << ") * OCB * ICB * PACK_C_SIZE * PACK_C_SIZE + ocb * ICB * " + "PACK_C_SIZE *PACK_C_SIZE + icb* PACK_C_SIZE * " + "PACK_C_SIZE + " + "ic_inner*PACK_C_SIZE, " + << src << out << inner << ");\n"; + } + } + return ss.str(); + }; + std::stringstream ss; + ss << StringTemplate::StringTemplateArgs() + .add("StoreRet2D", StoreRet2D) + .add("FilterTransUnroll", FilterTransUnroll) + .add("OC", OC) + .add("IC", IC) + .add("filter", inptr) + .add("outptr", outptr) + .render(filter_process); + return ss.str(); +} + +std::string WinogradF63Strategy4x16MK4::InputFeatureTrans( + const std::vector& strs) { + auto InputPrepareF43NCHW44 = [](std::vector) { + std::stringstream ss; + std::string kernel = R"( + size_t IW4 = IW_ * PACK_C_SIZE; + size_t iw4_start = iw_start * PACK_C_SIZE; + size_t icb = ic / PACK_C_SIZE; + memset(patchT, 0, sizeof(float) * PACK_C_SIZE * Alpha * Alpha); + if (inner) { + const float* input_ptr = + source + icb * IH_ * IW4 + ih_start * IW4 + iw4_start; + for (size_t ih = 0; ih < Alpha; ih++) { +#define cb(i) GI_FLOAT32_t v##i = GiLoadFloat32(input_ptr + PACK_C_SIZE * i); + UNROLL_CALL_NOWRAPPER(8, cb); +#undef cb + +#define cb(i) GiStoreFloat32(patchT + ih * PACK_C_SIZE * Alpha + i * PACK_C_SIZE, v##i); + UNROLL_CALL_NOWRAPPER(8, cb); +#undef cb + input_ptr += IW4; + } + } else { + int ih0_act = ih_start >0 ? ih_start:0, + ih1_act = (ih_start + Alpha)< IH_?(ih_start + Alpha):IH_, + iw0_act = iw_start > 0 ? iw_start : 0, + iw1_act =(iw_start + Alpha)< IW_?(iw_start + Alpha):IW_; + const float* input_ptr = source + icb * IH_ * IW4; + // partial copy + for (int ih = ih0_act; ih < ih1_act; ++ih) { + for (int iw = iw0_act; iw < iw1_act; ++iw) { + size_t iho = ih - ih_start, iwo = iw - iw_start; + GI_FLOAT32_t src = GiLoadFloat32(input_ptr + ih * IW4 + iw * PACK_C_SIZE); + GiStoreFloat32( + patchT + iho * PACK_C_SIZE * Alpha + iwo * PACK_C_SIZE, src); + } + } + } + + +)"; + return kernel; + }; + auto InputTransformF43NCHW44 = [](std::vector) { + std::stringstream ss; + std::string kernel = R"( + // BT * d * B + + size_t ICB = IC_ / PACK_C_SIZE; + + GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7; +#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + const float* v0 = input_parameters + 0; + const float* v1 = input_parameters + 4; + const float* v2 = input_parameters + 8; +#else + GI_FLOAT32_t v0 = GiLoadFloat32(input_parameters + 0); + GI_FLOAT32_t v1 = GiLoadFloat32(input_parameters + 4); + GI_FLOAT32_t v2 = GiLoadFloat32(input_parameters + 8); +#endif + + //! B + //! 1 0 0 0 0 0 0 0 + //! 0 1 -1 0.5 -0.5 2 -2 -1 + //! -5.25 1 1 0.25 0.25 4 4 0 + //! 0 -4.25 4.25 -2.5 2.5 -2.5 2.5 5.25 + //! 5.25 -4.25 -4.25 -1.25 -1.25 -5 -5 0 + //! 0 1 -1 2 -2 0.5 -0.5 -5.25 + //! -1 1 1 1 1 1 1 0 + //! 0 0 0 0 0 0 0 1 + +#define cb(i) \ + d1 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 1 * PACK_C_SIZE); \ + d2 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 2 * PACK_C_SIZE); \ + d3 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 3 * PACK_C_SIZE); \ + d4 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 4 * PACK_C_SIZE); \ + d5 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 5 * PACK_C_SIZE); \ + d6 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 6 * PACK_C_SIZE); \ + GI_FLOAT32_t t##i##0 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 0 * PACK_C_SIZE); \ + GI_FLOAT32_t t##i##7 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 7 * PACK_C_SIZE); \ + GI_FLOAT32_t t##i##1 = d6; \ + GI_FLOAT32_t t##i##2 = d6; \ + GI_FLOAT32_t t##i##3 = d6; \ + GI_FLOAT32_t t##i##4 = d6; \ + GI_FLOAT32_t t##i##5 = d6; \ + GI_FLOAT32_t t##i##6 = d6; \ + t##i##0 = GiSubtractFloat32(t##i##0, d6); \ + t##i##1 = GiAddFloat32(t##i##1, d1); \ + t##i##2 = GiSubtractFloat32(t##i##2, d1); \ + t##i##3 = MADD(t##i##3, d1, v0, 2); \ + t##i##4 = MSUB(t##i##4, d1, v0, 2); \ + t##i##5 = MADD(t##i##5, d1, v1, 2); \ + t##i##6 = MSUB(t##i##6, d1, v1, 2); \ + t##i##7 = GiSubtractFloat32(t##i##7, d1); \ + t##i##0 = MSUB(t##i##0, d2, v0, 0); \ + t##i##1 = GiAddFloat32(t##i##1, d2); \ + t##i##2 = GiAddFloat32(t##i##2, d2); \ + t##i##3 = MADD(t##i##3, d2, v0, 3); \ + t##i##4 = MADD(t##i##4, d2, v0, 3); \ + t##i##5 = MADD(t##i##5, d2, v1, 3); \ + t##i##6 = MADD(t##i##6, d2, v1, 3); \ + t##i##1 = MSUB(t##i##1, d3, v0, 1); \ + t##i##2 = MADD(t##i##2, d3, v0, 1); \ + t##i##3 = MSUB(t##i##3, d3, v1, 0); \ + t##i##4 = MADD(t##i##4, d3, v1, 0); \ + t##i##5 = MSUB(t##i##5, d3, v1, 0); \ + t##i##6 = MADD(t##i##6, d3, v1, 0); \ + t##i##7 = MADD(t##i##7, d3, v0, 0); \ + t##i##0 = MADD(t##i##0, d4, v0, 0); \ + t##i##1 = MSUB(t##i##1, d4, v0, 1); \ + t##i##2 = MSUB(t##i##2, d4, v0, 1); \ + t##i##3 = MSUB(t##i##3, d4, v1, 1); \ + t##i##4 = MSUB(t##i##4, d4, v1, 1); \ + t##i##5 = MSUB(t##i##5, d4, v2, 0); \ + t##i##6 = MSUB(t##i##6, d4, v2, 0); \ + t##i##1 = GiAddFloat32(t##i##1, d5); \ + t##i##2 = GiSubtractFloat32(t##i##2, d5); \ + t##i##3 = MADD(t##i##3, d5, v1, 2); \ + t##i##4 = MSUB(t##i##4, d5, v1, 2); \ + t##i##5 = MADD(t##i##5, d5, v0, 2); \ + t##i##6 = MSUB(t##i##6, d5, v0, 2); \ + t##i##7 = MSUB(t##i##7, d5, v0, 0); + UNROLL_CALL_RAW(8, cb); +#undef cb + +#define cb(i) \ + d0 = t0##i; \ + d1 = t6##i; \ + d2 = t6##i; \ + d3 = t6##i; \ + d4 = t6##i; \ + d5 = t6##i; \ + d6 = t6##i; \ + d7 = t7##i; \ + d0 = GiSubtractFloat32(d0, t6##i); \ + d1 = GiAddFloat32(d1, t1##i); \ + d2 = GiSubtractFloat32(d2, t1##i); \ + d3 = MADD(d3, t1##i, v0, 2); \ + d4 = MSUB(d4, t1##i, v0, 2); \ + d5 = MADD(d5, t1##i, v1, 2); \ + d6 = MSUB(d6, t1##i, v1, 2); \ + d7 = GiSubtractFloat32(d7, t1##i); \ + d0 = MSUB(d0, t2##i, v0, 0); \ + d1 = GiAddFloat32(d1, t2##i); \ + d2 = GiAddFloat32(d2, t2##i); \ + d3 = MADD(d3, t2##i, v0, 3); \ + d4 = MADD(d4, t2##i, v0, 3); \ + d5 = MADD(d5, t2##i, v1, 3); \ + d6 = MADD(d6, t2##i, v1, 3); \ + d1 = MSUB(d1, t3##i, v0, 1); \ + d2 = MADD(d2, t3##i, v0, 1); \ + d3 = MSUB(d3, t3##i, v1, 0); \ + d4 = MADD(d4, t3##i, v1, 0); \ + d5 = MSUB(d5, t3##i, v1, 0); \ + d6 = MADD(d6, t3##i, v1, 0); \ + d7 = MADD(d7, t3##i, v0, 0); \ + d0 = MADD(d0, t4##i, v0, 0); \ + d1 = MSUB(d1, t4##i, v0, 1); \ + d2 = MSUB(d2, t4##i, v0, 1); \ + d3 = MSUB(d3, t4##i, v1, 1); \ + d4 = MSUB(d4, t4##i, v1, 1); \ + d5 = MSUB(d5, t4##i, v2, 0); \ + d6 = MSUB(d6, t4##i, v2, 0); \ + d1 = GiAddFloat32(d1, t5##i); \ + d2 = GiSubtractFloat32(d2, t5##i); \ + d3 = MADD(d3, t5##i, v1, 2); \ + d4 = MSUB(d4, t5##i, v1, 2); \ + d5 = MADD(d5, t5##i, v0, 2); \ + d6 = MSUB(d6, t5##i, v0, 2); \ + d7 = MSUB(d7, t5##i, v0, 0); \ + GiStoreFloat32( \ + dst + \ + (0 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE + \ + icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \ + d0); \ + GiStoreFloat32( \ + dst + \ + (1 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE + \ + icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \ + d1); \ + GiStoreFloat32( \ + dst + \ + (2 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE + \ + icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \ + d2); \ + GiStoreFloat32( \ + dst + \ + (3 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE + \ + icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \ + d3); \ + GiStoreFloat32( \ + dst + \ + (4 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE + \ + icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \ + d4); \ + GiStoreFloat32( \ + dst + \ + (5 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE + \ + icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \ + d5); \ + GiStoreFloat32( \ + dst + \ + (6 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE + \ + icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \ + d6); \ + GiStoreFloat32( \ + dst + \ + (7 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE + \ + icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \ + d7); + UNROLL_CALL_RAW(8, cb); +#undef cb + + +)"; + return kernel; + }; + + std::string input_process = R"( + const uint32_t OUTPUT_BLOCK_SIZE = 6; + const uint32_t KS = 3; + + float* dst = ${transform_input_ptr}; + const float* source = ${inptr}; + uint32_t IH_ = ${IH}; + uint32_t IW_ = ${IW}; + uint32_t IC_ = ${IC}; + uint32_t PH_ = ${PH}; + uint32_t PW_ = ${PW}; + uint32_t nr_tiles_in_loop_ = ${nr_tiles_in_loop}; + uint32_t tile_id_ = ${tile_id}; + + + const float input_parameters[12] = {5.25f, 4.25f, 0.5f, 0.25f, 2.5f, 1.25f, + 2.0f, 4.0f, 5.0f, 0.0f, 0.0f, 0.0f}; + + #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS) + //! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use + //! GiMultiplyAddScalarFloat32 + #define MADD(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d)) + #define MSUB(a, b, c, d) GiMultiplySubScalarFloat32(a, b, *(c + d)) + #else + #define MADD(a, b, c, d) GiSimdFmaLane(a, b, c, d) + #define MSUB(a, b, c, d) GiFmsqLaneQFloat32(a, b, c, d) + #endif + + uint32_t OW = IW_ + 2 * PW_ - KS + 1; + uint32_t tiles_w = (OW + OUTPUT_BLOCK_SIZE -1)/ OUTPUT_BLOCK_SIZE; + float* patch = transform_mid_ptr; + float* patchT = transform_mid_ptr + PACK_C_SIZE * Alpha * Alpha; + + for (uint32_t ic = 0; ic < IC_; ic += 4) { + uint32_t tile_start_id = tile_id_; + for(uint32_t tile_idx = 0; tile_idx < nr_tiles_in_loop_; tile_idx++) { + uint32_t index = tile_start_id + tile_idx; + uint32_t nh = index / tiles_w; + uint32_t nw = index % tiles_w; + + int ih_start = nh * OUTPUT_BLOCK_SIZE - PH_; + int iw_start = nw * OUTPUT_BLOCK_SIZE - PW_; + int inner = (ih_start >= 0 && iw_start >= 0 && + ih_start + Alpha <= (int)IH_ && + iw_start + Alpha <= (int)IW_)?1:0; + + + ${InputPrepareF43NCHW44()} + ${InputTransformF43NCHW44()} + } + })"; + + std::stringstream ss; + ss << StringTemplate::StringTemplateArgs() + .add("inptr", strs[0]) + .add("transform_input_ptr", strs[1]) + .add("IH", strs[2]) + .add("IW", strs[3]) + .add("IC", strs[4]) + .add("PH", strs[5]) + .add("PW", strs[6]) + .add("tile_id", strs[7]) + .add("nr_tiles_in_loop", strs[8]) + .add("InputTransformF43NCHW44", InputTransformF43NCHW44) + .add("InputPrepareF43NCHW44", InputPrepareF43NCHW44) + .render(input_process); + return ss.str(); +} + +std::string WinogradF63Strategy4x16MK4::DependMatmulSymbol() { + return Arm64::MatmulM4N16MK4Kernel().GetKernelSymbol(NULL); +} + +std::string WinogradF63Strategy4x16MK4::BatchedMatMul( + const std::vector& strs) { + std::string matmul_compute = R"( + for(uint32_t i =0; i< Alpha; i++){ + for(uint32_t j=0; j& strs, TContext* ctx) { + std::string ouput_trans = R"( + float* transform_output_ptr_ = ${transform_output_ptr}; + float* outptr_ = ${outptr}; + const float* bias = ${bias_ptr}; + + uint32_t OH_ = ${OH}; + uint32_t OW_ = ${OW}; + uint32_t OC_ = ${OC}; + uint32_t tile_id_ = ${tile_id}; + uint32_t nr_tiles_in_loop_ = ${nr_tiles_in_loop}; + uint32_t tiles_w_ = (OW_ + OutputBlockSize -1) / OutputBlockSize; + for (uint32_t oc = 0; oc < OC_; oc += 4) { + for(uint32_t tile_idx = 0; tile_idx < nr_tiles_in_loop_; tile_idx++) { + uint32_t index = tile_id_ + tile_idx; + uint32_t nh = index / tiles_w_; + uint32_t nw = index % tiles_w_; + uint32_t oh_start = nh * OutputBlockSize; + uint32_t ow_start = nw * OutputBlockSize; + //! AT * m * A + + size_t OCB = OC_ / PACK_C_SIZE; + size_t ocb = oc / PACK_C_SIZE; + +#define cb(m, n) \ + GI_FLOAT32_t v##m##n = GiLoadFloat32( \ + transform_output_ptr_ + \ + (m * Alpha + n) * OCB * nr_tiles_in_loop_ * PACK_C_SIZE + \ + ocb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE); + UNROLL_CALL_NOWRAPPER_D2(8, 8, cb); +#undef cb + + /** + * A + * + * 1 0 0 0 0 0 + * 1 1 1 1 1 1 + * 1 -1 1 -1 1 -1 + * 1 2 4 8 16 32 + * 1 -2 4 -8 16 -32 + * 1 0.5 0.25 0.125 0.0625 0.03125 + * 1 -0.5 0.25 -0.125 0.0625 -0.03125 + * 0 0 0 0 0 1 + */ + + /* + * v1addv2 = v1##m + v2##m; + * v1subv2 = v1##m - v2##m; + * v3addv4 = v3##m + v4##m; + * v3subv4 = v3##m - v4##m; + * v5addv6 = v5##m + v6##m; + * v5subv6 = v5##m - v6##m; + * t0##m = v0##m + v1addv2 + v3addv4 + v5addv6; + * t1##m = v1subv2 + v3subv4 * 2.f + v5subv6 * 0.5f; + * t2##m = v1addv2 + v3addv4 * 4.f + v5addv6 * 0.25f; + * t3##m = v1subv2 + v3subv4 * 8.f + v5subv6 * 0.125f; + * t4##m = v1addv2 + v3addv4 * 16.f + v5addv6 * 0.0625f; + * t5##m = v1subv2 + v3subv4 * 32.f + v5subv6 * 0.03125f + v7##m; + */ + GI_FLOAT32_t v1addv2, v1subv2, v3addv4, v3subv4, v5addv6, v5subv6; +#define cb(m) \ + v1addv2 = GiAddFloat32(v1##m, v2##m); \ + v1subv2 = GiSubtractFloat32(v1##m, v2##m); \ + v3addv4 = GiAddFloat32(v3##m, v4##m); \ + v3subv4 = GiSubtractFloat32(v3##m, v4##m); \ + v5addv6 = GiAddFloat32(v5##m, v6##m); \ + v5subv6 = GiSubtractFloat32(v5##m, v6##m); \ + GI_FLOAT32_t t0##m = GiAddFloat32(GiAddFloat32(GiAddFloat32(v0##m, v1addv2), v3addv4), v5addv6); \ + GI_FLOAT32_t t1##m = GiAddFloat32(GiAddFloat32(v1subv2, GiMultiplyScalerFloat32(v3subv4, 2.f)), GiMultiplyScalerFloat32(v5subv6, 0.5f)); \ + GI_FLOAT32_t t2##m = GiAddFloat32(GiAddFloat32(v1addv2, GiMultiplyScalerFloat32(v3addv4, 4.f)), GiMultiplyScalerFloat32(v5addv6, 0.25f)); \ + GI_FLOAT32_t t3##m = GiAddFloat32(GiAddFloat32(v1subv2, GiMultiplyScalerFloat32(v3subv4, 8.f)), GiMultiplyScalerFloat32(v5subv6, 0.125f)); \ + GI_FLOAT32_t t4##m = GiAddFloat32(GiAddFloat32(v1addv2, GiMultiplyScalerFloat32(v3addv4, 16.f)), GiMultiplyScalerFloat32(v5addv6, 0.0625f)); \ + GI_FLOAT32_t t5##m = \ + GiAddFloat32(GiAddFloat32(GiAddFloat32(v1subv2, GiMultiplyScalerFloat32(v3subv4, 32.f)), GiMultiplyScalerFloat32(v5subv6, 0.03125f)), \ + v7##m); + + UNROLL_CALL_NOWRAPPER(8, cb); +#undef cb + + /* + * v1addv2 = t##m##1 + t##m##2; + * v1subv2 = t##m##1 - t##m##2; + * v3addv4 = t##m##3 + t##m##4; + * v3subv4 = t##m##3 - t##m##4; + * v5addv6 = t##m##5 + t##m##6; + * v5subv6 = t##m##5 - t##m##6; + * v##m##0 = t##m##0 + v1addv2 + v3addv4 + v5addv6; + * v##m##1 = v1subv2 + v3subv4 * 2.f + v5subv6 * 0.5f; + * v##m##2 = v1addv2 + v3addv4 * 4.f + v5addv6 * 0.25f; + * v##m##3 = v1subv2 + v3subv4 * 8.f + v5subv6 * 0.125f; + * v##m##4 = v1addv2 + v3addv4 * 16.f + v5addv6 * 0.0625f; + * v##m##5 = v1subv2 + v3subv4 * 32.f + v5subv6 * 0.03125f + t##m##7; + */ +#define cb(m) \ + v1addv2 = GiAddFloat32(t##m##1, t##m##2); \ + v1subv2 = GiSubtractFloat32(t##m##1, t##m##2); \ + v3addv4 = GiAddFloat32(t##m##3, t##m##4); \ + v3subv4 = GiSubtractFloat32(t##m##3, t##m##4); \ + v5addv6 = GiAddFloat32(t##m##5, t##m##6); \ + v5subv6 = GiSubtractFloat32(t##m##5, t##m##6); \ + v##m##0 = GiAddFloat32(GiAddFloat32(GiAddFloat32(t##m##0, v1addv2), v3addv4), v5addv6); \ + v##m##1 = GiAddFloat32(GiAddFloat32(v1subv2, GiMultiplyScalerFloat32(v3subv4, 2.f)), GiMultiplyScalerFloat32(v5subv6, 0.5f)); \ + v##m##2 = GiAddFloat32(GiAddFloat32(v1addv2, GiMultiplyScalerFloat32(v3addv4, 4.f)), GiMultiplyScalerFloat32(v5addv6, 0.25f)); \ + v##m##3 = GiAddFloat32(GiAddFloat32(v1subv2, GiMultiplyScalerFloat32(v3subv4, 8.f)), GiMultiplyScalerFloat32(v5subv6, 0.125f)); \ + v##m##4 = GiAddFloat32(GiAddFloat32(v1addv2, GiMultiplyScalerFloat32(v3addv4, 16.f)), GiMultiplyScalerFloat32(v5addv6, 0.0625f)); \ + v##m##5 = \ + GiAddFloat32(GiAddFloat32(GiAddFloat32(v1subv2, GiMultiplyScalerFloat32(v3subv4, 32.f)), GiMultiplyScalerFloat32(v5subv6, 0.03125f)), \ + t##m##7); + + UNROLL_CALL_NOWRAPPER(6, cb); +#undef cb + + GI_FLOAT32_t vbias; + if (bias) { + vbias = GiLoadFloat32(bias + oc); + +#define cb(m, n) v##m##n = GiAddFloat32(v##m##n, vbias); + UNROLL_CALL_RAW_D2(6, 6, cb); +#undef cb + } +${nonline_gen_init()} +${nonline_gen_func(v00, vbias)};v00=vbias; +${nonline_gen_func(v01, vbias)};v01=vbias; +${nonline_gen_func(v02, vbias)};v02=vbias; +${nonline_gen_func(v03, vbias)};v03=vbias; +${nonline_gen_func(v04, vbias)};v04=vbias; +${nonline_gen_func(v05, vbias)};v05=vbias; + +${nonline_gen_func(v10, vbias)};v10=vbias; +${nonline_gen_func(v11, vbias)};v11=vbias; +${nonline_gen_func(v12, vbias)};v12=vbias; +${nonline_gen_func(v13, vbias)};v13=vbias; +${nonline_gen_func(v14, vbias)};v14=vbias; +${nonline_gen_func(v15, vbias)};v15=vbias; + +${nonline_gen_func(v20, vbias)};v20=vbias; +${nonline_gen_func(v21, vbias)};v21=vbias; +${nonline_gen_func(v22, vbias)};v22=vbias; +${nonline_gen_func(v23, vbias)};v23=vbias; +${nonline_gen_func(v24, vbias)};v24=vbias; +${nonline_gen_func(v25, vbias)};v25=vbias; + +${nonline_gen_func(v30, vbias)};v30=vbias; +${nonline_gen_func(v31, vbias)};v31=vbias; +${nonline_gen_func(v32, vbias)};v32=vbias; +${nonline_gen_func(v33, vbias)};v33=vbias; +${nonline_gen_func(v34, vbias)};v34=vbias; +${nonline_gen_func(v35, vbias)};v35=vbias; + +${nonline_gen_func(v40, vbias)};v40=vbias; +${nonline_gen_func(v41, vbias)};v41=vbias; +${nonline_gen_func(v42, vbias)};v42=vbias; +${nonline_gen_func(v43, vbias)};v43=vbias; +${nonline_gen_func(v44, vbias)};v44=vbias; +${nonline_gen_func(v45, vbias)};v45=vbias; + +${nonline_gen_func(v50, vbias)};v50=vbias; +${nonline_gen_func(v51, vbias)};v51=vbias; +${nonline_gen_func(v52, vbias)};v52=vbias; +${nonline_gen_func(v53, vbias)};v53=vbias; +${nonline_gen_func(v54, vbias)};v54=vbias; +${nonline_gen_func(v55, vbias)};v55=vbias; + + +#define out_save(oho, owo) \ + do { \ + size_t oh = oh_start + oho; \ + size_t ow = ow_start + owo; \ + if (oh < OH && ow < OW) { \ + GiStoreFloat32( \ + outptr_ + oc * OH * OW + oh * OW * PACK_C_SIZE + ow * PACK_C_SIZE, \ + v##oho##owo); \ + } \ + } while (0); + UNROLL_CALL_RAW_D2(6, 6, out_save); + +#undef out_save + +#undef MSUB +#undef MADD + } + })"; + std::string nonline_mode = ctx->haveAttr("nonlineMode") + ? ctx->getAttrStr("nonlineMode") + : "IDENTITY"; + auto nonline_gen = create_activation_gener_instrinsic(nonline_mode); + auto nonline_gen_func = [&](std::vector str) -> std::string { + return nonline_gen->GenIntrinsicFloat(str[0], str[1]); + }; + auto nonline_gen_init = [&]() -> std::string { + return nonline_gen->GenIntrinsicInitFloat(); + }; + + std::stringstream ss; + ss << StringTemplate::StringTemplateArgs() + .add("nonline_gen_func", nonline_gen_func) + .add("nonline_gen_init", nonline_gen_init) + .add("transform_output_ptr", strs[0]) + .add("outptr", strs[1]) + .add("bias_ptr", strs[2]) + .add("OH", strs[3]) + .add("OW", strs[4]) + .add("OC", strs[5]) + .add("tile_id", strs[6]) + .add("nr_tiles_in_loop", strs[7]) + .render(ouput_trans); + return ss.str(); +} + +// vim: syntax=cpp.doxygen diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h new file mode 100644 index 00000000..dfd83b9b --- /dev/null +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h @@ -0,0 +1,36 @@ +/** + * \file + * compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ +#pragma once +#include +#include "Common/ConvKernel.h" +#include "Utils/StringTemplate.h" +#include "WinogradCommon.h" +#include "compiler/KernelGen/KernelGen.h" +namespace megcc { +namespace KernelGen { +namespace GeneralIntrinsic { + +class WinogradF63Strategy4x16MK4 : public WinogradStrategyBase { +public: + uint32_t GetKernelSize() override { return 3; } + uint32_t GetOutputBlockSize() override { return 6; } + std::string DependMatmulSymbol() override; + std::string WeightTrans(const std::vector& strs) override; + std::string InputFeatureTrans( + const std::vector& strs) override; + std::string BatchedMatMul(const std::vector& strs) override; + std::string OutputFeatureTrans(const std::vector& strs, + TContext*) override; +}; + +} // namespace GeneralIntrinsic +} // namespace KernelGen +} // namespace megcc + +// vim: syntax=cpp.doxygen From 5d345c3ddf5878f0bdcdfbbe678dff9840fa4a36 Mon Sep 17 00:00:00 2001 From: yuxiongxiong Date: Mon, 9 Jan 2023 12:17:41 +0800 Subject: [PATCH 14/17] feat(compiler): add f63 f43 kernel test and benchmark --- .../Transforms/KernelMaterialization.cpp | 2 +- .../ConvKernel/Winograd/WinogradCommon.cpp | 4 +- .../ConvKernel/Winograd/WinogradCommon.h | 3 +- .../Winograd/WinogradF43Strategy4x16MK4.cpp | 11 ++- .../Winograd/WinogradF43Strategy4x16MK4.h | 1 + .../Winograd/WinogradF63Strategy4x16MK4.cpp | 12 +-- .../Winograd/WinogradF63Strategy4x16MK4.h | 1 + compiler/lib/KernelGen/KernelGen.cpp | 6 +- .../test/kernel/opr/arm/benchmark_conv.cpp | 48 ++++++----- compiler/test/kernel/opr/arm/conv.cpp | 80 ++++++++++--------- 10 files changed, 88 insertions(+), 80 deletions(-) diff --git a/compiler/lib/Dialect/Kernel/Transforms/KernelMaterialization.cpp b/compiler/lib/Dialect/Kernel/Transforms/KernelMaterialization.cpp index 574590bb..9ad16439 100644 --- a/compiler/lib/Dialect/Kernel/Transforms/KernelMaterialization.cpp +++ b/compiler/lib/Dialect/Kernel/Transforms/KernelMaterialization.cpp @@ -291,7 +291,7 @@ class KernelMaterialization final void populateKernelMaterializationPatterns(RewritePatternSet& patterns) { if (target_arch == megcc::KernelGen::ARM64V7) { auto a64_registry = std::make_unique(); - Kernel::addBuiltinTemplates(*a64_registry, megcc::KernelGen::ARM64); + Kernel::addBuiltinTemplates(*a64_registry, megcc::KernelGen::ARM64V7); //! a32_registry and a64_registry shared the same map to avoid //! generating redundant armcommon kernel auto a32_registry = std::make_unique( diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp index dae4bd6a..8adec61f 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp @@ -65,7 +65,7 @@ std::string WinogradFrameNchw44::GenGetWorkSpaceCode( return TinyNN_SUCCESS; })"; ss << StringTemplate::StringTemplateArgs() - .add("tile_per_loop", m_tile_per_loop) + .add("tile_per_loop", strategy->GetTileSize()) .add("KernelSize", strategy->GetKernelSize()) .add("OutputBlockSize", strategy->GetOutputBlockSize()) .render(workspace_temp); @@ -246,7 +246,7 @@ std::string WinogradFrameNchw44::GenKernelBodyCode( writer << StringTemplate::StringTemplateArgs(ctx) .add("KernelSize", strategy->GetKernelSize()) .add("OutputBlockSize", strategy->GetOutputBlockSize()) - .add("nr_tiles_per_loop", m_tile_per_loop) + .add("nr_tiles_per_loop", strategy->GetTileSize()) .add("BiasPtr", bias_ptr) .add_ctx_int("pad_h") .add_ctx_int("pad_w") diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h index 488010d3..00807dc0 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h @@ -20,6 +20,7 @@ class WinogradStrategyBase { public: virtual uint32_t GetKernelSize() = 0; virtual uint32_t GetOutputBlockSize() = 0; + virtual uint32_t GetTileSize() { return 32; }; //! transform the weight to winograd space, input strings are: //! 0: inptr, the start pointer of the convolution weight @@ -63,8 +64,6 @@ class WinogradStrategyBase { }; class WinogradFrameNchw44 { - uint32_t m_tile_per_loop = 32; - public: //! gen init code std::string GenInitCode(TContext*, WinogradStrategyBase*); diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp index becbb458..d20335e5 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp @@ -970,15 +970,14 @@ std::string WinogradF43Strategy4x16MK4::OutputFeatureTrans( if (bias) { vbias = GiLoadFloat32(bias + oc); } -# define BIAS_LINE(i, j, k) \ +# define BIAS_LINE(j, k) \ v##j##k = GiAddFloat32(v##j##k, vbias); #define BIAS(m) \ - BIAS_LINE(3, m, 5) \ - BIAS_LINE(2, m, 4) \ - BIAS_LINE(1, m, 3) \ - BIAS_LINE(0, m, 2) - + BIAS_LINE(m, 5) \ + BIAS_LINE(m, 4) \ + BIAS_LINE(m, 3) \ + BIAS_LINE(m, 2) // add_bias if(bias){ diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h index 13dbccec..6df9fb57 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h @@ -20,6 +20,7 @@ class WinogradF43Strategy4x16MK4 : public WinogradStrategyBase { public: uint32_t GetKernelSize() override { return 3; } uint32_t GetOutputBlockSize() override { return 4; } + uint32_t GetTileSize() override { return 68; }; std::string DependMatmulSymbol() override; std::string WeightTrans(const std::vector& strs) override; std::string InputFeatureTrans( diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp index c63bb9f9..7e212a42 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp @@ -132,7 +132,7 @@ std::string WinogradF63Strategy4x16MK4::WeightTrans( std::string WinogradF63Strategy4x16MK4::InputFeatureTrans( const std::vector& strs) { - auto InputPrepareF43NCHW44 = [](std::vector) { + auto InputPrepareF63NCHW44 = [](std::vector) { std::stringstream ss; std::string kernel = R"( size_t IW4 = IW_ * PACK_C_SIZE; @@ -173,7 +173,7 @@ std::string WinogradF63Strategy4x16MK4::InputFeatureTrans( )"; return kernel; }; - auto InputTransformF43NCHW44 = [](std::vector) { + auto InputTransformF63NCHW44 = [](std::vector) { std::stringstream ss; std::string kernel = R"( // BT * d * B @@ -395,8 +395,8 @@ std::string WinogradF63Strategy4x16MK4::InputFeatureTrans( iw_start + Alpha <= (int)IW_)?1:0; - ${InputPrepareF43NCHW44()} - ${InputTransformF43NCHW44()} + ${InputPrepareF63NCHW44()} + ${InputTransformF63NCHW44()} } })"; @@ -411,8 +411,8 @@ std::string WinogradF63Strategy4x16MK4::InputFeatureTrans( .add("PW", strs[6]) .add("tile_id", strs[7]) .add("nr_tiles_in_loop", strs[8]) - .add("InputTransformF43NCHW44", InputTransformF43NCHW44) - .add("InputPrepareF43NCHW44", InputPrepareF43NCHW44) + .add("InputTransformF63NCHW44", InputTransformF63NCHW44) + .add("InputPrepareF63NCHW44", InputPrepareF63NCHW44) .render(input_process); return ss.str(); } diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h index dfd83b9b..9187bfaa 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h @@ -20,6 +20,7 @@ class WinogradF63Strategy4x16MK4 : public WinogradStrategyBase { public: uint32_t GetKernelSize() override { return 3; } uint32_t GetOutputBlockSize() override { return 6; } + uint32_t GetTileSize() override { return 16; }; std::string DependMatmulSymbol() override; std::string WeightTrans(const std::vector& strs) override; std::string InputFeatureTrans( diff --git a/compiler/lib/KernelGen/KernelGen.cpp b/compiler/lib/KernelGen/KernelGen.cpp index cd7978b9..221831f0 100644 --- a/compiler/lib/KernelGen/KernelGen.cpp +++ b/compiler/lib/KernelGen/KernelGen.cpp @@ -66,8 +66,10 @@ KernelPack::GetKernel(KernelPack::KernType kernel_type, Arch arch) { } } //! WARNING: the f63 and f43 must exist in GI kernel - a64_kerns.insert(a64_kerns.begin(), sorted_kern.begin(), - sorted_kern.end()); + if (arch == Arch::ARM64) { + a64_kerns.insert(a64_kerns.begin(), sorted_kern.begin(), + sorted_kern.end()); + } } else { valid_kern = gi_kerns; } diff --git a/compiler/test/kernel/opr/arm/benchmark_conv.cpp b/compiler/test/kernel/opr/arm/benchmark_conv.cpp index 29c5a63e..560d229c 100644 --- a/compiler/test/kernel/opr/arm/benchmark_conv.cpp +++ b/compiler/test/kernel/opr/arm/benchmark_conv.cpp @@ -124,13 +124,13 @@ TEST(AARCH64, BenchmarkConvNCHWNCHW44) { param.compute_mode = ConvBiasForward::Param::ComputeMode::DEFAULT; param.format = ConvBiasForward::Param::Format::NCHW44; benchmarker.set_param(param); - benchmarker.execs( - {{1, 3, 224, 224}, {8, 3, 3, 3, 4}, {1, 8, 1, 1, 4}, {}, {}}).print(); + benchmarker + .execs({{1, 3, 224, 224}, {8, 3, 3, 3, 4}, {1, 8, 1, 1, 4}, {}, {}}) + .print(); } TEST(AARCH64, BenchmarkConvF32Winograd) { Benchmarker benchmarker(Arch::ARM64); - benchmarker.set_kernel_symbol(".*_winograd_f23"); ConvBiasForward::Param param; param.pad_h = 1; @@ -140,25 +140,29 @@ TEST(AARCH64, BenchmarkConvF32Winograd) { param.compute_mode = ConvBiasForward::Param::ComputeMode::DEFAULT; param.format = ConvBiasForward::Param::Format::NCHW44; benchmarker.set_param(param); - benchmarker.set_before_exec_callback( - megdnn::test::AlgoChecker( - "WINOGRAD_NCHW44:AARCH64_F32_MK4_4x16:4:2:24")); - for (size_t Channel : {32, 256}) { - for (size_t HW : {56, 28, 14}) { - auto result = - benchmarker.execs({{1, Channel / 4, HW, HW, 4}, - {Channel / 4, Channel / 4, 3, 3, 4, 4}, - {1, Channel / 4, 1, 1, 4}, - {}, - {}}); - printf("megcc result time = %f, throughput %f Gops, %f mbps\n", - result.megcc_performance.kernel_time_ms, - result.megcc_performance.compute_throughput_gops, - result.megcc_performance.memory_throughput_mbps); - printf("dnn result time = %f, throughput %f Gops, %f mbps\n", - result.dnn_performance.kernel_time_ms, - result.dnn_performance.compute_throughput_gops, - result.dnn_performance.memory_throughput_mbps); + std::vector> algo_pairs = { + {".*_winograd_f23", "WINOGRAD_NCHW44:AARCH64_F32_MK4_4x16:4:2:24"}, + {".*_winograd_f43", "WINOGRAD_NCHW44:AARCH64_F32_MK4_4x16:4:4:68"}, + {".*_winograd_f63", "WINOGRAD_NCHW44:AARCH64_F32_MK4_4x16:4:6:16"}}; + + for (auto algo : algo_pairs) { + printf("megcc algo: %s VS megdnn algo: %s\n", algo[0].c_str(), + algo[1].c_str()); + for (size_t Channel : {32, 256}) { + for (size_t HW : {56, 28, 14}) { + benchmarker.set_kernel_symbol(algo[0]); + benchmarker.set_before_exec_callback( + megdnn::test::AlgoChecker( + algo[1].c_str())); + + auto result = benchmarker.execs( + {{1, Channel / 4, HW, HW, 4}, + {Channel / 4, Channel / 4, 3, 3, 4, 4}, + {1, Channel / 4, 1, 1, 4}, + {}, + {}}); + result.print(); + } } } } diff --git a/compiler/test/kernel/opr/arm/conv.cpp b/compiler/test/kernel/opr/arm/conv.cpp index ad1d4c9e..fad7aefd 100644 --- a/compiler/test/kernel/opr/arm/conv.cpp +++ b/compiler/test/kernel/opr/arm/conv.cpp @@ -343,53 +343,55 @@ TEST(AARCH64, ConvBiasNCHWNCHW44) { TEST(AARCH64, ConvWinogradNCHW44) { Checker checker(Arch::ARM64); - checker.set_kernel_symbol(".*_winograd_f23"); - checker.set_epsilon(1e-3); + checker.set_epsilon(1e-2); ConvBiasForward::Param param; param.stride_h = 1; param.stride_w = 1; param.compute_mode = ConvBiasForward::Param::ComputeMode::DEFAULT; param.format = ConvBiasForward::Param::Format::NCHW44; param.sparse = ConvBiasForward::Param::Sparse::DENSE; - - for (size_t Channel : {32, 64, 256}) { - for (size_t HW : {28, 14}) { - param.pad_h = 1; - param.pad_w = 1; - checker.set_param(param); - checker.execs({{1, Channel / 4, HW, HW, 4}, - {Channel / 4, Channel / 4, 3, 3, 4, 4}, - {1, Channel / 4, 1, 1, 4}, - {}, - {}}); + for (auto name : + {".*_winograd_f23", "^GI.*_winograd_f43.*", "^GI.*_winograd_f63.*"}) { + checker.set_kernel_symbol(name); + for (size_t Channel : {32, 64, 256}) { + for (size_t HW : {28, 14}) { + param.pad_h = 1; + param.pad_w = 1; + checker.set_param(param); + checker.execs({{1, Channel / 4, HW, HW, 4}, + {Channel / 4, Channel / 4, 3, 3, 4, 4}, + {1, Channel / 4, 1, 1, 4}, + {}, + {}}); + } } + // clang-format off + for(size_t P:{0, 1}) + for(size_t IC : {1, 3, 8}) + for(size_t OC : {1, 4}) + for(size_t IH: {3, 5, 22, 32}) + for(size_t IW : {22, 56}) + for(auto mode : {ConvBiasForward::Param::NonlineMode::IDENTITY, + ConvBiasForward::Param::NonlineMode::RELU, + ConvBiasForward::Param::NonlineMode::H_SWISH}) + // clang-format on + { + param.pad_h = P; + param.pad_w = P; + param.nonlineMode = mode; + checker.set_param(param); + checker.execs({{1, IC, IH, IW, 4}, + {OC, IC, 3, 3, 4, 4}, + {}, + {}, + {}}); + checker.execs({{2, IC, IH, IW, 4}, + {OC, IC, 3, 3, 4, 4}, + {1, OC, 1, 1, 4}, + {}, + {}}); + } } - - // clang-format off - for(size_t P:{0, 1}) - for(size_t IC : {1, 3, 8}) - for(size_t OC : {1, 4}) - for(size_t IH: {3, 5, 22, 32}) - for(size_t IW : {22, 56}) - for(auto mode : {ConvBiasForward::Param::NonlineMode::IDENTITY, - ConvBiasForward::Param::NonlineMode::RELU}) - // clang-format on - { - param.pad_h = P; - param.pad_w = P; - param.nonlineMode = mode; - checker.set_param(param); - checker.execs({{1, IC, IH, IW, 4}, - {OC, IC, 3, 3, 4, 4}, - {}, - {}, - {}}); - checker.execs({{2, IC, IH, IW, 4}, - {OC, IC, 3, 3, 4, 4}, - {1, OC, 1, 1, 4}, - {}, - {}}); - } } TEST(AARCH64, ConvBiasIm2col) { From 8b66bd4aab65700da6caff9860c147377cc2085a Mon Sep 17 00:00:00 2001 From: yuxiongxiong Date: Mon, 9 Jan 2023 19:05:26 +0800 Subject: [PATCH 15/17] feat(compiler): add gi max and min kernel --- .../GeneralIntrinsic/Elemwise/Elemwise.cpp | 3 +- .../ElemwiseHelper/BinaryHelper.cpp | 70 +++++++++++++++++++ .../ElemwiseHelper/ElemwiseHelper.cpp | 4 ++ .../ElemwiseHelper/ElemwiseHelper.h | 2 + .../generalIntrinsic/benchmark_elemwise.cpp | 7 +- .../opr/generalIntrinsic/benchmark_reduce.cpp | 3 +- .../test/kernel/opr/generalIntrinsic/cv.cpp | 5 +- .../opr/generalIntrinsic/elementwise.cpp | 2 +- runtime/include/lite-c/common_enum_c.h | 2 +- runtime/src/lite/network.c | 10 +-- 10 files changed, 94 insertions(+), 14 deletions(-) diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/Elemwise/Elemwise.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/Elemwise/Elemwise.cpp index 5ffdd2c2..4f1390c1 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/Elemwise/Elemwise.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/Elemwise/Elemwise.cpp @@ -28,7 +28,8 @@ bool ElemwiseKernel::IsAvailable(TContext* ctx) const { bool mode_ok = mode == "RELU" || mode == "EXP" || mode == "SIGMOID" || mode == "H_SWISH" || mode == "ADD" || mode == "SUB" || mode == "MUL" || mode == "TRUE_DIV" || - mode == "FUSE_ADD_RELU" || mode == "FUSE_MUL_ADD3"; + mode == "FUSE_ADD_RELU" || mode == "FUSE_MUL_ADD3" || + mode == "MAX" || mode == "MIN"; if (mode == "FUSE_MUL_ADD3") { auto bcast_type = ElemwiseGenTernary::GetBcastType( ctx->getAttrOprand("operand:0"), diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/BinaryHelper.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/BinaryHelper.cpp index 891474b6..547a1340 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/BinaryHelper.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/BinaryHelper.cpp @@ -573,6 +573,76 @@ std::string ElemwiseGenBinaryFuseAddRelu::GenKernelNaiveUnroll( return writer.str(); } +std::string ElemwiseGenBinaryMax::GenKernelSimdInit( + std::vector) const { + return ""; +} + +std::string ElemwiseGenBinaryMax::GenKernelSimdUnroll( + std::vector strs) const { + int unroll = std::stoi(strs[0]); + auto dst = strs[1]; + std::stringstream writer; + int str_id = 2; + for (int i = 0; i < unroll; i++) { + writer << "\n GiStoreFloat32((" << dst << ") + 4 * " << i + << ", GiMaximumFloat32(" << strs[str_id] << "," << strs[str_id + 1] + << "));"; + str_id += 2; + } + return writer.str(); +} + +std::string ElemwiseGenBinaryMax::GenKernelNaiveUnroll( + std::vector strs) const { + int unroll = std::stoi(strs[0]); + auto dst = strs[1]; + std::stringstream writer; + int str_id = 2; + for (int i = 0; i < unroll; i++) { + writer << "\n(" << dst << ")[" << i << "] = (" << strs[str_id] << ")[" + << i << "] > (" << strs[str_id + 1] << ")[" << i << "] ?(" << strs[str_id] << ")[" << i <<"]:(" << strs[str_id + 1] << ")[" + << i << "] ;"; + str_id += 2; + } + return writer.str(); +} + +std::string ElemwiseGenBinaryMin::GenKernelSimdInit( + std::vector) const { + return ""; +} + +std::string ElemwiseGenBinaryMin::GenKernelSimdUnroll( + std::vector strs) const { + int unroll = std::stoi(strs[0]); + auto dst = strs[1]; + std::stringstream writer; + int str_id = 2; + for (int i = 0; i < unroll; i++) { + writer << "\n GiStoreFloat32((" << dst << ") + 4 * " << i + << ", GiMinimumFloat32(" << strs[str_id] << "," << strs[str_id + 1] + << "));"; + str_id += 2; + } + return writer.str(); +} + +std::string ElemwiseGenBinaryMin::GenKernelNaiveUnroll( + std::vector strs) const { + int unroll = std::stoi(strs[0]); + auto dst = strs[1]; + std::stringstream writer; + int str_id = 2; + for (int i = 0; i < unroll; i++) { + writer << "\n(" << dst << ")[" << i << "] = (" << strs[str_id] << ")[" + << i << "] < (" << strs[str_id + 1] << ")[" << i << "] ?(" << strs[str_id] << ")[" << i <<"]:(" << strs[str_id + 1] << ")[" + << i << "] ;"; + str_id += 2; + } + return writer.str(); +} + std::string ElemwiseGenBinary::GenCodeBody( std::vector strs) const { auto input0 = strs[0]; diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.cpp index 27dff585..32216d16 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.cpp +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.cpp @@ -44,6 +44,10 @@ std::shared_ptr ElemwiseHelperFunc::CreateGenHelper( operands[1]); CASE_DISPATCH_ARG("FUSE_ADD_RELU", ElemwiseGenBinaryFuseAddRelu, operands[0], operands[1]); + CASE_DISPATCH_ARG("MAX", ElemwiseGenBinaryMax, + operands[0], operands[1]); + CASE_DISPATCH_ARG("MIN", ElemwiseGenBinaryMin, + operands[0], operands[1]); CC_ABORT << "Binary mode: " << mode << " not Implement now\n"; } else if (nr_operands == 4) { CASE_DISPATCH_ARG("FUSE_MUL_ADD3", ElemwiseGenTernaryFuseMulAdd3, diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.h b/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.h index ee65ad94..8fe654d8 100644 --- a/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.h +++ b/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.h @@ -157,6 +157,8 @@ DEFINE_BINARY_OP(ElemwiseGenBinarySub) DEFINE_BINARY_OP(ElemwiseGenBinaryMul) DEFINE_BINARY_OP(ElemwiseGenBinaryTrueDiv) DEFINE_BINARY_OP(ElemwiseGenBinaryFuseAddRelu) +DEFINE_BINARY_OP(ElemwiseGenBinaryMax) +DEFINE_BINARY_OP(ElemwiseGenBinaryMin) #undef DEFINE_BINARY_OP //! TODO: add more binary elemwise here /************************************Ternary***********************************/ diff --git a/compiler/test/kernel/opr/generalIntrinsic/benchmark_elemwise.cpp b/compiler/test/kernel/opr/generalIntrinsic/benchmark_elemwise.cpp index db2b6c16..2f57bf1e 100644 --- a/compiler/test/kernel/opr/generalIntrinsic/benchmark_elemwise.cpp +++ b/compiler/test/kernel/opr/generalIntrinsic/benchmark_elemwise.cpp @@ -8,6 +8,7 @@ */ #include "test/kernel/common/benchmark.h" +#include "megbrain/reflection.h" using namespace megdnn; using namespace megcc::test; using namespace megcc::KernelGen; @@ -18,7 +19,7 @@ TEST(GI, ElementwiseUnique_BMK) { benchmarker.set_kernel_symbol("GI_kernel_elementwise.+"); ElemwiseForward::Param param; for (auto mode : {MODE::RELU, MODE::SIGMOID, MODE::EXP, MODE::H_SWISH}) { - printf("mode=%d\n", mode); + printf("mode=%s\n", mgb::reflection::nameOfEnumValue(mode).c_str()); param.mode = mode; benchmarker.set_param(param); benchmarker.execs({{10000}, {}}).print(); @@ -36,7 +37,7 @@ TEST(GI, ElementwiseBinary_BMK) { ElemwiseForward::Param param; for (auto mode : {MODE::ADD, MODE::SUB, MODE::MUL, MODE::FUSE_ADD_RELU, MODE::TRUE_DIV}) { - printf("mode=%d\n", mode); + printf("mode=%s\n", mgb::reflection::nameOfEnumValue(mode).c_str()); param.mode = mode; benchmarker.set_param(param); benchmarker.execs({{10000}, {10000}, {}}).print(); @@ -53,7 +54,7 @@ TEST(GI, ElementwiseTernary_BMK) { ElemwiseForward::Param param; benchmarker.set_kernel_symbol("GI_kernel_elementwise.+"); for (auto mode : {MODE::FUSE_MUL_ADD3}) { - printf("mode=%d\n", mode); + printf("mode=%s\n", mgb::reflection::nameOfEnumValue(mode).c_str()); param.mode = mode; benchmarker.set_param(param); //! vec_vec diff --git a/compiler/test/kernel/opr/generalIntrinsic/benchmark_reduce.cpp b/compiler/test/kernel/opr/generalIntrinsic/benchmark_reduce.cpp index 760d18a1..28f8b2ea 100644 --- a/compiler/test/kernel/opr/generalIntrinsic/benchmark_reduce.cpp +++ b/compiler/test/kernel/opr/generalIntrinsic/benchmark_reduce.cpp @@ -8,6 +8,7 @@ */ #include "test/kernel/common/benchmark.h" +#include "megbrain/reflection.h" using namespace megdnn; using namespace megcc::test; using namespace megcc::KernelGen; @@ -17,7 +18,7 @@ TEST(GI, BENCHMARK_Reduce) { Benchmarker benchmarker(Arch::BAREMETAL); benchmarker.set_kernel_symbol("GI_kernel_reduce.*"); for (auto mode : {Mode::MIN, Mode::MAX, Mode::SUM, Mode::SUM_SQR, Mode::MEAN, Mode::PRODUCT}){ - printf("mode=%d\n", mode); + printf("mode=%s\n", mgb::reflection::nameOfEnumValue(mode).c_str()); for (auto src : {TensorShape{200, 300}, TensorShape{3, 200, 300}, TensorShape{1, 3, 200, 300}}){ for (size_t axis = 0; axis < 4; ++axis) { if (axis < src.ndim) { diff --git a/compiler/test/kernel/opr/generalIntrinsic/cv.cpp b/compiler/test/kernel/opr/generalIntrinsic/cv.cpp index 935cf9e7..7e354ecf 100644 --- a/compiler/test/kernel/opr/generalIntrinsic/cv.cpp +++ b/compiler/test/kernel/opr/generalIntrinsic/cv.cpp @@ -9,6 +9,7 @@ #include "test/kernel/common/checker.h" #include "test/kernel/common/cv_opr.h" +#include "megbrain/reflection.h" using namespace megcc::test; using namespace megdnn; using namespace megcc::KernelGen; @@ -134,13 +135,13 @@ TEST(GI, CVcvtcolor) { checker.set_dtype(1, dtype::Uint8()); for (auto mode : {CvtMode::RGB2YUV, CvtMode::RGB2BGR}) { - printf("mode=%d\n", mode); + printf("mode=%s\n", mgb::reflection::nameOfEnumValue(mode).c_str()); param.mode = mode; checker.set_param(param); checker.exec({{1, 17, 31, 3}, {}}); } for (auto mode : {CvtMode::YUV2BGR_NV21}) { - printf("mode=%d\n", mode); + printf("mode=%s\n", mgb::reflection::nameOfEnumValue(mode).c_str()); param.mode = mode; checker.set_param(param); checker.exec({{1, 3, 18, 1}, {}}); diff --git a/compiler/test/kernel/opr/generalIntrinsic/elementwise.cpp b/compiler/test/kernel/opr/generalIntrinsic/elementwise.cpp index 5bc3f59d..c9337640 100644 --- a/compiler/test/kernel/opr/generalIntrinsic/elementwise.cpp +++ b/compiler/test/kernel/opr/generalIntrinsic/elementwise.cpp @@ -33,7 +33,7 @@ TEST(GI, ElementwiseBinary) { checker.set_kernel_symbol("GI_kernel_elementwise.+"); ElemwiseForward::Param param; - for (auto mode : {MODE::ADD, MODE::SUB, MODE::MUL, MODE::FUSE_ADD_RELU}) { + for (auto mode : {MODE::ADD, MODE::SUB, MODE::MUL, MODE::FUSE_ADD_RELU, MODE::MAX, MODE::MIN}) { param.mode = mode; checker.set_param(param); checker.execs({{1}, {1}, {}}); diff --git a/runtime/include/lite-c/common_enum_c.h b/runtime/include/lite-c/common_enum_c.h index 361b52d2..9178aee3 100644 --- a/runtime/include/lite-c/common_enum_c.h +++ b/runtime/include/lite-c/common_enum_c.h @@ -17,7 +17,7 @@ */ typedef enum LiteLogLevel { DEBUG = 0, /*!< The lowest level and most verbose */ - INFO = 1, /*!< The lowest level and most verbose */ + INFO = 1, /*!< print infos, warns and errors message */ WARN = 2, /*!< Print only warning and errors */ ERROR = 3, /*!< Print only errors */ } LiteLogLevel; diff --git a/runtime/src/lite/network.c b/runtime/src/lite/network.c index 35b6f6e0..b4523a49 100644 --- a/runtime/src/lite/network.c +++ b/runtime/src/lite/network.c @@ -200,7 +200,7 @@ int LITE_forward(const LiteNetwork network) { Layout in_layout = opr->inputs[0]->layout; Layout out_layout = opr->outputs[0]->layout; - LOG_ERROR( + LOG_INFO( " instruction: %s \nuse %fms \t" "[%d(%d), %d(%d), %d(%d), %d(%d), %d(%d)] \t" "[%d(%d), %d(%d), %d(%d), %d(%d), %d(%d)]\n", @@ -216,9 +216,9 @@ int LITE_forward(const LiteNetwork network) { out_layout.stride[4]); } else { - LOG_ERROR("execute used time %f ms of instruction %s.\n", - inst->time_ms / inst->time_count, - instruction_type_name(inst->tag)); + LOG_INFO("execute used time %f ms of instruction %s.\n", + inst->time_ms / inst->time_count, + instruction_type_name(inst->tag)); } #endif } @@ -361,7 +361,7 @@ int LITE_destroy_network(LiteNetwork network) { } FREE(cb_model->device_models); - //! free combine model struce + //! free combine model struct FREE(cb_model); return TinyNN_SUCCESS; } From b43b6e1f5483c982d3ac73e20a311c3b3955caa2 Mon Sep 17 00:00:00 2001 From: yuxiongxiong Date: Mon, 9 Jan 2023 19:17:43 +0800 Subject: [PATCH 16/17] feat(benchmark): add megcc benchmark --- README.md | 1 + benchmark/.gitignore | 4 + benchmark/CMakeLists.txt | 97 +++++++++++++++++ benchmark/README.md | 80 ++++++++++++++ benchmark/clean.sh | 3 + benchmark/main.cpp | 64 +++++++++++ benchmark/model/model_arm.json | 47 ++++++++ benchmark/model/model_riscv.json | 40 +++++++ benchmark/model/model_x86.json | 40 +++++++ benchmark/python/example.py | 157 +++++++++++++++++++++++++++ benchmark/python/format.sh | 22 ++++ benchmark/python/src/benchmark.py | 102 +++++++++++++++++ benchmark/python/src/models.py | 163 ++++++++++++++++++++++++++++ benchmark/src/CCbenchmark.cpp | 97 +++++++++++++++++ benchmark/src/CCbenchmark.h | 33 ++++++ benchmark/src/MGEbenchmark.cpp | 100 +++++++++++++++++ benchmark/src/MGEbenchmark.h | 38 +++++++ benchmark/src/benchmark.h | 25 +++++ benchmark/src/build_config.h.in | 10 ++ benchmark/tools/cc_analysis.py | 89 +++++++++++++++ benchmark/tools/inference_visual.py | 91 ++++++++++++++++ 21 files changed, 1303 insertions(+) create mode 100644 benchmark/.gitignore create mode 100644 benchmark/CMakeLists.txt create mode 100644 benchmark/README.md create mode 100755 benchmark/clean.sh create mode 100644 benchmark/main.cpp create mode 100644 benchmark/model/model_arm.json create mode 100644 benchmark/model/model_riscv.json create mode 100644 benchmark/model/model_x86.json create mode 100644 benchmark/python/example.py create mode 100755 benchmark/python/format.sh create mode 100644 benchmark/python/src/benchmark.py create mode 100644 benchmark/python/src/models.py create mode 100644 benchmark/src/CCbenchmark.cpp create mode 100644 benchmark/src/CCbenchmark.h create mode 100644 benchmark/src/MGEbenchmark.cpp create mode 100644 benchmark/src/MGEbenchmark.h create mode 100644 benchmark/src/benchmark.h create mode 100644 benchmark/src/build_config.h.in create mode 100644 benchmark/tools/cc_analysis.py create mode 100644 benchmark/tools/inference_visual.py diff --git a/README.md b/README.md index 5e055dd6..ce22036f 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ MegCC supports Arm64/ArmV7/X86/BareMatal backend. You may want to check [support * Download release compiler suit from [release page](https://github.com/MegEngine/MegCC/releases) * Compiler from source, please fellow the [compiler doc](compiler/README.md) * Build the release tar, please fellow the [release doc](doc/how-to-release.md) +* Get benchmark of different model please reference [benchmark](benchmark/README.md) #### How to use MegCC diff --git a/benchmark/.gitignore b/benchmark/.gitignore new file mode 100644 index 00000000..17e80af5 --- /dev/null +++ b/benchmark/.gitignore @@ -0,0 +1,4 @@ +model/benchmark_* +model/generated_models +config +output \ No newline at end of file diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 00000000..797bb5b6 --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,97 @@ +cmake_minimum_required(VERSION 3.15.2) +set(CMAKE_EXPORT_COMPILE_COMMANDS + ON + CACHE INTERNAL "") + +project(Benchmarker) + +option(ENABLE_MEGENGINE_FRAMEWORK "build benchmark for megengine" OFF) +configure_file(src/build_config.h.in + ${CMAKE_CURRENT_BINARY_DIR}/genfiles/build_config.h) +# set megcc lib +if(NOT DEFINED RUNTIME_KERNEL_DIR) + message(FATAL_ERROR "build MegCC runtime kernel dir RUNTIME_KERNEL_DIR is empty, use -DRUNTIME_KERNEL_DIR=your_model_kernel_dir to set") +else() + message(STATUS "build MegCC runtime with kernel dir ${RUNTIME_KERNEL_DIR}") +endif() + +add_library(TinyNN STATIC IMPORTED) +set_target_properties( + TinyNN PROPERTIES IMPORTED_LOCATION + "${RUNTIME_KERNEL_DIR}/runtime/install/lib/libTinyNN.a") +if(ENABLE_MEGENGINE_FRAMEWORK) + message(STATUS "build benchmark with megengine ${ENABLE_MEGENGINE_FRAMEWORK}") + option(X86_BACKEND "Build bechmarker with X86 megengine lib" ON) + # set megengine lib + if(NOT DEFINED MEGENGINE_INSTALL_DIR) + message(FATAL_ERROR "MEGENGINE_INSTALL_DIR is empty use -DMEGENGINE_INSTALL_DIR=your_megengine_install_dir to set") + else() + message(STATUS "MEGENGINE_INSTALL_DIR is ${MEGENGINE_INSTALL_DIR}") + endif() + add_library(mgb_imported INTERFACE IMPORTED) + + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + set(MGE_INSTALL_LIBS ${MEGENGINE_INSTALL_DIR}/lite/lib/aarch64/liblite_static_all_in_one.a) + target_link_libraries(mgb_imported INTERFACE ${MGE_INSTALL_LIBS}) + elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a") + set(MGE_INSTALL_LIBS ${MEGENGINE_INSTALL_DIR}/lite/lib/armv7/liblite_static_all_in_one.a) + target_link_libraries(mgb_imported INTERFACE ${MGE_INSTALL_LIBS}) + elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64") + set(MGE_INSTALL_LIBS ${MEGENGINE_INSTALL_DIR}/lite/lib/riscv64/liblite_static_all_in_one.a) + target_link_libraries(mgb_imported INTERFACE ${MGE_INSTALL_LIBS}) + else() + if(X86_BACKEND) + if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64") + set(MKL_LIBS + ${PROJECT_SOURCE_DIR}/../third_party/MegEngine/third_party/mkl/x86_64/lib/libmkl_core.a + ${PROJECT_SOURCE_DIR}/../third_party/MegEngine/third_party/mkl/x86_64/lib/libmkl_sequential.a + ${PROJECT_SOURCE_DIR}/../third_party/MegEngine/third_party/mkl/x86_64/lib/libmkl_intel_ilp64.a + ) + set(MGE_INSTALL_LIBS ${MEGENGINE_INSTALL_DIR}/lite/lib/x86_64/liblite_static_all_in_one.a) + target_compile_definitions(mgb_imported INTERFACE -DMKL_ILP64) + # WARNING: i386 is not test locally + elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i386" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i686") + set(MKL_LIBS + ${PROJECT_SOURCE_DIR}/../third_party/MegEngine/third_party/mkl/i386/lib/libmkl_core.a + ${PROJECT_SOURCE_DIR}/../third_party/MegEngine/third_party/mkl/x86_64/lib/libmkl_sequential.a + ${PROJECT_SOURCE_DIR}/../third_party/MegEngine/third_party/mkl/x86_64/lib/libmkl_intel_32.a + ) + set(MGE_INSTALL_LIBS ${MEGENGINE_INSTALL_DIR}/lite/lib/i386/liblite_static_all_in_one.a) + endif() + set(MKL_DNN_LIBS + ${MEGENGINE_INSTALL_DIR}/lib/libdnnl.a + ${MEGENGINE_INSTALL_DIR}/lib/libmkldnn.a + ) + + if(UNIX AND NOT APPLE) + target_link_libraries(mgb_imported INTERFACE ${MGE_INSTALL_LIBS} ${MKL_DNN_LIBS} -Wl,--start-group -ldl ${MKL_LIBS} -Wl,--end-group) + else() + target_link_libraries(mgb_imported INTERFACE ${MGE_INSTALL_LIBS} ${MKL_DNN_LIBS} ${MKL_LIBS}) + endif() + else() + set(MGE_INSTALL_LIBS ${MEGENGINE_INSTALL_DIR}/lib/libmegengine.a ${MEGENGINE_INSTALL_DIR}/lib/libflatbuffers.a) + target_link_libraries(mgb_imported INTERFACE ${MGE_INSTALL_LIBS}) + endif() + + endif() + + target_include_directories(mgb_imported INTERFACE ${MEGENGINE_INSTALL_DIR}/include) +endif() +# benchmarker config +file(GLOB_RECURSE SOURCES main.cpp src/*.cpp src/*.h) +add_executable(benchmarker ${SOURCES}) +target_include_directories( + benchmarker PUBLIC $ $) +if(ENABLE_MEGENGINE_FRAMEWORK) + target_link_libraries(benchmarker -pthread TinyNN mgb_imported) +else() +target_link_libraries(benchmarker -pthread TinyNN) +endif() +message(STATUS "${CMAKE_TOOLCHAIN_FILE}") +if(CMAKE_TOOLCHAIN_FILE) + if(ANDROID) + target_link_libraries(benchmarker log) + endif() +endif() + +install(TARGETS benchmarker LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}) diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 00000000..192569de --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,80 @@ +# How to use megcc benchmark + +## introduction +megcc benchmark is a easy tool to get the benchmark result of different model in megcc +the file struction is shown bellow: +``` +├── clean.sh +├── CMakeLists.txt +├── main.cpp +├── model +│ ├── model_arm.json +│ ├── model_riscv.json +│ ├── model_x86.json +│ └── request.txt +├── python +│ ├── example.py +│ ├── format.sh +│ └── src +│ ├── benchmark.py +│ └── models.py +├── README.md +├── src +│ ├── benchmark.h +│ ├── build_config.h.in +│ ├── CCbenchmark.cpp +│ ├── CCbenchmark.h +│ ├── MGEbenchmark.cpp +│ └── MGEbenchmark.h +└── tools + ├── cc_analysis.py + └── inference_visual.py +``` + +in src, it is a c++ application to run benchmark result on different platform. +in python, the model convertion, other related preparing work and the benchmarker example is given +the tools contains some usable scripts to analysis benchmark results +## supported model +mobilenetv2, resnet18, efficientnetb0 shufflenetv2 vgg16 +## request +```bash +mgeconvert > v.1.0.2 +onnx==1.11.0 +torch==1.10.0 +# or +git clone https://github.com/MegEngine/mgeconvert.git +cd mgeconvert +git checkout master +python3 -m pip install . --user --install-option="--targets=onnx" + +``` +the mgeconvert can be install by following command: +```bash +git clone https://github.com/MegEngine/mgeconvert.git +cd mgeconvert +git checkout master +python3 -m pip install . --user --install-option="--targets=onnx" + +``` +## get model and run benchmark example +``` bash +cd megcc/benchmark +export MEGCC_MGB_TO_TINYNN_PATH= +python3 python/example.py +``` +if you want to run in other platform, please reference the example add your new run_platform_xxx function in BenchmarkRunner, +the example given a ssh remote device test template + +## analysis megcc log + +the `output` directory is generated by `example.py` + +### visualize the inference result of different model +```bash +python3 benchmark/tools/inference_visual.py benchmark/output -o figure_dir +``` + +### visualize the profile result of different kernel in different model +```bash +python3 benchmark/tools/cc_analysis.py benchmark/output -o figure_dir +``` \ No newline at end of file diff --git a/benchmark/clean.sh b/benchmark/clean.sh new file mode 100755 index 00000000..61110f8f --- /dev/null +++ b/benchmark/clean.sh @@ -0,0 +1,3 @@ +# /bin/bash -e +set -x +rm -rf ./build* ./output ./config ./model/benchmark* ./model/generate* diff --git a/benchmark/main.cpp b/benchmark/main.cpp new file mode 100644 index 00000000..e124fa2f --- /dev/null +++ b/benchmark/main.cpp @@ -0,0 +1,64 @@ +/** + * \file benchmark/main.cpp + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ +#include +#include +#include +#include "src/CCbenchmark.h" +#include "src/MGEbenchmark.h" + +using namespace megcc; +using namespace Benchmark; +int main(int argc, char** argv) { + if (argc < 2 && argc > 4) { + fprintf(stderr, "cmdline error, please run with:\n"); + fprintf(stderr, "benchmarker [options] ... \n"); + fprintf(stderr, + "tips:\n\t you can use --profile and --mge to profile model " + "and enable megengine framework (\"megcc\" is default)\n"); + return -1; + } + int log_level = 3; + std::string framework = "megcc"; + std::string model_path = argv[1]; + int idx = 2; + while (idx < argc) { + std::string args = argv[idx]; + if (args == "--profile") { + log_level = 0; + } else if (args == "--mge") { + framework = "mge"; + } else { + fprintf(stderr, "invalid option: %s\n", argv[idx]); + } + ++idx; + } + std::vector> benchmarkers; + if (framework == "megcc") { + benchmarkers.push_back( + std::make_shared(model_path, log_level)); + } +#if ENABLE_MEGENGINE_FRAMEWORK + else if (framework == "mge") { + benchmarkers.push_back( + std::make_shared(model_path, log_level)); + } +#endif + else { + fprintf(stderr, + "unsupport framework: %s, megcc, mge(export " + "ENABLE_MEGENGINE_FRAMEWORK=ON) is supported\n", + framework.c_str()); + } + + for (size_t i = 0; i < benchmarkers.size(); ++i) { + benchmarkers[i]->load_model(); + benchmarkers[i]->profile(); + } + + return 0; +} \ No newline at end of file diff --git a/benchmark/model/model_arm.json b/benchmark/model/model_arm.json new file mode 100644 index 00000000..34103980 --- /dev/null +++ b/benchmark/model/model_arm.json @@ -0,0 +1,47 @@ +{ + "dump_dir": "./benchmark_kernel_arm/", + "models": [ + { + "model_name": "mobilenetv2", + "model_path": "./generated_models/mobilenetv2.mge", + "input_shape_str": "data=(1,3,224,224)", + "enable_nchw44": true + }, + { + "model_name": "resnet18", + "model_path": "./generated_models/resnet18.mge", + "input_shape_str": "data=(1,3,224,224)", + "enable_nchw44": true + }, + { + "model_name": "resnet50", + "model_path": "./generated_models/resnet50.mge", + "input_shape_str": "data=(1,3,224,224)", + "enable_nchw44": true + }, + { + "model_name": "efficientnetb0", + "model_path": "./generated_models/efficientnetb0.mge", + "input_shape_str": "data=(1,3,256,256)", + "enable_nchw44": true + }, + { + "model_name": "shufflenetv2", + "model_path": "./generated_models/shufflenetv2.mge", + "input_shape_str": "data=(1,3,224,224)", + "enable_nchw44": true + }, + { + "model_name": "vgg11", + "model_path": "./generated_models/vgg11.mge", + "input_shape_str": "data=(1,3,224,224)", + "enable_nchw44": true + }, + { + "model_name": "vgg16", + "model_path": "./generated_models/vgg16.mge", + "input_shape_str": "data=(1,3,224,224)", + "enable_nchw44": true + } + ] +} \ No newline at end of file diff --git a/benchmark/model/model_riscv.json b/benchmark/model/model_riscv.json new file mode 100644 index 00000000..6d00992f --- /dev/null +++ b/benchmark/model/model_riscv.json @@ -0,0 +1,40 @@ +{ + "dump_dir": "./benchmark_kernel_riscv/", + "models": [ + { + "model_name": "mobilenetv2", + "model_path": "./generated_models/mobilenetv2.mge", + "input_shape_str": "data=(1,3,224,224)" + }, + { + "model_name": "resnet18", + "model_path": "./generated_models/resnet18.mge", + "input_shape_str": "data=(1,3,224,224)" + }, + { + "model_name": "resnet50", + "model_path": "./generated_models/resnet50.mge", + "input_shape_str": "data=(1,3,224,224)" + }, + { + "model_name": "efficientnetb0", + "model_path": "./generated_models/efficientnetb0.mge", + "input_shape_str": "data=(1,3,256,256)" + }, + { + "model_name": "shufflenetv2", + "model_path": "./generated_models/shufflenetv2.mge", + "input_shape_str": "data=(1,3,224,224)" + }, + { + "model_name": "vgg11", + "model_path": "./generated_models/vgg11.mge", + "input_shape_str": "data=(1,3,224,224)" + }, + { + "model_name": "vgg16", + "model_path": "./generated_models/vgg16.mge", + "input_shape_str": "data=(1,3,224,224)" + } + ] +} \ No newline at end of file diff --git a/benchmark/model/model_x86.json b/benchmark/model/model_x86.json new file mode 100644 index 00000000..a93077e3 --- /dev/null +++ b/benchmark/model/model_x86.json @@ -0,0 +1,40 @@ +{ + "dump_dir": "./benchmark_kernel_x86/", + "models": [ + { + "model_name": "mobilenetv2", + "model_path": "./generated_models/mobilenetv2.mge", + "input_shape_str": "data=(1,3,224,224)" + }, + { + "model_name": "resnet18", + "model_path": "./generated_models/resnet18.mge", + "input_shape_str": "data=(1,3,224,224)" + }, + { + "model_name": "resnet50", + "model_path": "./generated_models/resnet50.mge", + "input_shape_str": "data=(1,3,224,224)" + }, + { + "model_name": "efficientnetb0", + "model_path": "./generated_models/efficientnetb0.mge", + "input_shape_str": "data=(1,3,256,256)" + }, + { + "model_name": "shufflenetv2", + "model_path": "./generated_models/shufflenetv2.mge", + "input_shape_str": "data=(1,3,224,224)" + }, + { + "model_name": "vgg11", + "model_path": "./generated_models/vgg11.mge", + "input_shape_str": "data=(1,3,224,224)" + }, + { + "model_name": "vgg16", + "model_path": "./generated_models/vgg16.mge", + "input_shape_str": "data=(1,3,224,224)" + } + ] +} \ No newline at end of file diff --git a/benchmark/python/example.py b/benchmark/python/example.py new file mode 100644 index 00000000..bed2fcd6 --- /dev/null +++ b/benchmark/python/example.py @@ -0,0 +1,157 @@ +#! /usr/bin/env python3 +import os + +import numpy as np +import yaml +from src.benchmark import BenchMarkRunnerBase, ValidModel, ValidOutputDir +from src.models import * + +all_models = AllModel() +arch_str=["x86", "arm64", "armv7"] +arch_str=["x86"] +framework_str = ["megcc"] +models_dir = "{}/benchmark/model/generated_models".format(megcc_path) +bechmarkers = {} +kernel_build_dirs = {} +# set as your own ssh device host and workdir(make sure install sshd and rsync on your device) +ssh_device_info=[ +{"name":"","host": "", "workdir": ""} +] + + +class BenchmarkRunner(BenchMarkRunnerBase): + remote_config = None + remote_config_file = "{}/benchmark/config/cofnig.yaml".format(megcc_path) + + def __init__(self, benchmark_build_dir="", benchmark_arch="x86"): + super().__init__(benchmark_build_dir, benchmark_arch) + + def run_ssh_device(self, ssh_name, ssh_host, ssh_workdir): + if not os.path.exists(self.output_dir.local_path) or os.path.isfile( + self.output_dir.local_path + ): + os.makedirs(self.output_dir.local_path) + logfile = open( + "{}/{}-{}-{}-{}-log-{}.txt".format( + self.output_dir.local_path, + self.benchmark_framework, + self.benchmark_arch, + self.model.name, + self.log_level, + ssh_name, + ), + "w", + ) + run_options = "" + if self.log_level == 0: + run_options += " --profile" + if self.benchmark_framework == "mge": + run_options += " --mge" + config_name = "benchmark-{}-{}-{}".format( + self.benchmark_framework, self.benchmark_arch, self.model.name + ) + for file_ in [self.benchmark_exec_func, self.model.path]: + cmd = "rsync -aP -zz {} {}:{}/".format( + file_, ssh_host, ssh_workdir + ) + subprocess.check_call(cmd, shell=True) + cmd = ' ssh -t {} "unset LD_PRELOAD && cd {} && LD_LIBRARY_PATH=./ && chmod +x ./benchmarker && ./benchmarker {}.{} {}" '.format( + ssh_host, ssh_workdir, self.model.name, self.model.exten, run_options + ) + subprocess.check_call(cmd, shell=True, stdout=logfile, stderr=subprocess.STDOUT) + + +def build_model_and_megcc_lib(): + #! dump all models from onnx to megengine + all_models.make(models_dir) + #! prepare megcc compiler + prepare_megcc() + #! build megcc model lib + for arch_desc in arch_str: + build_megcc_lib(arch_desc, model_config_json="", kernel_build_dir="") + + +#! build benchmarker +def gen_benchmarker(): + for arch_desc in arch_str: + benchmark_build_dir = "{}/benchmark/build/{}".format(megcc_path, arch_desc) + kernel_build_dirs[arch_desc] = "{}/benchmark/model/benchmark_kernel_{}".format( + megcc_path, arch_desc + ) + benchmarker = BenchmarkRunner( + benchmark_build_dir=benchmark_build_dir, benchmark_arch=arch_desc + ) + bechmarkers[arch_desc] = benchmarker + + +def build_benchmarker(x86_target="fallback"): + for arch_desc in arch_str: + benchmark_build_dir = "{}/benchmark/build/{}".format(megcc_path, arch_desc) + if arch_desc == "x86": + build_option = "-DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PWD/install -DRUNTIME_KERNEL_DIR={}".format( + kernel_build_dirs[arch_desc] + ) + else: + if arch_desc == "arm64": + TOOLCHAIN_OPTION = '-DCMAKE_TOOLCHAIN_FILE="$NDK_ROOT/build/cmake/android.toolchain.cmake" -DANDROID_NDK="$NDK_ROOT" -DANDROID_ABI=arm64-v8a -DANDROID_NATIVE_API_LEVEL=21' + elif arch_desc == "armv7": + TOOLCHAIN_OPTION = '-DCMAKE_TOOLCHAIN_FILE="$NDK_ROOT/build/cmake/android.toolchain.cmake" -DANDROID_NDK="$NDK_ROOT" -DANDROID_ABI=armeabi-v7a -DANDROID_NATIVE_API_LEVEL=21' + elif arch_desc == "riscv": + TOOLCHAIN_OPTION = '-DCMAKE_TOOLCHAIN_FILE="{}/runtime/toolchains/riscv64-linux-gnu.toolchain.cmake"'.format( + megcc_path + ) + build_option = "{} -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PWD/install -DRUNTIME_KERNEL_DIR={}".format( + TOOLCHAIN_OPTION, kernel_build_dirs[arch_desc] + ) + + bechmarkers[arch_desc].build(build_options=build_option) + + +# !set test config and run +def set_config_and_run(): + for arch_desc in arch_str: + kernel_build_dir = "{}/benchmark/model/benchmark_kernel_{}".format( + megcc_path, arch_desc + ) + for model in all_models.models: + for framework in framework_str: + for log_level in [False, True]: + if framework == "megcc": + exten = "tiny" + model_path = "{}/{}.tiny".format(kernel_build_dir, model.name) + elif framework == "mge": + model_path = "{}/{}.mge".format(models_dir, model.name) + exten = "mge" + model_ = ValidModel(model_path, model.name, exten) + output_dir_ = ValidOutputDir( + "{}/benchmark/output".format(megcc_path), "output" + ) + bechmarkers[arch_desc].set_config( + profile_kernel=log_level, + benchmark_framework=framework, + model=model_, + output_dir=output_dir_, + ) + if arch_desc == "x86": + bechmarkers[arch_desc].run_local() + elif arch_desc != "riscv": + # run for different device may avoid the effection of device heat radiation + for ssh_device in ssh_device_info: + ssh_name=ssh_device["name"] + ssh_host=ssh_device["host"] + ssh_workdir=ssh_device["workdir"] + bechmarkers[arch_desc].run_ssh_device(ssh_name, ssh_host, ssh_workdir) + else: + print("unsupported arch type in megcc") + return + + +def main(): + build_model_and_megcc_lib() + gen_benchmarker() + build_benchmarker() + set_config_and_run() + + +if __name__ == "__main__": + main() diff --git a/benchmark/python/format.sh b/benchmark/python/format.sh new file mode 100755 index 00000000..49e1b340 --- /dev/null +++ b/benchmark/python/format.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -e +cd $(dirname $0) + +ISORT_ARG="" +BLACK_ARG="" + +while getopts 'd' OPT; do + case $OPT in + d) + ISORT_ARG="--diff --check-only" + BLACK_ARG="--diff --check" + ;; + ?) + echo "Usage: `basename $0` [-d]" + esac +done + +isort $ISORT_ARG -j $(nproc) . +black $BLACK_ARG --target-version=py35 . +isort $ISORT_ARG -j $(nproc) ../tools +black $BLACK_ARG --target-version=py35 ../tools diff --git a/benchmark/python/src/benchmark.py b/benchmark/python/src/benchmark.py new file mode 100644 index 00000000..66b52a97 --- /dev/null +++ b/benchmark/python/src/benchmark.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +import os +import subprocess +from pathlib import Path + +import numpy as np + +megcc_path = Path( + os.path.split(os.path.realpath(__file__))[0] +).parent.parent.parent.absolute() + + +class ValidModel: + path = "" + name = "" + exten = "" + + def __init__(self, model_path="", model_name="", exten=""): + self.path = model_path + self.name = model_name + self.exten = exten + + +class ValidOutputDir: + local_path = "" + remote_path = "" + tag = "" + + def __init__(self, local_path="", remote_path=""): + self.local_path = local_path + self.remote_path = remote_path + + +class BenchMarkRunnerBase: + model = None + benchmark_build_dir = "" + benchmark_arch = "" + benchmark_framework = "" + output_dir = None + log_level = -1 + benchmark_exec_func = "" + + def __init__(self, benchmark_build_dir="", benchmark_arch="x86"): + if benchmark_build_dir == "": + benchmark_build_dir = "{}/benchmark/build_{}".format( + megcc_path, benchmark_arch + ) + self.benchmark_build_dir = benchmark_build_dir + self.benchmark_arch = benchmark_arch + + def build(self, x86_target="fallback", build_options=""): + # build prepare + if not os.path.exists(self.benchmark_build_dir) or os.path.isfile( + self.benchmark_build_dir + ): + os.makedirs(self.benchmark_build_dir) + # build megengine lib and set cmake build options + cmd = "cd {} && cmake {}/benchmark {} -G Ninja && ninja install/strip".format( + self.benchmark_build_dir, megcc_path, build_options + ) + subprocess.check_call(cmd, shell=True) + + def set_config( + self, + profile_kernel=False, + benchmark_framework="megcc", + model=None, + output_dir=None, + ): + if profile_kernel: + self.log_level = 0 + else: + self.log_level = 3 + self.benchmark_framework = benchmark_framework + self.output_dir = output_dir + self.model = model + self.benchmark_exec_func = "{}/install/bin/benchmarker".format( + self.benchmark_build_dir + ) + + def run_local(self): + if not os.path.exists(self.output_dir.local_path) or os.path.isfile( + self.output_dir.local_path + ): + os.makedirs(self.output_dir.local_path) + logfile = open( + "{}/{}-{}-{}-{}-log-local.txt".format( + self.output_dir.local_path, + self.benchmark_framework, + self.benchmark_arch, + self.model.name, + self.log_level, + ), + "w", + ) + run_options = "" + if self.log_level == 0: + run_options += " --profile" + if self.benchmark_framework == "mge": + run_options += " --mge" + cmd = "{} {} {}".format(self.benchmark_exec_func, self.model.path, run_options) + subprocess.check_call(cmd, shell=True, stdout=logfile, stderr=subprocess.STDOUT) diff --git a/benchmark/python/src/models.py b/benchmark/python/src/models.py new file mode 100644 index 00000000..6547ad01 --- /dev/null +++ b/benchmark/python/src/models.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +import logging +import os +import subprocess +from pathlib import Path + +import numpy as np +import torch.onnx +import torchvision +from mgeconvert.converters.onnx_to_mge import * + +megcc_path = Path( + os.path.split(os.path.realpath(__file__))[0] +).parent.parent.parent.absolute() +default_gen_path = "{}/benchmark/model/generated_models".format(megcc_path) + + +class Model: + name = None + torch_model = None + input_shape = [] + + def __init__(self, name, torch_model, input_shape): + self.name = name + self.torch_model = torch_model + self.input_shape = input_shape + + +class AllModel: + models = [] + # model src from onnx + def __init__(self): + # pytorch model + self.models.append( + Model( + "mobilenetv2", + torchvision.models.mobilenetv2.mobilenet_v2(), + [1, 3, 224, 224], + ) + ) + self.models.append( + Model( + "efficientnetb0", + torchvision.models.efficientnet.efficientnet_b0(), + [1, 3, 256, 256], + ) + ) + self.models.append( + Model( + "shufflenetv2", + torchvision.models.shufflenetv2.shufflenet_v2_x0_5(), + [1, 3, 224, 224], + ) + ) + self.models.append( + Model("resnet18", torchvision.models.resnet.resnet18(), [1, 3, 224, 224]) + ) + self.models.append( + Model("resnet50", torchvision.models.resnet.resnet50(), [1, 3, 224, 224]) + ) + self.models.append( + Model("vgg11", torchvision.models.vgg.vgg11(), [1, 3, 224, 224]) + ) + self.models.append( + Model("vgg16", torchvision.models.vgg.vgg16(), [1, 3, 224, 224]) + ) + + def get_all_onnx_models(self, output_dir=default_gen_path): + if not os.path.exists(output_dir) or os.path.isfile(output_dir): + os.makedirs(output_dir) + for model in self.models: + output = "{}/{}.onnx".format(output_dir, model.name) + logging.debug("get model file from torchvision to: {}".format(output)) + net = model.torch_model + net.eval() + input_data = torch.randn(model.input_shape) + torch.onnx.export( + net, + input_data, + output, + export_params=True, + opset_version=12, + input_names=["data"], + output_names=["ret"], + ) + + def convert_to_mge(self, output_dir=default_gen_path): + for model in self.models: + input = "{}/{}.onnx".format(output_dir, model.name) + output = "{}/{}.mge".format(output_dir, model.name) + onnx_to_mge(input, output) + + def make(self, model_dir=""): + if model_dir != "": + self.get_all_onnx_models(model_dir) + self.convert_to_mge(model_dir) + else: + self.get_all_onnx_models() + self.convert_to_mge() + + +def prepare_megcc(): + # build prepare + MEGCC_MGB_TO_TINYNN_PATH = os.environ.get("MEGCC_MGB_TO_TINYNN_PATH") + assert ( + len(MEGCC_MGB_TO_TINYNN_PATH) != 0 + ), "MEGCC_MGB_TO_TINYNN_PATH is not valid, please export MEGCC_MGB_TO_TINYNN_PATH to your path of mgb_to_tinynn" + + +def build_megcc_lib(arch_desc="x86", model_config_json="", kernel_build_dir=""): + MEGCC_MGB_TO_TINYNN_PATH = os.environ.get("MEGCC_MGB_TO_TINYNN_PATH") + # build prepare + change_dir = "" + if model_config_json == "": + arch_ = arch_desc + if arch_desc == "arm64" or arch_desc == "armv7": + arch_ = "arm" + model_config_json = "{}/benchmark/model/model_{}.json".format(megcc_path, arch_) + if kernel_build_dir == "": + # WARNING: the dir path should be the same with path set in model_config_json file + kernel_build_dir = "{}/benchmark/model/benchmark_kernel_{}".format( + megcc_path, arch_desc + ) + change_dir = "cd {}/benchmark/model".format(megcc_path) + if not os.path.exists(kernel_build_dir) or os.path.isfile(kernel_build_dir): + os.makedirs(kernel_build_dir) + # set runtime build options + if arch_desc == "x86": + arch = "--baremetal" + runtime_flag = "" + elif arch_desc == "arm64": + arch = "--arm64" + runtime_flag = "--cross_build --cross_build_target_arch aarch64 --cross_build_target_os ANDROID" + elif arch_desc == "armv7": + arch = "--armv7" + runtime_flag = "--cross_build --cross_build_target_arch armv7-a --cross_build_target_os ANDROID " + elif arch_desc == "riscv": + arch = "--baremetal" + runtime_flag = "--cross_build --cross_build_target_arch rv64gcv0p7 --cross_build_target_os LINUX" + + # convert model + if len(change_dir) != 0: + cmd = "{} && {}/mgb-to-tinynn -json={} {} --dump {}".format( + change_dir, + MEGCC_MGB_TO_TINYNN_PATH, + model_config_json, + arch, + kernel_build_dir, + ) + else: + cmd = "{}/mgb-to-tinynn -json={} {} --dump {}".format( + change_dir, + MEGCC_MGB_TO_TINYNN_PATH, + model_config_json, + arch, + kernel_build_dir, + ) + subprocess.check_call(cmd, shell=True) + # build runtime + cmd = "python3 {}/runtime/scripts/runtime_build.py --build_with_profile --kernel_dir {}/ --remove_old_build {}".format( + megcc_path, kernel_build_dir, runtime_flag + ) + subprocess.check_call(cmd, shell=True) diff --git a/benchmark/src/CCbenchmark.cpp b/benchmark/src/CCbenchmark.cpp new file mode 100644 index 00000000..23b8e4c1 --- /dev/null +++ b/benchmark/src/CCbenchmark.cpp @@ -0,0 +1,97 @@ +/** + * \file benchmark/src/CCbenchmark.cpp + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ +#include "CCbenchmark.h" +#include +#include +#include +#include +#include +#include "lite-c/common_enum_c.h" +#include "lite-c/global_c.h" +#include "lite-c/tensor_c.h" +const int number = 50; +const int warmup = 10; + +#define LITE_CAPI_CHECK(error_, msg_) \ + if (error_) { \ + printf(msg_); \ + LITE_destroy_network(m_model); \ + __builtin_trap(); \ + } + +#define EXAMPLE_ASSERT(exp_, ...) \ + if (!(exp_)) { \ + printf("" __VA_ARGS__); \ + __builtin_trap(); \ + } + +using namespace megcc; +using namespace Benchmark; + +#if TINYNN_CALLBACK_ENABLE +#include +#include "tinynn_callback.h" +static void test_timeimp(int32_t* sec, int32_t* usec) { + struct timeval t; + gettimeofday(&t, NULL); + *sec = t.tv_sec; + *usec = t.tv_usec; +} +static TinyNnCallBack g_cb = { + .tinynn_log_cb = printf, + .tinynn_gettime_cb = test_timeimp, + .tinynn_malloc_cb = malloc, + .tinynn_free_cb = free, + .tinynn_fopen_cb = fopen, + .tinynn_ftell_cb = ftell, + .tinynn_fseek_cb = fseek, + .tinynn_fclose_cb = fclose, + .tinynn_fwrite_cb = fwrite, + .tinynn_fread_cb = fread, +}; +#endif + +/////////////////// CCBenchmarker //////////////// +void CCBenchmarker::load_model() { +#if TINYNN_CALLBACK_ENABLE + register_tinynn_cb(TINYNN_CB_VERSION, g_cb); +#endif + LITE_CAPI_CHECK(LITE_make_network(&m_model, *default_config(), + *default_network_io()), + "create model error. \n"); + + LITE_CAPI_CHECK(LITE_load_model_from_path(m_model, m_model_path.c_str()), + "load model error. \n"); +} + +void CCBenchmarker::profile() { + for (int i = 0; i < warmup; i++) { + LITE_CAPI_CHECK(LITE_forward(m_model), "run model failed\n"); + LITE_CAPI_CHECK(LITE_wait(m_model), "wait model failed\n"); + } + + struct timeval start; + struct timeval end; + gettimeofday(&start, NULL); + for (int i = 0; i < number; i++) { + LITE_CAPI_CHECK(LITE_forward(m_model), "run model failed\n"); + LITE_CAPI_CHECK(LITE_wait(m_model), "wait model failed\n"); + } + gettimeofday(&end, NULL); + + unsigned long diff = + 1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - start.tv_usec; + float average_time = ((float)diff) / number / 1000; + if (m_log_level == 3) { + printf("the inference average time=%.3f ms\n", average_time); + } +} + +CCBenchmarker::~CCBenchmarker() { + LITE_CAPI_CHECK(LITE_destroy_network(m_model), "delete model failed\n"); +} \ No newline at end of file diff --git a/benchmark/src/CCbenchmark.h b/benchmark/src/CCbenchmark.h new file mode 100644 index 00000000..a0e81792 --- /dev/null +++ b/benchmark/src/CCbenchmark.h @@ -0,0 +1,33 @@ +/** + * \file benchmark/src/CCbenchmark.h + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ +#pragma once +#include +#include +#include "benchmark.h" +#include "lite-c/global_c.h" +#include "lite-c/network_c.h" +namespace megcc { +namespace Benchmark { +class CCBenchmarker final : public Benchmarker { +public: + CCBenchmarker(std::string model, int log_level) + : m_model_path(model), m_log_level(log_level) { + LITE_set_log_level(static_cast(log_level)); + }; + virtual void load_model() override; + virtual void profile() override; + ~CCBenchmarker(); + +private: + int m_log_level; + std::string m_model_path; + LiteNetwork m_model; +}; +} // namespace Benchmark + +} // namespace megcc diff --git a/benchmark/src/MGEbenchmark.cpp b/benchmark/src/MGEbenchmark.cpp new file mode 100644 index 00000000..6ba26b35 --- /dev/null +++ b/benchmark/src/MGEbenchmark.cpp @@ -0,0 +1,100 @@ +/** + * \file benchmark/src/MGEbenchmark.cpp + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ +#include "MGEbenchmark.h" +#if ENABLE_MEGENGINE_FRAMEWORK +#include +#include +#include +#include +#include +#include "megbrain/gopt/inference.h" +using namespace mgb; +using namespace megcc; +using namespace Benchmark; +const int number = 50; +const int warmup = 10; + +void MGEBenchmarker::load_model() { + std::unique_ptr inp_file = + serialization::InputFile::make_fs(m_model_path.c_str()); + auto format = + serialization::GraphLoader::identify_graph_dump_format(*inp_file); + mgb_assert(format.valid(), "invalid model: unknown model format"); + auto loader = + serialization::GraphLoader::make(std::move(inp_file), format.val()); + if (m_log_level == 0) { + m_profiler = std::move(std::make_unique( + m_load_config.comp_graph.get())); + } else { + m_load_config.comp_graph->options().comp_node_seq_record_level = 1; + } + m_load_config.comp_graph->options().var_sanity_check_first_run = false; + m_load_config.comp_graph->options() + .graph_opt.enable_fuse_conv_bias_nonlinearity(); + m_load_config.comp_graph->options().graph_opt.enable_weight_preprocess(); + + m_model = loader->load(m_load_config, false); +} + +void MGEBenchmarker::profile() { + //! optimize for inference + auto& output_vars = m_model.output_var_list; + + using Strategy = + mgb::opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; + auto strategy = static_cast(0); + strategy = Strategy::PROFILE | Strategy::OPTIMIZED | strategy; + mgb::gopt::modify_opr_algo_strategy_inplace(output_vars, strategy); + mgb::gopt::OptimizeForInferenceOptions opt_for_inference; +#ifdef __ANDROID__ +#if __ARM_FEATURE_DOTPROD + opt_for_inference.enable_nchw44_dot(); +#else + opt_for_inference.enable_nchw44(); +#endif +#else + output_vars = mgb::gopt::layout_transform( + output_vars, mgb::gopt::GraphTuningOptions::Target::CPU); +#endif + size_t nr_output = output_vars.size(); + + output_vars = + mgb::gopt::optimize_for_inference(output_vars, opt_for_inference); + std::vector> input_map_vec; + auto cg = m_model.output_var_list[0].node()->owner_graph(); + for (auto&& i : output_vars) { + mgb::ComputingGraph::Callback cb; + m_output_spec.emplace_back(i, std::move(cb)); + } + m_func = cg->compile(m_output_spec); + struct timeval start; + struct timeval end; + gettimeofday(&start, NULL); + for (int i = 0; i < warmup; ++i) { + m_func->execute().wait(); + } + gettimeofday(&end, NULL); + unsigned long diff = + 1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - start.tv_usec; + + gettimeofday(&start, NULL); + for (int i = 0; i < number; ++i) { + m_func->execute().wait(); + } + gettimeofday(&end, NULL); + diff = 1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - start.tv_usec; + float average_time = ((float)diff) / number / 1000; + if (m_log_level == 0) { + std::string profile_ret; + m_profiler->to_json_full(m_func.get())->writeto(profile_ret, 4); + printf("%s\n", profile_ret.c_str()); + } else { + printf("the inference average time=%.3f ms\n", average_time); + } +} +#endif \ No newline at end of file diff --git a/benchmark/src/MGEbenchmark.h b/benchmark/src/MGEbenchmark.h new file mode 100644 index 00000000..c4caf24d --- /dev/null +++ b/benchmark/src/MGEbenchmark.h @@ -0,0 +1,38 @@ +/** + * \file benchmark/src/MGEbenchmark.h + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ +#pragma once +#include "build_config.h" +#if ENABLE_MEGENGINE_FRAMEWORK +#include +#include +#include "benchmark.h" +#include "megbrain/plugin/profiler.h" +#include "megbrain/serialization/serializer.h" +namespace megcc { +namespace Benchmark { +class MGEBenchmarker final : public Benchmarker { +public: + MGEBenchmarker(std::string model, int log_level) + : m_model_path(model), m_log_level(log_level) { + m_load_config.comp_graph = mgb::ComputingGraph::make(); + }; + virtual void load_model() override; + virtual void profile() override; + +private: + int m_log_level; + std::string m_model_path; + mgb::serialization::GraphLoadConfig m_load_config; + mgb::serialization::GraphLoader::LoadResult m_model; + std::unique_ptr m_func; + std::unique_ptr m_profiler; + mgb::cg::ComputingGraph::OutputSpec m_output_spec; +}; +} // namespace Benchmark +} // namespace megcc +#endif \ No newline at end of file diff --git a/benchmark/src/benchmark.h b/benchmark/src/benchmark.h new file mode 100644 index 00000000..1a6172fe --- /dev/null +++ b/benchmark/src/benchmark.h @@ -0,0 +1,25 @@ +/** + * \file benchmark/src/benchmark.h + * + * This file is part of MegCC, a deep learning compiler developed by Megvii. + * + * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved. + */ +#pragma once +#include +#include +namespace megcc { +namespace Benchmark { +/** + * Benchmarker interface + * + */ +class Benchmarker { +public: + virtual void load_model() = 0; + virtual void profile() = 0; + virtual ~Benchmarker() = default; +}; +} // namespace Benchmark + +} // namespace megcc diff --git a/benchmark/src/build_config.h.in b/benchmark/src/build_config.h.in new file mode 100644 index 00000000..da9180e9 --- /dev/null +++ b/benchmark/src/build_config.h.in @@ -0,0 +1,10 @@ +#ifndef _HEADER_BUILD_CONFIG +#define _HEADER_BUILD_CONFIG + +#cmakedefine01 ENABLE_MEGENGINE_FRAMEWORK + +#ifndef ENABLE_MEGENGINE_FRAMEWORK +#define ENABLE_MEGENGINE_FRAMEWORK 0 +#endif + +#endif // _HEADER_BUILD_CONFIG diff --git a/benchmark/tools/cc_analysis.py b/benchmark/tools/cc_analysis.py new file mode 100644 index 00000000..b07248a8 --- /dev/null +++ b/benchmark/tools/cc_analysis.py @@ -0,0 +1,89 @@ +#! /usr/bin/env python3 +import argparse +import os +import re +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np + + +def main(passed_args=None): + parser = argparse.ArgumentParser( + description="analyze profile result", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("data") + parser.add_argument("--output", "-o", default=".", type=str) + args = parser.parse_args(passed_args) + if not os.path.exists(args.output) or os.path.isfile(args.output): + os.makedirs(args.output) + files0 = set() + if os.path.isdir(args.data): + for i in os.listdir(args.data): + files0.add(str(Path(args.data) / i)) + else: + files0.add(args.data) + data_map = {} + data_info = [] + model_set = set() + for i in files0: + path = i.split("/") + file_name = path[len(path) - 1].split(".") + info = file_name[0].split("-") + if info[0] == "megcc" and info[3] == "0": + text_file = open(i, "r") + data = text_file.read() + text_file.close() + pattern = re.compile(r"\s\w+\s[\r\n]+use\s\d*\.\d+") + results = pattern.findall(data) + analyze_data = [] + op_totoal_nums = len(results) + op_per_test = int(op_totoal_nums / 60) + iter_num = 0 + total = 0.0 + for i in results: + kernel_name_pattern = re.compile(r"\s\w+\s") + kernel_time_pattern = re.compile(r"\d*\.\d+") + kernel_name = kernel_name_pattern.search(i).group() + kernel_time = float(kernel_time_pattern.search(i).group()) + if iter_num < op_per_test: + total = total + kernel_time + analyze_data.append([kernel_name, kernel_time]) + else: + total = total + kernel_time + analyze_data[iter_num % op_per_test][1] += kernel_time + + iter_num = iter_num + 1 + diff_kernel_data = {} + for i in analyze_data: + if not i[0] in diff_kernel_data: + diff_kernel_data[i[0]] = [i[1], i[1] / total] + else: + diff_kernel_data[i[0]][0] += i[1] + diff_kernel_data[i[0]][1] += i[1] / total + kernel_name = [] + kernel_rate = [] + for k, v in sorted( + diff_kernel_data.items(), key=lambda item: item[1][1], reverse=True + ): + kernel_name.append(k) + kernel_rate.append(v[1] * 100) + + barWidth = 0.5 + topK = 10 + kernel_name = kernel_name[0:topK] + kernel_rate = kernel_rate[0:topK] + br1 = np.arange(len(kernel_name)) + plt.figure(figsize=(25, 6)) + plt.title("{}-{}-{}".format(info[1], info[5], info[2]), fontsize=30) + plt.pie(kernel_rate, labels=kernel_name, autopct="%0.1f%%") + plt.savefig( + "{}/{}-{}-{}-profile-top{}.png".format( + args.output, info[1], info[5], info[2], topK + ) + ) + + +if __name__ == "__main__": + main() diff --git a/benchmark/tools/inference_visual.py b/benchmark/tools/inference_visual.py new file mode 100644 index 00000000..ca023c66 --- /dev/null +++ b/benchmark/tools/inference_visual.py @@ -0,0 +1,91 @@ +#! /usr/bin/env python3 +import argparse +import os +import re +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np + + +def main(passed_args=None): + parser = argparse.ArgumentParser( + description="visualize inference result", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("data") + parser.add_argument("--output", "-o", default=".", type=str) + args = parser.parse_args(passed_args) + files0 = set() + if not os.path.exists(args.output) or os.path.isfile(args.output): + os.makedirs(args.output) + if os.path.isdir(args.data): + for i in os.listdir(args.data): + files0.add(str(Path(args.data) / i)) + else: + files0.add(args.data) + data_map = {} + data_info = [] + model_set = set() + for i in files0: + path = i.split("/") + file_name = path[len(path) - 1].split(".") + info = file_name[0].split("-") + + if "{}-{}".format(info[1], info[5]) not in data_map: + data_map["{}-{}".format(info[1], info[5])] = {} + if info[0] not in data_map["{}-{}".format(info[1], info[5])]: + data_map["{}-{}".format(info[1], info[5])][info[0]] = [] + if info[3] == "3": + text_file = open(i, "r") + data = text_file.read() + text_file.close() + pattern = re.compile(r"\d*\.\d+") + result = float(pattern.search(data).group()) + # for excel + data_info.append([info[1], info[5], info[0], info[2], result]) + data_map["{}-{}".format(info[1], info[5])][info[0]].append( + [info[2], result] + ) + model_set.add(info[2]) + + model_list = [] + for model in model_set: + model_list.append(model) + model_list = sorted(model_list) + + for k, v in data_map.items(): + for k0, v0 in v.items(): + v1 = sorted(v0, key=lambda item: item[0]) + v1_val = [] + for i in v1: + v1_val.append(i[1]) + data_map[k][k0] = v1_val + for i in data_info: + print(i[0], i[1], i[2], i[3], i[4]) + print(model_list) + print(data_map) + # generate figure + barWidth = 0.5 + br1 = np.arange(len(model_list)) + br2 = [x + barWidth for x in br1] + for k, v in data_map.items(): + plt.figure(figsize=(10, 6)) + plt.title(k) + # Make the plot + plt.bar(br1, v["megcc"], width=barWidth, edgecolor="grey", label="megcc") + + # Adding Xticks + plt.xlabel("model", fontweight="bold", fontsize=15) + plt.ylabel("inference(ms)", fontweight="bold", fontsize=15) + plt.xticks([r + barWidth for r in range(len(model_list))], model_list) + plt.grid(axis="y") + for a, b in zip(br1, v["megcc"]): + plt.text(a, b + 0.05, "%.2f" % b, ha="center", va="bottom") + + plt.legend() + plt.savefig("{}/{}.png".format(args.output, k)) + + +if __name__ == "__main__": + main() From 16fa9ee784351bcdce9f92142ccd7199c80f9bd7 Mon Sep 17 00:00:00 2001 From: yeasoon <1695924908@qq.com> Date: Tue, 10 Jan 2023 16:25:48 +0800 Subject: [PATCH 17/17] fix(third_party): update megengine to e77cea141387fc8095b8c842547fcd6510f5c41f --- compiler/include/compiler/Common/Version.h.in | 2 +- script/release_megcc.sh | 6 ------ third_party/MegEngine | 2 +- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/compiler/include/compiler/Common/Version.h.in b/compiler/include/compiler/Common/Version.h.in index ce1f2c8c..7b2a0301 100644 --- a/compiler/include/compiler/Common/Version.h.in +++ b/compiler/include/compiler/Common/Version.h.in @@ -11,7 +11,7 @@ #include #define MEGCC_MAJOR 0 #define MEGCC_MINOR 1 -#define MEGCC_PATCH 2 +#define MEGCC_PATCH 3 namespace megcc { namespace { const std::string git_branch = "@GIT_BRANCH@"; diff --git a/script/release_megcc.sh b/script/release_megcc.sh index eef6c540..6e5ff688 100755 --- a/script/release_megcc.sh +++ b/script/release_megcc.sh @@ -32,17 +32,11 @@ pushd ${OUT_DIR}/build_host cmake ${COMPILER_PATH} -G Ninja ninja cp tools/mgb-to-tinynn/mgb-to-tinynn ${OUT_DIR}/bin/ - strip mgb-to-tinynn cp tools/mgb-runner/mgb-runner ${OUT_DIR}/bin/ - strip mgb-runner cp tools/mgb-importer/mgb-importer ${OUT_DIR}/bin/ - strip mgb-importer cp tools/kernel_exporter/kernel_exporter ${OUT_DIR}/bin/ - strip kernel_exporter cp tools/hako-to-mgb/hako-to-mgb ${OUT_DIR}/bin/ - strip hako-to-mgb cp tools/megcc-opt/megcc-opt ${OUT_DIR}/bin/ - strip megcc-opt popd pushd ${PROJECT_PATH}/compiler GIT_ID=`git rev-parse --short HEAD` diff --git a/third_party/MegEngine b/third_party/MegEngine index 31218a18..e77cea14 160000 --- a/third_party/MegEngine +++ b/third_party/MegEngine @@ -1 +1 @@ -Subproject commit 31218a1863edf07be0feed947fad0dc38740fee3 +Subproject commit e77cea141387fc8095b8c842547fcd6510f5c41f