From 29fa4cf1c5a86515941bb5f69465d5657511ff18 Mon Sep 17 00:00:00 2001
From: limingxin <limingxin@megvii.com>
Date: Wed, 23 Nov 2022 18:01:20 +0800
Subject: [PATCH 01/17] feat(compiler & runtime): support extern opr loader

---
 ci/test_tools.sh                              |   4 +-
 .../Dialect/Kernel/IR/KernelDialect.td        |  13 +
 .../compiler/Target/MGB/dummy_loader.h        | 134 +++++++++
 compiler/include/megbrain/IR/ops.td           |  13 +
 .../Conversion/MGBToKernel/MGBToKernel.cpp    |  25 +-
 .../MGBToKernel/MGBToKernelHelper.h           |  11 +
 .../lib/Dialect/Kernel/IR/KernelDialect.cpp   |   5 +
 compiler/lib/Target/MGB/importer.cpp          | 195 +++++++++++-
 compiler/lib/Target/TinyNN/exporter.cpp       |  35 +++
 immigration/include/extern_c_opr.h            | 222 ++++++++++++++
 runtime/CMakeLists.txt                        |   2 +-
 runtime/example/standard_OS/lite_main.c       | 114 ++++++-
 runtime/schema/model.fbs                      |  15 +-
 runtime/src/cheader/model_reader.h            |  32 +-
 runtime/src/vm.c                              |   1 +
 runtime/src/vm/extern_opr.c                   | 283 ++++++++++++++++++
 runtime/src/vm/instruction.h                  |  15 +-
 runtime/test/CMakeLists.txt                   |   3 +-
 runtime/version.ld                            |   1 +
 script/build_and_test_not_standard_os.sh      |   2 +-
 20 files changed, 1099 insertions(+), 26 deletions(-)
 create mode 100644 compiler/include/compiler/Target/MGB/dummy_loader.h
 create mode 100644 immigration/include/extern_c_opr.h
 create mode 100644 runtime/src/vm/extern_opr.c
diff --git a/ci/test_tools.sh b/ci/test_tools.sh
index e3f5dfc1..3405de38 100755
--- a/ci/test_tools.sh
+++ b/ci/test_tools.sh
@@ -40,7 +40,7 @@ function compare_output_with_mgb(){
   mkdir -p "${TINYNN_OUTPUT_DIR}"
   TINYMODEL_PATH=`find  ${OUTPUT_DIR} -name "*.tiny"`
   TINYNN_SHAPE_STR=`echo $INPUT_DATA_SHAPE_STR | sed 's/[()]//g'`
-  $RUNTIME_BUILD_DIR/tinynn_test_lite ${TINYMODEL_PATH} "$TINYNN_OUTPUT_DIR" 0 $INPUT_DATA_STR ${TINYNN_SHAPE_STR}
+  $RUNTIME_BUILD_DIR/tinynn_test_lite -m ${TINYMODEL_PATH} -o "$TINYNN_OUTPUT_DIR" -l 0 -d $INPUT_DATA_STR -s ${TINYNN_SHAPE_STR}
   MGB_OUTPUT_DIR="$OUTPUT_DIR/mgb_out/"
   mkdir -p "${MGB_OUTPUT_DIR}"
   if [[ "$MODEL_PATH" == *".emod" ]];then
@@ -69,7 +69,7 @@ function check_mem_leak_with_asan(){
   cmake --build "$RUNTIME_BUILD_DIR_ASAN" --target tinynn_test_lite
   TINYNN_OUTPUT_ASAN_DIR="$OUTPUT_DIR/tinynn_out_asan"
   mkdir -p ${TINYNN_OUTPUT_ASAN_DIR}  
-  $RUNTIME_BUILD_DIR_ASAN/tinynn_test_lite ${TINYMODEL_PATH} "$TINYNN_OUTPUT_ASAN_DIR" 0 $INPUT_DATA_STR $TINYNN_SHAPE_STR
+  $RUNTIME_BUILD_DIR_ASAN/tinynn_test_lite -m ${TINYMODEL_PATH} -o "$TINYNN_OUTPUT_ASAN_DIR" -l 0 -d $INPUT_DATA_STR -s $TINYNN_SHAPE_STR
   python3 $PROJECT_PATH/ci/compare_output_bin.py $TINYNN_OUTPUT_ASAN_DIR $MGB_OUTPUT_DIR --eps="$EPS"
 }
 
diff --git a/compiler/include/compiler/Dialect/Kernel/IR/KernelDialect.td b/compiler/include/compiler/Dialect/Kernel/IR/KernelDialect.td
index 815a5330..a93dbaba 100644
--- a/compiler/include/compiler/Dialect/Kernel/IR/KernelDialect.td
+++ b/compiler/include/compiler/Dialect/Kernel/IR/KernelDialect.td
@@ -90,6 +90,19 @@ def KernelCall: KernelBase<"KernelCall", [
     );
 }
 
+def ExternOpr: KernelBase<"ExternOpr", [
+        DeclareOpInterfaceMethods<SymbolUserOpInterface>,
+        AttrSizedOperandSegments
+    ]> { 
+    let arguments = (ins
+    Arg<Variadic<AnyMemRef>, "", [MemRead]>:$operands,
+    Arg<Variadic<AnyMemRef>, "", [MemWrite]>:$results,
+    StrAttr:$name,
+    StrAttr:$data,
+    UI32Attr:$data_len
+  );
+}
+
 class InstructBase<string mnemonic, dag extraArgs=(ins), list<Trait> traits=[]>:
     KernelBase<mnemonic, !listconcat(traits, [
         NoSideEffect
diff --git a/compiler/include/compiler/Target/MGB/dummy_loader.h b/compiler/include/compiler/Target/MGB/dummy_loader.h
new file mode 100644
index 00000000..bc809af2
--- /dev/null
+++ b/compiler/include/compiler/Target/MGB/dummy_loader.h
@@ -0,0 +1,134 @@
+/**
+ * \file compiler/include/compiler/Target/MGB/dummy_loader.h
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+
+#include <malloc.h>
+#include <cassert>
+#include <map>
+#include <memory>
+#include <queue>
+#include <vector>
+#include "megbrain/serialization/extern_c_opr.h"
+
+namespace {
+std::map<std::string,
+         std::pair<std::vector<std::vector<uint32_t>>, std::vector<uint32_t>>>
+        name2outputinfo;
+class MGBOprDescImpl {
+    static std::string loader_name;
+
+    static inline const std::pair<std::vector<std::vector<uint32_t>>,
+                                  std::vector<uint32_t>>&
+    get_output_info(const std::string& loader_name) {
+        auto&& iter = name2outputinfo.find(loader_name);
+        if (iter != name2outputinfo.end())
+            return iter->second;
+        else if (name2outputinfo.size() == 1)
+            return name2outputinfo.begin()->second;
+        else {
+            CC_ASSERT(0)
+                    << "Please check loader name in command line args whether "
+                       "consistent with loader name in dumped model.\n";
+            return {};
+        }
+    }
+
+    static void release(MGBOprDesc* self) {
+        free(self->user_data);
+        delete self;
+    }
+
+    static size_t hash(const MGBOprDesc* self) { return 1; }
+
+    static int is_same(const MGBOprDesc* self, const MGBOprDesc* rhs) {
+        CC_ABORT << "The function 'is_same' is part of the dummy loader, just "
+                    "for "
+                    "compile but should NOT be called.\n";
+        return 1;
+    }
+
+    static void execute(const MGBOprDesc* self, const MGBTensor* input,
+                        const MGBTensor* output) {
+        CC_ABORT << "The function 'execute' is part of the dummy loader, just "
+                    "for "
+                    "compile but should NOT be called.\n";
+    }
+
+    static void infer_shape(const MGBOprDesc* self, const MGBTensorShape* input,
+                            MGBTensorShape* output) {
+        auto&& output_shapes =
+                get_output_info(reinterpret_cast<char*>(self->user_data)).first;
+        for (size_t i = 0; i < self->nr_output; ++i) {
+            output[i].ndim = output_shapes[i].size();
+            for (size_t j = 0; j < output[i].ndim; ++j)
+                output[i].shape[j] = output_shapes[i][j];
+        }
+    }
+
+    static void infer_dtype(const struct MGBOprDesc* self,
+                            const MGBDType* input, MGBDType* output) {
+        auto&& output_dtypes =
+                get_output_info(reinterpret_cast<char*>(self->user_data))
+                        .second;
+        for (size_t i = 0; i < self->nr_output; ++i)
+            output[i] = static_cast<MGBDType>(output_dtypes[i]);
+    }
+
+public:
+    static MGBOprDesc* make(const std::string& loader_name) {
+        auto desc = std::make_unique<MGBOprDesc>();
+
+        uint32_t nr_output = get_output_info(loader_name).first.size();
+        mgb_init_opr_desc(desc.get(), nr_output, "dummy");
+#define cb(func) desc->func = func;
+        MGB_OPR_DESC_FOREACH_MEM_FN(cb)
+#undef cb
+        desc->infer_dtype = infer_dtype;
+        // copy loader name into desc->user_data
+        desc->user_data = malloc(loader_name.size() + 1);
+        memcpy(desc->user_data, loader_name.c_str(), loader_name.size());
+        reinterpret_cast<char*>(desc->user_data)[loader_name.size()] = '\0';
+
+        return desc.release();
+    }
+};
+
+class MGBOprLoaderImpl {
+    static std::map<std::string, void*> user_datas;
+
+    static MGBOprDesc* create_desc(size_t nr_input, const void* buf,
+                                   size_t buf_len) {
+        std::string name((char*)buf + sizeof(size_t), *(size_t*)buf);
+        size_t data_len = buf_len - sizeof(size_t) - *(size_t*)buf;
+        void* user_data = malloc(sizeof(size_t) + data_len);
+        *(size_t*)(user_data) = data_len;
+        memmove(user_data + sizeof(size_t),
+                buf + sizeof(size_t) + *(size_t*)buf, data_len);
+
+        user_datas[name] = user_data;
+
+        return MGBOprDescImpl::make(name);
+    }
+
+public:
+    static std::map<std::string, void*>& get_user_datas() { return user_datas; }
+    static MGBOprLoader make() { return {"extern_opr_dummy", &create_desc}; }
+};
+std::map<std::string, void*> MGBOprLoaderImpl::user_datas = {};
+
+void mgb_c_opr_init_output_info(
+        const MGBExternCOprApi* (*get_api)(int),
+        const std::map<std::string,
+                       std::pair<std::vector<std::vector<uint32_t>>,
+                                 std::vector<uint32_t>>>& output_info) {
+    name2outputinfo = std::move(output_info);
+    const MGBExternCOprApi* api = get_api(MGB_EXTERN_C_OPR_VERSION);
+    assert(api);
+    MGBOprLoader loader = MGBOprLoaderImpl::make();
+    api->register_loader(&loader);
+}
+}  // namespace
\ No newline at end of file
diff --git a/compiler/include/megbrain/IR/ops.td b/compiler/include/megbrain/IR/ops.td
index bce360a4..79c22e86 100644
--- a/compiler/include/megbrain/IR/ops.td
+++ b/compiler/include/megbrain/IR/ops.td
@@ -429,5 +429,18 @@ def FusedElemwise: MgbHashableOp<"FusedElemwise"> {
     MgbArrayAttr<MgbStringAttr>:$modes
   );
 }
+  
+def ExternOpr: MgbHashableOp<"ExternOpr"> {
+  let inputs = (ins Variadic<AnyType>:$input);
+  let extraArguments = (ins
+    MgbStringAttr:$name,
+    MgbStringAttr:$data,
+    MgbUI32Attr:$data_len,
+    MgbUI32Attr:$nr_input,
+    MgbUI32Attr:$nr_output
+  );
+
+  let results = (outs Variadic<AnyType>:$results);
+}
 
 #endif // MGB_OPS
diff --git a/compiler/lib/Conversion/MGBToKernel/MGBToKernel.cpp b/compiler/lib/Conversion/MGBToKernel/MGBToKernel.cpp
index f435a1fe..a3aef624 100644
--- a/compiler/lib/Conversion/MGBToKernel/MGBToKernel.cpp
+++ b/compiler/lib/Conversion/MGBToKernel/MGBToKernel.cpp
@@ -233,7 +233,7 @@ class ConvertElemwise final : public OpConversionPattern<MGB::Elemwise> {
             case Mode::EQ:
                 return createOp<Kernel::EqualKernel>(op, operands, rewriter);
             case Mode::SILU:
-                return createOp<Kernel::SILUKernel>(op, operands, rewriter); 
+                return createOp<Kernel::SILUKernel>(op, operands, rewriter);
             default:
                 CC_ABORT << "Unsupport Elemwise mode :"
                          << static_cast<int>(op.mode()) << "\n";
@@ -550,6 +550,27 @@ class GenericConverter : public OpConversionPattern<SrcOp> {
     }
 };
 
+class ExternOprConverter : public OpConversionPattern<MGB::ExternOpr> {
+public:
+    using OpAdaptor = typename MGB::ExternOpr::Adaptor;
+    using OpConversionPattern<MGB::ExternOpr>::OpConversionPattern;
+    LogicalResult matchAndRewrite(
+            MGB::ExternOpr op, OpAdaptor adaptor,
+            ConversionPatternRewriter& rewriter) const override {
+        LOG_DEBUG << "Convert ExternOpr MGB dialect to Abstract kernel of "
+                     "opr name: "
+                  << op.getOperationName().str() << "\n";
+        auto operands = adaptor.getOperands();
+        CC_ASSERT(!isDynamicShape(operands))
+                << "ExternOpr operands shape should not be dynamic.\n";
+        auto attrs = ConvertAttr<MGB::ExternOpr>(op->getAttrDictionary(),
+                                                 op->getContext());
+        setOperandSegmentAttr(op->getContext(), attrs,
+                              {op.nr_input(), op.nr_output()});
+        return createOp<Kernel::ExternOpr>(op, operands, rewriter, attrs);
+    }
+};
+
 }  // namespace
 
 void populateMGBToKernelConversionPatterns(TypeConverter& typeConverter,
@@ -558,7 +579,7 @@ void populateMGBToKernelConversionPatterns(TypeConverter& typeConverter,
             ConvertParamStorage, ConvertParamProvider, ConvertElemwise,
             ConvertFusedElemwise, ConvertConvLike, ConvertReduce,
             ConvertReshape, ConvertSubtensor, ConvertSetSubtensor,
-            ConvertConcat,
+            ConvertConcat, ExternOprConverter,
             GenericConverter<MGB::WarpPerspective,
                              Kernel::WarpPerspectiveKernel>,
             GenericConverter<MGB::IndexingMultiAxisVec,
diff --git a/compiler/lib/Conversion/MGBToKernel/MGBToKernelHelper.h b/compiler/lib/Conversion/MGBToKernel/MGBToKernelHelper.h
index 70f1f8f8..32815dc5 100644
--- a/compiler/lib/Conversion/MGBToKernel/MGBToKernelHelper.h
+++ b/compiler/lib/Conversion/MGBToKernel/MGBToKernelHelper.h
@@ -141,6 +141,17 @@ SmallVector<NamedAttribute, 4> ConvertAttr<MGB::Pooling>(
     return attrs;
 }
 
+template <>
+SmallVector<NamedAttribute, 4> ConvertAttr<MGB::ExternOpr>(
+        DictionaryAttr direct_attr, MLIRContext* context) {
+    SmallVector<NamedAttribute, 4> attrs;
+    GetParam("name");
+    GetParam("data");
+    GetParam("data_len");
+
+    return attrs;
+}
+
 template <>
 SmallVector<NamedAttribute, 4> ConvertAttr<MGB::MatrixInverse>(
         DictionaryAttr direct_attr, MLIRContext* context) {
diff --git a/compiler/lib/Dialect/Kernel/IR/KernelDialect.cpp b/compiler/lib/Dialect/Kernel/IR/KernelDialect.cpp
index 691f1547..adfe3e9f 100644
--- a/compiler/lib/Dialect/Kernel/IR/KernelDialect.cpp
+++ b/compiler/lib/Dialect/Kernel/IR/KernelDialect.cpp
@@ -60,6 +60,11 @@ LogicalResult KernelCall::verifySymbolUses(SymbolTableCollection& symbolTable) {
     return success();
 }
 
+LogicalResult ExternOpr::verifySymbolUses(SymbolTableCollection& symbolTable) {
+    // TODO
+    return success();
+}
+
 MemRefType Reshape::memoryForward(MemRefType inpType) {
     auto oupType = getResult().getType().dyn_cast<ShapedType>();
     if (!oupType) {
diff --git a/compiler/lib/Target/MGB/importer.cpp b/compiler/lib/Target/MGB/importer.cpp
index f0479a22..e8867f0b 100644
--- a/compiler/lib/Target/MGB/importer.cpp
+++ b/compiler/lib/Target/MGB/importer.cpp
@@ -19,6 +19,7 @@
 #include "compiler/Common/MemoryStatus.h"
 #include "compiler/Dialect/MGB/IR/MGBDialect.h"
 #include "compiler/Target/Hako/hako_parse.h"
+#include "compiler/Target/MGB/dummy_loader.h"
 #include "compiler/Target/MGB/helper.h"
 #include "compiler/Target/MGB/import.h"
 
@@ -35,11 +36,31 @@
 #include "megbrain/opr/misc.h"
 #include "megbrain/opr/nn_int.h"
 #include "megbrain/opr/tensor_manip.h"
+#include "megbrain/serialization/extern_c_opr.h"
+#include "megbrain/serialization/extern_c_opr_io.h"
 #include "megbrain/serialization/serializer.h"
 
 llvm::cl::opt<int> hako_version(
         "hako", llvm::cl::desc("specific version used for encrypt"),
         llvm::cl::init(2));
+llvm::cl::opt<std::string> ExternOprOutputShape(
+        "extern-opr-output-shapes", llvm::cl::Optional,
+        llvm::cl::desc("specific extern opr output shapes"),
+        llvm::cl::value_desc(
+                "loader_name_1=output_shape_1;output_shape_2;...:"
+                "loader_name_2=output_shape_1;output_shape_2;... "
+                "If only one loader, \"loader_name=\" can be omitted."
+                "e.g., "
+                "\"loader_1=(1,3,5,5);(1,1);(3,3):loader_2=(2,2);(1,1,3,3)\""));
+llvm::cl::opt<std::string> ExternOprOutputDType(
+        "extern-opr-output-dtypes", llvm::cl::Optional,
+        llvm::cl::desc("specific extern opr output dtypes"),
+        llvm::cl::value_desc(
+                "Similar to --extern-opr-output-shapes but without "
+                "\"loader_name\"."
+                "The available values are float32, int32, uint8, float16, "
+                "int16. e.g., \"float32;int32;uint8:float16;int16\". Default "
+                "value is float32."));
 
 using namespace mgb;
 using namespace llvm;
@@ -139,6 +160,142 @@ std::vector<uint8_t> read_file(std::string path) {
     return res;
 }
 
+inline std::vector<std::string> split(std::string str,
+                                      const std::string& delimiter) {
+    std::vector<std::string> res;
+    size_t pos = 0;
+    while ((pos = str.find(delimiter)) != std::string::npos) {
+        res.emplace_back(std::move(str.substr(0, pos)));
+        str.erase(0, pos + delimiter.size());
+    }
+    res.emplace_back(std::move(str));
+    return res;
+}
+
+inline void parse_extern_output_info() {
+    std::map<std::string, std::pair<std::vector<std::vector<uint32_t>>,
+                                    std::vector<uint32_t>>>
+            name2outputinfo;
+
+    std::string extern_opr_output_shapes = ExternOprOutputShape;
+    if (extern_opr_output_shapes.size()) {
+        auto&& output_shapes_loaders = split(extern_opr_output_shapes, ":");
+        size_t nr_loader = output_shapes_loaders.size();
+
+        std::string extern_opr_output_dtypes = ExternOprOutputDType;
+        bool specify_dtype = (extern_opr_output_dtypes.size() != 0);
+        auto&& output_dtypes_loaders = split(extern_opr_output_dtypes, ":");
+        if (specify_dtype)
+            CC_ASSERT(nr_loader == output_dtypes_loaders.size());
+
+        auto skip_whitespace = [](const std::string& str) {
+            int left = 0, right = str.size() - 1;
+            while (str[left] == ' ' || str[left] == '\t')
+                ++left;
+            while (str[right] == ' ' || str[right] == '\t')
+                --right;
+            return str.substr(left, right - left + 1);
+        };
+
+        auto parse_output_info = [=, &name2outputinfo](
+                                         const std::string& output_shapes_str,
+                                         const std::string& output_dtypes_str,
+                                         const std::string& loader_name) {
+            auto&& output_shapes = split(output_shapes_str, ";");
+
+            std::vector<uint32_t> uint_output_dtypes(output_shapes.size(), 0);
+            if (specify_dtype) {
+                auto&& output_dtypes = split(output_dtypes_str, ";");
+                CC_ASSERT((output_shapes.size() == output_dtypes.size()))
+                        << "Number of extern opr output shapes("
+                        << output_shapes.size()
+                        << ") should equal to "
+                           "number "
+                           "of extern opr output dtypes("
+                        << output_dtypes.size() << ").\n";
+                std::unordered_map<std::string, uint32_t> dtype_str2uint{
+                        {"float32", 0},
+                        {"int32", 1},
+                        {"uint8", 2},
+                        {"float16", 3},
+                        {"int16", 4}};
+                for (size_t i = 0; i < output_dtypes.size(); ++i) {
+                    auto&& tmp_str = skip_whitespace(output_dtypes[i]);
+                    if (dtype_str2uint.find(tmp_str) != dtype_str2uint.end())
+                        uint_output_dtypes[i] = dtype_str2uint.at(tmp_str);
+                    else
+                        CC_ASSERT(0)
+                                << tmp_str
+                                << " is invalid extern opr output dtype! Dtype "
+                                   "should be float32, int32, uint8, float16 "
+                                   "or "
+                                   "int16.\n";
+                }
+            }
+
+            std::vector<std::vector<uint32_t>> uint_output_shapes(
+                    output_shapes.size());
+            for (size_t i = 0; i < output_shapes.size(); ++i) {
+                auto&& tmp_str = skip_whitespace(output_shapes[i]);
+                CC_ASSERT((tmp_str[0] == '(' &&
+                           tmp_str[tmp_str.size() - 1] == ')'))
+                        << "The output shape needs to be surrounded by "
+                           "parentheses.\n";
+                tmp_str = tmp_str.substr(1, tmp_str.size() - 2);
+                auto&& tmp_shape = split(tmp_str, ",");
+                CC_ASSERT((tmp_shape.size() <= MGB_TENSOR_MAX_NDIM))
+                        << "Maximum dimension of single output shape of extern "
+                           "opr "
+                           "is "
+                        << MGB_TENSOR_MAX_NDIM << ".\n";
+                uint_output_shapes[i].resize(tmp_shape.size());
+                std::transform(tmp_shape.begin(), tmp_shape.end(),
+                               uint_output_shapes[i].begin(),
+                               [](const std::string& s) {
+                                   return static_cast<uint32_t>(std::stoul(s));
+                               });
+            }
+
+            name2outputinfo[loader_name] =
+                    std::make_pair(std::move(uint_output_shapes),
+                                   std::move(uint_output_dtypes));
+        };
+
+        if (nr_loader == 1) {
+            auto&& name_and_shapes = split(output_shapes_loaders[0], "=");
+            bool specify_name = (name_and_shapes.size() == 2);
+            std::string&& loader_name =
+                    (specify_name ? skip_whitespace(name_and_shapes[0]) : "_");
+            const std::string& shapes =
+                    specify_name ? name_and_shapes[1] : name_and_shapes[0];
+            if (specify_dtype) {
+                parse_output_info(shapes, output_dtypes_loaders[0],
+                                  loader_name);
+            } else {
+                parse_output_info(shapes, "", loader_name);
+            }
+        } else if (nr_loader > 1) {
+            for (size_t i = 0; i < nr_loader; ++i) {
+                auto&& name_and_shapes = split(output_shapes_loaders[i], "=");
+                CC_ASSERT((name_and_shapes.size() == 2))
+                        << "When there are more than one loader, loader name "
+                           "must be specified.\n";
+                std::string&& loader_name = skip_whitespace(name_and_shapes[0]);
+                const std::string& shapes = name_and_shapes[1];
+                if (specify_dtype) {
+                    parse_output_info(shapes, output_dtypes_loaders[i],
+                                      loader_name);
+                } else {
+                    parse_output_info(shapes, "", loader_name);
+                }
+            }
+        }
+
+        mgb_c_opr_init_output_info(mgb_get_extern_c_opr_api_versioned,
+                                   name2outputinfo);
+    }
+}
+
 class Importer {
     using LoadResult = serialization::GraphLoader::LoadResult;
     using Options = MGBImporterOptions;
@@ -152,8 +309,8 @@ class Importer {
         m_context->loadDialect<mlir::StandardOpsDialect>();
     }
 
-    mlir::LogicalResult import_mgb(std::string model_path, Options options
-                                   , int hako_ver = 0) {
+    mlir::LogicalResult import_mgb(std::string model_path, Options options,
+                                   int hako_ver = 0) {
         std::vector<uint8_t> mdl_model_buffer;
         std::unique_ptr<serialization::InputFile> inp_file;
         hako_ver = hako_ver == 0 ? hako_version.getValue() : hako_ver;
@@ -174,6 +331,9 @@ class Importer {
         CC_ASSERT(format.valid()) << "invalid model: unknown model format.\n";
         m_loader = serialization::GraphLoader::make(std::move(inp_file),
                                                     format.val());
+
+        parse_extern_output_info();
+
         LOG_DEBUG << "Process mgb graph\n";
         process_graph(options);
         return mlir::verify(m_module);
@@ -814,6 +974,35 @@ class Importer {
                     std::vector<Value>{elemwise_exp, reduce_sum},
                     opr::Elemwise::Mode::TRUE_DIV);
             m_var2value.emplace(out, out_value);
+        } else if (auto extern_opr =
+                           opr->try_cast_final<opr::ExternCOprRunner>()) {
+            auto user_datas = MGBOprLoaderImpl::get_user_datas();
+
+            void* _data = nullptr;
+            if (user_datas.find(opr->name()) != user_datas.end()) {
+                _data = user_datas[opr->name()];
+            }
+            CC_ASSERT(_data) << "No data related to " << opr->name() << ".\n";
+            std::string data(
+                    reinterpret_cast<const char*>(_data + sizeof(size_t)),
+                    *(size_t*)(_data));
+            free(_data);
+
+            std::vector<mlir::Type> v_resultTypes(opr->output().size());
+            for (int i = 0; i < opr->output().size(); ++i) {
+                v_resultTypes[i] = var_to_shaped_type(opr->output(i));
+            }
+
+            uint32_t nr_input = static_cast<uint32_t>(opr->input().size());
+            uint32_t nr_output = static_cast<uint32_t>(opr->output().size());
+
+            auto values = m_builder.create<mlir::MGB::ExternOpr>(
+                    m_builder.getUnknownLoc(), v_resultTypes,
+                    var_array_to_value_array(opr->input()), opr->name(), data,
+                    static_cast<uint32_t>(data.size()), nr_input, nr_output);
+            for (int i = 0; i < opr->output().size(); ++i) {
+                m_var2value.emplace(opr->output(i), values.getResult(i));
+            }
         } else {
             CC_ABORT << "unsupported mgb operator type "
                      << opr->dyn_typeinfo()->name << "\n";
@@ -1200,7 +1389,7 @@ mlir::LogicalResult import_mgb(mlir::ModuleOp module, std::string model_path,
     LOG_DEBUG << "\n\t\t\t Begin Import MBG \t\t\t\n";
     LOG_DEBUG << "load model from " << model_path
               << " with Options:\n\tuse_static_memory_plan="
-              << options.use_naive_memory_plan
+              << options.use_static_memory_plan
               << "\n\toptimize_for_inference=" << options.optimize_for_inference
               << "\n\tuse_naive_memory_plan=" << options.use_naive_memory_plan
               << "\n\tgraph_opt_level="
diff --git a/compiler/lib/Target/TinyNN/exporter.cpp b/compiler/lib/Target/TinyNN/exporter.cpp
index 274c5774..41dca537 100644
--- a/compiler/lib/Target/TinyNN/exporter.cpp
+++ b/compiler/lib/Target/TinyNN/exporter.cpp
@@ -294,6 +294,41 @@ class Exporter {
                                         MegCC::TensorType_WEIGHT,
                                         symbol2weight_id[op.name().str()]));
                     })
+                    .Case([&](Kernel::ExternOpr op) {
+                        kernel_exporter.addInst("EXTERN_OPR");
+
+                        std::vector<int32_t> input_tensors, output_tensors;
+                        for (auto&& i : op.operands()) {
+                            auto&& tensor = value2typed_tensor.at(
+                                    i.getAsOpaquePointer());
+                            input_tensors.push_back(tensor.second);
+                        }
+
+                        for (auto&& i : op.results()) {
+                            auto&& tensor = value2typed_tensor.at(
+                                    i.getAsOpaquePointer());
+                            output_tensors.push_back(tensor.second);
+                        }
+
+                        std::string name(op.name().data(), op.name().size());
+                        std::string data(op.data().data(), op.data().size());
+                        uint32_t data_len = data.size();
+
+                        LOG_DEBUG << "Add ExternOpr instruction.\n";
+                        instructions_type.push_back(
+                                MegCC::Instruction_ExternOpr);
+                        instructions.push_back(
+                                MegCC::CreateExternOpr(
+                                        m_fbs_builder,
+                                        m_fbs_builder.CreateVector(
+                                                input_tensors),
+                                        m_fbs_builder.CreateString(name),
+                                        m_fbs_builder.CreateString(data),
+                                        data_len,
+                                        m_fbs_builder.CreateVector(
+                                                output_tensors))
+                                        .Union());
+                    })
                     .Case([&](Kernel::MemPlan op) {
                         createTensor(op->getResult(0));
                     })
diff --git a/immigration/include/extern_c_opr.h b/immigration/include/extern_c_opr.h
new file mode 100644
index 00000000..7d46af5f
--- /dev/null
+++ b/immigration/include/extern_c_opr.h
@@ -0,0 +1,222 @@
+#ifndef MEGBRAIN_EXTERN_C_OPR_H
+#define MEGBRAIN_EXTERN_C_OPR_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifdef MGE_DLL_EXPORT
+#define MGB_PUBLIC __declspec(dllexport)
+#else
+#define MGB_PUBLIC __attribute__((visibility("default")))
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MGB_C_OPR_INIT_FUNC
+#define MGB_C_OPR_INIT_FUNC mgb_c_opr_init
+#endif
+
+#define INIT_FUNCS(s)           #s
+#define INIT_FUNC(s)            INIT_FUNCS(s)
+#define MGB_C_OPR_INIT_FUNC_STR INIT_FUNC(MGB_C_OPR_INIT_FUNC)
+
+#define MGB_EXTERN_C_OPR_VERSION 0x24
+#define MGB_TENSOR_MAX_NDIM      8
+
+//! data types
+typedef enum MGBDType {
+    MGB_DTYPE_FLOAT32,
+    MGB_DTYPE_INT32,
+    MGB_DTYPE_UINT8,
+    //! IEEE 754-based half-precision floating
+    MGB_DTYPE_FLOAT16,
+    MGB_DTYPE_INT16,
+} MGBDType;
+
+typedef struct MGBTensorShape {
+    uint32_t ndim, shape[MGB_TENSOR_MAX_NDIM];
+} MGBTensorShape;
+
+typedef struct MGBTensorLayout {
+    uint32_t dtype;
+    MGBTensorShape shape;
+} MGBTensorLayout;
+
+//! tensor representation
+typedef struct MGBTensor {
+    MGBTensorLayout layout;
+    void* data;  //!< the tensor value, accessible by caller CPU thread
+} MGBTensor;
+
+//! extern device tenosr struct
+typedef struct ExternDeviceTensor {
+    //! layout of device extern tensor, use to validity check with MGBTensor
+    MGBTensorLayout layout;
+    //! different NPU API has different type define so just define a void * to
+    //! compat all, need loader and SDK implement reinterpret_cast it
+    //! exampe for NNIE, device_ptr may define as
+    //! struct MemoryInfo {
+    //!     HI_U64 phy_addr;
+    //!     void* vir_addr;
+    //!     size_t size = 0;
+    //! }
+    void* device_ptr;
+} ExternDeviceTensor;
+
+//! for dynamic extern c opr param
+typedef struct ExternCOprParam {
+    //! dump name of extern c opr in graph
+    //! example graph:
+    //! ExternCOpr1(3516:preprocess)->opr->ExternCOpr2(3559)->opr->ExternCOpr3(3516:det_face)...
+    //! extern_c_opr_dump_name config case:
+    //! when set 3516:preprocess, ExternCOpr1 will be config.
+    //! when set 3559, ExternCOpr2 will be config.
+    //! when set 3516:det_face, ExternCOpr3 will be config.
+    //! when set nullptr, will auto config the first ExternCOpr.
+    const char* extern_c_opr_dump_name;
+
+    //! number of input/output, use to index and check
+    //! if set nr_input = 0, means do not provide input ExternDeviceTensor
+    //! if set nr_output = 0, means do not provide nr_output ExternDeviceTensor
+    size_t nr_input, nr_output;
+
+    //! ptr of input/output ExternDeviceTensor
+    ExternDeviceTensor* input;
+    ExternDeviceTensor* output;
+
+    //! device id
+    size_t device_id;
+
+    //! extra info for misc dynamic config
+    uint8_t* extra_info;
+    //! size of extra_info
+    size_t extra_info_size;
+} ExternCOprParam;
+
+/*!
+ * \brief operator descriptor
+ *
+ * Note: all the methods (except release) should be purely functional, so a
+ * descriptor can be shared by multiple operators
+ */
+typedef struct MGBOprDesc {
+    //! size of this MGBOprDesc object
+    uint32_t size;
+
+    //! number of input/output vars
+    uint32_t nr_output;
+
+    //! operator type name
+    const char* type_name;
+
+    //! release this descriptor
+    void (*release)(struct MGBOprDesc* self);
+
+    //! compute hash
+    size_t (*hash)(const struct MGBOprDesc* self);
+
+    //! equality check
+    int (*is_same)(const struct MGBOprDesc* self, const struct MGBOprDesc* rhs);
+
+    //! perform the computation
+    void (*execute)(
+            const struct MGBOprDesc* self, const MGBTensor* input,
+            const MGBTensor* output);
+
+    //! infer output shapes from input shapes
+    void (*infer_shape)(
+            const struct MGBOprDesc* self, const MGBTensorShape* input,
+            MGBTensorShape* output);
+
+    //! optional: infer output dtypes from input dtypes
+    void (*infer_dtype)(
+            const struct MGBOprDesc* self, const MGBDType* input, MGBDType* output);
+
+    //! custom user data to be associated with this descriptor
+    void* user_data;
+
+    //! dynamic extern c opr param
+    ExternCOprParam* dynamic_param;
+} MGBOprDesc;
+
+//! foreach member function of MGBOprDesc to help initialization
+#define MGB_OPR_DESC_FOREACH_MEM_FN(cb) \
+    cb(release) cb(hash) cb(is_same) cb(execute) cb(infer_shape)
+
+//! operator loader
+typedef struct MGBOprLoader {
+    //! name of the loader; must match the name given in
+    //! ExternCOprRunner::make_placeholder and would be written to graph dump
+    //! file
+    const char* name;
+
+    /*!
+     * \brief create a new descriptor from saved buffer
+     *
+     * Note: there is no guarantee on the alignment of \p buf.
+     */
+    MGBOprDesc* (*create_desc)(size_t nr_input, const void* buf, size_t buf_len);
+} MGBOprLoader;
+
+//! APIs provided by megbrain
+typedef struct MGBExternCOprApi {
+    /*!
+     * \brief register an operator loader
+     *
+     * content of the loader would be copied
+     *
+     * \return true if registration succeeds; false if duplicated name
+     */
+    int (*register_loader)(const MGBOprLoader* loader);
+
+    /*!
+     * \brief unregister a MGBOprLoader
+     * \return whether any loader is removed (i.e. whether the name exists)
+     */
+    int (*unregister_loader)(const char* name);
+} MGBExternCOprApi;
+
+//! get API ptr for specific version; return nullptr if version mismatch
+MGB_PUBLIC const MGBExternCOprApi* mgb_get_extern_c_opr_api_versioned(int version);
+
+#ifdef __cplusplus
+}
+#endif
+
+//! get the API ptr for current header version; return nullptr on mismatch
+static inline const MGBExternCOprApi* mgb_get_extern_c_opr_api() {
+    return mgb_get_extern_c_opr_api_versioned(MGB_EXTERN_C_OPR_VERSION);
+}
+
+static inline size_t mgb_get_dtype_size(MGBDType dtype) {
+    switch (dtype) {
+        case MGB_DTYPE_INT32:
+            return 4;
+        case MGB_DTYPE_FLOAT32:
+            return 4;
+        case MGB_DTYPE_UINT8:
+            return 1;
+        case MGB_DTYPE_FLOAT16:
+        case MGB_DTYPE_INT16:
+            return 2;
+        default:
+            __builtin_trap();
+            return -1;
+    }
+}
+
+static inline void mgb_init_opr_desc(
+        MGBOprDesc* desc, uint32_t nr_output, const char* type_name) {
+    memset(desc, 0, sizeof(MGBOprDesc));
+    desc->size = sizeof(MGBOprDesc);
+    desc->nr_output = nr_output;
+    desc->type_name = type_name;
+}
+
+#undef MGB_PUBLIC
+#endif  // MEGBRAIN_EXTERN_C_OPR_H
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index af9ad2e3..30930510 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -183,7 +183,7 @@ target_include_directories(
 
 if(NOT TINYNN_BUILD_FOR_NOT_STANDARD_OS)
   add_executable(tinynn_test_lite example/standard_OS/lite_main.c)
-  target_link_libraries(tinynn_test_lite TinyNN m)
+  target_link_libraries(tinynn_test_lite TinyNN m dl)
 endif()
 
 if(TINYNN_ACHIEVE_ALL AND NOT TINYNN_BUILD_FOR_NOT_STANDARD_OS)
diff --git a/runtime/example/standard_OS/lite_main.c b/runtime/example/standard_OS/lite_main.c
index 77ec8975..4035a68c 100644
--- a/runtime/example/standard_OS/lite_main.c
+++ b/runtime/example/standard_OS/lite_main.c
@@ -7,7 +7,9 @@
  */
 
 #include <string.h>
+#include <getopt.h>
 #include <sys/time.h>
+#include "extern_c_opr.h"
 #include "lite-c/common_enum_c.h"
 #include "lite-c/global_c.h"
 #include "lite-c/network_c.h"
@@ -156,30 +158,115 @@ static TinyNnCallBack g_cb = {
         .tinynn_fread_cb = fread,
 };
 #endif
+
+void usage(){
+    fprintf(stderr, 
+        "Usage:\n"
+        "\t--input-model/-m: input model path\n"
+        "\t--output-dir/-o: output file path\n"
+        "\t--log-level/-l: 0:ERROR, 1:WARN, 2:INFO, 3:DEBUG\n"
+        "\t--input-data/-d: var=path/to/data_file\n"
+        "\t--data-shape/-s: data shape\n"
+        "\t--c-opr-lib/-c: path to extern opr lib file(.so)\n"
+        "\t--c-opr-init-interface/-i: the init API of your loader\n"
+        );
+}
+
+#if defined(_WIN32)
+#include <io.h>
+#include <windows.h>
+#define RTLD_LAZY 0
+
+static void* dlopen(const char* file, int) {
+    return (void*)(LoadLibrary(file));
+}
+
+static void* dlsym(void* handle, const char* name) {
+    FARPROC symbol = GetProcAddress((HMODULE)handle, name);
+    return (void*)symbol;
+}
+
+#else
+#include <dlfcn.h>
+#endif
+
 int main(int argc, char** argv) {
     LITE_set_log_level(WARN);
 #if TINYNN_CALLBACK_ENABLE
     register_tinynn_cb(TINYNN_CB_VERSION, g_cb);
 #endif
-    if (argc < 2) {
-        fprintf(stderr, "input error, please run with:\n");
-        fprintf(stderr,
-                "tinynn_test <input_model> <output_dir> "
-                "<print_out_info_debug=0/1/2/3> "
-                "<input_data>..."
-                "\n");
-        return -1;
+    char* model_path = NULL;
+    char* output_dir = NULL;
+    int print_out = 0;
+    char* data_str = NULL;
+    char* data_shape_str = NULL;
+    char* extern_so = NULL;
+    const char* c_opr_lib_interface = "mgb_c_opr_init";
+
+    const struct option long_options[] = {
+        {"input-model", required_argument, 0, 'm'},
+        {"output-dir", required_argument, 0, 'o'},
+        {"log-level", required_argument, 0, 'l'},
+        {"input-data", required_argument, 0, 'd'},
+        {"data-shape", required_argument, 0, 's'},
+        {"c-opr-lib", required_argument, 0, 'c'},
+        {"c-opr-init-interface", required_argument, 0, 'i'},
+        {"help", no_argument, 0, 'h'},
+        {0, 0, 0, 0}
+    };
+    const char* shortopt = "m:o:l:d:s:c:i:h";
+    int c, option_idx = 0;
+    while(1) {
+        c = getopt_long(argc, argv, shortopt, long_options, &option_idx);
+        if(c == -1){
+            break;
+        }
+        switch(c){
+            case 'm':
+                model_path = optarg;
+                break;
+            case 'o':
+                output_dir = optarg;
+                break;
+            case 'l':
+                print_out = atoi(optarg);
+                break;
+            case 'd':
+                data_str = optarg;
+                break;
+            case 's':
+                data_shape_str = optarg;
+                break;
+            case 'c':
+                extern_so = optarg;
+                break;
+            case 'i':
+                c_opr_lib_interface = optarg;
+                break;
+            case 'h':
+                usage();
+                exit(0);
+                break;
+            default:
+                abort();
+        }
     }
-    const char* model_path = argv[1];
-    const char* output_dir = argc > 2 ? argv[2] : NULL;
-    int print_out = argc > 3 ? atoi(argv[3]) : 0;
+    
     if (print_out == 2) {
         LITE_set_log_level(INFO);
     } else if (print_out == 3) {
         LITE_set_log_level(DEBUG);
     }
-    char* data_str = argc > 4 ? argv[4] : NULL;
-    char* data_shape_str = argc > 5 ? argv[5] : NULL;
+
+    if(extern_so){
+        void* handle = dlopen(extern_so, RTLD_LAZY);
+        EXAMPLE_ASSERT(handle, "load loader failed.\n");
+        void (*func)(const MGBExternCOprApi* (*)(int)) = NULL;
+        *(void**)&func = dlsym(handle, c_opr_lib_interface);
+        EXAMPLE_ASSERT(func, "load init interface of loader failed.\n");
+        func(mgb_get_extern_c_opr_api_versioned);
+    }
+    
     LiteNetwork model;
     LITE_CAPI_CHECK(
             LITE_make_network(&model, *default_config(), *default_network_io()),
@@ -283,6 +370,7 @@ int main(int argc, char** argv) {
     }
 
     LITE_CAPI_CHECK(LITE_destroy_network(model), "delete model failed\n");
+
     return 0;
 }
 
diff --git a/runtime/schema/model.fbs b/runtime/schema/model.fbs
index aadaefcc..abcadad1 100644
--- a/runtime/schema/model.fbs
+++ b/runtime/schema/model.fbs
@@ -176,6 +176,18 @@ table Reshape {
     output: int;
 }
 
+table ExternOpr {
+    // the input tensor idx
+    input: [int];
+    // opr name
+    name: string;
+    // opr user data
+    data: string;
+    data_len: uint32;
+    // the output tensor idx
+    output: [int];
+}
+
 enum ArithMode : byte {
     ROUND = 0,
     NEGATE,
@@ -348,9 +360,10 @@ union Instruction {
     IndexingMultiAxis = 14,
     ArgSort = 15,
     Reshape = 16,
+    ExternOpr = 17,
     // terminator to mark the end of instruction definitions
     // all instruction types should be placed before here
-    INSTRUCTION_TABLE_END = 17
+    INSTRUCTION_TABLE_END = 18
 }
 
 // device base data for a Model
diff --git a/runtime/src/cheader/model_reader.h b/runtime/src/cheader/model_reader.h
index dabd191d..cc8fe563 100644
--- a/runtime/src/cheader/model_reader.h
+++ b/runtime/src/cheader/model_reader.h
@@ -71,6 +71,10 @@ typedef const struct MegCC_Reshape_table *MegCC_Reshape_table_t;
 typedef struct MegCC_Reshape_table *MegCC_Reshape_mutable_table_t;
 typedef const flatbuffers_uoffset_t *MegCC_Reshape_vec_t;
 typedef flatbuffers_uoffset_t *MegCC_Reshape_mutable_vec_t;
+typedef const struct MegCC_ExternOpr_table *MegCC_ExternOpr_table_t;
+typedef struct MegCC_ExternOpr_table *MegCC_ExternOpr_mutable_table_t;
+typedef const flatbuffers_uoffset_t *MegCC_ExternOpr_vec_t;
+typedef flatbuffers_uoffset_t *MegCC_ExternOpr_mutable_vec_t;
 typedef const struct MegCC_Arithmetic_table *MegCC_Arithmetic_table_t;
 typedef struct MegCC_Arithmetic_table *MegCC_Arithmetic_mutable_table_t;
 typedef const flatbuffers_uoffset_t *MegCC_Arithmetic_vec_t;
@@ -240,6 +244,15 @@ typedef flatbuffers_uoffset_t *MegCC_Model_mutable_vec_t;
 #endif
 #define MegCC_Reshape_type_hash ((flatbuffers_thash_t)0xe65906ba)
 #define MegCC_Reshape_type_identifier "\xba\x06\x59\xe6"
+#ifndef MegCC_ExternOpr_file_identifier
+#define MegCC_ExternOpr_file_identifier flatbuffers_identifier
+#endif
+/* deprecated, use MegCC_ExternOpr_file_identifier */
+#ifndef MegCC_ExternOpr_identifier
+#define MegCC_ExternOpr_identifier flatbuffers_identifier
+#endif
+#define MegCC_ExternOpr_type_hash ((flatbuffers_thash_t)0x6183fc9d)
+#define MegCC_ExternOpr_type_identifier "\x9d\xfc\x83\x61"
 #ifndef MegCC_Arithmetic_file_identifier
 #define MegCC_Arithmetic_file_identifier flatbuffers_identifier
 #endif
@@ -805,6 +818,20 @@ __flatbuffers_define_vector_field(0, MegCC_Reshape, inputs, flatbuffers_int32_ve
 __flatbuffers_define_vector_field(1, MegCC_Reshape, input_types, MegCC_TensorType_vec_t, 0)
 __flatbuffers_define_scalar_field(2, MegCC_Reshape, output, flatbuffers_int32, int32_t, INT32_C(0))
 
+struct MegCC_ExternOpr_table { uint8_t unused__; };
+
+static inline size_t MegCC_ExternOpr_vec_len(MegCC_ExternOpr_vec_t vec)
+__flatbuffers_vec_len(vec)
+static inline MegCC_ExternOpr_table_t MegCC_ExternOpr_vec_at(MegCC_ExternOpr_vec_t vec, size_t i)
+__flatbuffers_offset_vec_at(MegCC_ExternOpr_table_t, vec, i, 0)
+__flatbuffers_table_as_root(MegCC_ExternOpr)
+
+__flatbuffers_define_vector_field(0, MegCC_ExternOpr, input, flatbuffers_int32_vec_t, 0)
+__flatbuffers_define_string_field(1, MegCC_ExternOpr, name, 0)
+__flatbuffers_define_string_field(2, MegCC_ExternOpr, data, 0)
+__flatbuffers_define_scalar_field(3, MegCC_ExternOpr, data_len, flatbuffers_uint32, uint32_t, UINT32_C(0))
+__flatbuffers_define_vector_field(4, MegCC_ExternOpr, output, flatbuffers_int32_vec_t, 0)
+
 struct MegCC_Arithmetic_table { uint8_t unused__; };
 
 static inline size_t MegCC_Arithmetic_vec_len(MegCC_Arithmetic_vec_t vec)
@@ -992,7 +1019,8 @@ __flatbuffers_define_union(flatbuffers_, MegCC_Instruction)
 #define MegCC_Instruction_IndexingMultiAxis ((MegCC_Instruction_union_type_t)UINT8_C(14))
 #define MegCC_Instruction_ArgSort ((MegCC_Instruction_union_type_t)UINT8_C(15))
 #define MegCC_Instruction_Reshape ((MegCC_Instruction_union_type_t)UINT8_C(16))
-#define MegCC_Instruction_INSTRUCTION_TABLE_END ((MegCC_Instruction_union_type_t)UINT8_C(17))
+#define MegCC_Instruction_ExternOpr ((MegCC_Instruction_union_type_t)UINT8_C(17))
+#define MegCC_Instruction_INSTRUCTION_TABLE_END ((MegCC_Instruction_union_type_t)UINT8_C(18))
 
 static inline const char *MegCC_Instruction_type_name(MegCC_Instruction_union_type_t type)
 {
@@ -1014,6 +1042,7 @@ static inline const char *MegCC_Instruction_type_name(MegCC_Instruction_union_ty
     case MegCC_Instruction_IndexingMultiAxis: return "IndexingMultiAxis";
     case MegCC_Instruction_ArgSort: return "ArgSort";
     case MegCC_Instruction_Reshape: return "Reshape";
+    case MegCC_Instruction_ExternOpr: return "ExternOpr";
     case MegCC_Instruction_INSTRUCTION_TABLE_END: return "INSTRUCTION_TABLE_END";
     default: return "";
     }
@@ -1039,6 +1068,7 @@ static inline int MegCC_Instruction_is_known_type(MegCC_Instruction_union_type_t
     case MegCC_Instruction_IndexingMultiAxis: return 1;
     case MegCC_Instruction_ArgSort: return 1;
     case MegCC_Instruction_Reshape: return 1;
+    case MegCC_Instruction_ExternOpr: return 1;
     case MegCC_Instruction_INSTRUCTION_TABLE_END: return 1;
     default: return 0;
     }
diff --git a/runtime/src/vm.c b/runtime/src/vm.c
index b02891b5..8618946e 100644
--- a/runtime/src/vm.c
+++ b/runtime/src/vm.c
@@ -18,6 +18,7 @@ void register_all(VM* vm) {
     register_dimshuffle(vm);
     register_broadcast_shape_of(vm);
     register_reshape(vm);
+    register_extern_opr(vm);
 }
 
 VM* vm_global_inst() {
diff --git a/runtime/src/vm/extern_opr.c b/runtime/src/vm/extern_opr.c
new file mode 100644
index 00000000..e39ca310
--- /dev/null
+++ b/runtime/src/vm/extern_opr.c
@@ -0,0 +1,283 @@
+/**
+ * \file runtime/src/vm/extern_opr.c
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+
+#include "vm.h"
+#include "utils.h"
+#include "extern_c_opr.h"
+
+#if ENABLE_INST_EXTERN_OPR
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "init.h"
+#include "parse.h"
+#include "vm/common.h"
+#include "vm/instruction.h"
+#include "vm/registry.h"
+
+typedef struct LoaderMap {
+    MGBOprLoader loader;
+} LoaderMap;
+
+typedef struct LoaderMapVec {
+    LoaderMap* loader_map;
+    size_t size;
+    size_t capacity;
+} LoaderMapVec;
+
+static LoaderMapVec loader_maps;
+
+static int insert_loader(LoaderMapVec* lm, const MGBOprLoader* loader) {
+    for (int i = 0; i < lm->size; ++i) {
+        if (strcmp(lm->loader_map[i].loader.name, loader->name) == 0) {
+            return 0;
+        }
+    }
+
+    if (lm->capacity == 0) {
+        lm->capacity = 2;
+        lm->loader_map = tinynn_malloc(sizeof(LoaderMap) * lm->capacity);
+    }
+    if (lm->size >= lm->capacity) {
+        lm->capacity *= 2;
+        LoaderMap* tmp = tinynn_malloc(sizeof(LoaderMap) * lm->capacity);
+        memcpy(tmp, lm->loader_map, sizeof(LoaderMap) * lm->size);
+        tinynn_free(lm->loader_map);
+        lm->loader_map = tmp;
+    }
+
+    lm->loader_map[lm->size].loader = *loader;
+    ++lm->size;
+    return 1;
+}
+
+static int register_loader(const MGBOprLoader* loader) {
+    return insert_loader(&loader_maps, loader);
+}
+
+static int delete_loader(LoaderMapVec* lm, const char* name) {
+    for (int i = 0; i < lm->size; ++i) {
+        if (strcmp(lm->loader_map[i].loader.name, name) == 0) {
+            memmove(lm->loader_map + i, lm->loader_map + i + 1,
+                    lm->size - i - 1);
+            --lm->size;
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static int unregister_loader(const char* name) {
+    return delete_loader(&loader_maps, name);
+}
+
+static LoaderMap* find_loader_by_name(const LoaderMapVec* lm,
+                                      const char* name) {
+    for (int i = 0; i < lm->size; ++i) {
+        if (strcmp(lm->loader_map[i].loader.name, name) == 0) {
+            return lm->loader_map + i;
+        }
+    }
+    return NULL;
+}
+
+static void free_loader_maps(LoaderMapVec* lm) {
+    if (lm->loader_map) {
+        tinynn_free(lm->loader_map);
+        lm->loader_map = NULL;
+    }
+}
+
+//! get API ptr for specific version; return nullptr if version mismatch
+const MGBExternCOprApi* mgb_get_extern_c_opr_api_versioned(int version) {
+    static MGBExternCOprApi api;
+    api.unregister_loader = unregister_loader;
+    TINYNN_ASSERT_MSG(version >= 0x24, "Extern opr loader version must greater than 0x24.\n");
+
+    if (version != MGB_EXTERN_C_OPR_VERSION) {
+        return NULL;
+    }
+
+    api.register_loader = register_loader;
+    return &api;
+}
+
+// Convert Tensor to MGBTensor, except MGBTensor.data.
+static void Tensor2MGBTensor(const Tensor* tensor, MGBTensor* mgb_tensor){
+    mgb_tensor->layout.shape.ndim = tensor->layout.nr_dim;
+    for(int i = 0; i < tensor->layout.nr_dim; ++i){
+        mgb_tensor->layout.shape.shape[i] = tensor->layout.dims[i];
+    }
+    switch(tensor->dtype.type_enum){
+        case TinyNN_FLOAT:
+            mgb_tensor->layout.dtype = MGB_DTYPE_FLOAT32;
+            break;
+        case TinyNN_FLOAT16:
+            mgb_tensor->layout.dtype = MGB_DTYPE_FLOAT16;
+            break;
+        case TinyNN_INT:
+            mgb_tensor->layout.dtype = MGB_DTYPE_INT32;
+            break;
+        case TinyNN_INT16:
+            mgb_tensor->layout.dtype = MGB_DTYPE_INT16;
+            break;
+        case TinyNN_UINT8:
+            mgb_tensor->layout.dtype = MGB_DTYPE_UINT8;
+            break;
+        default:
+            TINYNN_ASSERT_MSG(0, "Unsupport data type\n");
+    }
+}
+
+static void MGBTensor2Tensor(const MGBTensor* mgb_tensor, Tensor* tensor){
+    tensor->layout.nr_dim = mgb_tensor->layout.shape.ndim;
+    for(int i = 0; i < mgb_tensor->layout.shape.ndim; ++i){
+        tensor->layout.dims[i] = mgb_tensor->layout.shape.shape[i];
+    }
+
+    switch(mgb_tensor->layout.dtype){
+        case MGB_DTYPE_FLOAT32:
+            tensor->dtype.type_enum = TinyNN_FLOAT;
+            break;
+        case MGB_DTYPE_FLOAT16:
+            tensor->dtype.type_enum = TinyNN_FLOAT16;
+            break;
+        case MGB_DTYPE_INT32:
+            tensor->dtype.type_enum = TinyNN_INT;
+            break;
+        case MGB_DTYPE_INT16:
+            tensor->dtype.type_enum = TinyNN_INT16;
+            break;
+        case MGB_DTYPE_UINT8:
+            tensor->dtype.type_enum = TinyNN_UINT8;
+            break;
+        default:
+            TINYNN_ASSERT_MSG(0, "Unsupport data type\n");
+    }
+}
+
+static TinyNNStatus load(flatbuffers_generic_t fbs_inst, Instruction* inst,
+                         VM* vm) {
+    ExternOpr* extern_opr = &inst->workload.extern_opr;
+    DeviceModel* model = get_active_device_model(vm);
+    ns(ExternOpr_table_t) fbs_extern_opr = (ns(ExternOpr_table_t))(fbs_inst);
+    inst->tag = TinyNN_INST_EXTERN_OPR;
+
+    flatbuffers_int32_vec_t fbs_inputs = ns(ExternOpr_input(fbs_extern_opr));
+    extern_opr->nr_input = flatbuffers_int32_vec_len(fbs_inputs);
+    extern_opr->inputs = tinynn_malloc(sizeof(Tensor*) * extern_opr->nr_input);
+    for(int i = 0; i < extern_opr->nr_input; ++i){
+        extern_opr->inputs[i] = model->tensors + fbs_inputs[i];
+    }
+
+    flatbuffers_int32_vec_t fbs_outputs = ns(ExternOpr_output(fbs_extern_opr));
+    extern_opr->nr_output = flatbuffers_int32_vec_len(fbs_outputs);
+    extern_opr->outputs = tinynn_malloc(sizeof(Tensor*) * extern_opr->nr_output);
+    for(int i = 0; i < extern_opr->nr_output; ++i){
+        extern_opr->outputs[i] = model->tensors + fbs_outputs[i];
+    }
+
+    const char* name = ns(ExternOpr_name(fbs_extern_opr));
+    const void* data = ns(ExternOpr_data(fbs_extern_opr));
+    size_t data_len = ns(ExternOpr_data_len(fbs_extern_opr));
+
+    LoaderMap* loader_map = find_loader_by_name(&loader_maps, name);
+    TINYNN_ASSERT_MSG(loader_map, "Wrong loader.\n");
+    extern_opr->desc = loader_map->loader.create_desc(extern_opr->nr_input,
+        data, data_len);
+    
+    extern_opr->mgb_inputs = tinynn_malloc(sizeof(MGBTensor) * extern_opr->nr_input);
+    MGBTensorShape* inputs_shape = tinynn_malloc(sizeof(MGBTensorShape) * extern_opr->nr_input);
+    MGBDType* inputs_type = tinynn_malloc(sizeof(MGBDType) * extern_opr->nr_input);
+    for(int i = 0; i < extern_opr->nr_input; ++i){
+        Tensor2MGBTensor(extern_opr->inputs[i], extern_opr->mgb_inputs + i);
+        inputs_shape[i] = extern_opr->mgb_inputs[i].layout.shape;
+        inputs_type[i] = extern_opr->mgb_inputs[i].layout.dtype;
+    }
+
+    extern_opr->mgb_outputs = tinynn_malloc(sizeof(MGBTensor) * extern_opr->nr_output);
+    MGBTensorShape* outputs_shape = tinynn_malloc(sizeof(MGBTensorShape) * extern_opr->nr_output);
+    MGBDType* outputs_type = tinynn_malloc(sizeof(MGBDType) * extern_opr->nr_output);
+
+    extern_opr->desc->infer_shape(extern_opr->desc, inputs_shape, outputs_shape);
+    if(extern_opr->desc->infer_dtype){
+        extern_opr->desc->infer_dtype(extern_opr->desc, inputs_type, outputs_type);
+    }else{
+        for(int i = 0; i < extern_opr->nr_output; ++i){
+            outputs_type[i] = inputs_type[0];
+        }
+    }
+
+    for(int i = 0; i < extern_opr->nr_output; ++i){
+        extern_opr->mgb_outputs[i].layout.dtype = outputs_type[i];
+        extern_opr->mgb_outputs[i].layout.shape.ndim = outputs_shape[i].ndim;
+        for(int j = 0; j < extern_opr->mgb_outputs[i].layout.shape.ndim; ++j){
+            extern_opr->mgb_outputs[i].layout.shape.shape[j] = outputs_shape[i].shape[j];
+        }
+    }
+
+    tinynn_free(outputs_shape);
+    tinynn_free(outputs_type);
+
+    tinynn_free(inputs_shape);
+    tinynn_free(inputs_type);
+
+    return TinyNN_SUCCESS;
+}
+
+static TinyNNStatus execute(Instruction* inst, VM* vm) {
+    ExternOpr* extern_opr = &inst->workload.extern_opr;
+
+    for(int i = 0; i < extern_opr->nr_input; ++i){
+        extern_opr->mgb_inputs[i].data = extern_opr->inputs[i]->ptr;
+    }
+    for(int i = 0; i < extern_opr->nr_output; ++i){
+        extern_opr->mgb_outputs[i].data = extern_opr->outputs[i]->ptr;
+    }
+    extern_opr->desc->execute(extern_opr->desc, extern_opr->mgb_inputs, extern_opr->mgb_outputs);
+    for(int i = 0; i < extern_opr->nr_output; ++i){
+        MGBTensor2Tensor(extern_opr->mgb_outputs + i, extern_opr->outputs[i]);
+    }
+
+    return TinyNN_SUCCESS;
+}
+
+static TinyNNStatus destruct(VM* vm, Instruction* inst) {
+    ExternOpr* extern_opr = &inst->workload.extern_opr;
+
+    FREE(extern_opr->inputs);
+    FREE(extern_opr->outputs);
+    FREE(extern_opr->mgb_outputs);
+    FREE(extern_opr->mgb_inputs);
+
+    free_loader_maps(&loader_maps);
+
+    return TinyNN_SUCCESS;
+}
+
+void register_extern_opr(VM* vm) {
+    vm_register_instruction_load(vm, ns(Instruction_ExternOpr), &load);
+    vm_register_instruction_call(vm, TinyNN_INST_EXTERN_OPR, &execute);
+    vm_register_instruction_destruct(vm, TinyNN_INST_EXTERN_OPR, &destruct);
+}
+#else
+void register_extern_opr(VM* vm) {}
+
+const MGBExternCOprApi* mgb_get_extern_c_opr_api_versioned(int i) {
+    TINYNN_ASSERT_MSG(
+            0,
+            "Should NOT execute here!!!\n"
+            "Maybe there is no extern opr in model, "
+            "but command line argument --c-opr-lib/-c is provided.\n");
+    return NULL;
+}
+#endif
+// vim: syntax=cpp.doxygen
diff --git a/runtime/src/vm/instruction.h b/runtime/src/vm/instruction.h
index 0896839c..3fa4fbd3 100644
--- a/runtime/src/vm/instruction.h
+++ b/runtime/src/vm/instruction.h
@@ -12,6 +12,7 @@
 #include "data_struct.h"
 #include "model_reader.h"
 #include "runtime_inst_switch.h"
+#include "extern_c_opr.h"
 
 // clang-format off
 #define FOR_EACH_INSTRUCTION_TYPE(cb) \
@@ -30,7 +31,8 @@
     cb(TinyNN_INST_TYPECVT)\
     cb(TinyNN_INST_INDEXING_MULTI_AXIS)\
     cb(TinyNN_INST_ARGSORT)\
-    cb(TinyNN_INST_RESHAPE)
+    cb(TinyNN_INST_RESHAPE)\
+    cb(TinyNN_INST_EXTERN_OPR)
 
 typedef enum {
     TinyNN_INST_NONE = 0,
@@ -202,6 +204,16 @@ typedef struct {
     Tensor* output;
 } Reshape;
 
+typedef struct {
+    int32_t nr_input;
+    int32_t nr_output;
+    Tensor** inputs;
+    MGBTensor* mgb_inputs;
+    MGBTensor* mgb_outputs;
+    Tensor** outputs;
+    MGBOprDesc* desc;
+} ExternOpr;
+
 typedef struct Instruction {
     InstructionType tag;
     union {
@@ -221,6 +233,7 @@ typedef struct Instruction {
         IndexingMultiAxis indexing_multi_axis;
         ArgSort argsort;
         Reshape reshape;
+        ExternOpr extern_opr;
     } workload;
 #if TINYNN_PROFILE_KERNEL
     float time_ms;
diff --git a/runtime/test/CMakeLists.txt b/runtime/test/CMakeLists.txt
index 7cf9db71..1d0b0759 100644
--- a/runtime/test/CMakeLists.txt
+++ b/runtime/test/CMakeLists.txt
@@ -68,7 +68,8 @@ target_include_directories(
          ./runtime/
          ${SCHEMA_OUTPUT}
          ${PROJECT_SOURCE_DIR}/../../third_party/flatcc/include
-         ${PROJECT_SOURCE_DIR}/../../third_part/gtest/include)
+         ${PROJECT_SOURCE_DIR}/../../third_part/gtest/include
+         ${PROJECT_SOURCE_DIR}/../../immigration/include)
 
 target_link_libraries(TinyNNTest gtest)
 
diff --git a/runtime/version.ld b/runtime/version.ld
index 356a3a34..76275193 100644
--- a/runtime/version.ld
+++ b/runtime/version.ld
@@ -4,6 +4,7 @@ global:
     default_config;
     default_network_io;
     register_tinynn_cb;
+    mgb_get_extern_c_opr_api_versioned;
 
 
 local:
diff --git a/script/build_and_test_not_standard_os.sh b/script/build_and_test_not_standard_os.sh
index 9437ef54..cdfdbb44 100755
--- a/script/build_and_test_not_standard_os.sh
+++ b/script/build_and_test_not_standard_os.sh
@@ -28,7 +28,7 @@ cmake --build "$MEGCC_BUILD_DIR" -j$(nproc) --target mgb-to-tinynn --target mgb-
 
 function check_key_words() {
     #elf self mangle words, we do not care!!
-    white_list="@MEGW mgb1 5Mbg6 MGBi O:MgBnWk <mbG =MEG>Yr]< 4emUi0B >HMgE kMEG RmEg MbGV4 MEgIy @MEg mGe#S BMgb MGB( mBg: MBgr8C A&mGB mEg; mGb>/ mEg= .strtab .shstrtab A=MgE= mgb=g MGe= g=MgE <mgE="
+    white_list="@MEGW mgb1 5Mbg6 MGBi O:MgBnWk <mbG =MEG>Yr]< 4emUi0B >HMgE kMEG RmEg MbGV4 MEgIy @MEg mGe#S BMgb MGB( mBg: MBgr8C A&mGB mEg; mGb>/ mEg= .strtab .shstrtab A=MgE= mgb=g MGe= g=MgE <mgE= =Mgb>"
     elf_file=$1
     if [ ! -f ${elf_file} ];then
         echo "ERR: can not find ${elf_file}"

From b47ad5a3326e5a433fe233fa57810c9bac901294 Mon Sep 17 00:00:00 2001
From: yuxiongxiong <yuxiongxiong@megvii.com>
Date: Wed, 28 Dec 2022 17:20:05 +0800
Subject: [PATCH 02/17] feat(compiler): add quant fuse_and_relu and
 Convbackdata naive kernel

---
 .../BareMetal/ConvBackDataKernel.cpp          |  52 +++--
 .../KernelGen/BareMetal/ElemwiseMultiType.cpp |  14 +-
 compiler/test/kernel/opr/naive/conv.cpp       | 190 ++++++++++++------
 .../kernel/opr/naive/elemwise_multitype.cpp   |   3 +-
 4 files changed, 174 insertions(+), 85 deletions(-)

diff --git a/compiler/lib/KernelGen/BareMetal/ConvBackDataKernel.cpp b/compiler/lib/KernelGen/BareMetal/ConvBackDataKernel.cpp
index 2cd81319..9a453f0b 100644
--- a/compiler/lib/KernelGen/BareMetal/ConvBackDataKernel.cpp
+++ b/compiler/lib/KernelGen/BareMetal/ConvBackDataKernel.cpp
@@ -11,7 +11,6 @@
 #include <sstream>
 #include <string>
 
-#include "Activation.h"
 #include "ConvBackDataKernel.h"
 #include "FormatHelper.h"
 #include "Utils/StringTemplate.h"
@@ -64,6 +63,19 @@ std::string gen_inline_addr(std::string format_str, std::string sparse) {
     return ss.str();
 }
 
+std::string gen_dep() {
+    return R"(
+        static inline int8_t fp32_to_int8(float src){
+                int res = roundf(src);
+                if(res > 127){
+                    res=127;
+                }else if(res < -128){
+                    res=-128;
+                }
+                return (int8_t)(res);
+        }
+    )";
+}
 std::string get_format(TContext* ctx) {
     auto format_str = ctx->getAttrStr("format");
     return format_str;
@@ -83,12 +95,17 @@ bool ConvBackDataGeneral::IsAvailable(TContext* ctx) const {
     bool param_mode_ok = (ctx->getAttrStr("format") == "NCHW" ||
                           ctx->getAttrStr("format") == "NCHW44") &&
                          ctx->getAttrStr("mode") == "CROSS_CORRELATION";
-    bool type_float_ok = ctx->getAttrInt("nr_operands") >= 3 &&
+    bool type_float_ok = ctx->getAttrInt("nr_operands") == 3 &&
                          ((ctx->getAttrOprand("operand:0").dtype == "f32" &&
                            ctx->getAttrOprand("operand:1").dtype == "f32" &&
                            ctx->getAttrOprand("operand:2").dtype == "f32"));
+    bool type_qint_ok =
+            ctx->getAttrInt("nr_operands") == 3 &&
+            (Utils::is_quant_dtype(ctx->getAttrOprand("operand:0").dtype, 8) &&
+             Utils::is_quant_dtype(ctx->getAttrOprand("operand:1").dtype, 8) &&
+             Utils::is_quant_dtype(ctx->getAttrOprand("operand:2").dtype, 8));
 
-    return param_mode_ok && (type_float_ok);
+    return param_mode_ok && (type_float_ok || type_qint_ok);
 }
 
 std::string ConvBackDataGeneral::GetKernelSymbol(TContext* ctx) const {
@@ -136,8 +153,13 @@ std::string ConvBackDataGeneral::GetKernelBody(TContext* context) const {
     auto flt_specifier = Utils::cvt_dtype_specifier(flt_dtype);
     auto dst_specifier = Utils::cvt_dtype_specifier(dst_dtype);
     std::string acc_specifier = "float";
+    std::string convert = "";
+    std::string compute_kern = "(*sval) + dval  * fval";
     if (src_specifier == "int8_t" && flt_specifier == "int8_t") {
-        acc_specifier = "int";
+        convert = "fp32_to_int8";
+        compute_kern =
+                "((*sval) * scale + dval * dst_scale  * fval * "
+                "flt_scale)/scale";
     }
 
     uint32_t spatial_start = 2;
@@ -168,10 +190,10 @@ std::string ConvBackDataGeneral::GetKernelBody(TContext* context) const {
     ss << R"(
 #include <stdbool.h>
 )";
-    ss << GenActivation::gen_func_call_with_typecvt_dep(
-                  noline_mode, acc_specifier, dst_specifier)
-       << "\n";
     ss << gen_inline_addr(filter_format_str, sparse_str);
+    if (src_specifier == "int8_t" && flt_specifier == "int8_t") {
+        ss << gen_dep();
+    }
     ss << GenCommonRet() << " " << GetKernelSignature(context) << "{\n";
     ss << "const uint32_t spatial_start = " << spatial_start << ";\n";
     ss << "const uint32_t channel_pos = " << channel_pos << ";\n";
@@ -244,7 +266,7 @@ std::string ConvBackDataGeneral::GetKernelBody(TContext* context) const {
                 uint32_t oc_idx = group_idx * ocpg + ocpg_idx;
                 for (uint32_t oh_idx = 0; oh_idx < oh; ++oh_idx) {
                     for (uint32_t ow_idx = 0; ow_idx < ow; ++ow_idx) {
-                        ${acc_specifier} dval = dst_ptr[${dst_layout_iter_symbol}(batch_idx, oc_idx, oh_idx,
+                        ${dst_specifier} dval = dst_ptr[${dst_layout_iter_symbol}(batch_idx, oc_idx, oh_idx,
                                                 ow_idx, dst_layout.stride,
                                                 true)];
                         for (uint32_t fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -258,14 +280,16 @@ std::string ConvBackDataGeneral::GetKernelBody(TContext* context) const {
                                          ++icpg_idx) {
                                         uint32_t ic_idx =
                                                 group_idx * icpg + icpg_idx;
-                                        ${acc_specifier}* sval = &src_ptr[${src_layout_iter_symbol}(
+                                        ${src_specifier}* sval = &src_ptr[${src_layout_iter_symbol}(
                                                 batch_idx, ic_idx, ih_idx,
                                                 iw_idx, src_layout.stride,
                                                 false)];
-                                        ${acc_specifier} fval = flt_ptr[${filter_iter_symbol}(
+                                        ${flt_specifier} fval = flt_ptr[${filter_iter_symbol}(
                                                 group_idx, ocpg_idx, icpg_idx,
                                                 fh_idx, fw_idx, filter_stride)];
-                                        *sval += dval * fval;
+                                          ${acc_specifier}  tmp_mid_val0 = ${compute_kern};
+                                          ${src_specifier} tmp_mid_val = ${convert}(tmp_mid_val0);
+                                          *sval = tmp_mid_val;
                                     }
                                 }
                             }
@@ -290,14 +314,12 @@ std::string ConvBackDataGeneral::GetKernelBody(TContext* context) const {
                     .add("filter_iter_symbol", "get_filter_addr_" +
                                                        filter_format_str + "_" +
                                                        sparse_str)
-                    .add("act_func", GenActivation::gen_func_call_with_typecvt(
-                                             noline_mode, "dval", acc_specifier,
-                                             dst_specifier, "scale",
-                                             "flt_scale", "dst_scale"))
                     .add("src_specifier", src_specifier)
                     .add("flt_specifier", flt_specifier)
                     .add("dst_specifier", dst_specifier)
                     .add("acc_specifier", acc_specifier)
+                    .add("convert", convert)
+                    .add("compute_kern", compute_kern)
                     .render(body_template);
     return ss.str();
 }
diff --git a/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp b/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp
index 5d384819..b5cec2c9 100644
--- a/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp
+++ b/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp
@@ -35,7 +35,14 @@ std::string gen_dep(std::string mode) {
 
 std::string gen_binary(std::string mode) {
     if (mode == "QADD") {
-        return "fp32_to_int8((scale_0 * val_0 + scale_1 * val_1) * scale_div)";
+        return "float out_val = fp32_to_int8((scale_0 * val_0 + scale_1 * "
+               "val_1) * scale_div);";
+    } else if (mode == "QFUSE_ADD_RELU") {
+        return R"(
+        float val0 = scale_0 * val_0;
+        float val1 = scale_1 * val_1;     
+        float out_val  = fp32_to_int8( ((val0 + val1) > 0? (val0 + val1):0) * scale_div);
+        )";
     } else {
         CC_ABORT << "not support mode " << mode.c_str() << "\n";
     }
@@ -48,7 +55,7 @@ bool ElemwiseMultiTypeKernel::IsAvailable(TContext* context) const {
     auto mode = context->getAttrStr("mode");
     auto nr_operands = context->getAttrInt("nr_operands");
     bool nr_operands_ok = nr_operands == 3;
-    bool mode_ok_binary = mode == "QADD";
+    bool mode_ok_binary = mode == "QADD" || mode == "QFUSE_ADD_RELU";
     return nr_operands_ok && (mode_ok_binary);
 }
 
@@ -99,7 +106,8 @@ std::string ElemwiseMultiTypeKernel::GetKernelBody(TContext* context) const {
                 for(size_t i = 0; i < nr_elem; ++i){
                     ${op0_specifier} val_0 = input_0[i];
                     ${op1_specifier} val_1 = input_1[i];
-                    output_data[i] = ${act};
+                    ${act};
+                    output_data[i] = out_val;
                 }
                 return TinyNN_SUCCESS;
                 }
diff --git a/compiler/test/kernel/opr/naive/conv.cpp b/compiler/test/kernel/opr/naive/conv.cpp
index a234282d..838b81dc 100644
--- a/compiler/test/kernel/opr/naive/conv.cpp
+++ b/compiler/test/kernel/opr/naive/conv.cpp
@@ -11,7 +11,58 @@
 using namespace megdnn;
 using namespace megcc::test;
 using namespace megcc::KernelGen;
-
+namespace {
+void nchw_backdata(Checker<ConvolutionBackwardData>& checker) {
+    ConvolutionBackwardData::Param param;
+    param.compute_mode = ConvolutionBackwardData::Param::ComputeMode::DEFAULT;
+    param.format = ConvolutionBackwardData::Param::Format::NCHW;
+    checker.set_epsilon(1e-4);
+    for (size_t n : {2})
+        for (size_t oc : {1, 4})
+            for (size_t ic : {1, 4})
+                for (size_t hw : {7, 12})
+                    for (size_t kernel : {1, 3})
+                        for (size_t pad : {(size_t)0, kernel / 2})
+                            for (size_t stride : {1, 2}) {
+                                param.pad_h = pad;
+                                param.pad_w = pad;
+                                param.stride_h = stride;
+                                param.stride_w = stride;
+                                param.sparse =
+                                        ConvBiasForward::Param::Sparse::DENSE;
+                                checker.set_param(param);
+                                checker.execs({{oc, ic, kernel, kernel},
+                                               {n, oc, hw, hw},
+                                               {n, ic,
+                                                (hw - 1) * stride +
+                                                        (kernel - 1) *
+                                                                param.dilate_h +
+                                                        1 - pad * 2,
+                                                (hw - 1) * stride +
+                                                        (kernel - 1) *
+                                                                param.dilate_w +
+                                                        1 - pad * 2}});
+                                if (ic == oc) {
+                                    size_t group = oc;
+                                    param.sparse = ConvolutionBackwardData::
+                                            Param::Sparse::GROUP;
+                                    checker.set_param(param);
+                                    checker.execs(
+                                            {{group, 1, 1, kernel, kernel},
+                                             {n, oc, hw, hw},
+                                             {n, ic,
+                                              (hw - 1) * stride +
+                                                      (kernel - 1) *
+                                                              param.dilate_h +
+                                                      1 - pad * 2,
+                                              (hw - 1) * stride +
+                                                      (kernel - 1) *
+                                                              param.dilate_w +
+                                                      1 - pad * 2}});
+                                }
+                            }
+}
+}  // namespace
 TEST(NAIVE, ConvBiasNCHWQS8) {
     Checker<ConvBiasForward> checker(Arch::BAREMETAL);
     checker.set_kernel_symbol("kernel_.*");
@@ -222,42 +273,7 @@ TEST(NAIVE, ConvBackDataNCHW) {
     Checker<ConvolutionBackwardData> checker(Arch::BAREMETAL);
     checker.set_kernel_symbol("kernel_.*");
     ConvolutionBackwardData::Param param;
-    ConstRNG seq(2.0);
-    ConstRNG const_rng(1.0);
-
-    checker.set_rng(1, &seq);
-    checker.set_rng(0, &const_rng);
-
-    param.compute_mode = ConvolutionBackwardData::Param::ComputeMode::DEFAULT;
-    param.format = ConvolutionBackwardData::Param::Format::NCHW;
-    checker.set_epsilon(1e-4);
-        for (size_t n : {2})
-            for (size_t oc : {1, 4})
-                for (size_t ic : {1, 4})
-                    for (size_t hw : {7, 12})
-                        for (size_t kernel : {1, 3})
-                            for (size_t pad : {(size_t)0, kernel / 2})
-                                for (size_t stride : {1, 2}) {
-                                    param.pad_h = pad;
-                                    param.pad_w = pad;
-                                    param.stride_h = stride;
-                                    param.stride_w = stride;
-                                    param.sparse = ConvBiasForward::Param::
-                                            Sparse::DENSE;
-                                    checker.set_param(param);
-                                    checker.execs({{oc, ic, kernel, kernel},
-                                                   {n, oc, hw, hw},
-                                                   {n, ic, (hw-1)*stride+(kernel-1)*param.dilate_h+1-pad*2, (hw-1)*stride+(kernel-1)*param.dilate_w+1-pad*2}});
-                                     if (ic == oc) {
-                                        size_t group = oc;
-                                        param.sparse = ConvolutionBackwardData::Param::
-                                                Sparse::GROUP;
-                                        checker.set_param(param);
-                                        checker.execs({{ group, 1, 1, kernel, kernel},
-                                                   {n, oc, hw, hw},
-                                                   {n, ic, (hw-1)*stride+(kernel-1)*param.dilate_h+1-pad*2, (hw-1)*stride+(kernel-1)*param.dilate_w+1-pad*2}});
-                                    }
-                                }
+    nchw_backdata(checker);
 }
 
 TEST(NAIVE, ConvBackDataNCHW44) {
@@ -267,35 +283,77 @@ TEST(NAIVE, ConvBackDataNCHW44) {
     checker.set_epsilon(1e-4);
     param.compute_mode = ConvolutionBackwardData::Param::ComputeMode::DEFAULT;
     param.format = ConvolutionBackwardData::Param::Format::NCHW44;
-        for (size_t n : {12})
-            for (size_t oc : {4, 12})
-                for (size_t ic : {4, 12})
-                    for (size_t hw : {7, 12})
-                        for (size_t kernel : {1, 3})
-                            for (size_t pad : {(size_t)0, kernel / 2})
-                                for (size_t stride : {1, 2}) {
-                                    param.pad_h = pad;
-                                    param.pad_w = pad;
-                                    param.stride_h = stride;
-                                    param.stride_w = stride;
-                                    param.sparse = ConvolutionBackwardData::Param::
-                                            Sparse::DENSE;
+    for (size_t n : {12})
+        for (size_t oc : {4, 12})
+            for (size_t ic : {4, 12})
+                for (size_t hw : {7, 12})
+                    for (size_t kernel : {1, 3})
+                        for (size_t pad : {(size_t)0, kernel / 2})
+                            for (size_t stride : {1, 2}) {
+                                param.pad_h = pad;
+                                param.pad_w = pad;
+                                param.stride_h = stride;
+                                param.stride_w = stride;
+                                param.sparse = ConvolutionBackwardData::Param::
+                                        Sparse::DENSE;
+                                checker.set_param(param);
+                                checker.execs(
+                                        {{oc / 4, ic / 4, kernel, kernel, 4, 4},
+                                         {n, oc / 4, hw, hw, 4},
+                                         {n, ic / 4,
+                                          (hw - 1) * stride +
+                                                  (kernel - 1) *
+                                                          param.dilate_h +
+                                                  1 - pad * 2,
+                                          (hw - 1) * stride +
+                                                  (kernel - 1) *
+                                                          param.dilate_w +
+                                                  1 - pad * 2,
+                                          4}});
+                                if (ic == oc) {
+                                    size_t group = oc;
+                                    param.sparse = ConvolutionBackwardData::
+                                            Param::Sparse::GROUP;
                                     checker.set_param(param);
-                                    checker.execs({{oc / 4, ic / 4, kernel,
-                                                    kernel, 4, 4},
-                                                    {n, oc / 4, hw, hw, 4},
-                                                    {n, ic / 4 , (hw-1)*stride+(kernel-1)*param.dilate_h+1-pad*2, (hw-1)*stride+(kernel-1)*param.dilate_w+1-pad*2, 4}
-                                                    });
-                                    if (ic == oc) {
-                                        size_t group = oc;
-                                        param.sparse = ConvolutionBackwardData::Param::
-                                                Sparse::GROUP;
-                                        checker.set_param(param);
-                                            checker.execs({{group / 4, 1, 1, kernel,
-                                                    kernel, 4},
-                                                    {n, oc / 4, hw, hw, 4},
-                                                    {n, ic / 4 , (hw-1)*stride+(kernel-1)*param.dilate_h+1-pad*2, (hw-1)*stride+(kernel-1)*param.dilate_w+1-pad*2, 4}
-                                                    });
-                                    }
+                                    checker.execs(
+                                            {{group / 4, 1, 1, kernel, kernel,
+                                              4},
+                                             {n, oc / 4, hw, hw, 4},
+                                             {n, ic / 4,
+                                              (hw - 1) * stride +
+                                                      (kernel - 1) *
+                                                              param.dilate_h +
+                                                      1 - pad * 2,
+                                              (hw - 1) * stride +
+                                                      (kernel - 1) *
+                                                              param.dilate_w +
+                                                      1 - pad * 2,
+                                              4}});
                                 }
+                            }
+}
+
+TEST(NAIVE, ConvBackDataNCHWQS8) {
+    Checker<ConvolutionBackwardData> checker(Arch::BAREMETAL);
+    checker.set_kernel_symbol("kernel_.*");
+    ConvolutionBackwardData::Param param;
+
+    checker.set_dtype(0, dtype::QuantizedS8(1.0f));
+    checker.set_dtype(1, dtype::QuantizedS8(2.0f));
+    checker.set_dtype(2, dtype::QuantizedS8(2.0f));
+    nchw_backdata(checker);
+}
+
+TEST(NAIVE, ConvBackDataNCHWQS8Overflow) {
+    Checker<ConvolutionBackwardData> checker(Arch::BAREMETAL);
+    checker.set_kernel_symbol("kernel_.*");
+    ConvolutionBackwardData::Param param;
+    UniformIntRNG qint_rng(30, 50);
+    checker.set_rng(0, &qint_rng);
+    checker.set_rng(1, &qint_rng);
+
+    checker.set_dtype(0, dtype::QuantizedS8(1.0f));
+    checker.set_dtype(1, dtype::QuantizedS8(2.0f));
+    checker.set_dtype(2, dtype::QuantizedS8(2.0f));
+    nchw_backdata(checker);
 }
diff --git a/compiler/test/kernel/opr/naive/elemwise_multitype.cpp b/compiler/test/kernel/opr/naive/elemwise_multitype.cpp
index 82b29831..4e3c1fa0 100644
--- a/compiler/test/kernel/opr/naive/elemwise_multitype.cpp
+++ b/compiler/test/kernel/opr/naive/elemwise_multitype.cpp
@@ -20,7 +20,8 @@ TEST(NAIVE, ElementwiseMultitypeBinary) {
     checker.set_dtype(1, dtype::QuantizedS8(2.f));
     checker.set_dtype(2, dtype::QuantizedS8(3.f));
     ElemwiseMultiType::Param param;
-    for (auto mode : {MODE::QADD}) {
+
+    for (auto mode : {MODE::QADD, MODE::QFUSE_ADD_RELU}) {
         param.mode = mode;
         checker.set_param(param);
         checker.execs({{1}, {1}, {}});

From 61b1d8e925270db5c4be9cf7fa257b6ab4cd3150 Mon Sep 17 00:00:00 2001
From: zhanghaolong <zhanghaolong@megvii.com>
Date: Thu, 29 Dec 2022 13:25:30 +0800
Subject: [PATCH 03/17] feat(misc): misc opt 1: config run iter count 2: make
 compiler more friendly for debugging

---
 compiler/CMakeLists.txt                 |  2 +-
 runtime/example/standard_OS/lite_main.c | 99 +++++++++++++++----------
 runtime/src/lite/network.c              |  4 +-
 3 files changed, 61 insertions(+), 44 deletions(-)

diff --git a/compiler/CMakeLists.txt b/compiler/CMakeLists.txt
index 9bdd7419..d904a0bf 100644
--- a/compiler/CMakeLists.txt
+++ b/compiler/CMakeLists.txt
@@ -115,7 +115,7 @@ set(TCC_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/../third_party/tcc/include)
 set(RUNTIME_SRC_DIR ${PROJECT_SOURCE_DIR}/../runtime/src)
 set(RUNTIME_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/../runtime/include)
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -g")
 
 include(cmake/GenLiteSchema.cmake)
 gen_lite_schema()
diff --git a/runtime/example/standard_OS/lite_main.c b/runtime/example/standard_OS/lite_main.c
index 4035a68c..ce7fe294 100644
--- a/runtime/example/standard_OS/lite_main.c
+++ b/runtime/example/standard_OS/lite_main.c
@@ -6,8 +6,8 @@
  * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
  */
 
-#include <string.h>
 #include <getopt.h>
+#include <string.h>
 #include <sys/time.h>
 #include "extern_c_opr.h"
 #include "lite-c/common_enum_c.h"
@@ -58,27 +58,31 @@ static void write_file(const char* file_name, void* ptr, size_t length) {
 }
 
 static inline void run_model(LiteNetwork model, const char* output_dir,
-                             int instance_cnt, const int print_out) {
+                             int instance_cnt, const int print_out,
+                             const size_t warmup_count,
+                             const size_t iter_count) {
+    size_t number = iter_count;
+    size_t warmup = warmup_count;
 #if TINYNN_DUMP_TENSOR || DEBUG_MODE
-    const int number = 1;
-    const int warmup = 0;
-#else
-    const int number = 100;
-    const int warmup = 20;
+    number = 1;
+    warmup = 0;
+    printf("(DEBUG or TINYNN_DUMP_TENSOR enable)overwriting run iter to: %zu, "
+           "warmup count to: %zu\n",
+           number, warmup);
 #endif
-    for (int i = 0; i < warmup; i++) {
+    for (size_t i = 0; i < warmup; i++) {
         LITE_CAPI_CHECK(LITE_forward(model), "run model failed\n");
         LITE_CAPI_CHECK(LITE_wait(model), "wait model failed\n");
-        printf("warmup iter %d finished.\n\n", i);
+        printf("warmup iter %zu finished.\n", i);
     }
 
     struct timeval start;
     struct timeval end;
     gettimeofday(&start, NULL);
-    for (int i = 0; i < number; i++) {
+    for (size_t i = 0; i < number; i++) {
         LITE_CAPI_CHECK(LITE_forward(model), "run model failed\n");
         LITE_CAPI_CHECK(LITE_wait(model), "wait model failed\n");
-        printf("execute iter %d finished.\n", i);
+        printf("execute iter %zu finished.\n", i);
     }
     gettimeofday(&end, NULL);
     unsigned long diff =
@@ -159,17 +163,19 @@ static TinyNnCallBack g_cb = {
 };
 #endif
 
-void usage(){
-    fprintf(stderr, 
-        "Usage:\n"
-        "\t--input-model/-m: input model path\n"
-        "\t--output-dir/-o: output file path\n"
-        "\t--log-level/-l: 0:ERROR, 1:WARN, 2:INFO, 3:DEBUG\n"
-        "\t--input-data/-d: var=path/to/data_file\n"
-        "\t--data-shape/-s: data shape\n"
-        "\t--c-opr-lib/-c: path to extern opr lib file(.so)\n"
-        "\t--c-opr-init-interface/-i: the init API of your loader\n"
-        );
+void usage() {
+    fprintf(stderr,
+            "Usage:\n"
+            "\t--input-model/-m: input model path\n"
+            "\t--output-dir/-o: output file path\n"
+            "\t--log-level/-l: 0:ERROR, 1:WARN, 2:INFO, 3:DEBUG\n"
+            "\t--input-data/-d: var=path/to/data_file, create by: "
+            "python3 compiler/script/debug/gen_input.py\n"
+            "\t--data-shape/-s: data shape\n"
+            "\t--c-opr-lib/-c: path to extern opr lib file(.so)\n"
+            "\t--c-opr-init-interface/-i: the init API of your loader\n"
+            "\t--warmup-count/-w: warmup count before run model\n"
+            "\t--iter-count/-t: iter run model\n");
 }
 
 #if defined(_WIN32)
@@ -202,26 +208,29 @@ int main(int argc, char** argv) {
     char* data_shape_str = NULL;
     char* extern_so = NULL;
     const char* c_opr_lib_interface = "mgb_c_opr_init";
+    size_t warmup_count = 1;
+    size_t iter = 10;
 
     const struct option long_options[] = {
-        {"input-model", required_argument, 0, 'm'},
-        {"output-dir", required_argument, 0, 'o'},
-        {"log-level", required_argument, 0, 'l'},
-        {"input-data", required_argument, 0, 'd'},
-        {"data-shape", required_argument, 0, 's'},
-        {"c-opr-lib", required_argument, 0, 'c'},
-        {"c-opr-init-interface", required_argument, 0, 'i'},
-        {"help", no_argument, 0, 'h'},
-        {0, 0, 0, 0}
-    };
-    const char* shortopt = "m:o:l:d:s:c:i:h";
+            {"input-model", required_argument, 0, 'm'},
+            {"output-dir", required_argument, 0, 'o'},
+            {"log-level", required_argument, 0, 'l'},
+            {"input-data", required_argument, 0, 'd'},
+            {"data-shape", required_argument, 0, 's'},
+            {"c-opr-lib", required_argument, 0, 'c'},
+            {"c-opr-init-interface", required_argument, 0, 'i'},
+            {"warmup-count", required_argument, 0, 'w'},
+            {"iter-count", required_argument, 0, 't'},
+            {"help", no_argument, 0, 'h'},
+            {0, 0, 0, 0}};
+    const char* shortopt = "m:o:l:d:s:c:i:w:t:h";
     int c, option_idx = 0;
-    while(1) {
+    while (1) {
         c = getopt_long(argc, argv, shortopt, long_options, &option_idx);
-        if(c == -1){
+        if (c == -1) {
             break;
         }
-        switch(c){
+        switch (c) {
             case 'm':
                 model_path = optarg;
                 break;
@@ -243,6 +252,12 @@ int main(int argc, char** argv) {
             case 'i':
                 c_opr_lib_interface = optarg;
                 break;
+            case 'w':
+                warmup_count = atoi(optarg);
+                break;
+            case 't':
+                iter = atoi(optarg);
+                break;
             case 'h':
                 usage();
                 exit(0);
@@ -251,14 +266,14 @@ int main(int argc, char** argv) {
                 abort();
         }
     }
-    
+
     if (print_out == 2) {
         LITE_set_log_level(INFO);
     } else if (print_out == 3) {
         LITE_set_log_level(DEBUG);
     }
 
-    if(extern_so){
+    if (extern_so) {
         void* handle = dlopen(extern_so, RTLD_LAZY);
         EXAMPLE_ASSERT(handle, "load loader failed.\n");
         void (*func)(const MGBExternCOprApi* (*)(int)) = NULL;
@@ -266,7 +281,7 @@ int main(int argc, char** argv) {
         EXAMPLE_ASSERT(func, "load init interface of loader failed.\n");
         func(mgb_get_extern_c_opr_api_versioned);
     }
-    
+
     LiteNetwork model;
     LITE_CAPI_CHECK(
             LITE_make_network(&model, *default_config(), *default_network_io()),
@@ -354,7 +369,8 @@ int main(int argc, char** argv) {
                        nr_input, input_cnt);
             }
         }
-        run_model(model, output_dir, instance_cnt, print_out);
+        run_model(model, output_dir, instance_cnt, print_out, warmup_count,
+                  iter);
         for (size_t i = 0; i < nr_input; ++i) {
             free(data[i]);
         }
@@ -366,7 +382,8 @@ int main(int argc, char** argv) {
     }
     //! if no input data set, just run the model with random input data
     if (instance_cnt == 0) {
-        run_model(model, output_dir, instance_cnt, print_out);
+        run_model(model, output_dir, instance_cnt, print_out, warmup_count,
+                  iter);
     }
 
     LITE_CAPI_CHECK(LITE_destroy_network(model), "delete model failed\n");
diff --git a/runtime/src/lite/network.c b/runtime/src/lite/network.c
index 2d8a8bf7..35b6f6e0 100644
--- a/runtime/src/lite/network.c
+++ b/runtime/src/lite/network.c
@@ -201,7 +201,7 @@ int LITE_forward(const LiteNetwork network) {
             Layout in_layout = opr->inputs[0]->layout;
             Layout out_layout = opr->outputs[0]->layout;
             LOG_ERROR(
-                    " instruction %s \n%f \t"
+                    " instruction: %s \nuse %fms \t"
                     "[%d(%d), %d(%d), %d(%d), %d(%d), %d(%d)] \t"
                     "[%d(%d), %d(%d), %d(%d), %d(%d), %d(%d)]\n",
                     opr->type, inst->time_ms / inst->time_count,
@@ -335,7 +335,7 @@ int LITE_destroy_network(LiteNetwork network) {
         //! preprocessed weight
         for (int i = 0; i < model->nr_processed_weight; i++) {
             Tensor* weight = model->processed_weights + i;
-            if(!weight->is_shared)
+            if (!weight->is_shared)
                 model->device.free(weight->ptr);
         }
         FREE(model->processed_weights);

From 87235b4244e23fc33013c81ecdac15e701571e3e Mon Sep 17 00:00:00 2001
From: zhanghaolong <zhanghaolong@megvii.com>
Date: Thu, 29 Dec 2022 13:40:08 +0800
Subject: [PATCH 04/17] feat(misc): increase priority of gemv/gevm

---
 compiler/lib/KernelGen/KernelGen.cpp | 32 ++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/compiler/lib/KernelGen/KernelGen.cpp b/compiler/lib/KernelGen/KernelGen.cpp
index f7be9a13..20e68428 100644
--- a/compiler/lib/KernelGen/KernelGen.cpp
+++ b/compiler/lib/KernelGen/KernelGen.cpp
@@ -32,15 +32,29 @@ KernelPack::GetKernel(KernelPack::KernType kernel_type, Arch arch) {
     //! arm64v7 is used by tinycv, nn opr should be armv64 or armv7, not arm64v7
     auto deduce_func = GetDeduceLayout(kernel_type);
     if (arch == Arch::ARM64 || arch == Arch::ARM64V7) {
-        auto a64_kerns = Arm64::ArchKernelPack::GetKernel(kernel_type);
-        auto armcommon_kerns =
-                ArmCommon::ArchKernelPack::GetKernel(kernel_type);
-        auto gi_kerns =
-                GeneralIntrinsic::ArchKernelPack::GetKernel(kernel_type);
-        a64_kerns.insert(a64_kerns.end(), armcommon_kerns.begin(),
-                         armcommon_kerns.end());
-        a64_kerns.insert(a64_kerns.end(), gi_kerns.begin(), gi_kerns.end());
-        return {a64_kerns, deduce_func};
+        if (kernel_type == KernelPack::KernType::MatrixMulKernel) {
+            auto a64_kerns = Arm64::ArchKernelPack::GetKernel(kernel_type);
+            auto armcommon_kerns =
+                    ArmCommon::ArchKernelPack::GetKernel(kernel_type);
+            auto gi_kerns =
+                    GeneralIntrinsic::ArchKernelPack::GetKernel(kernel_type);
+            armcommon_kerns.insert(armcommon_kerns.end(), a64_kerns.begin(),
+                                   a64_kerns.end());
+            armcommon_kerns.insert(armcommon_kerns.end(), gi_kerns.begin(),
+                                   gi_kerns.end());
+            return {armcommon_kerns, deduce_func};
+        } else {
+            auto a64_kerns = Arm64::ArchKernelPack::GetKernel(kernel_type);
+            auto armcommon_kerns =
+                    ArmCommon::ArchKernelPack::GetKernel(kernel_type);
+            auto gi_kerns =
+                    GeneralIntrinsic::ArchKernelPack::GetKernel(kernel_type);
+            a64_kerns.insert(a64_kerns.end(), armcommon_kerns.begin(),
+                             armcommon_kerns.end());
+            a64_kerns.insert(a64_kerns.end(), gi_kerns.begin(), gi_kerns.end());
+            return {a64_kerns, deduce_func};
+        }
+
     } else if (arch == Arch::ARMV7) {
         auto a32_kerns = Armv7::ArchKernelPack::GetKernel(kernel_type);
 

From 63f677ce44706b13dea11ff527229e246511cabc Mon Sep 17 00:00:00 2001
From: yuxiongxiong <yuxiongxiong@megvii.com>
Date: Fri, 30 Dec 2022 17:58:52 +0800
Subject: [PATCH 05/17] feat(compiler): add qrelu mode of elemwisemultitype

---
 .../KernelGen/BareMetal/ElemwiseMultiType.cpp | 57 +++++++++++++++++--
 .../kernel/opr/naive/elemwise_multitype.cpp   | 16 ++++++
 2 files changed, 67 insertions(+), 6 deletions(-)

diff --git a/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp b/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp
index b5cec2c9..94785a79 100644
--- a/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp
+++ b/compiler/lib/KernelGen/BareMetal/ElemwiseMultiType.cpp
@@ -32,16 +32,26 @@ std::string gen_dep(std::string mode) {
         }
     )";
 }
+std::string gen_unary(std::string mode) {
+    if (mode == "QRELU") {
+        return "int8_t out_val = fp32_to_int8(((scale_0 * val_0) > 0?(scale_0 "
+               "* "
+               "val_0 ):0) * scale_div)";
+    } else {
+        CC_ABORT << "not support mode " << mode.c_str() << "\n";
+    }
+    return "";
+}
 
 std::string gen_binary(std::string mode) {
     if (mode == "QADD") {
-        return "float out_val = fp32_to_int8((scale_0 * val_0 + scale_1 * "
+        return "int8_t out_val = fp32_to_int8((scale_0 * val_0 + scale_1 * "
                "val_1) * scale_div);";
     } else if (mode == "QFUSE_ADD_RELU") {
         return R"(
         float val0 = scale_0 * val_0;
         float val1 = scale_1 * val_1;     
-        float out_val  = fp32_to_int8( ((val0 + val1) > 0? (val0 + val1):0) * scale_div);
+        int8_t out_val  = fp32_to_int8( ((val0 + val1) > 0? (val0 + val1):0) * scale_div);
         )";
     } else {
         CC_ABORT << "not support mode " << mode.c_str() << "\n";
@@ -54,9 +64,11 @@ std::string gen_binary(std::string mode) {
 bool ElemwiseMultiTypeKernel::IsAvailable(TContext* context) const {
     auto mode = context->getAttrStr("mode");
     auto nr_operands = context->getAttrInt("nr_operands");
-    bool nr_operands_ok = nr_operands == 3;
-    bool mode_ok_binary = mode == "QADD" || mode == "QFUSE_ADD_RELU";
-    return nr_operands_ok && (mode_ok_binary);
+    bool nr_operands_ok = nr_operands == 2 || nr_operands == 3;
+    bool mode_ok_unary = nr_operands == 2 && mode == "QRELU";
+    bool mode_ok_binary =
+            nr_operands == 3 && (mode == "QADD" || mode == "QFUSE_ADD_RELU");
+    return nr_operands_ok && (mode_ok_unary || mode_ok_binary);
 }
 
 std::string ElemwiseMultiTypeKernel::GetKernelSymbol(TContext* context) const {
@@ -74,8 +86,41 @@ std::string ElemwiseMultiTypeKernel::GetKernelBody(TContext* context) const {
     writer << gen_dep(mode);
     writer << GenCommonRet() << " ";
     writer << GetKernelSignature(context);
+    if (context->getAttrInt("nr_operands") == 2) {
+        auto op0 = context->getAttrOprand("operand:0");
+        auto dst = context->getAttrOprand("operand:1");
+        CC_ASSERT(Utils::is_quant_dtype(op0.dtype, 8) &&
+                  Utils::is_quant_dtype(dst.dtype, 8));
+        auto op0_specifier = Utils::cvt_dtype_specifier(op0.dtype);
+        auto dst_specifier = Utils::cvt_dtype_specifier(dst.dtype);
+        std::string binary_str = R"({
+                ${op0_specifier}* input_0 = (${op0_specifier}*)inputs[0]->ptr;
+                float scale_0 = inputs[0]->dtype.param.scale;
+                TINYNN_ASSERT(input_0);
+                ${dst_specifier}* output_data = (${dst_specifier}*)outputs[0]->ptr;
+                float scale_dst = outputs[0]->dtype.param.scale;
+                TINYNN_ASSERT(output_data);
+                float scale_div = 1.f / scale_dst;
 
-    if (context->getAttrInt("nr_operands") == 3) {
+                Layout in_layout = inputs[0]->layout;
+                size_t nr_elem = 1;
+                for (int i = 0; i < in_layout.nr_dim; ++i) {
+                    nr_elem *= in_layout.dims[i];
+                }
+                for(size_t i = 0; i < nr_elem; ++i){
+                    ${op0_specifier} val_0 = input_0[i];
+                    ${act};
+                    output_data[i] = out_val;
+                }
+                return TinyNN_SUCCESS;
+                }
+            )";
+        writer << StringTemplate::StringTemplateArgs()
+                          .add("op0_specifier", op0_specifier)
+                          .add("dst_specifier", dst_specifier)
+                          .add("act", gen_unary(mode))
+                          .render(binary_str);
+    } else if (context->getAttrInt("nr_operands") == 3) {
         auto op0 = context->getAttrOprand("operand:0");
         auto op1 = context->getAttrOprand("operand:1");
         auto dst = context->getAttrOprand("operand:2");
diff --git a/compiler/test/kernel/opr/naive/elemwise_multitype.cpp b/compiler/test/kernel/opr/naive/elemwise_multitype.cpp
index 4e3c1fa0..38ec4cce 100644
--- a/compiler/test/kernel/opr/naive/elemwise_multitype.cpp
+++ b/compiler/test/kernel/opr/naive/elemwise_multitype.cpp
@@ -12,6 +12,22 @@ using namespace megdnn;
 using namespace megcc::test;
 using MODE = ElemwiseMultiType::Param::Mode;
 
+TEST(NAIVE, ElementwiseMultitypeUnary) {
+    Checker<ElemwiseMultiType> checker;
+    checker.set_kernel_symbol("kernel_.*");
+    checker.set_epsilon(1e-4);
+    checker.set_dtype(0, dtype::QuantizedS8(1.f));
+    checker.set_dtype(1, dtype::QuantizedS8(2.f));
+    ElemwiseMultiType::Param param;
+    for (auto mode : {MODE::QRELU}) {
+        param.mode = mode;
+        checker.set_param(param);
+        checker.execs({{1}, {}});
+        checker.execs({{1, 10}, {}});
+        checker.execs({{1, 10, 12, 13}, {}});
+    }
+}
+
 TEST(NAIVE, ElementwiseMultitypeBinary) {
     Checker<ElemwiseMultiType> checker;
     checker.set_kernel_symbol("kernel_.*");

From bfdeb57f26b73bdfac153e53788509227b4e3b7f Mon Sep 17 00:00:00 2001
From: zhanghaolong <zhanghaolong@megvii.com>
Date: Tue, 20 Dec 2022 15:14:03 +0800
Subject: [PATCH 06/17] feat(compiler/tools): add kernel export tools

---
 compiler/include/compiler/Common/Logger.h     |    6 +-
 compiler/lib/Common/Logger.cpp                |   18 +
 .../ConvKernel/Int8/Int8DotConvNchwNchw44.cpp |    1 +
 .../ConvKernel/Fp32/Fp32ConvNchwNchw44.cpp    |    4 +-
 .../ConvKernel/F32ConvNCHWNCHW443x3s2.cpp     |    5 +-
 .../ConvKernel/Fp32ConvNchwNchw44.cpp         |   42 +-
 compiler/tools/CMakeLists.txt                 |    1 +
 compiler/tools/kernel_exporter/CMakeLists.txt |    7 +
 .../tools/kernel_exporter/config_attr.cpp     | 1076 +++++++++++++++++
 compiler/tools/kernel_exporter/config_attr.h  |   33 +
 .../tools/kernel_exporter/exporter_imp.cpp    |  134 ++
 compiler/tools/kernel_exporter/exporter_imp.h |   73 ++
 .../tools/kernel_exporter/tinynn-exporter.cpp |   53 +
 compiler/tools/kernel_exporter/utils.cpp      |   60 +
 compiler/tools/kernel_exporter/utils.h        |   14 +
 script/build_and_test_not_standard_os.sh      |    2 +-
 script/release_megcc.sh                       |    7 +
 17 files changed, 1517 insertions(+), 19 deletions(-)
 create mode 100644 compiler/tools/kernel_exporter/CMakeLists.txt
 create mode 100644 compiler/tools/kernel_exporter/config_attr.cpp
 create mode 100644 compiler/tools/kernel_exporter/config_attr.h
 create mode 100644 compiler/tools/kernel_exporter/exporter_imp.cpp
 create mode 100644 compiler/tools/kernel_exporter/exporter_imp.h
 create mode 100644 compiler/tools/kernel_exporter/tinynn-exporter.cpp
 create mode 100644 compiler/tools/kernel_exporter/utils.cpp
 create mode 100644 compiler/tools/kernel_exporter/utils.h

diff --git a/compiler/include/compiler/Common/Logger.h b/compiler/include/compiler/Common/Logger.h
index a510cd01..868b9347 100644
--- a/compiler/include/compiler/Common/Logger.h
+++ b/compiler/include/compiler/Common/Logger.h
@@ -20,6 +20,8 @@ enum class LogLevel : uint32_t {
 
 void SetLogLevel(LogLevel);
 
+void setAssertThrow(bool);
+
 LogLevel GetLogLevel();
 
 class Logger {
@@ -59,8 +61,8 @@ class Logger {
 
 class LogFatal : public Logger {
 public:
-    LogFatal() : Logger(LogLevel::ERROR) {}
-    ~LogFatal() { abort(); }
+    LogFatal();
+    ~LogFatal();
 };
 
 #define LOG_DEBUG megcc::Logger::debug()
diff --git a/compiler/lib/Common/Logger.cpp b/compiler/lib/Common/Logger.cpp
index daf78066..f8dc1968 100644
--- a/compiler/lib/Common/Logger.cpp
+++ b/compiler/lib/Common/Logger.cpp
@@ -7,15 +7,22 @@
  */
 
 #include "compiler/Common/Logger.h"
+#include <iostream>
 
 using namespace megcc;
 
 static LogLevel GlobalLogLevel = LogLevel::WARN;
 
+static bool g_is_assert_throw = false;
+
 void megcc::SetLogLevel(LogLevel level) {
     GlobalLogLevel = level;
 }
 
+void megcc::setAssertThrow(bool is_throw) {
+    g_is_assert_throw = is_throw;
+}
+
 LogLevel megcc::GetLogLevel() {
     return GlobalLogLevel;
 }
@@ -36,4 +43,15 @@ Logger Logger::error() {
     return Logger(LogLevel::ERROR);
 }
 
+LogFatal::LogFatal() : Logger(LogLevel::ERROR) {
+#if __EXCEPTIONS
+    if (g_is_assert_throw) {
+        throw std::exception();
+    }
+#endif
+}
+LogFatal::~LogFatal() {
+    abort();
+}
+
 // vim: syntax=cpp.doxygen
diff --git a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConvNchwNchw44.cpp b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConvNchwNchw44.cpp
index 5f2d5611..0d255fc1 100644
--- a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConvNchwNchw44.cpp
+++ b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConvNchwNchw44.cpp
@@ -44,6 +44,7 @@ bool ConvDotNCHWNCHW44::IsAvailable(TContext* ctx) const {
 }
 std::string ConvDotNCHWNCHW44::GetKernelSymbol(TContext* ctx) const {
     auto src_tensor = ctx->getAttrOprand("operand:0");
+    CC_ASSERT((src_tensor.shape.size()) > 0) << "src_tensor.shape.size > 0";
     uint32_t ic = src_tensor.shape[1];
     auto dst_tensor = ctx->getAttrOprand(
             "operand:" + std::to_string(ctx->getAttrInt("nr_operands") - 1));
diff --git a/compiler/lib/KernelGen/Arm/ArmCommon/ConvKernel/Fp32/Fp32ConvNchwNchw44.cpp b/compiler/lib/KernelGen/Arm/ArmCommon/ConvKernel/Fp32/Fp32ConvNchwNchw44.cpp
index 1fadf62f..71fc0fad 100644
--- a/compiler/lib/KernelGen/Arm/ArmCommon/ConvKernel/Fp32/Fp32ConvNchwNchw44.cpp
+++ b/compiler/lib/KernelGen/Arm/ArmCommon/ConvKernel/Fp32/Fp32ConvNchwNchw44.cpp
@@ -49,6 +49,8 @@ bool ConvFloatNCHWNCHW44::IsAvailable(TContext* ctx) const {
 }
 std::string ConvFloatNCHWNCHW44::GetKernelSymbol(TContext* ctx) const {
     auto src_tensor = ctx->getAttrOprand("operand:0");
+    CC_ASSERT(src_tensor.shape.size() > 0)
+            << "src_tensor size should > 0, now" << src_tensor.shape.size();
     uint32_t ic = src_tensor.shape[1];
     auto dst_tensor = ctx->getAttrOprand(
             "operand:" + std::to_string(ctx->getAttrInt("nr_operands") - 1));
@@ -241,7 +243,7 @@ std::string render_kernel(TContext* ctx) {
     std::string mode = ctx->haveAttr("nonlineMode")
                                ? ctx->getAttrStr("nonlineMode")
                                : "IDENTITY";
-                               
+
     auto activate_gen = create_activation_gener_instrinsic(mode);
 
     auto src_tensor = ctx->getAttrOprand("operand:0");
diff --git a/compiler/lib/KernelGen/Arm/Armv7/ConvKernel/F32ConvNCHWNCHW443x3s2.cpp b/compiler/lib/KernelGen/Arm/Armv7/ConvKernel/F32ConvNCHWNCHW443x3s2.cpp
index 1c508cf8..a38958b5 100644
--- a/compiler/lib/KernelGen/Arm/Armv7/ConvKernel/F32ConvNCHWNCHW443x3s2.cpp
+++ b/compiler/lib/KernelGen/Arm/Armv7/ConvKernel/F32ConvNCHWNCHW443x3s2.cpp
@@ -50,6 +50,8 @@ bool ConvFloatNCHWNCHW443x3s2::IsAvailable(TContext* ctx) const {
 }
 std::string ConvFloatNCHWNCHW443x3s2::GetKernelSymbol(TContext* ctx) const {
     auto src_tensor = ctx->getAttrOprand("operand:0");
+    CC_ASSERT(src_tensor.shape.size() > 0)
+            << "src_tensor size should > 0, now" << src_tensor.shape.size();
     uint32_t ic = src_tensor.shape[1];
     auto dst_tensor = ctx->getAttrOprand(
             "operand:" + std::to_string(ctx->getAttrInt("nr_operands") - 1));
@@ -117,7 +119,8 @@ std::string ConvFloatNCHWNCHW443x3s2::GetInitBody(TContext* ctx) const {
     return writer.str();
 }
 
-std::string ConvFloatNCHWNCHW443x3s2::GetWorkspaceBody(TContext* context) const {
+std::string ConvFloatNCHWNCHW443x3s2::GetWorkspaceBody(
+        TContext* context) const {
     std::stringstream ss;
     ss << R"(
         static inline int round_up(int x, int d){
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp
index 0a3a1e67..86fba8ad 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp
@@ -47,6 +47,8 @@ bool ConvFloatNCHWNCHW44::IsAvailable(TContext* ctx) const {
 }
 std::string ConvFloatNCHWNCHW44::GetKernelSymbol(TContext* ctx) const {
     auto src_tensor = ctx->getAttrOprand("operand:0");
+    CC_ASSERT(src_tensor.shape.size() > 0)
+            << "src_tensor size should > 0, now" << src_tensor.shape.size();
     uint32_t ic = src_tensor.shape[1];
     auto dst_tensor = ctx->getAttrOprand(
             "operand:" + std::to_string(ctx->getAttrInt("nr_operands") - 1));
@@ -151,7 +153,8 @@ std::string render_init(int c_idx, int nr_ow, bool with_bias) {
     for (int src_idx = 0; src_idx < nr_ow; ++src_idx) {
         if (with_bias) {
             ss << "c[" << c_idx << "][" << src_idx
-               << "] = GiFloat32Type2FixLenType(GiLoadFloat32(bias_ptr + " << c_idx << " * 4));";
+               << "] = GiFloat32Type2FixLenType(GiLoadFloat32(bias_ptr + "
+               << c_idx << " * 4));";
         } else {
             ss << "c[" << c_idx << "][" << src_idx
                << "] = GiFloat32Type2FixLenType(GiBroadcastFloat32(0.f));";
@@ -172,20 +175,23 @@ std::string render_core(int src_reg_size, int filter_size, bool is_big_oc,
     } else {
         for (int src_idx = 0; src_idx < src_reg_size; ++src_idx) {
             fw_ss << "src[" << src_idx
-                  << "] = GiFloat32Type2FixLenType(GiLoadFloat32(src_ptr + ${fh_idx} * packed_iw + "
+                  << "] = GiFloat32Type2FixLenType(GiLoadFloat32(src_ptr + "
+                     "${fh_idx} * packed_iw + "
                   << src_idx << "* ${simd_len}));\n";
         }
     }
 
     for (int fw_idx = 0; fw_idx < filter_size; ++fw_idx) {
         fw_ss << "weight[0][" << fw_idx
-              << "] = GiFloat32Type2FixLenType(GiLoadFloat32(filter_ptr + ${fh_idx} * ${ld_weight_fw} + "
+              << "] = GiFloat32Type2FixLenType(GiLoadFloat32(filter_ptr + "
+                 "${fh_idx} * ${ld_weight_fw} + "
               << fw_idx << " * ${simd_len}));\n";
     }
     if (is_big_oc) {
         for (int fw_idx = 0; fw_idx < filter_size; ++fw_idx) {
             fw_ss << "weight[1][" << fw_idx
-                  << "] = GiFloat32Type2FixLenType(GiLoadFloat32(filter_ptr + ${ld_weight_oc} + "
+                  << "] = GiFloat32Type2FixLenType(GiLoadFloat32(filter_ptr + "
+                     "${ld_weight_oc} + "
                      "${fh_idx} * "
                      "${ld_weight_fw} + "
                   << fw_idx << " * ${simd_len}));\n";
@@ -195,14 +201,22 @@ std::string render_core(int src_reg_size, int filter_size, bool is_big_oc,
         auto src_idx = fw_idx;
         auto weight_idx = fw_idx;
         for (int i = 0; i < nr_ow; ++i) {
-            fw_ss << "c[0][" << i << "] = GiFloat32Type2FixLenType(GiSimdFmaLane(GiFixLenType2GiFloat32Type(c[0][" << i
-                  << "]), GiFixLenType2GiFloat32Type(weight[0][" << weight_idx << "]),  GiFixLenType2GiFloat32Type(src[(" << i
+            fw_ss << "c[0][" << i
+                  << "] = "
+                     "GiFloat32Type2FixLenType(GiSimdFmaLane("
+                     "GiFixLenType2GiFloat32Type(c[0]["
+                  << i << "]), GiFixLenType2GiFloat32Type(weight[0]["
+                  << weight_idx << "]),  GiFixLenType2GiFloat32Type(src[(" << i
                   << " * ${stride} + " << src_idx << ") / 4]), "
                   << (i * stride + src_idx) % 4 << "));";
             if (is_big_oc) {
-                fw_ss << "c[1][" << i << "] = GiFloat32Type2FixLenType(GiSimdFmaLane(GiFixLenType2GiFloat32Type(c[1][" << i
-                      << "]), GiFixLenType2GiFloat32Type(weight[1][" << weight_idx << "]),  GiFixLenType2GiFloat32Type(src[(" << i
-                      << " * ${stride} + " << src_idx << ") / 4]), "
+                fw_ss << "c[1][" << i
+                      << "] = "
+                         "GiFloat32Type2FixLenType(GiSimdFmaLane("
+                         "GiFixLenType2GiFloat32Type(c[1]["
+                      << i << "]), GiFixLenType2GiFloat32Type(weight[1]["
+                      << weight_idx << "]),  GiFixLenType2GiFloat32Type(src[("
+                      << i << " * ${stride} + " << src_idx << ") / 4]), "
                       << (i * stride + src_idx) % 4 << "));";
             }
         }
@@ -223,11 +237,11 @@ std::string render_store(int nr_ow, int c_idx, const std::string& store_offset,
                          const ActivationGenIntrinsicBase& act) {
     std::stringstream ss;
     for (int ow_idx = 0; ow_idx < nr_ow; ++ow_idx) {
-        ss << act.GenIntrinsicFloatStore("GiFixLenType2GiFloat32Type(c[" + std::to_string(c_idx) + "][" +
-                                                 std::to_string(ow_idx) + "])",
-                                         "dst_ptr + " + store_offset + " + " +
-                                                 std::to_string(ow_idx) +
-                                                 " * simd_len");
+        ss << act.GenIntrinsicFloatStore(
+                "GiFixLenType2GiFloat32Type(c[" + std::to_string(c_idx) + "][" +
+                        std::to_string(ow_idx) + "])",
+                "dst_ptr + " + store_offset + " + " + std::to_string(ow_idx) +
+                        " * simd_len");
     }
     return ss.str();
 }
diff --git a/compiler/tools/CMakeLists.txt b/compiler/tools/CMakeLists.txt
index a8642194..c8197ed3 100644
--- a/compiler/tools/CMakeLists.txt
+++ b/compiler/tools/CMakeLists.txt
@@ -6,3 +6,4 @@ add_subdirectory(tinynn-exporter)
 add_subdirectory(hako-to-mgb)
 add_subdirectory(dump-kernel)
 add_subdirectory(megcc-translate)
+add_subdirectory(kernel_exporter)
\ No newline at end of file
diff --git a/compiler/tools/kernel_exporter/CMakeLists.txt b/compiler/tools/kernel_exporter/CMakeLists.txt
new file mode 100644
index 00000000..453f6d02
--- /dev/null
+++ b/compiler/tools/kernel_exporter/CMakeLists.txt
@@ -0,0 +1,7 @@
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+add_llvm_executable(kernel_exporter exporter_imp.cpp config_attr.cpp utils.cpp tinynn-exporter.cpp)
+llvm_update_compile_flags(kernel_exporter)
+target_link_libraries(kernel_exporter PRIVATE ${dialect_libs} KernelGen Common)
+target_compile_options(Common PUBLIC -fexceptions)
+target_compile_options(kernel_exporter PUBLIC -fexceptions)
+mlir_check_all_link_libraries(kernel_exporter)
diff --git a/compiler/tools/kernel_exporter/config_attr.cpp b/compiler/tools/kernel_exporter/config_attr.cpp
new file mode 100644
index 00000000..6e72067c
--- /dev/null
+++ b/compiler/tools/kernel_exporter/config_attr.cpp
@@ -0,0 +1,1076 @@
+/**
+ * \file
+ * compiler/tools/kernel_exporter/config_attr.cpp
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+#include "config_attr.h"
+#include <map>
+#include "compiler/Common/TContext.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "megbrain/common.h"
+#include "megbrain/reflection.h"
+#include "megdnn/basic_types.h"
+#include "megdnn/dtype.h"
+#include "megdnn/opr_param_defs.h"
+#include "megdnn/oprs/cv.h"
+#include "megdnn/oprs/general.h"
+#include "megdnn/oprs/imgproc.h"
+#include "megdnn/oprs/linalg.h"
+#include "megdnn/oprs/nn.h"
+#include "megdnn/oprs/nn_int.h"
+#include "utils.h"
+
+#define megcore_check(expr)                                            \
+    do {                                                               \
+        megcoreStatus_t _err = (expr);                                 \
+        if (_err != megcoreSuccess) {                                  \
+            fprintf(stderr, "mgb failed : line=%d %s:%d\n", (int)_err, \
+                    __FILE__, __LINE__);                               \
+            abort();                                                   \
+        }                                                              \
+    } while (0)
+
+namespace {
+#define DEFINE_DNNPARAM2STR(cls)                             \
+    std::string dnnparam_2_str(cls value) {                  \
+        return mgb::reflection::nameOfEnumValue<cls>(value); \
+    }
+
+DEFINE_DNNPARAM2STR(ConvParam::Format)
+DEFINE_DNNPARAM2STR(ConvParam::Sparse)
+DEFINE_DNNPARAM2STR(ConvParam::Mode)
+DEFINE_DNNPARAM2STR(megdnn::ElemwiseForward::Param::Mode)
+DEFINE_DNNPARAM2STR(megdnn::ElemwiseMultiType::Param::Mode)
+DEFINE_DNNPARAM2STR(megdnn::PoolingForward::Param::Mode)
+DEFINE_DNNPARAM2STR(megdnn::MatrixMulForward::Param::Format)
+DEFINE_DNNPARAM2STR(megdnn::MatrixMulForward::Param::ComputeMode)
+DEFINE_DNNPARAM2STR(megdnn::Reduce::Param::Mode)
+DEFINE_DNNPARAM2STR(megdnn::Reduce::Param::DataType)
+DEFINE_DNNPARAM2STR(megdnn::WarpPerspectiveForward::Param::BorderMode)
+DEFINE_DNNPARAM2STR(megdnn::WarpPerspectiveForward::Param::InterpolationMode)
+DEFINE_DNNPARAM2STR(megdnn::CvtColor::Param::Mode)
+DEFINE_DNNPARAM2STR(megdnn::Argsort::Param::Order)
+#undef DEFINE_DNNPARAM2STR
+
+int get_int() {
+    llvm::outs() << "please input a int number"
+                 << "\n";
+    std::string ret;
+    std::string num;
+    std::cin >> num;
+    int n = num.size();
+    for (int i = 0; i < n; i++) {
+        if (num[i] >= '0' && num[i] <= '9') {
+            ret.push_back(num[i]);
+        }
+    }
+    llvm::outs() << "input: " << stoi(ret) << "\n";
+
+    return stoi(ret);
+}
+
+float get_float() {
+    llvm::outs() << "please input a float number"
+                 << "\n";
+    std::string ret;
+    std::string num;
+    std::cin >> num;
+    int n = num.size();
+    for (int i = 0; i < n; i++) {
+        if ((num[i] >= '0' && num[i] <= '9') || num[i] == '.') {
+            ret.push_back(num[i]);
+        }
+    }
+    llvm::outs() << "input: " << stof(ret) << "\n";
+
+    return stof(ret);
+}
+
+std::string support_map_to_msg(const std::map<int, std::string>& m) {
+    std::string msg = "\n";
+    for (const auto& i : m) {
+        msg += std::to_string(i.first);
+        msg += " = ";
+        msg += i.second;
+        msg += ",\n";
+    }
+
+    return msg;
+}
+
+std::pair<std::string, std::map<int, std::string>> support_dtype() {
+    std::map<int, std::string> enum2dtype{
+            {0, "f32"}, {1, "si8"},          {2, "i32"},          {3, "i16"},
+            {4, "ui8"}, {5, "qsi8<scale:>"}, {6, "qsi32<scale:>"}};
+
+    return {support_map_to_msg(enum2dtype), enum2dtype};
+}
+
+std::pair<std::string, std::map<int, std::string>> support_format() {
+    std::map<int, std::string> format2enum{
+            {0, "NCHW"}, {7, "NCHW44"}, {8, "NCHW44_DOT"}};
+
+    return {support_map_to_msg(format2enum), format2enum};
+}
+
+}  // namespace
+
+namespace megcc {
+namespace exporter {
+#define FILL_MAP(_map_name, _parm_name, _attr_name) \
+    _map_name[#_attr_name] = CCAttr(_parm_name._attr_name)
+#define FILL_MAP_EX(_map_name, _parm_name, _attr_name, _helper_fun) \
+    _map_name[#_attr_name] = CCAttr(_helper_fun(_parm_name._attr_name))
+using KernType = KernelGen::KernelPack::KernType;
+template <typename Opr>
+class ParamHelper {
+public:
+    using Param = typename Opr::Param;
+    ParamHelper() {
+        megcore_check(megcoreCreateDeviceHandle(&m_device_handle,
+                                                megcorePlatformCPU));
+        megcore_check(megcoreCreateComputingHandle(&m_compute_handle,
+                                                   m_device_handle));
+        m_dnn_handle = megdnn::Handle::make(m_compute_handle, 2);
+    }
+
+    ~ParamHelper() {
+        megcore_check(megcoreDestroyComputingHandle(m_compute_handle));
+        megcore_check(megcoreDestroyDeviceHandle(m_device_handle));
+    }
+
+    Param create_param() {
+        auto opr = m_dnn_handle->create_operator<Opr>();
+        return opr->param();
+    }
+
+protected:
+    megcoreDeviceHandle_t m_device_handle;
+    megcoreComputingHandle_t m_compute_handle;
+    std::unique_ptr<megdnn::Handle> m_dnn_handle;
+};
+
+std::vector<megcc::CodeGenContext> config_attr(KPT k_type, std::string k_name,
+                                               bool use_default_attr) {
+#define DEC_DTYPE()                                              \
+    auto dtypes = support_dtype();                               \
+    llvm::outs() << "please config \"src type\" "                \
+                 << "support one of: " << dtypes.first << "\n";  \
+    auto dtype_enum = get_int();                                 \
+    if (dtypes.second.find(dtype_enum) == dtypes.second.end()) { \
+        llvm::outs() << "invalid input"                          \
+                     << "\n";                                    \
+        abort();                                                 \
+    }                                                            \
+    std::string dtype_input = dtypes.second[dtype_enum]
+
+#define DEC_FORMAT()                                                \
+    auto formats = support_format();                                \
+    llvm::outs() << "please config \"format\" "                     \
+                 << "support one of: " << formats.first << "\n";    \
+    auto format_input = get_int();                                  \
+    if (dtypes.second.find(format_input) == formats.second.end()) { \
+        llvm::outs() << "invalid input"                             \
+                     << "\n";                                       \
+        abort();                                                    \
+    }
+
+    std::vector<megcc::CodeGenContext> ret;
+    std::unordered_map<std::string, megcc::CCAttr> attr_map;
+    if (!use_default_attr) {
+        llvm::outs() << "+++++++++++++++++++++++++++++++++++++\n";
+        llvm::outs() << "  please config attr for " << k_name << "\n";
+        llvm::outs() << "+++++++++++++++++++++++++++++++++++++\n";
+    }
+    switch (k_type) {
+        case KPT::TopK: {
+            megcc::CCOperand cc_operand;
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            if (use_default_attr) {
+                attr_map["k"] = megcc::CCAttr(10);
+                attr_map["mode"] = megcc::CCAttr("KTH_ONLY");
+                cc_operand.dtype = "f32";
+            } else {
+                llvm::outs() << "please config \"k\""
+                             << "\n";
+                auto int_input = get_int();
+                attr_map["k"] = megcc::CCAttr(int_input);
+
+                auto support_mode = [&]()
+                        -> std::pair<std::string, std::map<int, std::string>> {
+                    std::map<int, std::string> enum2mode{
+                            {0, "KTH_ONLY"},
+                            {1, "VALUE_IDX_NOSORT"},
+                            {2, "VALUE_IDX_SORTED"}};
+
+                    return {support_map_to_msg(enum2mode), enum2mode};
+                };
+                auto m = support_mode();
+                llvm::outs() << "please config \"mode\" "
+                             << "support one of: " << m.first << "\n";
+                auto mode_enum = get_int();
+                if (m.second.find(mode_enum) == m.second.end()) {
+                    llvm::outs() << "invalid input"
+                                 << "\n";
+                    abort();
+                }
+                std::string input_str = m.second[mode_enum];
+                attr_map["mode"] = megcc::CCAttr(input_str);
+
+                DEC_DTYPE();
+                cc_operand.dtype = dtype_input;
+            }
+            attr_map[llvm::formatv("operand:{0}", 0)] = cc_operand;
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::ElemwiseKernel: {
+            auto&& m_helper = ParamHelper<megdnn::ElemwiseForward>();
+            auto param = m_helper.create_param();
+            if (use_default_attr) {
+                param.mode = megdnn::Elemwise::Mode::RELU;
+                attr_map["mode"] = CCAttr(dnnparam_2_str(param.mode));
+                attr_map["nr_operands"] = megcc::CCAttr(2);
+                megcc::CCOperand res;
+                res.dtype = "f32";
+                attr_map["operand:0"] = megcc::CCAttr(res);
+                attr_map["operand:1"] = megcc::CCAttr(res);
+                megcc::CodeGenContext ctx(attr_map);
+                ret.push_back(ctx);
+            } else {
+                EXPORT_ERR(
+                        "ElemwiseKernel have so many case , it`s hard to user "
+                        "dynamic config, not support now");
+            }
+        } break;
+        case KPT::ElemwiseMultiKernel: {
+            auto&& m_helper = ParamHelper<megdnn::ElemwiseMultiType>();
+            auto param = m_helper.create_param();
+            if (use_default_attr) {
+                param.mode = megdnn::ElemwiseMultiType::Mode::QADD;
+                attr_map["mode"] = CCAttr(dnnparam_2_str(param.mode));
+                attr_map["nr_operands"] = megcc::CCAttr(3);
+                megcc::CCOperand res;
+                res.dtype = "qsi8";
+                attr_map["operand:0"] = megcc::CCAttr(res);
+                res.dtype = "qsi8";
+                attr_map["operand:1"] = megcc::CCAttr(res);
+                res.dtype = "qsi8";
+                attr_map["operand:2"] = megcc::CCAttr(res);
+                megcc::CodeGenContext ctx(attr_map);
+                ret.push_back(ctx);
+            } else {
+                EXPORT_ERR(
+                        "ElemwiseMultiType have so many case , it`s hard to "
+                        "user "
+                        "dynamic config, not support now");
+            }
+        } break;
+        case KPT::PoolingKernel: {
+            auto&& m_helper = ParamHelper<megdnn::PoolingForward>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                //! init default attr
+                res.dtype = "f32";
+                param.stride_h = 1;
+                param.stride_w = 1;
+                param.pad_h = 1;
+                param.pad_w = 1;
+                param.window_h = 3;
+                param.window_w = 3;
+                param.format = ConvParam::Format::NCHW;
+                param.mode = megdnn::param::PoolingV0::Mode::MAX;
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+                llvm::outs() << "please config \"stride_h\""
+                             << "\n";
+                auto int_input = get_int();
+                param.stride_h = int_input;
+                llvm::outs() << "please config \"stride_w\""
+                             << "\n";
+                int_input = get_int();
+                param.stride_w = int_input;
+                llvm::outs() << "please config \"pad_h\""
+                             << "\n";
+                int_input = get_int();
+                param.pad_h = int_input;
+                llvm::outs() << "please config \"pad_w\""
+                             << "\n";
+                int_input = get_int();
+                param.pad_w = int_input;
+                llvm::outs() << "please config \"window_h\""
+                             << "\n";
+                int_input = get_int();
+                param.window_h = int_input;
+                llvm::outs() << "please config \"window_w\""
+                             << "\n";
+                int_input = get_int();
+                param.window_w = int_input;
+
+                DEC_FORMAT();
+                param.format = static_cast<ConvParam::Format>(format_input);
+                auto support_mode = [&]()
+                        -> std::pair<std::string, std::map<int, std::string>> {
+                    std::map<int, std::string> enum2mode{
+                            {0, "MAX"},
+                            {1, "AVERAGE"},
+                            {2, "AVERAGE_COUNT_EXCLUDE_PADDING"}};
+
+                    return {support_map_to_msg(enum2mode), enum2mode};
+                };
+
+                auto m = support_mode();
+                llvm::outs() << "please config \"mode\""
+                             << "support one of: " << m.first << "\n";
+                auto mode_enum = get_int();
+                if (m.second.find(mode_enum) == m.second.end()) {
+                    llvm::outs() << "invalid input"
+                                 << "\n";
+                    abort();
+                }
+                param.mode =
+                        static_cast<megdnn::param::PoolingV0::Mode>(mode_enum);
+            }
+            FILL_MAP(attr_map, param, stride_h);
+            FILL_MAP(attr_map, param, stride_w);
+            FILL_MAP(attr_map, param, pad_h);
+            FILL_MAP(attr_map, param, pad_w);
+            FILL_MAP(attr_map, param, window_h);
+            FILL_MAP(attr_map, param, window_w);
+
+            FILL_MAP_EX(attr_map, param, format, dnnparam_2_str);
+            FILL_MAP_EX(attr_map, param, mode, dnnparam_2_str);
+            attr_map["nr_operands"] = megcc::CCAttr(3);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            attr_map["operand:1"] = megcc::CCAttr(res);
+            attr_map["operand:2"] = megcc::CCAttr(res);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::MatrixMulKernel:
+        case KPT::BatchMatmulKernel: {
+            auto&& m_helper = ParamHelper<megdnn::MatrixMulForward>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                //! init default attr
+                res.dtype = "f32";
+                param.transposeA = false;
+                param.transposeB = false;
+                param.format = megdnn::param::MatrixMul::Format::DEFAULT;
+                param.compute_mode =
+                        megdnn::param::MatrixMul::ComputeMode::DEFAULT;
+                FILL_MAP(attr_map, param, transposeA);
+                FILL_MAP(attr_map, param, transposeB);
+
+                FILL_MAP_EX(attr_map, param, format, dnnparam_2_str);
+                FILL_MAP_EX(attr_map, param, compute_mode, dnnparam_2_str);
+                megcc::CodeGenContext ctx(attr_map);
+                ret.push_back(ctx);
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+
+                llvm::outs() << "please config \"transposeA\""
+                             << "\n";
+                auto int_input = get_int();
+                param.transposeA = int_input;
+                llvm::outs() << "please config \"transposeB\""
+                             << "\n";
+                int_input = get_int();
+                param.transposeB = int_input;
+                auto support_mode = [&]()
+                        -> std::pair<std::string, std::map<int, std::string>> {
+                    std::map<int, std::string> enum2mode{{0, "DEFAULT"},
+                                                         {1, "MK4"},
+                                                         {2, "MK8"},
+                                                         {3, "MK4_DOT"}};
+
+                    return {support_map_to_msg(enum2mode), enum2mode};
+                };
+                auto m = support_mode();
+                llvm::outs() << "please config \"format\""
+                             << "support one of: " << m.first << "\n";
+                auto mode_enum = get_int();
+                if (m.second.find(mode_enum) == m.second.end()) {
+                    llvm::outs() << "invalid input"
+                                 << "\n";
+                    abort();
+                }
+                param.format = static_cast<megdnn::param::MatrixMul::Format>(
+                        mode_enum);
+                param.compute_mode =
+                        static_cast<megdnn::param::MatrixMul::ComputeMode>(0);
+            }
+            FILL_MAP(attr_map, param, transposeA);
+            FILL_MAP(attr_map, param, transposeB);
+
+            FILL_MAP_EX(attr_map, param, format, dnnparam_2_str);
+            FILL_MAP_EX(attr_map, param, compute_mode, dnnparam_2_str);
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::MatrixInvKernel:
+        case KPT::RelayoutKernel: {
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                res.dtype = "f32";
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+            }
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::ReduceKernel: {
+            auto&& m_helper = ParamHelper<megdnn::ReduceForward>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                //! init default attr
+                res.dtype = "f32";
+                param.axis = 1;
+                param.mode = megdnn::param::Reduce::Mode::SUM;
+                param.data_type = megdnn::param::Reduce::DataType::DEFAULT;
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+
+                llvm::outs() << "please config \"axis\""
+                             << "\n";
+                auto int_input = get_int();
+                param.axis = int_input;
+                auto support_mode = [&]()
+                        -> std::pair<std::string, std::map<int, std::string>> {
+                    std::map<int, std::string> enum2mode{
+                            {0, "SUM"}, {1, "SUM_SQR"}, {2, "PRODUCT"},
+                            {3, "MIN"}, {4, "MAX"},     {5, "MEAN"}};
+
+                    return {support_map_to_msg(enum2mode), enum2mode};
+                };
+                auto m = support_mode();
+                llvm::outs() << "please config \"mode\" "
+                             << "support one of: " << m.first << "\n";
+                auto mode_enum = get_int();
+                if (m.second.find(mode_enum) == m.second.end()) {
+                    llvm::outs() << "invalid input"
+                                 << "\n";
+                    abort();
+                }
+                param.mode =
+                        static_cast<megdnn::param::Reduce::Mode>(mode_enum);
+                param.data_type =
+                        static_cast<megdnn::param::Reduce::DataType>(0);
+            }
+
+            FILL_MAP(attr_map, param, axis);
+            FILL_MAP_EX(attr_map, param, mode, dnnparam_2_str);
+            FILL_MAP_EX(attr_map, param, data_type, dnnparam_2_str);
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::IndexingMultiAxisKernel: {
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                res.dtype = "f32";
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+            }
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::IndexingOneHotKernel: {
+            auto&& m_helper = ParamHelper<megdnn::IndexingOneHot>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                //! init default attr
+                res.dtype = "f32";
+                param.axis = 1;
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+
+                llvm::outs() << "please config \"axis\""
+                             << "\n";
+                auto int_input = get_int();
+                param.axis = int_input;
+            }
+
+            FILL_MAP(attr_map, param, axis);
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::WarpPerspectiveKernel: {
+            auto&& m_helper = ParamHelper<megdnn::WarpPerspectiveForward>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                //! init default attr
+                res.dtype = "f32";
+                param.border_val = 0.1;
+                param.bmode = megdnn::param::WarpPerspective::BorderMode::
+                        BORDER_CONSTANT;
+                param.imode =
+                        megdnn::param::WarpPerspective::InterpolationMode::AREA;
+                param.format = megdnn::param::WarpPerspective::Format::NCHW;
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+
+                param.border_val = 0.1;
+                auto support_mode = [&]()
+                        -> std::pair<std::string, std::map<int, std::string>> {
+                    std::map<int, std::string> enum2mode{
+                            {0, "REPLICATE/BORDER_REPLICATE"},
+                            {1, "REFLECT/BORDER_REFLECT"},
+                            {2, "REFLECT_101/BORDER_REFLECT_101"},
+                            {3, "WRAP/BORDER_WRAP"},
+                            {4, "CONSTANT/BORDER_CONSTANT"},
+                            {5, "TRANSPARENT/BORDER_TRANSPARENT"},
+                            {6, "ISOLATED/BORDER_ISOLATED"},
+                    };
+
+                    return {support_map_to_msg(enum2mode), enum2mode};
+                };
+                auto m = support_mode();
+                llvm::outs() << "please config \"bmode\" "
+                             << "support one of: " << m.first << "\n";
+                auto mode_enum = get_int();
+                if (m.second.find(mode_enum) == m.second.end()) {
+                    llvm::outs() << "invalid input"
+                                 << "\n";
+                    abort();
+                }
+                param.bmode =
+                        megdnn::param::WarpPerspective::BorderMode(mode_enum);
+                auto support_imode = [&]()
+                        -> std::pair<std::string, std::map<int, std::string>> {
+                    std::map<int, std::string> enum2mode{
+                            {0, "NEAREST/INTER_NEAREST"},
+                            {1, "LINEAR/INTER_LINEAR"},
+                            {2, "AREA/INTER_AREA"},
+                            {3, "CUBIC/INTER_CUBIC"},
+                            {4, "LANCZOS4/INTER_LANCZOS4"},
+                    };
+
+                    return {support_map_to_msg(enum2mode), enum2mode};
+                };
+                auto im = support_imode();
+                llvm::outs() << "please config \"imode\" "
+                             << "support one of: " << im.first << "\n";
+                auto imode_enum = get_int();
+                if (im.second.find(imode_enum) == im.second.end()) {
+                    llvm::outs() << "invalid input"
+                                 << "\n";
+                    abort();
+                }
+                param.imode = megdnn::param::WarpPerspective::InterpolationMode(
+                        imode_enum);
+                DEC_FORMAT();
+                param.format =
+                        megdnn::param::WarpPerspective::Format(format_input);
+            }
+
+            FILL_MAP(attr_map, param, border_val);
+            FILL_MAP_EX(attr_map, param, bmode, dnnparam_2_str);
+            FILL_MAP_EX(attr_map, param, imode, dnnparam_2_str);
+            FILL_MAP_EX(attr_map, param, format, dnnparam_2_str);
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::WarpAffineKernel: {
+            auto&& m_helper = ParamHelper<megdnn::WarpAffineForward>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                //! init default attr
+                res.dtype = "f32";
+                param.border_val = 0.1;
+                param.border_mode =
+                        megdnn::param::WarpAffine::BorderMode::BORDER_CONSTANT;
+                param.imode =
+                        megdnn::param::WarpAffine::InterpolationMode::AREA;
+                param.format = megdnn::param::WarpAffine::Format::NCHW;
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+
+                param.border_val = 0.1;
+                auto support_mode = [&]()
+                        -> std::pair<std::string, std::map<int, std::string>> {
+                    std::map<int, std::string> enum2mode{
+                            {0, "REPLICATE/BORDER_REPLICATE"},
+                            {1, "REFLECT/BORDER_REFLECT"},
+                            {2, "REFLECT_101/BORDER_REFLECT_101"},
+                            {3, "WRAP/BORDER_WRAP"},
+                            {4, "CONSTANT/BORDER_CONSTANT"},
+                            {5, "TRANSPARENT/BORDER_TRANSPARENT"},
+                            {6, "ISOLATED/BORDER_ISOLATED"},
+                    };
+
+                    return {support_map_to_msg(enum2mode), enum2mode};
+                };
+                auto m = support_mode();
+                llvm::outs() << "please config \"bmode\" "
+                             << "support one of: " << m.first << "\n";
+                auto mode_enum = get_int();
+                if (m.second.find(mode_enum) == m.second.end()) {
+                    llvm::outs() << "invalid input"
+                                 << "\n";
+                    abort();
+                }
+                param.border_mode =
+                        megdnn::param::WarpAffine::BorderMode(mode_enum);
+                auto support_imode = [&]()
+                        -> std::pair<std::string, std::map<int, std::string>> {
+                    std::map<int, std::string> enum2mode{
+                            {0, "NEAREST/INTER_NEAREST"},
+                            {1, "LINEAR/INTER_LINEAR"},
+                            {2, "AREA/INTER_AREA"},
+                            {3, "CUBIC/INTER_CUBIC"},
+                            {4, "LANCZOS4/INTER_LANCZOS4"},
+                    };
+
+                    return {support_map_to_msg(enum2mode), enum2mode};
+                };
+                auto im = support_imode();
+                llvm::outs() << "please config \"imode\" "
+                             << "support one of: " << im.first << "\n";
+                auto imode_enum = get_int();
+                if (im.second.find(imode_enum) == im.second.end()) {
+                    llvm::outs() << "invalid input"
+                                 << "\n";
+                    abort();
+                }
+                param.imode = megdnn::param::WarpAffine::InterpolationMode(
+                        imode_enum);
+                DEC_FORMAT();
+                param.format = megdnn::param::WarpAffine::Format(format_input);
+            }
+
+            FILL_MAP(attr_map, param, border_val);
+            FILL_MAP_EX(attr_map, param, border_mode, dnnparam_2_str);
+            FILL_MAP_EX(attr_map, param, imode, dnnparam_2_str);
+            FILL_MAP_EX(attr_map, param, format, dnnparam_2_str);
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::TypeCvtKernel: {
+            if (use_default_attr) {
+                attr_map["nr_operands"] = megcc::CCAttr(2);
+                megcc::CCOperand res;
+                res.dtype = "ui8";
+                attr_map["operand:0"] = megcc::CCAttr(res);
+                res.dtype = "f32";
+                attr_map["operand:1"] = megcc::CCAttr(res);
+            } else {
+                attr_map["nr_operands"] = megcc::CCAttr(2);
+                megcc::CCOperand res;
+                auto dtypes = support_dtype();
+                llvm::outs() << "please config \"src type\" "
+                             << "support one of: " << dtypes.first << "\n";
+                auto dtype_enum = get_int();
+                if (dtypes.second.find(dtype_enum) == dtypes.second.end()) {
+                    llvm::outs() << "invalid input"
+                                 << "\n";
+                    abort();
+                }
+                std::string str_input = dtypes.second[dtype_enum];
+                res.dtype = str_input;
+                attr_map["operand:0"] = megcc::CCAttr(res);
+                llvm::outs() << "please config \"dst type\" "
+                             << "support one of: " << dtypes.first << "\n";
+                dtype_enum = get_int();
+                if (dtypes.second.find(dtype_enum) == dtypes.second.end()) {
+                    llvm::outs() << "invalid input"
+                                 << "\n";
+                    abort();
+                }
+                str_input = dtypes.second[dtype_enum];
+                res.dtype = str_input;
+                attr_map["operand:1"] = megcc::CCAttr(res);
+            }
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::PowCKernel: {
+            auto&& m_helper = ParamHelper<megdnn::PowC>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                //! init default attr
+                param.exp = 2;
+                res.dtype = "f32";
+
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+
+                llvm::outs() << "please config \"exp float value\" "
+                             << "\n";
+                float f_input = get_float();
+                param.exp = f_input;
+            }
+            FILL_MAP(attr_map, param, exp);
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::CVTransposeKernel:
+        case KPT::FlipKernel: {
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                res.dtype = "f32";
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+            }
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::ResizeKernel: {
+            auto&& m_helper = ParamHelper<megdnn::Resize>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                res.dtype = "f32";
+                param.imode = megdnn::param::Resize::InterpolationMode::NEAREST;
+                param.format = megdnn::param::Resize::Format::NCHW;
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+
+                auto support_mode = [&]()
+                        -> std::pair<std::string, std::map<int, std::string>> {
+                    std::map<int, std::string> enum2mode{
+                            {0, "NEAREST/INTER_NEAREST"},
+                            {1, "LINEAR/INTER_LINEAR"},
+                            {2, "AREA/INTER_AREA"},
+                            {3, "CUBIC/INTER_CUBIC"},
+                            {4, "LANCZOS4/INTER_LANCZOS4"},
+                    };
+                    return {support_map_to_msg(enum2mode), enum2mode};
+                };
+                auto m = support_mode();
+                llvm::outs() << "please config \"mode\" "
+                             << "support one of: " << m.first << "\n";
+                auto mode_enum = get_int();
+                if (m.second.find(mode_enum) == m.second.end()) {
+                    llvm::outs() << "invalid input"
+                                 << "\n";
+                    abort();
+                }
+                param.imode =
+                        megdnn::param::Resize::InterpolationMode(mode_enum);
+                DEC_FORMAT();
+                param.format = megdnn::param::Resize::Format(format_input);
+            }
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            FILL_MAP_EX(attr_map, param, imode, dnnparam_2_str);
+            FILL_MAP_EX(attr_map, param, format, dnnparam_2_str);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::RotateKernel: {
+            auto&& m_helper = ParamHelper<megdnn::Rotate>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                res.dtype = "f32";
+                param.clockwise = true;
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+
+                llvm::outs() << "please config \"clockwise\" "
+                             << "0 means false, other wise means true: "
+                             << "\n";
+                int int_input = get_int();
+                param.clockwise = int_input;
+            }
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            FILL_MAP(attr_map, param, clockwise);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::RoiCopyKernel: {
+            auto&& m_helper = ParamHelper<megdnn::ROICopy>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                res.dtype = "f32";
+                param.row_from = 1;
+                param.row_to = 1;
+                param.col_from = 1;
+                param.col_to = 1;
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+
+                llvm::outs() << "please config \"row_from\" "
+                             << "\n";
+                int int_input = get_int();
+                param.row_from = int_input;
+                llvm::outs() << "please config \"row_to\" "
+                             << "\n";
+                int_input = get_int();
+                param.row_to = int_input;
+                llvm::outs() << "please config \"col_from\" "
+                             << "\n";
+                int_input = get_int();
+                param.col_from = int_input;
+                llvm::outs() << "please config \"col_to\" "
+                             << "\n";
+                int_input = get_int();
+                param.col_to = int_input;
+            }
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            FILL_MAP(attr_map, param, row_from);
+            FILL_MAP(attr_map, param, row_to);
+            FILL_MAP(attr_map, param, col_from);
+            FILL_MAP(attr_map, param, col_to);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::CvtColorKernel: {
+            auto&& m_helper = ParamHelper<megdnn::CvtColor>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                res.dtype = "f32";
+                param.mode = megdnn::param::CvtColor::Mode::RGB2YUV;
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+
+                auto support_mode = [&]()
+                        -> std::pair<std::string, std::map<int, std::string>> {
+                    std::map<int, std::string> enum2mode{
+                            {0, "RGB2GRAY"},  {1, "RGB2YUV"},  {2, "YUV2RGB"},
+                            {3, "YUV2RGB"},   {4, "RGBA2RGB"}, {5, "RGBA2BGR"},
+                            {6, "RGBA2GRAY"}, {7, "RGB2BGR"},  {8, "BGR2GRAY"},
+                            {9, "BGR2RGB"},
+                    };
+
+                    return {support_map_to_msg(enum2mode), enum2mode};
+                };
+                auto m = support_mode();
+                llvm::outs() << "please config \"mode\" "
+                             << "support one of: " << m.first << "\n";
+                auto mode_enum = get_int();
+                if (m.second.find(mode_enum) == m.second.end()) {
+                    llvm::outs() << "invalid input"
+                                 << "\n";
+                    abort();
+                }
+                param.mode = megdnn::param::CvtColor::Mode(mode_enum);
+            }
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            FILL_MAP_EX(attr_map, param, mode, dnnparam_2_str);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::ArgSortKernel: {
+            auto&& m_helper = ParamHelper<megdnn::Argsort>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                res.dtype = "f32";
+                param.order = megdnn::param::Argsort::Order::ASCENDING;
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+
+                auto support_mode = [&]()
+                        -> std::pair<std::string, std::map<int, std::string>> {
+                    std::map<int, std::string> enum2mode{{0, "ASCENDING"},
+                                                         {1, "DESCENDING"}};
+
+                    return {support_map_to_msg(enum2mode), enum2mode};
+                };
+                auto m = support_mode();
+                llvm::outs() << "please config \"order\" "
+                             << "support one of: " << m.first << "\n";
+                auto mode_enum = get_int();
+                if (m.second.find(mode_enum) == m.second.end()) {
+                    llvm::outs() << "invalid input"
+                                 << "\n";
+                    abort();
+                }
+                param.order = megdnn::param::Argsort::Order(mode_enum);
+            }
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            attr_map["order"] = CCAttr(dnnparam_2_str(param.order));
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::ArgmaxKernel: {
+            auto&& m_helper = ParamHelper<megdnn::ArgmaxForward>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                res.dtype = "f32";
+                param.axis = 1;
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+
+                llvm::outs() << "please config \"axis\" "
+                             << "\n";
+                int int_input = get_int();
+                param.axis = int32_t(int_input);
+            }
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            attr_map["operand:1"] = megcc::CCAttr(res);
+            FILL_MAP(attr_map, param, axis);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::ConcatKernel: {
+            auto&& m_helper = ParamHelper<megdnn::ConcatForward>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            if (use_default_attr) {
+                res.dtype = "f32";
+                param.axis = 1;
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+
+                llvm::outs() << "please config \"axis\" "
+                             << "\n";
+                int int_input = get_int();
+                param.axis = int32_t(int_input);
+            }
+            attr_map["nr_operands"] = megcc::CCAttr(1);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            FILL_MAP(attr_map, param, axis);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        case KPT::ConvKernel:
+        case KPT::ConvBackDataKernel: {
+            auto&& m_helper = ParamHelper<megdnn::ConvBiasForward>();
+            auto param = m_helper.create_param();
+            megcc::CCOperand res;
+            uint32_t kernel_h = 0, kernel_w = 0;
+            if (use_default_attr) {
+                res.dtype = "f32";
+
+                kernel_h = 3;
+                kernel_w = 3;
+                param.sparse = ConvParam::Sparse::DENSE;
+                param.format = ConvParam::Format::NCHW;
+                param.stride_h = 1;
+                param.stride_w = 1;
+                param.pad_h = 1;
+                param.pad_w = 1;
+                param.dilate_h = 1;
+                param.dilate_w = 1;
+                param.mode = ConvParam::Mode::CONVOLUTION;
+            } else {
+                DEC_DTYPE();
+                res.dtype = dtype_input;
+
+                llvm::outs() << "please config \"kernel_h\" "
+                             << "\n";
+                int int_input = get_int();
+                kernel_h = int_input;
+#define CB(name)                                        \
+    llvm::outs() << "please config: " << #name << "\n"; \
+    int_input = get_int();                              \
+    name = int_input
+
+                CB(kernel_w);
+                CB(param.stride_h);
+                CB(param.stride_w);
+                CB(param.pad_h);
+                CB(param.pad_w);
+                CB(param.dilate_h);
+                CB(param.dilate_w);
+#undef CB
+                auto support_mode = [&]()
+                        -> std::pair<std::string, std::map<int, std::string>> {
+                    std::map<int, std::string> enum2mode{
+                            {0, "DENSE"},
+                            {1, "GROUP"},
+                    };
+
+                    return {support_map_to_msg(enum2mode), enum2mode};
+                };
+                auto m = support_mode();
+                llvm::outs() << "please config \"sparse\" "
+                             << "support one of: " << m.first << "\n";
+                auto mode_enum = get_int();
+                if (m.second.find(mode_enum) == m.second.end()) {
+                    llvm::outs() << "invalid input"
+                                 << "\n";
+                    abort();
+                }
+                param.sparse = ConvParam::Sparse(mode_enum);
+
+                DEC_FORMAT();
+                param.format = ConvParam::Format(format_input);
+
+                param.mode = ConvParam::Mode(1);
+            }
+            attr_map["nr_operands"] = megcc::CCAttr(3);
+            attr_map["operand:0"] = megcc::CCAttr(res);
+            attr_map["operand:1"] = megcc::CCAttr(res);
+            attr_map["operand:2"] = megcc::CCAttr(res);
+            attr_map["kernel_h"] = CCAttr(kernel_h);
+            attr_map["kernel_w"] = CCAttr(kernel_w);
+            FILL_MAP(attr_map, param, stride_h);
+            FILL_MAP(attr_map, param, stride_w);
+            FILL_MAP(attr_map, param, pad_h);
+            FILL_MAP(attr_map, param, pad_w);
+            FILL_MAP(attr_map, param, dilate_h);
+            FILL_MAP(attr_map, param, dilate_w);
+            FILL_MAP_EX(attr_map, param, sparse, dnnparam_2_str);
+            FILL_MAP_EX(attr_map, param, format, dnnparam_2_str);
+            FILL_MAP_EX(attr_map, param, mode, dnnparam_2_str);
+            megcc::CodeGenContext ctx(attr_map);
+            ret.push_back(ctx);
+        } break;
+        default:
+            EXPORT_ERR(ssprintf("config_attr not imp for: %s", k_name.c_str()));
+            break;
+    }
+
+    return ret;
+#undef DEC_DTYPE
+}
+
+}  // namespace exporter
+}  // namespace megcc
diff --git a/compiler/tools/kernel_exporter/config_attr.h b/compiler/tools/kernel_exporter/config_attr.h
new file mode 100644
index 00000000..19adf2d7
--- /dev/null
+++ b/compiler/tools/kernel_exporter/config_attr.h
@@ -0,0 +1,33 @@
+/**
+ * \file
+ * compiler/tools/kernel_exporter/config_attr.h
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+#pragma once
+
+//#include <data_struct.h>
+
+#include "compiler/KernelGen/KernelGen.h"
+#include "megbrain/common.h"
+#include "megdnn/oprs/general.h"
+#include "megdnn/oprs/nn.h"
+
+namespace {
+using ConvParam = megdnn::ConvolutionForward::Param;
+using ConvBiasParam = megdnn::ConvBiasForward::Param;
+using KPT = megcc::KernelGen::KernelPack::KernType;
+using KA = megcc::KernelGen::Arch;
+
+}  // namespace
+
+namespace megcc {
+namespace exporter {
+
+std::vector<megcc::CodeGenContext> config_attr(KPT k_type, std::string k_name,
+                                               bool use_default_attr);
+
+}  // namespace exporter
+}  // namespace megcc
diff --git a/compiler/tools/kernel_exporter/exporter_imp.cpp b/compiler/tools/kernel_exporter/exporter_imp.cpp
new file mode 100644
index 00000000..f74551a6
--- /dev/null
+++ b/compiler/tools/kernel_exporter/exporter_imp.cpp
@@ -0,0 +1,134 @@
+/**
+ * \file compiler/tools/kernel_exporter/exporter_imp.cpp
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+
+#include "exporter_imp.h"
+
+KPT KernelExporter::kernel_name_to_type() {
+    KPT ret;
+    auto m_find = m_kern_name2type.find(m_kernel_name);
+    if (m_find == m_kern_name2type.end()) {
+        EXPORT_ERR(
+                ssprintf("do not support kernel name: %s, support lists:\n%s",
+                         m_kernel_name.c_str(), support_kernels().c_str()));
+    } else {
+        ret = m_find->second;
+    }
+
+    return ret;
+}
+
+KA KernelExporter::get_arch_type() {
+    KA ret;
+    auto m_find = m_name2arch.find(m_kernel_arch);
+    if (m_find == m_name2arch.end()) {
+        EXPORT_ERR(ssprintf("do not support arch: %s, support archs:\n%s",
+                            m_kernel_arch.c_str(), support_archs().c_str()));
+    } else {
+        ret = m_find->second;
+    }
+
+    return ret;
+}
+
+std::pair<std::vector<const megcc::KernelGen::KernelFunc*>,
+          const megcc::KernelGen::DeduceFunc*>
+KernelExporter::get_kernels() {
+    KPT k_type = kernel_name_to_type();
+    KA arch_type = get_arch_type();
+    return megcc::KernelGen::KernelPack::GetKernel(k_type, arch_type);
+}
+
+void KernelExporter::gen_kenrels() {
+    auto kernels = get_kernels().first;
+    if (kernels.size() <= 0) {
+        EXPORT_ERR(ssprintf("ERR: can not find any KernelFunc for: %s",
+                            m_kernel_name.c_str()));
+    }
+
+    auto attrs = megcc::exporter::config_attr(
+            kernel_name_to_type(), m_kernel_name, m_use_default_attr);
+    std::string common_header = R"(
+#include <data_struct.h>
+#include <math.h>
+#include <stdio.h>
+)";
+    for (auto& i : kernels) {
+        for (auto& ctx : attrs) {
+            auto gen = [&]() {
+                bool is_cv = !i->GetCVKernelSymbol(&ctx).empty();
+                auto kernel_file_name = i->GetKernelSymbol(&ctx) + ".c";
+                if (is_cv) {
+                    kernel_file_name = i->GetCVKernelSymbol(&ctx) + ".c";
+                }
+                std::stringstream ss;
+                auto file_path = kernel_file_name;
+                llvm::outs() << "\n";
+                llvm::outs() << "\n";
+                ss << common_header;
+                if (is_cv) {
+                    ss << i->GetCVKernelBody(&ctx) << "\n";
+                } else {
+                    ss << i->GetKernelBody(&ctx) << "\n";
+                    for (auto& d : i->GetDependInternalSymbol(&ctx)) {
+                        ss << d.kernel_body;
+                    }
+                }
+                if (m_print_to_console) {
+                    std::cout << ss.rdbuf() << "\n";
+                };
+                std::ofstream out_file(file_path);
+                out_file << ss.str();
+                out_file.close();
+                llvm::outs() << "====>get kernel to: " << file_path << "\n";
+            };
+
+            try {
+                gen();
+            } catch (...) {
+            }
+        }
+    }
+
+    llvm::outs() << "Export tinynnkernel done.\n";
+}
+
+std::map<std::string, KPT> KernelExporter::m_kern_name2type{
+        {"ConvKernel", KPT::ConvKernel},
+        {"ElemwiseKernel", KPT::ElemwiseKernel},
+        {"ElemwiseMultiKernel", KPT::ElemwiseMultiKernel},
+        {"PoolingKernel", KPT::PoolingKernel},
+        {"MatrixMulKernel", KPT::MatrixMulKernel},
+        {"MatrixInvKernel", KPT::MatrixInvKernel},
+        {"RelayoutKernel", KPT::RelayoutKernel},
+        {"ReduceKernel", KPT::ReduceKernel},
+        {"IndexingMultiAxisKernel", KPT::IndexingMultiAxisKernel},
+        {"IndexingOneHotKernel", KPT::IndexingOneHotKernel},
+        {"WarpPerspectiveKernel", KPT::WarpPerspectiveKernel},
+        {"WarpAffineKernel", KPT::WarpAffineKernel},
+        {"TypeCvtKernel", KPT::TypeCvtKernel},
+        {"TopK", KPT::TopK},
+        {"BatchMatmulKernel", KPT::BatchMatmulKernel},
+        {"PowCKernel", KPT::PowCKernel},
+        {"CVTransposeKernel", KPT::CVTransposeKernel},
+        {"FlipKernel", KPT::FlipKernel},
+        {"ResizeKernel", KPT::ResizeKernel},
+        {"RotateKernel", KPT::RotateKernel},
+        {"RoiCopyKernel", KPT::RoiCopyKernel},
+        {"CvtColorKernel", KPT::CvtColorKernel},
+        {"ArgSortKernel", KPT::ArgSortKernel},
+        {"ArgmaxKernel", KPT::ArgmaxKernel},
+        {"ConcatKernel", KPT::ConcatKernel},
+        {"ConvBackDataKernel", KPT::ConvBackDataKernel}
+
+};
+
+std::map<std::string, KA> KernelExporter::m_name2arch{
+        {"BAREMETAL", KA::BAREMETAL},
+        {"ARM64", KA::ARM64},
+        {"ARMV7", KA::ARMV7},
+};
diff --git a/compiler/tools/kernel_exporter/exporter_imp.h b/compiler/tools/kernel_exporter/exporter_imp.h
new file mode 100644
index 00000000..6ba734ab
--- /dev/null
+++ b/compiler/tools/kernel_exporter/exporter_imp.h
@@ -0,0 +1,73 @@
+/**
+ * \file compiler/tools/kernel_exporter/exporter_imp.h
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+
+#include <fstream>
+
+#include "config_attr.h"
+#include "utils.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Parser.h"
+
+#include "compiler/Common/Logger.h"
+#include "compiler/Common/Version.h"
+#include "compiler/KernelGen/KernelGen.h"
+
+using namespace llvm;
+
+class KernelExporter {
+    static std::map<std::string, KPT> m_kern_name2type;
+    static std::map<std::string, KA> m_name2arch;
+
+    std::string m_kernel_name;
+    std::string m_kernel_arch;
+    bool m_use_default_attr;
+    bool m_print_to_console;
+
+    KPT kernel_name_to_type();
+    KA get_arch_type();
+
+    std::pair<std::vector<const megcc::KernelGen::KernelFunc*>,
+              const megcc::KernelGen::DeduceFunc*>
+    get_kernels();
+
+public:
+    KernelExporter(std::string kernel_name, std::string kernel_arch,
+                   bool use_default_attr, bool print_to_console)
+            : m_kernel_name{kernel_name},
+              m_kernel_arch{kernel_arch},
+              m_use_default_attr(use_default_attr),
+              m_print_to_console(print_to_console) {
+        std::string attr = "use kernel default attr";
+        if (!m_use_default_attr) {
+            attr = "use user config attr";
+        }
+        llvm::outs() << "try export tinynn kernel of " << m_kernel_name << "("
+                     << m_kernel_arch << ")"
+                     << "\n";
+        llvm::outs() << "kernel attr: " << attr << "\n";
+        llvm::outs() << "print to console: " << m_print_to_console << "\n";
+        megcc::setAssertThrow(true);
+    };
+
+#define MAPKEY2STR(m)   \
+    std::string ret;    \
+    for (auto i : m) {  \
+        ret += i.first; \
+        ret += "\n";    \
+    }                   \
+    return ret;
+
+    static std::string support_kernels() { MAPKEY2STR(m_kern_name2type); }
+
+    static std::string support_archs() { MAPKEY2STR(m_name2arch); }
+
+    void gen_kenrels();
+};
diff --git a/compiler/tools/kernel_exporter/tinynn-exporter.cpp b/compiler/tools/kernel_exporter/tinynn-exporter.cpp
new file mode 100644
index 00000000..5246583c
--- /dev/null
+++ b/compiler/tools/kernel_exporter/tinynn-exporter.cpp
@@ -0,0 +1,53 @@
+/**
+ * \file compiler/tools/kernel_exporter/tinynn-exporter.cpp
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+
+#include <fstream>
+
+#include "config_attr.h"
+#include "utils.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Parser.h"
+
+#include "exporter_imp.h"
+
+using namespace llvm;
+
+int main(int argc, char** argv) {
+    auto k_name_desc = "input kernel name, valid option:\n" +
+                       KernelExporter::support_kernels();
+    cl::opt<std::string> KernelName("kernel", cl::Required,
+                                    cl::desc(k_name_desc));
+    auto arch_desc = "the platform arch, valid options:\n" +
+                     KernelExporter::support_archs();
+    cl::opt<std::string> kernelArch("arch", cl::Required, cl::desc(arch_desc));
+    cl::opt<bool> use_default_attr(
+            "use_default_attr",
+            cl::desc("Use a default attribute to generate kernel, if not "
+                     "config, user need dynamic config it"));
+    cl::opt<bool> print_to_console("print_to_console",
+                                   cl::desc("Print kernel body to console"));
+    cl::opt<bool> Verbose(
+            "verbose",
+            cl::desc("log more detail information when compiler model"));
+
+    cl::AddExtraVersionPrinter(
+            [](raw_ostream& oss) { oss << megcc::getMegccVersionString(); });
+    cl::ParseCommandLineOptions(argc, argv);
+    if (Verbose) {
+        megcc::SetLogLevel(megcc::LogLevel::DEBUG);
+    }
+    KernelExporter exporter(KernelName.getValue(), kernelArch.getValue(),
+                            use_default_attr.getValue(),
+                            print_to_console.getValue());
+    exporter.gen_kenrels();
+
+    return 0;
+}
diff --git a/compiler/tools/kernel_exporter/utils.cpp b/compiler/tools/kernel_exporter/utils.cpp
new file mode 100644
index 00000000..a798feb8
--- /dev/null
+++ b/compiler/tools/kernel_exporter/utils.cpp
@@ -0,0 +1,60 @@
+/**
+ * \file compiler/tools/kernel_exporter/utils.cpp
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+#include "utils.h"
+#include <stdarg.h>
+#include <iostream>
+
+inline constexpr const char* convert_fmt_str(const char* fmt) {
+    return fmt;
+}
+
+std::string svsprintf(const char* fmt, va_list ap_orig) {
+    fmt = convert_fmt_str(fmt);
+    int size = 100; /* Guess we need no more than 100 bytes */
+    char* p;
+
+    if ((p = (char*)malloc(size)) == nullptr)
+        goto err;
+
+    for (;;) {
+        va_list ap;
+        va_copy(ap, ap_orig);
+        int n = vsnprintf(p, size, fmt, ap);
+        va_end(ap);
+
+        if (n < 0)
+            goto err;
+
+        if (n < size) {
+            std::string rst(p);
+            free(p);
+            return rst;
+        }
+
+        size = n + 1;
+
+        char* np = (char*)realloc(p, size);
+        if (!np) {
+            free(p);
+            goto err;
+        } else
+            p = np;
+    }
+
+err:
+    fprintf(stderr, "could not allocate memory for svsprintf; fmt=%s\n", fmt);
+    __builtin_trap();
+}
+
+std::string ssprintf(const char* fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    auto rst = svsprintf(fmt, ap);
+    va_end(ap);
+    return rst;
+}
diff --git a/compiler/tools/kernel_exporter/utils.h b/compiler/tools/kernel_exporter/utils.h
new file mode 100644
index 00000000..48f3d8f8
--- /dev/null
+++ b/compiler/tools/kernel_exporter/utils.h
@@ -0,0 +1,14 @@
+/**
+ * \file compiler/tools/kernel_exporter/utils.h
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+#include <iostream>
+
+#define EXPORT_ERR(msg)          \
+    llvm::outs() << msg << "\n"; \
+    __builtin_trap();
+
+std::string ssprintf(const char* fmt, ...);
diff --git a/script/build_and_test_not_standard_os.sh b/script/build_and_test_not_standard_os.sh
index cdfdbb44..12c9e161 100755
--- a/script/build_and_test_not_standard_os.sh
+++ b/script/build_and_test_not_standard_os.sh
@@ -28,7 +28,7 @@ cmake --build "$MEGCC_BUILD_DIR" -j$(nproc) --target mgb-to-tinynn --target mgb-
 
 function check_key_words() {
     #elf self mangle words, we do not care!!
-    white_list="@MEGW mgb1 5Mbg6 MGBi O:MgBnWk <mbG =MEG>Yr]< 4emUi0B >HMgE kMEG RmEg MbGV4 MEgIy @MEg mGe#S BMgb MGB( mBg: MBgr8C A&mGB mEg; mGb>/ mEg= .strtab .shstrtab A=MgE= mgb=g MGe= g=MgE <mgE= =Mgb>"
+    white_list="@MEGW mgb1 5Mbg6 MGBi O:MgBnWk <mbG =MEG>Yr]< 4emUi0B >HMgE kMEG RmEg MbGV4 MEgIy @MEg mGe#S BMgb MGB( mBg: MBgr8C A&mGB mEg; mGb>/ mEg= .strtab .shstrtab A=MgE= mgb=g MGe= g=MgE <mgE= =Mgb> MGE<"
     elf_file=$1
     if [ ! -f ${elf_file} ];then
         echo "ERR: can not find ${elf_file}"
diff --git a/script/release_megcc.sh b/script/release_megcc.sh
index 9b2c2f2e..eef6c540 100755
--- a/script/release_megcc.sh
+++ b/script/release_megcc.sh
@@ -32,10 +32,17 @@ pushd ${OUT_DIR}/build_host
     cmake ${COMPILER_PATH} -G Ninja
     ninja
     cp tools/mgb-to-tinynn/mgb-to-tinynn ${OUT_DIR}/bin/
+    strip mgb-to-tinynn
     cp tools/mgb-runner/mgb-runner ${OUT_DIR}/bin/
+    strip mgb-runner
     cp tools/mgb-importer/mgb-importer ${OUT_DIR}/bin/
+    strip mgb-importer
+    cp tools/kernel_exporter/kernel_exporter ${OUT_DIR}/bin/
+    strip kernel_exporter
     cp tools/hako-to-mgb/hako-to-mgb ${OUT_DIR}/bin/
+    strip hako-to-mgb
     cp tools/megcc-opt/megcc-opt ${OUT_DIR}/bin/
+    strip megcc-opt
 popd
 pushd ${PROJECT_PATH}/compiler
     GIT_ID=`git rev-parse --short HEAD`

From 50559109b17344cc2069274137f703cbb2036eb9 Mon Sep 17 00:00:00 2001
From: yuxiongxiong <yuxiongxiong@megvii.com>
Date: Tue, 27 Dec 2022 14:34:02 +0800
Subject: [PATCH 07/17] feat(compiler): optimize arm64 sigmoid

---
 .../KernelGen/Arm/Arm64/Elemwise/Elemwise.cpp | 137 +++++++
 .../KernelGen/Arm/Arm64/Elemwise/Elemwise.h   |  33 ++
 .../Arm64/ElemwiseHelper/ElemwiseHelper.cpp   |  45 ++
 .../Arm/Arm64/ElemwiseHelper/ElemwiseHelper.h | 103 +++++
 .../Arm/Arm64/ElemwiseHelper/UnaryHelper.cpp  | 383 ++++++++++++++++++
 .../lib/KernelGen/Arm/Arm64/KernelPack.cpp    |   5 +-
 compiler/test/kernel/opr/arm/Elementwise.cpp  |  13 +
 .../kernel/opr/arm/benchmark_elemwise.cpp     |   9 +
 8 files changed, 727 insertions(+), 1 deletion(-)
 create mode 100644 compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.cpp
 create mode 100644 compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.h
 create mode 100644 compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.cpp
 create mode 100644 compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.h
 create mode 100644 compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/UnaryHelper.cpp

diff --git a/compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.cpp b/compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.cpp
new file mode 100644
index 00000000..cf6235e5
--- /dev/null
+++ b/compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.cpp
@@ -0,0 +1,137 @@
+/**
+ * \file
+ * compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.cpp
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+
+#include "Elemwise.h"
+#include "../ElemwiseHelper/ElemwiseHelper.h"
+#include "Arm/ArmCommon/InternalKernel.h"
+#include "Utils/SymbolHelper.h"
+#include "compiler/Common/Logger.h"
+using namespace megcc;
+using namespace KernelGen;
+using namespace Arm64;
+
+bool ElemwiseKernel::IsAvailable(TContext* ctx) const {
+    //! TODO: now only support float type
+    int nr_operands = ctx->getAttrInt("nr_operands");
+    bool type_ok = true;
+    for (int i = 0; i < nr_operands; i++) {
+        type_ok &= (ctx->getAttrOprand("operand:" + std::to_string(i)).dtype ==
+                    "f32");
+    }
+    auto mode = ctx->getAttrStr("mode");
+    bool mode_ok = mode == "SIGMOID";
+    bool ok_input = nr_operands == 2;
+    bool usable = type_ok && mode_ok && ok_input;
+    return usable;
+}
+
+std::string ElemwiseKernel::GetKernelSymbol(TContext* context) const {
+    std::stringstream ss;
+    ss << "Arm64_kernel_elementwise";
+    ss << "_" << context->getAttrStr("mode");
+    int nr_operands = context->getAttrInt("nr_operands");
+    if (nr_operands == 2) {
+        ss << "_unary_vec_vec";
+    } else {
+        //! Not implement ternary elemwise kernel
+        ss << "_invalid_nr_operands_";
+    }
+    //! TODO: add ternary elemwise
+    ss << "_" << SymbolHelper::gen_io_str(context);
+    return ss.str();
+}
+
+std::string ElemwiseKernel::GetKernelBody(TContext* ctx) const {
+    std::stringstream writer;
+    int nr_operands = ctx->getAttrInt("nr_operands");
+    auto mode = ctx->getAttrStr("mode");
+    std::vector<CCOperand> operands;
+    for (int i = 0; i < nr_operands; i++) {
+        operands.push_back(ctx->getAttrOprand("operand:" + std::to_string(i)));
+    }
+    auto ElemwiseImpl = ElemwiseHelperFunc::CreateGenHelper(mode, operands);
+    auto InternalKernelFunc = ArmCommon::ExpNeonKernel();
+
+    CC_ASSERT(ElemwiseImpl) << "ElemwiseHelper Create error!\n";
+    writer << R"(
+        #include <arm_neon.h>
+        #include <math.h>
+        #include <stdbool.h>
+        #include "tensor_util.h"
+    )";
+    writer << "\n\n";
+    writer << "extern " << InternalKernelFunc.GetKernelSignature(ctx) << ";\n";
+    writer << R"(
+            static const struct {
+    float lower_range;
+    float upper_range;
+    float alpha_9;
+    float alpha_7;
+    float alpha_5;
+    float alpha_3;
+    float alpha_1;
+    float beta_10;
+    float beta_8;
+    float beta_6;
+    float beta_4;
+    float beta_2;
+    float beta_0;
+    float one_half;
+} sigmoid_constants = {
+        -18.0f,
+        18.0f,
+        4.37031012579801e-11f,
+        1.15627324459942e-07f,
+        6.08574864600143e-05f,
+        8.51377133304701e-03f,
+        2.48287947061529e-01f,
+        6.10247389755681e-13f,
+        5.76102136993427e-09f,
+        6.29106785017040e-06f,
+        1.70198817374094e-03f,
+        1.16817656904453e-01f,
+        9.93151921023180e-01f,
+        0.5f,
+};
+        )";
+    writer << GenCommonRet() << " " << GetKernelSignature(ctx) << "{\n";
+    //! input + output = 2, unary case
+    if (nr_operands == 2) {
+        writer << R"(
+        float* input_data0 = inputs[0]->ptr;
+        TINYNN_ASSERT(input_data0);
+        float* output_data = outputs[0]->ptr;
+        ${ElemwiseImpl(input_data0, output_data)};
+        )";
+    } else {
+        CC_ABORT << "not support ternary elemwise.\n";
+    }
+    writer << "\nreturn TinyNN_SUCCESS;\n}";
+
+    std::stringstream ss;
+    auto ImpleGen = [=](std::vector<std::string> strs) {
+        return ElemwiseImpl->GenCodeBody(strs);
+    };
+    ss << StringTemplate::StringTemplateArgs()
+                    .add("ElemwiseImpl", ImpleGen)
+                    .render(writer.str());
+    return ss.str();
+}
+
+std::vector<KernelObj> ElemwiseKernel::GetDependInternalSymbol(
+        TContext* ctx) const {
+    std::vector<KernelObj> depends;
+    ArmCommon::ExpNeonKernel kern;
+    depends.emplace_back(kern.GetKernelSymbol(ctx), kern.GetKernelBody(ctx),
+                         kern.GetBodyGuardBegin(ctx),
+                         kern.GetBodyGuardEnd(ctx));
+    return depends;
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.h b/compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.h
new file mode 100644
index 00000000..787f5dc4
--- /dev/null
+++ b/compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.h
@@ -0,0 +1,33 @@
+/**
+ * \file
+ * compiler/lib/KernelGen/Arm/Arm64/Elemwise/Elemwise.h
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+#pragma once
+#include <sstream>
+#include <string>
+#include "compiler/KernelGen/KernelGen.h"
+
+namespace megcc {
+namespace KernelGen {
+namespace Arm64 {
+
+class ElemwiseKernel : public KernelFunc {
+public:
+    virtual ~ElemwiseKernel(){};
+    bool IsAvailable(TContext* context) const override;
+    std::string GetKernelSymbol(TContext* context) const override;
+    std::string GetKernelBody(TContext* context) const override;
+
+    std::vector<KernelObj> GetDependInternalSymbol(
+            TContext* context) const override;
+};
+
+}  // namespace Arm64
+}  // namespace KernelGen
+}  // namespace megcc
+
+// vim: syntax=cpp.doxygen
diff --git a/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.cpp b/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.cpp
new file mode 100644
index 00000000..a212120e
--- /dev/null
+++ b/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.cpp
@@ -0,0 +1,45 @@
+/**
+ * \file
+ * compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.cpp
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+
+#include "ElemwiseHelper.h"
+#include "Utils/SymbolHelper.h"
+#include "compiler/Common/Logger.h"
+using namespace megcc;
+using namespace KernelGen;
+using namespace Arm64;
+
+#define CASE_DISPATCH(_mode, _helper_name)       \
+    if (mode == _mode) {                         \
+        return std::make_shared<_helper_name>(); \
+    }
+
+#define CASE_DISPATCH_ARG(_mode, _helper_name, ...)         \
+    if (mode == _mode) {                                    \
+        return std::make_shared<_helper_name>(__VA_ARGS__); \
+    }
+
+std::shared_ptr<ElemwiseGenBase> ElemwiseHelperFunc::CreateGenHelper(
+        std::string mode, std::vector<CCOperand> operands) {
+    size_t nr_operands = operands.size();
+    if (nr_operands == 2) {
+        CASE_DISPATCH("SIGMOID", ElemwiseGenUnarySigmoid);
+    } else {
+        CC_ABORT << mode << " not Implement now\n";
+    }
+    return nullptr;
+}
+
+#undef CASE_DISPATCH
+#undef CASE_DISPATCH_ARG
+
+std::string ElemwiseHelperFunc::BcastType2String(BcastType bcast_type) {
+    return ArmCommon::ElemwiseHelperFunc::BcastType2String(bcast_type);
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.h b/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.h
new file mode 100644
index 00000000..ff81f71b
--- /dev/null
+++ b/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.h
@@ -0,0 +1,103 @@
+/**
+ * \file
+ * compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/ElemwiseHelper.h
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+#pragma once
+#include <memory>
+#include <sstream>
+#include <string>
+#include "Arm/ArmCommon/ArmSimdHelper.h"
+#include "Arm/ArmCommon/ElemwiseHelper/ElemwiseHelper.h"
+#include "Common/ElemwiseCommon.h"
+#include "Utils/StringTemplate.h"
+#include "Utils/SymbolHelper.h"
+#include "compiler/KernelGen/KernelGen.h"
+namespace megcc {
+namespace KernelGen {
+namespace Arm64 {
+
+class ElemwiseGenBase {
+public:
+    //! gen the code out side the compute kernel, just address offset, for loop
+    virtual std::string GenCodeBody(std::vector<std::string>) const = 0;
+
+    //! Gen elemwise kernel asm computing init code, init for the necessary simd
+    //! variable, such as zero in Relu
+    virtual std::string GenKernelAsmInit(std::vector<std::string>) const = 0;
+
+    //! Gen the simd elemwise compute code, and the degree of unroll is specific
+    //! by first param
+    virtual std::string GenKernelSimdUnroll(std::vector<std::string>) const = 0;
+
+    //! Gen the naive C elemwise compute code, and the degree of unroll is
+    //! specific by first param
+    virtual std::string GenKernelNaiveUnroll(
+            std::vector<std::string>) const = 0;
+
+    virtual ~ElemwiseGenBase() {}
+};
+
+//! The Unary elemwise kernel base
+class ElemwiseGenUnary : public ElemwiseGenBase {
+public:
+    std::string m_src_dtype;
+    std::string m_dst_dtype;
+    bool m_inline_mode;
+    std::unique_ptr<ArmCommon::ArmSimdHelper> m_src_simd;
+    std::unique_ptr<ArmCommon::ArmSimdHelper> m_dst_simd;
+    bool m_i32_to_qs8;
+    std::unique_ptr<ArmCommon::ElemwiseGenUnarySigmoid> m_common_sigmoid_gen;
+    ElemwiseGenUnary(std::string src_dtype = "f32",
+                     std::string dst_dtype = "f32", bool inline_mode = false)
+            : m_src_dtype(src_dtype),
+              m_dst_dtype(dst_dtype),
+              m_inline_mode(inline_mode) {
+        m_src_simd = std::make_unique<ArmCommon::ArmSimdHelper>(src_dtype);
+        m_dst_simd = std::make_unique<ArmCommon::ArmSimdHelper>(dst_dtype);
+        m_common_sigmoid_gen =
+                std::make_unique<ArmCommon::ElemwiseGenUnarySigmoid>(
+                        src_dtype, dst_dtype, inline_mode);
+        m_i32_to_qs8 = Utils::is_int_dtype(m_src_dtype, 32) &&
+                       Utils::is_int_dtype(m_dst_dtype, 8);
+    };
+    std::string GenCodeBody(std::vector<std::string>) const override;
+    virtual std::string GenInlineName() const = 0;
+};
+
+//! create the elemwise helper implement according to the mode and operand
+struct ElemwiseHelperFunc {
+    static std::shared_ptr<ElemwiseGenBase> CreateGenHelper(
+            std::string mode, std::vector<CCOperand> operands);
+    static std::string BcastType2String(BcastType bcast_type);
+};
+
+/************************************Unary***********************************/
+
+#define DEFINE_NNARY_OP(_name)                                                 \
+    class _name : public ElemwiseGenUnary {                                    \
+    public:                                                                    \
+        _name(std::string src_dtype = "f32", std::string dst_dtype = "f32",    \
+              bool inline_mode = false)                                        \
+                : ElemwiseGenUnary(SymbolHelper::gen_valid_dtype(src_dtype),   \
+                                   SymbolHelper::gen_valid_dtype(dst_dtype),   \
+                                   inline_mode) {}                             \
+        std::string GenKernelAsmInit(std::vector<std::string>) const override; \
+        std::string GenKernelSimdUnroll(                                       \
+                std::vector<std::string>) const override;                      \
+        std::string GenKernelNaiveUnroll(                                      \
+                std::vector<std::string>) const override;                      \
+        std::string GenInlineName() const override;                            \
+    };
+
+DEFINE_NNARY_OP(ElemwiseGenUnarySigmoid)
+#undef DEFINE_NNARY_OP
+
+}  // namespace Arm64
+}  // namespace KernelGen
+}  // namespace megcc
+
+// vim: syntax=cpp.doxygen
diff --git a/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/UnaryHelper.cpp b/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/UnaryHelper.cpp
new file mode 100644
index 00000000..b88f6307
--- /dev/null
+++ b/compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/UnaryHelper.cpp
@@ -0,0 +1,383 @@
+/**
+ * \file
+ * compiler/lib/KernelGen/Arm/Arm64/ElemwiseHelper/UnaryHelper.cpp
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+
+#include "ElemwiseHelper.h"
+#include "Utils/SymbolHelper.h"
+#include "Utils/Utils.h"
+#include "compiler/Common/Logger.h"
+using namespace megcc;
+using namespace KernelGen;
+using namespace Arm64;
+std::string ElemwiseGenUnary::GenCodeBody(std::vector<std::string> strs) const {
+    std::stringstream body_ss;
+    if (m_inline_mode) {
+        body_ss << R"(static inline void ${inline_func_name}(const ${src_specifier}* src, ${dst_specifier}* dst, size_t nr_elem)";
+        body_ss << "){";
+    } else {
+        body_ss << R"(
+            Layout in_layout = inputs[0]->layout;
+            size_t nr_elem = 1;
+            for (int i = 0; i < in_layout.nr_dim; ++i) {
+                nr_elem *= in_layout.dims[i];
+            }
+            const ${src_specifier} * src = ${source};
+            ${dst_specifier}* dst = ${dst};
+        )";
+    }
+    body_ss << R"(
+       
+        ${kernel_init()}
+
+        size_t index = offset;
+        for(; index + 7 < nr_elem; index += 8) {
+            ${src_simd_specifier} vsrc0 = ${src_ld1q}(src);
+            ${src_simd_specifier} vsrc1 = ${src_ld1q}(src + 4);
+            ${kernel_simd_unroll(2, vsrc0, vdst0, vsrc1, vdst1)}
+            ${dst_store(dst, vdst0)};
+            ${dst_store(dst + 4, vdst1)};
+            src += 8;
+            dst += 8;
+        }
+        for(; index + 3 < nr_elem; index += 4) {
+            ${src_simd_specifier} vsrc0 = ${src_ld1q}(src);
+            ${kernel_simd_unroll(1, vsrc0, vdst0)}
+            ${dst_store(dst, vdst0)};
+            src += 4;
+            dst += 4;
+        }
+        for(; index < nr_elem; index++) {
+            ${kernel_naive_unroll(1, src, dst)}
+            src += 1;
+            dst += 1;
+        })";
+    if (m_inline_mode) {
+        body_ss << "}";
+    }
+    auto kernel_init = [this](std::vector<std::string> strs) {
+        return GenKernelAsmInit(strs);
+    };
+    auto kernel_simd_unroll = [this](std::vector<std::string> strs) {
+        return GenKernelSimdUnroll(strs);
+    };
+    auto kernel_naive_unroll = [this](std::vector<std::string> strs) {
+        return GenKernelNaiveUnroll(strs);
+    };
+    std::stringstream ss;
+    auto body_render = StringTemplate::StringTemplateArgs()
+                               .add("kernel_init", kernel_init)
+                               .add("kernel_simd_unroll", kernel_simd_unroll)
+                               .add("kernel_naive_unroll", kernel_naive_unroll)
+                               .add("src_specifier",
+                                    Utils::cvt_dtype_specifier(m_src_dtype))
+                               .add("dst_specifier",
+                                    Utils::cvt_dtype_specifier(m_dst_dtype))
+                               .add("src_ld1q", m_src_simd->get_ld1q_symbol())
+                               .add("dst_store",
+                                    [=](std::string ptr, std::string dst_reg) {
+                                        return m_dst_simd->get_st1q_symbol() +
+                                               "(" + ptr + "," + dst_reg +
+                                               ")\n";
+                                    })
+                               .add("dst_st1q", m_dst_simd->get_st1q_symbol())
+                               .add("src_simd_specifier",
+                                    m_src_simd->get_specifier_q_symbol());
+
+    if (m_inline_mode) {
+        body_render.add("inline_func_name", GenInlineName());
+    } else {
+        auto input = strs[0];
+        auto output = strs[1];
+        body_render.add("source", input).add("dst", output);
+    }
+    ss << body_render.render(body_ss.str());
+
+    return ss.str();
+}
+
+//! Sigmoid
+std::string ElemwiseGenUnarySigmoid::GenInlineName() const {
+    return "ElemwiseGenUnarySigmoid";
+}
+std::string ElemwiseGenUnarySigmoid::GenKernelAsmInit(
+        std::vector<std::string>) const {
+    std::stringstream writer;
+    writer << R"(
+        size_t x6_iter = nr_elem / (4 * 6);
+        size_t offset = x6_iter * 4 * 6;
+        float32x4_t lower_range;
+        float32x4_t upper_range;
+        float32x4_t alpha_9;
+        float32x4_t alpha_7;
+        float32x4_t alpha_5;
+        float32x4_t alpha_3;
+        float32x4_t alpha_1;
+        float32x4_t beta_10;
+        float32x4_t beta_8;
+        float32x4_t beta_6;
+        float32x4_t beta_4;
+        float32x4_t beta_2;
+        float32x4_t beta_0;
+        float32x4_t one_half;
+
+        const float* const_ptr = &(sigmoid_constants.lower_range);
+        if (x6_iter > 0) {
+            /**
+             * q0 - q5   : squared
+             * q6 - q11  : p
+             * q12- q17  : val(temp), q
+             * q18- q31  : const
+             */
+            asm volatile(
+                    "ld1r {%[lower_range].4s}, [%[const_ptr]], #4\n"
+                    "ld1r {%[upper_range].4s}, [%[const_ptr]], #4\n"
+                    "ld1r {%[alpha_9].4s},     [%[const_ptr]], #4\n"
+                    "ld1r {%[alpha_7].4s},     [%[const_ptr]], #4\n"
+                    "ld1r {%[alpha_5].4s},     [%[const_ptr]], #4\n"
+                    "ld1r {%[alpha_3].4s},     [%[const_ptr]], #4\n"
+                    "ld1r {%[alpha_1].4s},     [%[const_ptr]], #4\n"
+                    "ld1r {%[beta_10].4s},     [%[const_ptr]], #4\n"
+                    "ld1r {%[beta_8].4s},      [%[const_ptr]], #4\n"
+                    "ld1r {%[beta_6].4s},      [%[const_ptr]], #4\n"
+                    "ld1r {%[beta_4].4s},      [%[const_ptr]], #4\n"
+                    "ld1r {%[beta_2].4s},      [%[const_ptr]], #4\n"
+                    "ld1r {%[beta_0].4s},      [%[const_ptr]], #4\n"
+                    "ld1r {%[one_half].4s},    [%[const_ptr]], #4\n"
+
+                    "1:\n"
+                    "ldr  q12, [%[a_ptr]]     \n"
+                    "ldr  q13, [%[a_ptr], #16]\n"
+                    "ldr  q14, [%[a_ptr], #32]\n"
+                    "ldr  q15, [%[a_ptr], #48]\n"
+                    "ldr  q16, [%[a_ptr], #64]\n"
+                    "ldr  q17, [%[a_ptr], #80]\n"
+                    // auto val = vmaxq_f32(vdupq_n_f32(sigmoid_constants.lower_range),
+                    // src);
+                    "fmax v12.4s, v12.4s, %[lower_range].4s\n"
+                    "fmax v13.4s, v13.4s, %[lower_range].4s\n"
+                    "fmax v14.4s, v14.4s, %[lower_range].4s\n"
+                    "fmax v15.4s, v15.4s, %[lower_range].4s\n"
+                    "fmax v16.4s, v16.4s, %[lower_range].4s\n"
+                    "fmax v17.4s, v17.4s, %[lower_range].4s\n"
+                    "add %[a_ptr], %[a_ptr], #96\n"
+
+                    //  val = vminq_f32(vdupq_n_f32(sigmoid_constants.upper_range), val);
+                    "fmin v12.4s, v12.4s, %[upper_range].4s\n"
+                    "fmin v13.4s, v13.4s, %[upper_range].4s\n"
+                    "fmin v14.4s, v14.4s, %[upper_range].4s\n"
+                    "fmin v15.4s, v15.4s, %[upper_range].4s\n"
+                    "fmin v16.4s, v16.4s, %[upper_range].4s\n"
+                    "fmin v17.4s, v17.4s, %[upper_range].4s\n"
+
+                    //! auto squared = vmulq_f32(val, val);
+                    "fmul v0.4s, v12.4s, v12.4s\n"
+                    "fmul v1.4s, v13.4s, v13.4s\n"
+                    "fmul v2.4s, v14.4s, v14.4s\n"
+                    "fmul v3.4s, v15.4s, v15.4s\n"
+                    "fmul v4.4s, v16.4s, v16.4s\n"
+                    "fmul v5.4s, v17.4s, v17.4s\n"
+                    //    auto p = fma_ps_f32(
+                    //             vdupq_n_f32(sigmoid_constants.alpha_7), squared,
+                    //             vdupq_n_f32(sigmoid_constants.alpha_9));
+                    "fmul v6.4s,  v0.4s,  %[alpha_9].4s\n"
+                    "fmul v7.4s,  v1.4s,  %[alpha_9].4s\n"
+                    "fmul v8.4s,  v2.4s,  %[alpha_9].4s\n"
+                    "fmul v9.4s,  v3.4s,  %[alpha_9].4s\n"
+                    "fmul v10.4s, v4.4s,  %[alpha_9].4s\n"
+                    "fmul v11.4s, v5.4s,  %[alpha_9].4s\n"
+                    "fadd v6.4s,  v6.4s,  %[alpha_7].4s\n"
+                    "fadd v7.4s,  v7.4s,  %[alpha_7].4s\n"
+                    "fadd v8.4s,  v8.4s,  %[alpha_7].4s\n"
+                    "fadd v9.4s,  v9.4s,  %[alpha_7].4s\n"
+                    "fadd v10.4s, v10.4s, %[alpha_7].4s\n"
+                    "fadd v11.4s, v11.4s, %[alpha_7].4s\n"
+
+                    // p = fma_ps_f32(vdupq_n_f32(sigmoid_constants.alpha_5), p, squared);
+                    "fmul v6.4s,   v6.4s,  v0.4s\n"
+                    "fmul v7.4s,   v7.4s,  v1.4s\n"
+                    "fmul v8.4s,   v8.4s,  v2.4s\n"
+                    "fmul v9.4s,   v9.4s,  v3.4s\n"
+                    "fmul v10.4s,  v10.4s, v4.4s\n"
+                    "fmul v11.4s,  v11.4s, v5.4s\n"
+                    "fadd v6.4s,  v6.4s,  %[alpha_5].4s\n"
+                    "fadd v7.4s,  v7.4s,  %[alpha_5].4s\n"
+                    "fadd v8.4s,  v8.4s,  %[alpha_5].4s\n"
+                    "fadd v9.4s,  v9.4s,  %[alpha_5].4s\n"
+                    "fadd v10.4s, v10.4s, %[alpha_5].4s\n"
+                    "fadd v11.4s, v11.4s, %[alpha_5].4s\n"
+
+                    // p = fma_ps_f32(vdupq_n_f32(sigmoid_constants.alpha_3), p, squared);
+                    "fmul v6.4s,   v6.4s,  v0.4s\n"
+                    "fmul v7.4s,   v7.4s,  v1.4s\n"
+                    "fmul v8.4s,   v8.4s,  v2.4s\n"
+                    "fmul v9.4s,   v9.4s,  v3.4s\n"
+                    "fmul v10.4s,  v10.4s, v4.4s\n"
+                    "fmul v11.4s,  v11.4s, v5.4s\n"
+                    "fadd v6.4s,  v6.4s,  %[alpha_3].4s\n"
+                    "fadd v7.4s,  v7.4s,  %[alpha_3].4s\n"
+                    "fadd v8.4s,  v8.4s,  %[alpha_3].4s\n"
+                    "fadd v9.4s,  v9.4s,  %[alpha_3].4s\n"
+                    "fadd v10.4s, v10.4s, %[alpha_3].4s\n"
+                    "fadd v11.4s, v11.4s, %[alpha_3].4s\n"
+
+                    // p = fma_ps_f32(vdupq_n_f32(sigmoid_constants.alpha_1), p, squared);
+                    "fmul v6.4s,   v6.4s,  v0.4s\n"
+                    "fmul v7.4s,   v7.4s,  v1.4s\n"
+                    "fmul v8.4s,   v8.4s,  v2.4s\n"
+                    "fmul v9.4s,   v9.4s,  v3.4s\n"
+                    "fmul v10.4s,  v10.4s, v4.4s\n"
+                    "fmul v11.4s,  v11.4s, v5.4s\n"
+                    "fadd v6.4s,  v6.4s,  %[alpha_1].4s\n"
+                    "fadd v7.4s,  v7.4s,  %[alpha_1].4s\n"
+                    "fadd v8.4s,  v8.4s,  %[alpha_1].4s\n"
+                    "fadd v9.4s,  v9.4s,  %[alpha_1].4s\n"
+                    "fadd v10.4s, v10.4s, %[alpha_1].4s\n"
+                    "fadd v11.4s, v11.4s, %[alpha_1].4s\n"
+
+                    //     p = vmulq_f32(p, val);
+                    "fmul v6.4s,   v6.4s,  v12.4s\n"
+                    "fmul v7.4s,   v7.4s,  v13.4s\n"
+                    "fmul v8.4s,   v8.4s,  v14.4s\n"
+                    "fmul v9.4s,   v9.4s,  v15.4s\n"
+                    "fmul v10.4s,  v10.4s, v16.4s\n"
+                    "fmul v11.4s,  v11.4s, v17.4s\n"
+
+                    //     auto q = fma_ps_f32(
+                    //             vdupq_n_f32(sigmoid_constants.beta_8), squared,
+                    //             vdupq_n_f32(sigmoid_constants.beta_10));
+                    "fmul v12.4s, v0.4s,  %[beta_10].4s\n"
+                    "fmul v13.4s, v1.4s,  %[beta_10].4s\n"
+                    "fmul v14.4s, v2.4s,  %[beta_10].4s\n"
+                    "fmul v15.4s, v3.4s,  %[beta_10].4s\n"
+                    "fmul v16.4s, v4.4s,  %[beta_10].4s\n"
+                    "fmul v17.4s, v5.4s,  %[beta_10].4s\n"
+                    "fadd v12.4s, v12.4s, %[beta_8].4s\n"
+                    "fadd v13.4s, v13.4s, %[beta_8].4s\n"
+                    "fadd v14.4s, v14.4s, %[beta_8].4s\n"
+                    "fadd v15.4s, v15.4s, %[beta_8].4s\n"
+                    "fadd v16.4s, v16.4s, %[beta_8].4s\n"
+                    "fadd v17.4s, v17.4s, %[beta_8].4s\n"
+
+                    //     q = fma_ps_f32(vdupq_n_f32(sigmoid_constants.beta_6), q,
+                    //     squared);
+                    "fmul v12.4s, v12.4s,  v0.4s\n"
+                    "fmul v13.4s, v13.4s,  v1.4s\n"
+                    "fmul v14.4s, v14.4s,  v2.4s\n"
+                    "fmul v15.4s, v15.4s,  v3.4s\n"
+                    "fmul v16.4s, v16.4s,  v4.4s\n"
+                    "fmul v17.4s, v17.4s,  v5.4s\n"
+                    "fadd v12.4s, v12.4s, %[beta_6].4s\n"
+                    "fadd v13.4s, v13.4s, %[beta_6].4s\n"
+                    "fadd v14.4s, v14.4s, %[beta_6].4s\n"
+                    "fadd v15.4s, v15.4s, %[beta_6].4s\n"
+                    "fadd v16.4s, v16.4s, %[beta_6].4s\n"
+                    "fadd v17.4s, v17.4s, %[beta_6].4s\n"
+
+                    //     q = fma_ps_f32(vdupq_n_f32(sigmoid_constants.beta_4), q,
+                    //     squared);
+                    "fmul v12.4s, v12.4s,  v0.4s\n"
+                    "fmul v13.4s, v13.4s,  v1.4s\n"
+                    "fmul v14.4s, v14.4s,  v2.4s\n"
+                    "fmul v15.4s, v15.4s,  v3.4s\n"
+                    "fmul v16.4s, v16.4s,  v4.4s\n"
+                    "fmul v17.4s, v17.4s,  v5.4s\n"
+                    "fadd v12.4s, v12.4s, %[beta_4].4s\n"
+                    "fadd v13.4s, v13.4s, %[beta_4].4s\n"
+                    "fadd v14.4s, v14.4s, %[beta_4].4s\n"
+                    "fadd v15.4s, v15.4s, %[beta_4].4s\n"
+                    "fadd v16.4s, v16.4s, %[beta_4].4s\n"
+                    "fadd v17.4s, v17.4s, %[beta_4].4s\n"
+
+                    //     q = fma_ps_f32(vdupq_n_f32(sigmoid_constants.beta_2), q,
+                    //     squared);
+                    "fmul v12.4s, v12.4s,  v0.4s\n"
+                    "fmul v13.4s, v13.4s,  v1.4s\n"
+                    "fmul v14.4s, v14.4s,  v2.4s\n"
+                    "fmul v15.4s, v15.4s,  v3.4s\n"
+                    "fmul v16.4s, v16.4s,  v4.4s\n"
+                    "fmul v17.4s, v17.4s,  v5.4s\n"
+                    "fadd v12.4s, v12.4s, %[beta_2].4s\n"
+                    "fadd v13.4s, v13.4s, %[beta_2].4s\n"
+                    "fadd v14.4s, v14.4s, %[beta_2].4s\n"
+                    "fadd v15.4s, v15.4s, %[beta_2].4s\n"
+                    "fadd v16.4s, v16.4s, %[beta_2].4s\n"
+                    "fadd v17.4s, v17.4s, %[beta_2].4s\n"
+
+                    // q = fma_ps_f32(vdupq_n_f32(sigmoid_constants.beta_0), q, squared);
+                    "fmul v12.4s, v12.4s,  v0.4s\n"
+                    "fmul v13.4s, v13.4s,  v1.4s\n"
+                    "fmul v14.4s, v14.4s,  v2.4s\n"
+                    "fmul v15.4s, v15.4s,  v3.4s\n"
+                    "fmul v16.4s, v16.4s,  v4.4s\n"
+                    "fmul v17.4s, v17.4s,  v5.4s\n"
+                    "fadd v12.4s, v12.4s, %[beta_0].4s\n"
+                    "fadd v13.4s, v13.4s, %[beta_0].4s\n"
+                    "fadd v14.4s, v14.4s, %[beta_0].4s\n"
+                    "fadd v15.4s, v15.4s, %[beta_0].4s\n"
+                    "fadd v16.4s, v16.4s, %[beta_0].4s\n"
+                    "fadd v17.4s, v17.4s, %[beta_0].4s\n"
+
+                    // vaddq_f32(div_ps_f32(p, q),
+                    // vdupq_n_f32(sigmoid_constants.one_half));
+                    "fdiv v12.4s, v6.4s,  v12.4s\n"
+                    "fdiv v13.4s, v7.4s,  v13.4s\n"
+                    "fdiv v14.4s, v8.4s,  v14.4s\n"
+                    "fdiv v15.4s, v9.4s,  v15.4s\n"
+                    "fdiv v16.4s, v10.4s, v16.4s\n"
+                    "fdiv v17.4s, v11.4s, v17.4s\n"
+                    "subs %w[x6_iter], %w[x6_iter], #1\n"
+                    "fadd v12.4s, v12.4s, %[one_half].4s\n"
+                    "fadd v13.4s, v13.4s, %[one_half].4s\n"
+                    "fadd v14.4s, v14.4s, %[one_half].4s\n"
+                    "fadd v15.4s, v15.4s, %[one_half].4s\n"
+                    "fadd v16.4s, v16.4s, %[one_half].4s\n"
+                    "fadd v17.4s, v17.4s, %[one_half].4s\n"
+
+                    // save it
+                    "str  q12, [%[d_ptr]]     \n"
+                    "str  q13, [%[d_ptr], #16]\n"
+                    "str  q14, [%[d_ptr], #32]\n"
+                    "str  q15, [%[d_ptr], #48]\n"
+                    "str  q16, [%[d_ptr], #64]\n"
+                    "str  q17, [%[d_ptr], #80]\n"
+                    "add %[d_ptr], %[d_ptr], #96\n"
+
+                    "bne 1b\n"
+
+                    "2:\n"
+                    : [a_ptr] "+r"(src), [d_ptr] "+r"(dst), [const_ptr] "+r"(const_ptr),
+                      [x6_iter] "+r"(x6_iter), [lower_range] "=w"(lower_range),
+                      [alpha_9] "=w"(alpha_9), [upper_range] "=w"(upper_range),
+                      [alpha_7] "=w"(alpha_7), [alpha_5] "=w"(alpha_5),
+                      [alpha_3] "=w"(alpha_3), [alpha_1] "=w"(alpha_1),
+                      [beta_10] "=w"(beta_10), [beta_8] "=w"(beta_8),
+                      [beta_6] "=w"(beta_6), [beta_4] "=w"(beta_4),
+                      [beta_2] "=w"(beta_2), [beta_0] "=w"(beta_0),
+                      [one_half] "=w"(one_half)
+                    :
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+                      "v11", "v12", "v13", "v14", "v15", "v16", "v17", "x1", "x2", "x8",
+                      "x9", "cc", "memory");
+        }
+
+    )";
+    writer << "\nfloat32x4_t ones = vdupq_n_f32(1.f);";
+    return writer.str();
+}
+
+std::string ElemwiseGenUnarySigmoid::GenKernelSimdUnroll(
+        std::vector<std::string> strs) const {
+    return m_common_sigmoid_gen->GenKernelSimdUnroll(strs);
+}
+
+std::string ElemwiseGenUnarySigmoid::GenKernelNaiveUnroll(
+        std::vector<std::string> strs) const {
+    return m_common_sigmoid_gen->GenKernelNaiveUnroll(strs);
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/compiler/lib/KernelGen/Arm/Arm64/KernelPack.cpp b/compiler/lib/KernelGen/Arm/Arm64/KernelPack.cpp
index abaa184a..0c1906f6 100644
--- a/compiler/lib/KernelGen/Arm/Arm64/KernelPack.cpp
+++ b/compiler/lib/KernelGen/Arm/Arm64/KernelPack.cpp
@@ -10,10 +10,10 @@
 #include <memory>
 
 #include "ConvKernel.h"
+#include "Elemwise/Elemwise.h"
 #include "InternalKernel/InternalKernel.h"
 #include "KernelPack.h"
 #include "MatMulKernel/Fp32MatMul.h"
-
 using namespace megcc;
 using namespace KernelGen;
 using namespace Arm64;
@@ -40,6 +40,9 @@ struct AllA64Kernel {
                 std::make_shared<Arm64::MatmulM8N12Kernel>(),
                 std::make_shared<Arm64::MatmulM8N12MK4Kernel>(),
                 std::make_shared<Arm64::MatmulInt8DotM8N12MK4Kernel>()};
+
+        inner_map[KernelPack::KernType::ElemwiseKernel] = {
+                std::make_shared<Arm64::ElemwiseKernel>()};
     }
     std::unordered_map<KernelPack::KernType,
                        std::vector<std::shared_ptr<KernelFunc>>>
diff --git a/compiler/test/kernel/opr/arm/Elementwise.cpp b/compiler/test/kernel/opr/arm/Elementwise.cpp
index 55a39b47..8e5e86d1 100644
--- a/compiler/test/kernel/opr/arm/Elementwise.cpp
+++ b/compiler/test/kernel/opr/arm/Elementwise.cpp
@@ -28,6 +28,19 @@ TEST(AARCH64, ElementwiseUnique) {
     }
 }
 
+TEST(AARCH64, ElementwiseUnique_asm) {
+    Checker<ElemwiseForward> checker(Arch::ARM64);
+    checker.set_kernel_symbol("Arm64_kernel_elementwise.+");
+    ElemwiseForward::Param param;
+    for (auto mode : {MODE::SIGMOID}) {
+        param.mode = mode;
+        checker.set_param(param);
+        checker.execs({{1, 10}, {}});
+        checker.execs({{1, 10, 12, 13}, {}});
+        checker.execs({{10, 8, 2, 1}, {}});
+    }
+}
+
 TEST(AARCH64, ElementwiseBinary) {
     Checker<ElemwiseForward> checker(Arch::ARM64);
     ElemwiseForward::Param param;
diff --git a/compiler/test/kernel/opr/arm/benchmark_elemwise.cpp b/compiler/test/kernel/opr/arm/benchmark_elemwise.cpp
index 475cea93..1bb8baf6 100644
--- a/compiler/test/kernel/opr/arm/benchmark_elemwise.cpp
+++ b/compiler/test/kernel/opr/arm/benchmark_elemwise.cpp
@@ -23,4 +23,13 @@ TEST(AARCH64, BenchmarkElemwise) {
     benchmarker.execs({{1, 3, 160, 160}, {}}).print();
     benchmarker.execs({{1, 3, 160, 160}, {}}).print();
 }
+TEST(AARCH64, BenchmarkElemwise_asm) {
+    Benchmarker<ElemwiseForward> benchmarker(Arch::ARM64);
+    benchmarker.set_kernel_symbol("Arm64.*");
+    ElemwiseForward::Param param;
+    param.mode = MODE::SIGMOID;
+    benchmarker.set_param(param);
+    benchmarker.execs({{1, 3, 160, 160}, {}}).print();
+    benchmarker.execs({{1, 3, 160, 160}, {}}).print();
+}
 #endif

From 85f8ee1c916671b644695996e3ad66e4fde8ae32 Mon Sep 17 00:00:00 2001
From: limingxin <limingxin@megvii.com>
Date: Thu, 29 Dec 2022 13:43:18 +0800
Subject: [PATCH 08/17] feat(compiler): support to set ENV and loader path
 using cmdline arg when compile

---
 .../compiler/Target/MGB/dummy_loader.h        |  84 ++++++--
 compiler/lib/Target/MGB/importer.cpp          |  57 +++++-
 compiler/lib/Target/TinyNN/exporter.cpp       |  22 ++-
 runtime/example/standard_OS/lite_main.c       |   4 +-
 runtime/src/vm/extern_opr.c                   | 183 ++++++++++++++----
 runtime/version.ld                            |   2 +-
 script/ppl_build.sh                           |   2 +-
 script/ppl_gen.sh                             |  18 +-
 8 files changed, 294 insertions(+), 78 deletions(-)

diff --git a/compiler/include/compiler/Target/MGB/dummy_loader.h b/compiler/include/compiler/Target/MGB/dummy_loader.h
index bc809af2..49cce414 100644
--- a/compiler/include/compiler/Target/MGB/dummy_loader.h
+++ b/compiler/include/compiler/Target/MGB/dummy_loader.h
@@ -8,22 +8,28 @@
 
 #include <malloc.h>
 #include <cassert>
-#include <map>
 #include <memory>
 #include <queue>
+#include <unordered_map>
 #include <vector>
 #include "megbrain/serialization/extern_c_opr.h"
 
 namespace {
-std::map<std::string,
-         std::pair<std::vector<std::vector<uint32_t>>, std::vector<uint32_t>>>
-        name2outputinfo;
-class MGBOprDescImpl {
-    static std::string loader_name;
+struct LoaderInfo {
+    std::unordered_map<std::string,
+                       std::pair<std::vector<std::vector<uint32_t>>,
+                                 std::vector<uint32_t>>>
+            m_name_2_outputinfo;
+    std::unordered_map<std::string, std::string> m_envs;
+    std::pair<std::string, std::string> m_loader_path_with_interface;
+};
+static LoaderInfo loaderInfo;
 
+class MGBOprDescImpl {
     static inline const std::pair<std::vector<std::vector<uint32_t>>,
                                   std::vector<uint32_t>>&
     get_output_info(const std::string& loader_name) {
+        const auto& name2outputinfo = loaderInfo.m_name_2_outputinfo;
         auto&& iter = name2outputinfo.find(loader_name);
         if (iter != name2outputinfo.end())
             return iter->second;
@@ -99,6 +105,58 @@ class MGBOprDescImpl {
 
 class MGBOprLoaderImpl {
     static std::map<std::string, void*> user_datas;
+    // extra_data format:
+    // total_len
+    // nr_env
+    //     ENV_len_1:ENV_1:VALUE_len_1:VALUE_1
+    //     ENV_len_2....
+    // loader_path_len:loader_path:interface_len:interface
+    static std::shared_ptr<void> extra_data;
+
+    static void make_extra_data() {
+        // calculate len
+        size_t len = 0;
+        size_t nr_env = loaderInfo.m_envs.size();
+        len += sizeof(nr_env);  // nr_env
+        for (const auto& env : loaderInfo.m_envs) {
+            size_t env_len = env.first.size(), value_len = env.second.size();
+            len += sizeof(env_len) + env_len + sizeof(value_len) +
+                   value_len;  // ENV_len_x + ENV_x + VALUE_len_x + VALUE_x
+        }
+        len += sizeof(size_t) +
+               loaderInfo.m_loader_path_with_interface.first.size() +
+               sizeof(size_t) +
+               loaderInfo.m_loader_path_with_interface.second
+                       .size();  // loader_path_len + loader_path +
+                                 // interface_len + interface
+
+        extra_data = std::shared_ptr<void>(malloc(sizeof(size_t) + len), free);
+        // fill memory
+        void* tmp_p = extra_data.get();
+        *(size_t*)(tmp_p) = len;
+        tmp_p += sizeof(size_t);
+        *(size_t*)tmp_p = nr_env;
+        tmp_p += sizeof(size_t);
+        for (const auto& env : loaderInfo.m_envs) {
+            *(size_t*)tmp_p = env.first.size();
+            tmp_p += sizeof(size_t);
+            memmove(tmp_p, env.first.c_str(), env.first.size());
+            tmp_p += env.first.size();
+            *(size_t*)tmp_p = env.second.size();
+            tmp_p += sizeof(size_t);
+            memmove(tmp_p, env.second.c_str(), env.second.size());
+            tmp_p += env.second.size();
+        }
+        *(size_t*)tmp_p = loaderInfo.m_loader_path_with_interface.first.size();
+        tmp_p += sizeof(size_t);
+        memmove(tmp_p, loaderInfo.m_loader_path_with_interface.first.c_str(),
+                loaderInfo.m_loader_path_with_interface.first.size());
+        tmp_p += loaderInfo.m_loader_path_with_interface.first.size();
+        *(size_t*)tmp_p = loaderInfo.m_loader_path_with_interface.second.size();
+        tmp_p += sizeof(size_t);
+        memmove(tmp_p, loaderInfo.m_loader_path_with_interface.second.c_str(),
+                loaderInfo.m_loader_path_with_interface.second.size());
+    }
 
     static MGBOprDesc* create_desc(size_t nr_input, const void* buf,
                                    size_t buf_len) {
@@ -116,16 +174,16 @@ class MGBOprLoaderImpl {
 
 public:
     static std::map<std::string, void*>& get_user_datas() { return user_datas; }
-    static MGBOprLoader make() { return {"extern_opr_dummy", &create_desc}; }
+    static void* get_extra_data() { return extra_data.get(); }
+    static MGBOprLoader make() {
+        make_extra_data();
+        return {"extern_opr_dummy", &create_desc};
+    }
 };
 std::map<std::string, void*> MGBOprLoaderImpl::user_datas = {};
+std::shared_ptr<void> MGBOprLoaderImpl::extra_data = {};
 
-void mgb_c_opr_init_output_info(
-        const MGBExternCOprApi* (*get_api)(int),
-        const std::map<std::string,
-                       std::pair<std::vector<std::vector<uint32_t>>,
-                                 std::vector<uint32_t>>>& output_info) {
-    name2outputinfo = std::move(output_info);
+static void dummy_mgb_c_opr_init(const MGBExternCOprApi* (*get_api)(int)) {
     const MGBExternCOprApi* api = get_api(MGB_EXTERN_C_OPR_VERSION);
     assert(api);
     MGBOprLoader loader = MGBOprLoaderImpl::make();
diff --git a/compiler/lib/Target/MGB/importer.cpp b/compiler/lib/Target/MGB/importer.cpp
index e8867f0b..b7e19f13 100644
--- a/compiler/lib/Target/MGB/importer.cpp
+++ b/compiler/lib/Target/MGB/importer.cpp
@@ -61,6 +61,17 @@ llvm::cl::opt<std::string> ExternOprOutputDType(
                 "The available values are float32, int32, uint8, float16, "
                 "int16. e.g., \"float32;int32;uint8:float16;int16\". Default "
                 "value is float32."));
+llvm::cl::opt<std::string> ExternOprLoaderPathWithInterface(
+        "loader-path-with-interface", llvm::cl::Optional,
+        llvm::cl::desc("specific extern opr loader path with interface. If "
+                       "\"interface\" "
+                       "is not provided, using \"mgb_c_opr_init\" default."),
+        llvm::cl::value_desc("loader_path:interface"));
+llvm::cl::opt<std::string> ExternOprLoaderEnv(
+        "set-extern-opr-env", llvm::cl::Optional,
+        llvm::cl::desc("set ENV for all extern opr loader, must surrounded by "
+                       "\" if set multiple ENV."),
+        llvm::cl::value_desc("\"ENV_1=VALUE_1;ENV_2=VALUE_2...\""));
 
 using namespace mgb;
 using namespace llvm;
@@ -172,10 +183,8 @@ inline std::vector<std::string> split(std::string str,
     return res;
 }
 
-inline void parse_extern_output_info() {
-    std::map<std::string, std::pair<std::vector<std::vector<uint32_t>>,
-                                    std::vector<uint32_t>>>
-            name2outputinfo;
+inline void parse_extern_loader_info() {
+    auto& name2outputinfo = loaderInfo.m_name_2_outputinfo;
 
     std::string extern_opr_output_shapes = ExternOprOutputShape;
     if (extern_opr_output_shapes.size()) {
@@ -290,9 +299,34 @@ inline void parse_extern_output_info() {
                 }
             }
         }
+    }
+
+    // parse ENV
+    std::string env = ExternOprLoaderEnv;
+    if (env.size()) {
+        auto&& env_values = split(env, ";");
+        for (auto&& env_value : env_values) {
+            auto&& env_value_vec = split(env_value, "=");
+            CC_ASSERT((env_value_vec.size() == 2))
+                    << "Wrong format. Set ENV using \"ENV=VALUE\"";
+            loaderInfo.m_envs[env_value_vec[0]] = env_value_vec[1];
+        }
+    }
 
-        mgb_c_opr_init_output_info(mgb_get_extern_c_opr_api_versioned,
-                                   name2outputinfo);
+    // parse loader path and interface
+    std::string loaderPathWithInterface = ExternOprLoaderPathWithInterface;
+    if (loaderPathWithInterface.size()) {
+        auto&& loaderPath_interface = split(loaderPathWithInterface, ":");
+        CC_ASSERT((loaderPath_interface.size() <= 2))
+                << "Wrong format. Specify loader path and interface using "
+                   "loader_path[:interface]";
+        loaderInfo.m_loader_path_with_interface.first = loaderPath_interface[0];
+        if (loaderPath_interface.size() == 1 || loaderPath_interface[1] == "") {
+            loaderInfo.m_loader_path_with_interface.second = "mgb_c_opr_init";
+        } else {
+            loaderInfo.m_loader_path_with_interface.second =
+                    loaderPath_interface[1];
+        }
     }
 }
 
@@ -332,7 +366,8 @@ class Importer {
         m_loader = serialization::GraphLoader::make(std::move(inp_file),
                                                     format.val());
 
-        parse_extern_output_info();
+        parse_extern_loader_info();
+        dummy_mgb_c_opr_init(mgb_get_extern_c_opr_api_versioned);
 
         LOG_DEBUG << "Process mgb graph\n";
         process_graph(options);
@@ -977,6 +1012,7 @@ class Importer {
         } else if (auto extern_opr =
                            opr->try_cast_final<opr::ExternCOprRunner>()) {
             auto user_datas = MGBOprLoaderImpl::get_user_datas();
+            void* extra_data = MGBOprLoaderImpl::get_extra_data();
 
             void* _data = nullptr;
             if (user_datas.find(opr->name()) != user_datas.end()) {
@@ -986,6 +1022,11 @@ class Importer {
             std::string data(
                     reinterpret_cast<const char*>(_data + sizeof(size_t)),
                     *(size_t*)(_data));
+            uint32_t data_len = static_cast<uint32_t>(data.size());
+            if (extra_data)
+                data += std::string(reinterpret_cast<const char*>(
+                                            extra_data + sizeof(size_t)),
+                                    *(size_t*)(extra_data));
             free(_data);
 
             std::vector<mlir::Type> v_resultTypes(opr->output().size());
@@ -999,7 +1040,7 @@ class Importer {
             auto values = m_builder.create<mlir::MGB::ExternOpr>(
                     m_builder.getUnknownLoc(), v_resultTypes,
                     var_array_to_value_array(opr->input()), opr->name(), data,
-                    static_cast<uint32_t>(data.size()), nr_input, nr_output);
+                    data_len, nr_input, nr_output);
             for (int i = 0; i < opr->output().size(); ++i) {
                 m_var2value.emplace(opr->output(i), values.getResult(i));
             }
diff --git a/compiler/lib/Target/TinyNN/exporter.cpp b/compiler/lib/Target/TinyNN/exporter.cpp
index 41dca537..5cad1962 100644
--- a/compiler/lib/Target/TinyNN/exporter.cpp
+++ b/compiler/lib/Target/TinyNN/exporter.cpp
@@ -312,7 +312,7 @@ class Exporter {
 
                         std::string name(op.name().data(), op.name().size());
                         std::string data(op.data().data(), op.data().size());
-                        uint32_t data_len = data.size();
+                        uint32_t data_len = op.data_len();
 
                         LOG_DEBUG << "Add ExternOpr instruction.\n";
                         instructions_type.push_back(
@@ -462,7 +462,7 @@ class Exporter {
                         output_tensor = tensor.second;
                         auto descs = llvm::to_vector<4>(
                                 op.descs().getAsRange<ArrayAttr>());
-                        auto flags= llvm::to_vector<4>(
+                        auto flags = llvm::to_vector<4>(
                                 op.flags().getAsRange<ArrayAttr>());
                         std::vector<Offset<MegCC::IndexDesc>> descs_;
                         std::vector<Offset<MegCC::IndexDesc>> flags_;
@@ -478,7 +478,8 @@ class Exporter {
                         auto descs_fbs = m_fbs_builder.CreateVector(descs_);
                         auto flags_fbs = m_fbs_builder.CreateVector(flags_);
 
-                        MegCC::SubTensorBuilder subtensor_builder(m_fbs_builder);
+                        MegCC::SubTensorBuilder subtensor_builder(
+                                m_fbs_builder);
                         subtensor_builder.add_inputs(input_tensors_);
                         subtensor_builder.add_input_types(input_types_);
                         subtensor_builder.add_output(output_tensor);
@@ -486,8 +487,10 @@ class Exporter {
                         subtensor_builder.add_flags(flags_fbs);
 
                         LOG_DEBUG << "Add subtensor instruction.\n";
-                        instructions_type.push_back(MegCC::Instruction_SubTensor);
-                        instructions.push_back(subtensor_builder.Finish().Union());
+                        instructions_type.push_back(
+                                MegCC::Instruction_SubTensor);
+                        instructions.push_back(
+                                subtensor_builder.Finish().Union());
                     })
                     .Case([&](Kernel::SetSubtensorIns op) {
                         kernel_exporter.addInst("SETSUBTENSOR");
@@ -593,7 +596,8 @@ class Exporter {
                         auto&& out_tensor = value2typed_tensor.at(
                                 op.result().getAsOpaquePointer());
                         LOG_DEBUG << "Add Broadcast instruction.\n";
-                        instructions_type.push_back(MegCC::Instruction_BroadCast);
+                        instructions_type.push_back(
+                                MegCC::Instruction_BroadCast);
                         instructions.push_back(
                                 MegCC::CreateBroadCast(
                                         m_fbs_builder, input_tensors_,
@@ -751,7 +755,7 @@ class Exporter {
                         auto&& out_tensor = value2typed_tensor.at(
                                 op.result().getAsOpaquePointer());
 
-                        auto mat_id= op.mat_idx();
+                        auto mat_id = op.mat_idx();
                         auto member = llvm::to_vector<4>(
                                 mat_id.getAsRange<IntegerAttr>());
                         std::vector<int32_t> mat_id_v;
@@ -1008,8 +1012,8 @@ class Exporter {
                                    m_fbs_builder.CreateString(name));
     }
 
-    Offset<MegCC::IndexDesc> indexdesc_to_fbs(ArrayAttr desc){
-        CC_ASSERT(desc.size()==5);
+    Offset<MegCC::IndexDesc> indexdesc_to_fbs(ArrayAttr desc) {
+        CC_ASSERT(desc.size() == 5);
         auto member = llvm::to_vector<5>(desc.getAsRange<IntegerAttr>());
         return MegCC::CreateIndexDesc(m_fbs_builder, member[0].getInt(),
                                       member[1].getInt(), member[2].getInt(),
diff --git a/runtime/example/standard_OS/lite_main.c b/runtime/example/standard_OS/lite_main.c
index ce7fe294..ff469b35 100644
--- a/runtime/example/standard_OS/lite_main.c
+++ b/runtime/example/standard_OS/lite_main.c
@@ -196,6 +196,8 @@ static void* dlsym(void* handle, const char* name) {
 #include <dlfcn.h>
 #endif
 
+const MGBExternCOprApi* megcc_get_extern_c_opr_api_versioned(int version);
+
 int main(int argc, char** argv) {
     LITE_set_log_level(WARN);
 #if TINYNN_CALLBACK_ENABLE
@@ -279,7 +281,7 @@ int main(int argc, char** argv) {
         void (*func)(const MGBExternCOprApi* (*)(int)) = NULL;
         *(void**)&func = dlsym(handle, c_opr_lib_interface);
         EXAMPLE_ASSERT(func, "load init interface of loader failed.\n");
-        func(mgb_get_extern_c_opr_api_versioned);
+        func(megcc_get_extern_c_opr_api_versioned);
     }
 
     LiteNetwork model;
diff --git a/runtime/src/vm/extern_opr.c b/runtime/src/vm/extern_opr.c
index e39ca310..4af4aec0 100644
--- a/runtime/src/vm/extern_opr.c
+++ b/runtime/src/vm/extern_opr.c
@@ -6,9 +6,9 @@
  * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
  */
 
-#include "vm.h"
-#include "utils.h"
 #include "extern_c_opr.h"
+#include "utils.h"
+#include "vm.h"
 
 #if ENABLE_INST_EXTERN_OPR
 
@@ -97,10 +97,11 @@ static void free_loader_maps(LoaderMapVec* lm) {
 }
 
 //! get API ptr for specific version; return nullptr if version mismatch
-const MGBExternCOprApi* mgb_get_extern_c_opr_api_versioned(int version) {
+const MGBExternCOprApi* megcc_get_extern_c_opr_api_versioned(int version) {
     static MGBExternCOprApi api;
     api.unregister_loader = unregister_loader;
-    TINYNN_ASSERT_MSG(version >= 0x24, "Extern opr loader version must greater than 0x24.\n");
+    TINYNN_ASSERT_MSG(version >= 0x24,
+                      "Extern opr loader version must greater than 0x24.\n");
 
     if (version != MGB_EXTERN_C_OPR_VERSION) {
         return NULL;
@@ -111,12 +112,12 @@ const MGBExternCOprApi* mgb_get_extern_c_opr_api_versioned(int version) {
 }
 
 // Convert Tensor to MGBTensor, except MGBTensor.data.
-static void Tensor2MGBTensor(const Tensor* tensor, MGBTensor* mgb_tensor){
+static void Tensor2MGBTensor(const Tensor* tensor, MGBTensor* mgb_tensor) {
     mgb_tensor->layout.shape.ndim = tensor->layout.nr_dim;
-    for(int i = 0; i < tensor->layout.nr_dim; ++i){
+    for (int i = 0; i < tensor->layout.nr_dim; ++i) {
         mgb_tensor->layout.shape.shape[i] = tensor->layout.dims[i];
     }
-    switch(tensor->dtype.type_enum){
+    switch (tensor->dtype.type_enum) {
         case TinyNN_FLOAT:
             mgb_tensor->layout.dtype = MGB_DTYPE_FLOAT32;
             break;
@@ -137,13 +138,13 @@ static void Tensor2MGBTensor(const Tensor* tensor, MGBTensor* mgb_tensor){
     }
 }
 
-static void MGBTensor2Tensor(const MGBTensor* mgb_tensor, Tensor* tensor){
+static void MGBTensor2Tensor(const MGBTensor* mgb_tensor, Tensor* tensor) {
     tensor->layout.nr_dim = mgb_tensor->layout.shape.ndim;
-    for(int i = 0; i < mgb_tensor->layout.shape.ndim; ++i){
+    for (int i = 0; i < mgb_tensor->layout.shape.ndim; ++i) {
         tensor->layout.dims[i] = mgb_tensor->layout.shape.shape[i];
     }
 
-    switch(mgb_tensor->layout.dtype){
+    switch (mgb_tensor->layout.dtype) {
         case MGB_DTYPE_FLOAT32:
             tensor->dtype.type_enum = TinyNN_FLOAT;
             break;
@@ -164,6 +165,26 @@ static void MGBTensor2Tensor(const MGBTensor* mgb_tensor, Tensor* tensor){
     }
 }
 
+#if defined(_WIN32)
+#include <io.h>
+#include <windows.h>
+#define RTLD_LAZY 0
+
+static void* dlopen(const char* file, int) {
+    return (void*)(LoadLibrary(file));
+}
+
+static void* dlsym(void* handle, const char* name) {
+    FARPROC symbol = GetProcAddress((HMODULE)handle, name);
+    return (void*)symbol;
+}
+
+#else
+#include <dlfcn.h>
+#endif
+
+static int has_set_env_and_loader = 0;
+
 static TinyNNStatus load(flatbuffers_generic_t fbs_inst, Instruction* inst,
                          VM* vm) {
     ExternOpr* extern_opr = &inst->workload.extern_opr;
@@ -174,53 +195,140 @@ static TinyNNStatus load(flatbuffers_generic_t fbs_inst, Instruction* inst,
     flatbuffers_int32_vec_t fbs_inputs = ns(ExternOpr_input(fbs_extern_opr));
     extern_opr->nr_input = flatbuffers_int32_vec_len(fbs_inputs);
     extern_opr->inputs = tinynn_malloc(sizeof(Tensor*) * extern_opr->nr_input);
-    for(int i = 0; i < extern_opr->nr_input; ++i){
+    for (int i = 0; i < extern_opr->nr_input; ++i) {
         extern_opr->inputs[i] = model->tensors + fbs_inputs[i];
     }
 
     flatbuffers_int32_vec_t fbs_outputs = ns(ExternOpr_output(fbs_extern_opr));
     extern_opr->nr_output = flatbuffers_int32_vec_len(fbs_outputs);
-    extern_opr->outputs = tinynn_malloc(sizeof(Tensor*) * extern_opr->nr_output);
-    for(int i = 0; i < extern_opr->nr_output; ++i){
+    extern_opr->outputs =
+            tinynn_malloc(sizeof(Tensor*) * extern_opr->nr_output);
+    for (int i = 0; i < extern_opr->nr_output; ++i) {
         extern_opr->outputs[i] = model->tensors + fbs_outputs[i];
     }
 
-    const char* name = ns(ExternOpr_name(fbs_extern_opr));
+    char* name = ns(ExternOpr_name(fbs_extern_opr));
     const void* data = ns(ExternOpr_data(fbs_extern_opr));
     size_t data_len = ns(ExternOpr_data_len(fbs_extern_opr));
+    int idx = 0;
+    while (name[idx] != '\0' && name[idx] != ':')
+        ++idx;
+    name[idx] = '\0';
+
+    if (!has_set_env_and_loader) {
+        const void* extra_data = data + data_len;
+        // parse and set ENV
+        size_t nr_env = *(size_t*)extra_data;
+        extra_data += sizeof(size_t);
+        for (int i = 0; i < nr_env; ++i) {
+            size_t env_len = *(size_t*)extra_data;
+            extra_data += sizeof(size_t);
+            char* env = (char*)tinynn_malloc(env_len + 1);
+            memcpy(env, extra_data, env_len);
+            env[env_len] = '\0';
+            extra_data += env_len;
+
+            size_t value_len = *(size_t*)extra_data;
+            extra_data += sizeof(size_t);
+            char* value = (char*)tinynn_malloc(value_len + 1);
+            memcpy(value, extra_data, value_len);
+            value[value_len] = '\0';
+            extra_data += value_len;
+
+            TINYNN_ASSERT_MSG((!setenv(env, value, 1)),
+                              "setenv failed.\n");  // 1 means overwrite when
+                                                    // 'env' does exist.
+            LOG_DEBUG("Set ENV: %s=%s\n", env, value);
+
+            tinynn_free(env);
+            tinynn_free(value);
+        }
+
+        // load loader
+        size_t loader_path_len = *(size_t*)extra_data;
+        extra_data += sizeof(size_t);
+        if (loader_path_len) {
+            char* loader_path = tinynn_malloc(loader_path_len + 1);
+            memcpy(loader_path, extra_data, loader_path_len);
+            extra_data += loader_path_len;
+            loader_path[loader_path_len] = '\0';
+            LOG_DEBUG("Try to load loader in path %s.\n", loader_path);
+            void* handle = dlopen(loader_path, RTLD_LAZY);
+            // if dlopen failed, but loader path is NOT absolute path.
+            if (!handle && loader_path[0] != '/') {
+                // try current path
+                char* extend_loader_path = tinynn_malloc(loader_path_len + 3);
+                extend_loader_path[0] = '.';
+                extend_loader_path[1] = '/';
+                memcpy(extend_loader_path + 2, loader_path,
+                       loader_path_len + 1);
+                LOG_DEBUG(
+                        "Load loader in path %s failed. Now try to load loader "
+                        "in path %s.\n",
+                        loader_path, extend_loader_path);
+                handle = dlopen(extend_loader_path, RTLD_LAZY);
+                tinynn_free(extend_loader_path);
+            }
+            tinynn_free(loader_path);
+            TINYNN_ASSERT_MSG(handle,
+                              "Load loader failed. Can NOT find loader file in "
+                              "given path.\n");
+
+            size_t interface_len = *(size_t*)extra_data;
+            extra_data += sizeof(size_t);
+            char* c_opr_lib_interface = tinynn_malloc(interface_len + 1);
+            memcpy(c_opr_lib_interface, extra_data, interface_len);
+            c_opr_lib_interface[interface_len] = '\0';
+            void (*func)(const MGBExternCOprApi* (*)(int)) = NULL;
+            *(void**)&func = dlsym(handle, c_opr_lib_interface);
+            tinynn_free(c_opr_lib_interface);
+            TINYNN_ASSERT_MSG(func, "load init interface of loader failed.\n");
+            func(megcc_get_extern_c_opr_api_versioned);
+        }
+        has_set_env_and_loader = 1;
+    }
 
     LoaderMap* loader_map = find_loader_by_name(&loader_maps, name);
     TINYNN_ASSERT_MSG(loader_map, "Wrong loader.\n");
     extern_opr->desc = loader_map->loader.create_desc(extern_opr->nr_input,
-        data, data_len);
-    
-    extern_opr->mgb_inputs = tinynn_malloc(sizeof(MGBTensor) * extern_opr->nr_input);
-    MGBTensorShape* inputs_shape = tinynn_malloc(sizeof(MGBTensorShape) * extern_opr->nr_input);
-    MGBDType* inputs_type = tinynn_malloc(sizeof(MGBDType) * extern_opr->nr_input);
-    for(int i = 0; i < extern_opr->nr_input; ++i){
+                                                      data, data_len);
+
+    extern_opr->mgb_inputs =
+            tinynn_malloc(sizeof(MGBTensor) * extern_opr->nr_input);
+    MGBTensorShape* inputs_shape =
+            tinynn_malloc(sizeof(MGBTensorShape) * extern_opr->nr_input);
+    MGBDType* inputs_type =
+            tinynn_malloc(sizeof(MGBDType) * extern_opr->nr_input);
+    for (int i = 0; i < extern_opr->nr_input; ++i) {
         Tensor2MGBTensor(extern_opr->inputs[i], extern_opr->mgb_inputs + i);
         inputs_shape[i] = extern_opr->mgb_inputs[i].layout.shape;
         inputs_type[i] = extern_opr->mgb_inputs[i].layout.dtype;
     }
 
-    extern_opr->mgb_outputs = tinynn_malloc(sizeof(MGBTensor) * extern_opr->nr_output);
-    MGBTensorShape* outputs_shape = tinynn_malloc(sizeof(MGBTensorShape) * extern_opr->nr_output);
-    MGBDType* outputs_type = tinynn_malloc(sizeof(MGBDType) * extern_opr->nr_output);
-
-    extern_opr->desc->infer_shape(extern_opr->desc, inputs_shape, outputs_shape);
-    if(extern_opr->desc->infer_dtype){
-        extern_opr->desc->infer_dtype(extern_opr->desc, inputs_type, outputs_type);
-    }else{
-        for(int i = 0; i < extern_opr->nr_output; ++i){
+    extern_opr->mgb_outputs =
+            tinynn_malloc(sizeof(MGBTensor) * extern_opr->nr_output);
+    MGBTensorShape* outputs_shape =
+            tinynn_malloc(sizeof(MGBTensorShape) * extern_opr->nr_output);
+    MGBDType* outputs_type =
+            tinynn_malloc(sizeof(MGBDType) * extern_opr->nr_output);
+
+    extern_opr->desc->infer_shape(extern_opr->desc, inputs_shape,
+                                  outputs_shape);
+    if (extern_opr->desc->infer_dtype) {
+        extern_opr->desc->infer_dtype(extern_opr->desc, inputs_type,
+                                      outputs_type);
+    } else {
+        for (int i = 0; i < extern_opr->nr_output; ++i) {
             outputs_type[i] = inputs_type[0];
         }
     }
 
-    for(int i = 0; i < extern_opr->nr_output; ++i){
+    for (int i = 0; i < extern_opr->nr_output; ++i) {
         extern_opr->mgb_outputs[i].layout.dtype = outputs_type[i];
         extern_opr->mgb_outputs[i].layout.shape.ndim = outputs_shape[i].ndim;
-        for(int j = 0; j < extern_opr->mgb_outputs[i].layout.shape.ndim; ++j){
-            extern_opr->mgb_outputs[i].layout.shape.shape[j] = outputs_shape[i].shape[j];
+        for (int j = 0; j < extern_opr->mgb_outputs[i].layout.shape.ndim; ++j) {
+            extern_opr->mgb_outputs[i].layout.shape.shape[j] =
+                    outputs_shape[i].shape[j];
         }
     }
 
@@ -236,14 +344,15 @@ static TinyNNStatus load(flatbuffers_generic_t fbs_inst, Instruction* inst,
 static TinyNNStatus execute(Instruction* inst, VM* vm) {
     ExternOpr* extern_opr = &inst->workload.extern_opr;
 
-    for(int i = 0; i < extern_opr->nr_input; ++i){
+    for (int i = 0; i < extern_opr->nr_input; ++i) {
         extern_opr->mgb_inputs[i].data = extern_opr->inputs[i]->ptr;
     }
-    for(int i = 0; i < extern_opr->nr_output; ++i){
+    for (int i = 0; i < extern_opr->nr_output; ++i) {
         extern_opr->mgb_outputs[i].data = extern_opr->outputs[i]->ptr;
     }
-    extern_opr->desc->execute(extern_opr->desc, extern_opr->mgb_inputs, extern_opr->mgb_outputs);
-    for(int i = 0; i < extern_opr->nr_output; ++i){
+    extern_opr->desc->execute(extern_opr->desc, extern_opr->mgb_inputs,
+                              extern_opr->mgb_outputs);
+    for (int i = 0; i < extern_opr->nr_output; ++i) {
         MGBTensor2Tensor(extern_opr->mgb_outputs + i, extern_opr->outputs[i]);
     }
 
@@ -271,7 +380,7 @@ void register_extern_opr(VM* vm) {
 #else
 void register_extern_opr(VM* vm) {}
 
-const MGBExternCOprApi* mgb_get_extern_c_opr_api_versioned(int i) {
+const MGBExternCOprApi* megcc_get_extern_c_opr_api_versioned(int i) {
     TINYNN_ASSERT_MSG(
             0,
             "Should NOT execute here!!!\n"
diff --git a/runtime/version.ld b/runtime/version.ld
index 76275193..bddbdb30 100644
--- a/runtime/version.ld
+++ b/runtime/version.ld
@@ -4,7 +4,7 @@ global:
     default_config;
     default_network_io;
     register_tinynn_cb;
-    mgb_get_extern_c_opr_api_versioned;
+    megcc_get_extern_c_opr_api_versioned;
 
 
 local:
diff --git a/script/ppl_build.sh b/script/ppl_build.sh
index c6d6f946..815191aa 100755
--- a/script/ppl_build.sh
+++ b/script/ppl_build.sh
@@ -2,4 +2,4 @@
 set -ex
 PROJECT_PATH="$(dirname $(readlink -f $0))/"
 KERNEL_DIR="${PROJECT_PATH}/kern/"
-${PROJECT_PATH}/script/runtime_build.py --cross_build --kernel_dir ${KERNEL_DIR} --remove_old_build --specify_build_dir ${PROJECT_PATH}/build $@
+${PROJECT_PATH}/runtime/script/runtime_build.py --cross_build --kernel_dir ${KERNEL_DIR} --remove_old_build --specify_build_dir ${PROJECT_PATH}/build $@
diff --git a/script/ppl_gen.sh b/script/ppl_gen.sh
index e0661202..dee56295 100755
--- a/script/ppl_gen.sh
+++ b/script/ppl_gen.sh
@@ -16,23 +16,25 @@ RUNTIME_PATH=${PROJECT_PATH}/runtime
 mkdir -p ${OUT_DIR}
 KERN_DIR="${OUT_DIR}/kern/"
 rm -fr ${OUT_DIR}/*
+mkdir -p "${OUT_DIR}/runtime"
 mkdir -p "${OUT_DIR}/model"
 mkdir -p "${OUT_DIR}/model_info"
-mkdir -p "${OUT_DIR}/script"
+mkdir -p "${OUT_DIR}/runtime/script"
 mkdir -p "${KERN_DIR}"
 ${DUMP_APP} --json="${JSON_PATH}" "${ARCH_SPECIFIC}" --dump="${KERN_DIR}" ${EXTRA_DUMP_CMD}
-cp -r "${RUNTIME_PATH}/flatcc" "${OUT_DIR}/flatcc"
-cp -r "${RUNTIME_PATH}/include" "${OUT_DIR}/include"
-cp -r "${RUNTIME_PATH}/schema" "${OUT_DIR}/schema"
-cp -r "${RUNTIME_PATH}/example" "${OUT_DIR}/example"
-cp -r "${RUNTIME_PATH}/src" "${OUT_DIR}/src"
-cp "${RUNTIME_PATH}/CMakeLists.txt" "${OUT_DIR}/CMakeLists.txt"
+cp -r "${RUNTIME_PATH}/flatcc" "${OUT_DIR}/runtime/flatcc"
+cp -r "${RUNTIME_PATH}/include" "${OUT_DIR}/runtime/include"
+cp -r "${RUNTIME_PATH}/schema" "${OUT_DIR}/runtime/schema"
+cp -r "${RUNTIME_PATH}/example" "${OUT_DIR}/runtime/example"
+cp -r "${RUNTIME_PATH}/src" "${OUT_DIR}/runtime/src"
+cp -r "${PROJECT_PATH}/immigration" "${OUT_DIR}/immigration"
+cp "${RUNTIME_PATH}/CMakeLists.txt" "${OUT_DIR}/runtime/CMakeLists.txt"
 MODEL_FILE=`find ${OUT_DIR}/kern/ -name "*.tiny"`
 if [ ! -z "${MODEL_FILE}" ];then
   mv ${OUT_DIR}/kern/*.tiny "${OUT_DIR}/model"
   mv ${OUT_DIR}/kern/*.tiny.txt "${OUT_DIR}/model_info"
 fi
 cp -a "${PROJECT_PATH}"/script/{ppl_build.sh,test_model.py} "${OUT_DIR}/"
-cp "${PROJECT_PATH}/runtime/scripts/runtime_build.py" "${OUT_DIR}/script/"
+cp "${RUNTIME_PATH}/scripts/runtime_build.py" "${OUT_DIR}/runtime/script/"
 cp "${JSON_PATH}" "${OUT_DIR}/"
 tar -czf megcc_ppl_gen.tar.gz "${OUT_DIR}"

From 2647e9b69c0a5a939210984286e3868b9cce269b Mon Sep 17 00:00:00 2001
From: zhanghaolong <zhanghaolong@megvii.com>
Date: Fri, 6 Jan 2023 16:42:06 +0800
Subject: [PATCH 09/17] feat(compiler): support qsi8qsi8qsi32qsi32 for
 Int8DotConv1x1Mk4M8N12

---
 .../lib/KernelGen/Arm/Arm64/Activation.cpp    |  2 +-
 .../Int8/Int8DotConv1x1Mk4M8N12.cpp           | 11 ++++++++--
 .../InternalKernel/Int8DotM8N12MK4GEMM.cpp    | 19 +++++++++++-----
 .../Arm/ArmCommon/InternalKernel.cpp          | 22 +++++++++++++++++--
 .../lib/KernelGen/Arm/ArmCommon/Typecvt.cpp   |  1 +
 compiler/lib/KernelGen/Common/ConvKernel.h    | 10 +++++++--
 compiler/lib/KernelGen/Utils/Utils.h          |  3 ++-
 runtime/src/vm/registry.h                     |  2 ++
 8 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/compiler/lib/KernelGen/Arm/Arm64/Activation.cpp b/compiler/lib/KernelGen/Arm/Arm64/Activation.cpp
index 3ef6a2be..6f07e8a4 100644
--- a/compiler/lib/KernelGen/Arm/Arm64/Activation.cpp
+++ b/compiler/lib/KernelGen/Arm/Arm64/Activation.cpp
@@ -191,7 +191,7 @@ std::string ActivationGenAsmBase::GenAsmQuantStore(
         }
         ss << gener.render(temp_ss.str());
     } else {
-        CC_ASSERT(dst_specifier == "int32_t");
+        CC_ASSERT(dst_specifier == "int32_t" || dst_specifier == "int");
         if (!with_store) {
             return "";
         }
diff --git a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConv1x1Mk4M8N12.cpp b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConv1x1Mk4M8N12.cpp
index 04d25398..0e055ab1 100644
--- a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConv1x1Mk4M8N12.cpp
+++ b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel/Int8/Int8DotConv1x1Mk4M8N12.cpp
@@ -40,7 +40,7 @@ bool Conv1x1DotMk4::IsAvailable(TContext* ctx) const {
                      ctx->getAttrStr("nonlineMode") == "RELU" ||
                      ctx->getAttrStr("nonlineMode") == "H_SWISH";
 
-    bool type_ok = is_qint8_conv_dtype(ctx);
+    bool type_ok = is_qint8_conv_dtype(ctx, true);
 
     bool layout_ok = ctx->getAttrOprand("operand:0").shape.size() == 5 &&
                      ctx->getAttrOprand("operand:0").shape[4] == 4;
@@ -186,6 +186,9 @@ std::shared_ptr<TContext> Conv1x1DotMk4::GetInnerCtx(TContext* ctx) const {
     inner_ctx->setAttr("transposeB", false);
     inner_ctx->setAttr("format", "MK4_DOT");
     inner_ctx->setAttr("dtype", ctx->getAttrOprand("operand:0").dtype);
+    auto last_dtype = Utils::get_last_operand(ctx).dtype;
+    auto last_dtype_str = SymbolHelper::gen_valid_dtype(last_dtype);
+    inner_ctx->setAttr("last_dtype", last_dtype_str);
     return inner_ctx;
 }
 
@@ -202,6 +205,9 @@ std::string Conv1x1DotMk4::GetKernelBody(TContext* ctx) const {
         gen_temp_dst =
                 "void* temp_dst = (int8_t*) workspace_ptr + pack_b_align;";
     }
+    auto last_dtype = Utils::get_last_operand(ctx).dtype;
+    auto last_dtype_str = SymbolHelper::gen_valid_dtype(last_dtype);
+    std::string dst_specifier = Utils::cvt_dtype_specifier(last_dtype_str);
     writer << StringTemplate::StringTemplateArgs()
                       .add("bias_ptr_str", bias_ptr_str)
                       .add("packb_size_sym",
@@ -212,9 +218,10 @@ std::string Conv1x1DotMk4::GetKernelBody(TContext* ctx) const {
                       .add("naked_kern_sym",
                            m_inner_gemm.GetNakedKernelSymbol(inner_ctx.get()))
                       .add("gen_temp_dst", gen_temp_dst)
+                      .add("dst_specifier", dst_specifier)
                       .render(R"({
     int8_t* input_data = inputs[0]->ptr;
-    int8_t* output_data = outputs[0]->ptr;
+    ${dst_specifier}* output_data = outputs[0]->ptr;
 
     Layout in_layout = inputs[0]->layout;
     Layout out_layout = outputs[0]->layout;
diff --git a/compiler/lib/KernelGen/Arm/Arm64/InternalKernel/Int8DotM8N12MK4GEMM.cpp b/compiler/lib/KernelGen/Arm/Arm64/InternalKernel/Int8DotM8N12MK4GEMM.cpp
index 7517b0bf..aed24e9f 100644
--- a/compiler/lib/KernelGen/Arm/Arm64/InternalKernel/Int8DotM8N12MK4GEMM.cpp
+++ b/compiler/lib/KernelGen/Arm/Arm64/InternalKernel/Int8DotM8N12MK4GEMM.cpp
@@ -9,11 +9,11 @@
 
 #include "Arm/Arm64/Activation.h"
 #include "Arm/ArmCommon/MatmulCommon.h"
+#include "Arm/ArmCommon/common_asm_utils.h"
 #include "InternalKernel.h"
 #include "Utils/StringTemplate.h"
 #include "Utils/Utils.h"
 #include "compiler/Common/Logger.h"
-#include "Arm/ArmCommon/common_asm_utils.h"
 using namespace megcc;
 using namespace KernelGen;
 using namespace Arm64;
@@ -51,10 +51,10 @@ std::string interleave_1x4_4_b() {
 }
 
 std::string prefetch() {
-     return R"(
+    return R"(
         #define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n"
-    )" + KernelGen::ArmCommon::gen_common_prefetch_2x_f32()
-    + KernelGen::ArmCommon::gen_common_prefetch_3x_f32();
+    )" + KernelGen::ArmCommon::gen_common_prefetch_2x_f32() +
+           KernelGen::ArmCommon::gen_common_prefetch_3x_f32();
 }
 
 std::string transpose_1x12() {
@@ -1500,6 +1500,11 @@ std::string MatmulInt8DotM8N12MK4Kernel::GetKernelSymbol(TContext* ctx) const {
         CC_ASSERT(dtype == "8832");
         ss << "_" << dtype;
     }
+    if (ctx->haveAttr("last_dtype")) {
+        auto last_dtype = ctx->getAttrStr("last_dtype");
+        ss << "_"
+           << "output_dtype_" << last_dtype;
+    }
     return ss.str();
 }
 
@@ -1533,6 +1538,10 @@ std::string MatmulInt8DotM8N12MK4Kernel::GetKernelBody(TContext* ctx) const {
     writer << prefetch();
     writer << transpose_1x12();
     auto dtype = ctx->getAttrStr("dtype");
+    std::string last_dtype = "si8";
+    if (ctx->haveAttr("last_dtype")) {
+        last_dtype = ctx->getAttrStr("last_dtype");
+    }
     std::string dst_specifier = "int32_t";
     auto nonline_mode = ctx->haveAttr("nonlineMode")
                                 ? ctx->getAttrStr("nonlineMode")
@@ -1540,7 +1549,7 @@ std::string MatmulInt8DotM8N12MK4Kernel::GetKernelBody(TContext* ctx) const {
     if (Utils::is_quant_dtype(dtype) &&
         (nonline_mode == "RELU" || nonline_mode == "IDENTITY" ||
          nonline_mode == "H_SWISH")) {
-        dst_specifier = "int8_t";
+        dst_specifier = Utils::cvt_dtype_specifier(last_dtype);
     }
     //! sigmoid use explicit postprocess
     bool need_temp_dst = need_post_process(ctx);
diff --git a/compiler/lib/KernelGen/Arm/ArmCommon/InternalKernel.cpp b/compiler/lib/KernelGen/Arm/ArmCommon/InternalKernel.cpp
index ea57df76..66df056c 100644
--- a/compiler/lib/KernelGen/Arm/ArmCommon/InternalKernel.cpp
+++ b/compiler/lib/KernelGen/Arm/ArmCommon/InternalKernel.cpp
@@ -25,8 +25,17 @@ std::string MatmulInternal::GenNakedKernelCall(TContext* ctx) {
         return R"((const float* pack_a, const float* pack_b, float* C,
             size_t LDC, size_t M, size_t N, size_t K, const float* bias_ptr))";
     } else if (Utils::is_quant_dtype(dtype, 8)) {
-        return R"((const int8_t* pack_a, const int8_t* pack_b, int8_t* C,
+        std::string last_dtype = "si8";
+        if (ctx->haveAttr("last_dtype")) {
+            last_dtype = ctx->getAttrStr("last_dtype");
+        }
+        if (Utils::is_int_dtype(last_dtype, 32)) {
+            return R"((const int8_t* pack_a, const int8_t* pack_b, int* C,
             size_t LDC, size_t M, size_t N, size_t K, const int32_t* bias_ptr, void* workspace, float scale, float temp_scale, float dst_scale_inv))";
+        } else {
+            return R"((const int8_t* pack_a, const int8_t* pack_b, int8_t* C,
+            size_t LDC, size_t M, size_t N, size_t K, const int32_t* bias_ptr, void* workspace, float scale, float temp_scale, float dst_scale_inv))";
+        }
     } else if (dtype == "8832") {
         return R"((const int8_t* pack_a, const int8_t* pack_b, int32_t* C,
             size_t LDC, size_t M, size_t N, size_t K, const int32_t* bias_ptr, float scale))";
@@ -42,8 +51,17 @@ std::string MatmulInternal::GenKernelCall(TContext* ctx) {
         return R"((const float* A, size_t LDA, const float* B, size_t LDB, float* C,
             size_t LDC, size_t M, size_t N, size_t K, const float* bias_ptr, void* workspace))";
     } else if (Utils::is_quant_dtype(dtype, 8)) {
-        return R"((const int8_t* A, size_t LDA, const int8_t* B, size_t LDB, int8_t* C,
+        std::string last_dtype = "si8";
+        if (ctx->haveAttr("last_dtype")) {
+            last_dtype = ctx->getAttrStr("last_dtype");
+        }
+        if (Utils::is_int_dtype(last_dtype, 32)) {
+            return R"((const int8_t* A, size_t LDA, const int8_t* B, size_t LDB, int* C,
+            size_t LDC, size_t M, size_t N, size_t K, const int32_t* bias_ptr, void* workspace, float scale, float temp_scale, float dst_scale_inv))";
+        } else {
+            return R"((const int8_t* A, size_t LDA, const int8_t* B, size_t LDB, int8_t* C,
             size_t LDC, size_t M, size_t N, size_t K, const int32_t* bias_ptr, void* workspace, float scale, float temp_scale, float dst_scale_inv))";
+        }
     } else if (dtype == "8832") {
         return R"((const int8_t* A, size_t LDA, const int8_t* B, size_t LDB, int32_t* C,
             size_t LDC, size_t M, size_t N, size_t K, const int32_t* bias_ptr, void* workspace, float scale))";
diff --git a/compiler/lib/KernelGen/Arm/ArmCommon/Typecvt.cpp b/compiler/lib/KernelGen/Arm/ArmCommon/Typecvt.cpp
index 4abcb519..af9ee0d4 100644
--- a/compiler/lib/KernelGen/Arm/ArmCommon/Typecvt.cpp
+++ b/compiler/lib/KernelGen/Arm/ArmCommon/Typecvt.cpp
@@ -195,6 +195,7 @@ std::string TypecvtKernel::GetKernelBody(TContext* context) const {
     std::string dst_specifier = Utils::cvt_dtype_specifier(dst_dtype_str);
     ss << R"(
     #include <arm_neon.h>
+    #include <math.h>
     )";
     ss << gen_neon_intrin_compat();
     ss << init_declare(src_dtype_str, dst_dtype_str);
diff --git a/compiler/lib/KernelGen/Common/ConvKernel.h b/compiler/lib/KernelGen/Common/ConvKernel.h
index 1c946403..6c25b265 100644
--- a/compiler/lib/KernelGen/Common/ConvKernel.h
+++ b/compiler/lib/KernelGen/Common/ConvKernel.h
@@ -26,14 +26,20 @@ class ConvImpl : public KernelFunc {
     }
     std::string GetKernelSymbol(TContext* context) const override;
 
-    static bool is_qint8_conv_dtype(TContext* ctx) {
+    static bool is_qint8_conv_dtype(TContext* ctx,
+                                    bool is_dst_support_si32 = false) {
         bool type_ok = ctx->getAttrInt("nr_operands") >= 3;
         auto dst_dtype = Utils::get_last_operand(ctx).dtype;
         type_ok = type_ok && Utils::is_quant_dtype(
                                      ctx->getAttrOprand("operand:0").dtype, 8);
         type_ok = type_ok && Utils::is_quant_dtype(
                                      ctx->getAttrOprand("operand:1").dtype, 8);
-        type_ok = type_ok && Utils::is_quant_dtype(dst_dtype, 8);
+        if (is_dst_support_si32) {
+            type_ok = type_ok && (Utils::is_quant_dtype(dst_dtype, 8) ||
+                                  Utils::is_quant_dtype(dst_dtype, 32));
+        } else {
+            type_ok = type_ok && Utils::is_quant_dtype(dst_dtype, 8);
+        }
         if (is_bias(ctx)) {
             type_ok = type_ok &&
                       Utils::is_quant_dtype(
diff --git a/compiler/lib/KernelGen/Utils/Utils.h b/compiler/lib/KernelGen/Utils/Utils.h
index 11f4ae0b..a46b86b7 100644
--- a/compiler/lib/KernelGen/Utils/Utils.h
+++ b/compiler/lib/KernelGen/Utils/Utils.h
@@ -63,7 +63,8 @@ static inline bool is_float_dtype(const std::string& dtype,
 static inline bool is_int_dtype(const std::string& dtype, int bit_width = -1) {
     if (bit_width == 8 && (dtype == "i8" || dtype == "si8" || dtype == "ui8")) {
         return true;
-    } else if (bit_width == 32 && (dtype == "i32" || dtype == "si32")) {
+    } else if (bit_width == 32 &&
+               (dtype == "i32" || dtype == "si32" || dtype == "qsi32")) {
         return true;
     } else if (bit_width == 16 && (dtype == "i16" || dtype == "ui16")) {
         return true;
diff --git a/runtime/src/vm/registry.h b/runtime/src/vm/registry.h
index 83b11721..5e48e78d 100644
--- a/runtime/src/vm/registry.h
+++ b/runtime/src/vm/registry.h
@@ -29,6 +29,8 @@ void register_broadcast_shape_of(VM* vm);
 
 void register_reshape(VM* vm);
 
+void register_extern_opr(VM* vm);
+
 #endif  // VM_REGISTRY_H
 
 // vim: syntax=cpp.doxygen

From 5a265083f08dcbc80ab6bd25b42722161e78b404 Mon Sep 17 00:00:00 2001
From: limingxin <limingxin@megvii.com>
Date: Fri, 6 Jan 2023 19:18:55 +0800
Subject: [PATCH 10/17] ci: add extern c opr loader test

---
 compiler/include/compiler/Target/MGB/dummy_loader.h | 8 ++++----
 runtime/src/vm/extern_opr.c                         | 6 ++++++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/compiler/include/compiler/Target/MGB/dummy_loader.h b/compiler/include/compiler/Target/MGB/dummy_loader.h
index 49cce414..842da8bd 100644
--- a/compiler/include/compiler/Target/MGB/dummy_loader.h
+++ b/compiler/include/compiler/Target/MGB/dummy_loader.h
@@ -106,11 +106,11 @@ class MGBOprDescImpl {
 class MGBOprLoaderImpl {
     static std::map<std::string, void*> user_datas;
     // extra_data format:
-    // total_len
-    // nr_env
-    //     ENV_len_1:ENV_1:VALUE_len_1:VALUE_1
+    // total_len(size_t)
+    // nr_env(size_t)
+    //     ENV_len_1(size_t):ENV_1(char[ENV_len_1]):VALUE_len_1(size_t):VALUE_1(char[VALUE_len_1])
     //     ENV_len_2....
-    // loader_path_len:loader_path:interface_len:interface
+    // loader_path_len(size_t):loader_path(char[loader_path_len]):interface_len(size_t):interface(char[interface_len])
     static std::shared_ptr<void> extra_data;
 
     static void make_extra_data() {
diff --git a/runtime/src/vm/extern_opr.c b/runtime/src/vm/extern_opr.c
index 4af4aec0..12d7dbdd 100644
--- a/runtime/src/vm/extern_opr.c
+++ b/runtime/src/vm/extern_opr.c
@@ -217,6 +217,12 @@ static TinyNNStatus load(flatbuffers_generic_t fbs_inst, Instruction* inst,
 
     if (!has_set_env_and_loader) {
         const void* extra_data = data + data_len;
+        // extra_data format:
+        // nr_env(size_t)
+        //     ENV_len_1(size_t):ENV_1(char[ENV_len_1]):VALUE_len_1(size_t):VALUE_1(char[VALUE_len_1])
+        //     ENV_len_2....
+        // loader_path_len(size_t):loader_path(char[loader_path_len]):interface_len(size_t):interface(char[interface_len])
+
         // parse and set ENV
         size_t nr_env = *(size_t*)extra_data;
         extra_data += sizeof(size_t);

From 8c4ef921906f880ce35cc4e5e11d20f1769612f2 Mon Sep 17 00:00:00 2001
From: yuxiongxiong <yuxiongxiong@megvii.com>
Date: Mon, 9 Jan 2023 10:39:52 +0800
Subject: [PATCH 11/17] feat(compiler): basic change for more winograd algo

---
 compiler/lib/KernelGen/Arm/Arm64/ConvKernel.h |   5 +-
 .../lib/KernelGen/BareMetal/ConvKernel.cpp    |  53 ++++---
 .../GeneralIntrinsic/ConvKernel/ConvKernel.h  |  37 +++++
 .../ConvKernel/Fp32Conv1x1Mk4M4N12.cpp        |  50 +++---
 .../ConvKernel/Fp32ConvNchwNchw44.cpp         |  32 ++--
 .../ConvKernel/Fp32Im2col.cpp                 |  53 ++++---
 .../ConvKernel/Fp32WinogradNchw44.cpp         | 149 ++++++++++++++++++
 .../ConvKernel/Winograd/WinogradCommon.cpp    |  18 ++-
 .../ConvKernel/Winograd/WinogradCommon.h      |   2 +-
 .../Winograd/WinogradF23Strategy4x8MK4.cpp    |   2 +-
 .../Winograd/WinogradF23Strategy4x8MK4.h      |   2 +-
 .../InternalKernel/Fp32M4N12K4Matmul.cpp      |  18 ++-
 .../InternalKernel/InternalKernel.cpp         |   8 +-
 .../InternalKernel/InternalKernel.h           |   4 +-
 .../KernelGen/GeneralIntrinsic/KernelPack.cpp |   8 +-
 compiler/lib/KernelGen/KernelGen.cpp          |  74 +++++++--
 immigration/include/marm_neon.h               |   2 +-
 17 files changed, 394 insertions(+), 123 deletions(-)

diff --git a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel.h b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel.h
index ba959e35..dee5bd6f 100644
--- a/compiler/lib/KernelGen/Arm/Arm64/ConvKernel.h
+++ b/compiler/lib/KernelGen/Arm/Arm64/ConvKernel.h
@@ -9,9 +9,9 @@
 #pragma once
 #include <memory>
 #include <string>
+#include "Arm/ArmCommon/ConvKernel/Fp32/Winograd/WinogradCommon.h"
 #include "Common/ConvKernel.h"
 #include "ConvKernel/Fp32/Winograd/WinogradF23Strategy4x16MK4.h"
-#include "Arm/ArmCommon/ConvKernel/Fp32/Winograd/WinogradCommon.h"
 #include "InternalKernel/InternalKernel.h"
 #include "Utils/StringTemplate.h"
 #include "Utils/SymbolHelper.h"
@@ -138,6 +138,7 @@ class ChannelWiseInt8Mk4K3 : public Arm64ConvImpl {
 class WinogradFloatF23Nchw44 : public Arm64ConvImpl {
     mutable ArmCommon::WinogradFrameNchw44 m_framework;
     mutable WinogradF23Strategy4x16MK4 m_winograd_strategy;
+
 public:
     bool IsAvailable(TContext* context) const override;
     //! kernel gen
@@ -154,6 +155,6 @@ class WinogradFloatF23Nchw44 : public Arm64ConvImpl {
 
 }  // namespace Arm64
 }  // namespace KernelGen
-}  // namespace megcc 
+}  // namespace megcc
 
 // vim: syntax=cpp.doxygen
diff --git a/compiler/lib/KernelGen/BareMetal/ConvKernel.cpp b/compiler/lib/KernelGen/BareMetal/ConvKernel.cpp
index f7e2016f..fd9f3310 100644
--- a/compiler/lib/KernelGen/BareMetal/ConvKernel.cpp
+++ b/compiler/lib/KernelGen/BareMetal/ConvKernel.cpp
@@ -129,31 +129,36 @@ std::string get_dst_foramt(const std::string& filter_format) {
 }  // namespace
 std::string ConvImpl::GetKernelSymbol(TContext* ctx) const {
     std::stringstream extra_ss;
-    extra_ss << "_" << SymbolHelper::gen_io_str(ctx);
-    if (is_bias(ctx)) {
-        extra_ss << "_bias";
-    }
-    if (ctx->haveAttr("nonlineMode") &&
-        ctx->getAttrStr("nonlineMode") != "IDENTITY") {
-        extra_ss << "_" << ctx->getAttrStr("nonlineMode");
+    if (ctx) {
+        extra_ss << "_" << SymbolHelper::gen_io_str(ctx);
+        if (is_bias(ctx)) {
+            extra_ss << "_bias";
+        }
+        if (ctx->haveAttr("nonlineMode") &&
+            ctx->getAttrStr("nonlineMode") != "IDENTITY") {
+            extra_ss << "_" << ctx->getAttrStr("nonlineMode");
+        }
+        std::string name_temp =
+                "kernel_conv2d_${kernel_h}x${kernel_w}_${format}_${sparse}_p$"
+                "{pad_h}x${pad_w}_s${stride_h}x${stride_w}_d${dilate_h}x${"
+                "dilate_w}"
+                "${extra}";
+        return StringTemplate::StringTemplateArgs(ctx)
+                .add_ctx_int("kernel_h")
+                .add_ctx_int("kernel_w")
+                .add("format", get_format(ctx))
+                .add_ctx_str("sparse")
+                .add_ctx_int("pad_h")
+                .add_ctx_int("pad_w")
+                .add_ctx_int("stride_h")
+                .add_ctx_int("stride_w")
+                .add_ctx_int("dilate_h")
+                .add_ctx_int("dilate_w")
+                .add("extra", extra_ss.str())
+                .render(name_temp);
+    } else {
+        return "kernel_conv2d";
     }
-    std::string name_temp =
-            "kernel_conv2d_${kernel_h}x${kernel_w}_${format}_${sparse}_p$"
-            "{pad_h}x${pad_w}_s${stride_h}x${stride_w}_d${dilate_h}x${dilate_w}"
-            "${extra}";
-    return StringTemplate::StringTemplateArgs(ctx)
-            .add_ctx_int("kernel_h")
-            .add_ctx_int("kernel_w")
-            .add("format", get_format(ctx))
-            .add_ctx_str("sparse")
-            .add_ctx_int("pad_h")
-            .add_ctx_int("pad_w")
-            .add_ctx_int("stride_h")
-            .add_ctx_int("stride_w")
-            .add_ctx_int("dilate_h")
-            .add_ctx_int("dilate_w")
-            .add("extra", extra_ss.str())
-            .render(name_temp);
 }
 
 std::string ConvGeneral::GetKernelBody(TContext* context) const {
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/ConvKernel.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/ConvKernel.h
index f09484f2..5b27175a 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/ConvKernel.h
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/ConvKernel.h
@@ -14,6 +14,8 @@
 #include "Utils/StringTemplate.h"
 #include "Utils/SymbolHelper.h"
 #include "Winograd/WinogradF23Strategy4x8MK4.h"
+#include "Winograd/WinogradF43Strategy4x16MK4.h"
+#include "Winograd/WinogradF63Strategy4x16MK4.h"
 #include "compiler/KernelGen/KernelGen.h"
 
 namespace megcc {
@@ -129,6 +131,41 @@ class Conv1x1FloatMk4 : public GIConvImpl {
     std::shared_ptr<TContext> GetInnerCtx(TContext* ctx) const;
     MatmulM4N12MK4Kernel m_inner_gemm;
 };
+class WinogradFloatF43Nchw44 : public GIConvImpl {
+    mutable WinogradFrameNchw44 m_framework;
+    mutable WinogradF43Strategy4x16MK4 m_winograd_strategy;
+
+public:
+    bool IsAvailable(TContext* context) const override;
+    //! kernel gen
+    std::string GetKernelBody(TContext* context) const override;
+    //! init gen
+    std::string GetInitBody(TContext* context) const override;
+    std::string GetWorkspaceBody(TContext* context) const override;
+
+    std::vector<KernelObj> GetDependInternalSymbol(
+            TContext* context) const override;
+
+    std::string GetKernelSymbol(TContext* context) const override;
+};
+
+class WinogradFloatF63Nchw44 : public GIConvImpl {
+    mutable WinogradFrameNchw44 m_framework;
+    mutable WinogradF63Strategy4x16MK4 m_winograd_strategy;
+
+public:
+    bool IsAvailable(TContext* context) const override;
+    //! kernel gen
+    std::string GetKernelBody(TContext* context) const override;
+    //! init gen
+    std::string GetInitBody(TContext* context) const override;
+    std::string GetWorkspaceBody(TContext* context) const override;
+
+    std::vector<KernelObj> GetDependInternalSymbol(
+            TContext* context) const override;
+
+    std::string GetKernelSymbol(TContext* context) const override;
+};
 
 }  // namespace GeneralIntrinsic
 }  // namespace KernelGen
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Conv1x1Mk4M4N12.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Conv1x1Mk4M4N12.cpp
index eecacb7b..f5cdc8fc 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Conv1x1Mk4M4N12.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Conv1x1Mk4M4N12.cpp
@@ -46,30 +46,34 @@ bool Conv1x1FloatMk4::IsAvailable(TContext* ctx) const {
 
 std::string Conv1x1FloatMk4::GetKernelSymbol(TContext* ctx) const {
     std::stringstream extra_ss;
-    if (is_bias(ctx)) {
-        extra_ss << "_bias";
-    }
-    if (ctx->haveAttr("nonlineMode") &&
-        ctx->getAttrStr("nonlineMode") != "IDENTITY") {
-        extra_ss << "_" << ctx->getAttrStr("nonlineMode");
+    if (ctx) {
+        if (is_bias(ctx)) {
+            extra_ss << "_bias";
+        }
+        if (ctx->haveAttr("nonlineMode") &&
+            ctx->getAttrStr("nonlineMode") != "IDENTITY") {
+            extra_ss << "_" << ctx->getAttrStr("nonlineMode");
+        }
+        std::string name_temp =
+                "GI_kernel_conv2d_conv1x1_${format}_${kernel_h}x${kernel_w}_${"
+                "sparse}_p${pad_h}x${pad_w}_s${stride_h}x${stride_w}_d${"
+                "dilate_h}x${dilate_w}${extra}";
+        return StringTemplate::StringTemplateArgs(ctx)
+                .add_ctx_int("kernel_h")
+                .add_ctx_int("kernel_w")
+                .add_ctx_str("format")
+                .add_ctx_str("sparse")
+                .add_ctx_int("pad_h")
+                .add_ctx_int("pad_w")
+                .add_ctx_int("stride_h")
+                .add_ctx_int("stride_w")
+                .add_ctx_int("dilate_h")
+                .add_ctx_int("dilate_w")
+                .add("extra", extra_ss.str())
+                .render(name_temp);
+    } else {
+        return "GI_kernel_conv2d_conv1x1";
     }
-    std::string name_temp =
-            "GI_kernel_conv2d_conv1x1_${format}_${kernel_h}x${kernel_w}_${"
-            "sparse}_p${pad_h}x${pad_w}_s${stride_h}x${stride_w}_d${"
-            "dilate_h}x${dilate_w}${extra}";
-    return StringTemplate::StringTemplateArgs(ctx)
-            .add_ctx_int("kernel_h")
-            .add_ctx_int("kernel_w")
-            .add_ctx_str("format")
-            .add_ctx_str("sparse")
-            .add_ctx_int("pad_h")
-            .add_ctx_int("pad_w")
-            .add_ctx_int("stride_h")
-            .add_ctx_int("stride_w")
-            .add_ctx_int("dilate_h")
-            .add_ctx_int("dilate_w")
-            .add("extra", extra_ss.str())
-            .render(name_temp);
 }
 
 std::string Conv1x1FloatMk4::GetInitBody(TContext* ctx) const {
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp
index 86fba8ad..945e35c2 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32ConvNchwNchw44.cpp
@@ -46,19 +46,25 @@ bool ConvFloatNCHWNCHW44::IsAvailable(TContext* ctx) const {
     return param_value_ok && param_mode_ok && type_ok && noline_ok && layout_ok;
 }
 std::string ConvFloatNCHWNCHW44::GetKernelSymbol(TContext* ctx) const {
-    auto src_tensor = ctx->getAttrOprand("operand:0");
-    CC_ASSERT(src_tensor.shape.size() > 0)
-            << "src_tensor size should > 0, now" << src_tensor.shape.size();
-    uint32_t ic = src_tensor.shape[1];
-    auto dst_tensor = ctx->getAttrOprand(
-            "operand:" + std::to_string(ctx->getAttrInt("nr_operands") - 1));
-    uint32_t oc = dst_tensor.shape[1] * 4;
-    std::string name_temp = "${base_kernel_sym}_nchw_nchw44_oc${oc}_ic${ic}";
-    return StringTemplate::StringTemplateArgs(ctx)
-            .add("base_kernel_sym", GIConvImpl::GetKernelSymbol(ctx))
-            .add("oc", oc)
-            .add("ic", ic)
-            .render(name_temp);
+    if (ctx) {
+        auto src_tensor = ctx->getAttrOprand("operand:0");
+        CC_ASSERT(src_tensor.shape.size() > 0)
+                << "src_tensor size should > 0, now" << src_tensor.shape.size();
+        uint32_t ic = src_tensor.shape[1];
+        auto dst_tensor = ctx->getAttrOprand(
+                "operand:" +
+                std::to_string(ctx->getAttrInt("nr_operands") - 1));
+        uint32_t oc = dst_tensor.shape[1] * 4;
+        std::string name_temp =
+                "${base_kernel_sym}_nchw_nchw44_oc${oc}_ic${ic}";
+        return StringTemplate::StringTemplateArgs(ctx)
+                .add("base_kernel_sym", GIConvImpl::GetKernelSymbol(ctx))
+                .add("oc", oc)
+                .add("ic", ic)
+                .render(name_temp);
+    } else {
+        return "GI_kernel_conv2d_nchw_nchw44";
+    }
 }
 
 std::string ConvFloatNCHWNCHW44::GetInitBody(TContext* ctx) const {
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Im2col.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Im2col.cpp
index 1d82f1df..3fc6391c 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Im2col.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32Im2col.cpp
@@ -66,31 +66,36 @@ std::string gen_im2col(TContext* ctx, TContext* inner_ctx) {
 }  // namespace
 std::string ConvIm2colFloat::GetKernelSymbol(TContext* ctx) const {
     std::stringstream extra_ss;
-    if (is_bias(ctx)) {
-        extra_ss << "_bias";
-    }
-    if (ctx->haveAttr("nonlineMode") &&
-        ctx->getAttrStr("nonlineMode") != "IDENTITY") {
-        extra_ss << "_" << ctx->getAttrStr("nonlineMode");
+    if (ctx) {
+        if (is_bias(ctx)) {
+            extra_ss << "_bias";
+        }
+        if (ctx->haveAttr("nonlineMode") &&
+            ctx->getAttrStr("nonlineMode") != "IDENTITY") {
+            extra_ss << "_" << ctx->getAttrStr("nonlineMode");
+        }
+        extra_ss << ctx->getAttrOprand("operand:0").dtype;
+        std::string name_temp =
+                "GI_kernel_conv2d_im2col_${kernel_h}x${kernel_w}_${"
+                "format}_${sparse}_p${pad_h}x${pad_w}_s${stride_h}x${stride_w}_"
+                "d${"
+                "dilate_h}x${dilate_w}${extra}";
+        return StringTemplate::StringTemplateArgs(ctx)
+                .add_ctx_int("kernel_h")
+                .add_ctx_int("kernel_w")
+                .add_ctx_str("format")
+                .add_ctx_str("sparse")
+                .add_ctx_int("pad_h")
+                .add_ctx_int("pad_w")
+                .add_ctx_int("stride_h")
+                .add_ctx_int("stride_w")
+                .add_ctx_int("dilate_h")
+                .add_ctx_int("dilate_w")
+                .add("extra", extra_ss.str())
+                .render(name_temp);
+    } else {
+        return "GI_kernel_conv2d_im2col";
     }
-    extra_ss << ctx->getAttrOprand("operand:0").dtype;
-    std::string name_temp =
-            "GI_kernel_conv2d_im2col_${kernel_h}x${kernel_w}_${"
-            "format}_${sparse}_p${pad_h}x${pad_w}_s${stride_h}x${stride_w}_d${"
-            "dilate_h}x${dilate_w}${extra}";
-    return StringTemplate::StringTemplateArgs(ctx)
-            .add_ctx_int("kernel_h")
-            .add_ctx_int("kernel_w")
-            .add_ctx_str("format")
-            .add_ctx_str("sparse")
-            .add_ctx_int("pad_h")
-            .add_ctx_int("pad_w")
-            .add_ctx_int("stride_h")
-            .add_ctx_int("stride_w")
-            .add_ctx_int("dilate_h")
-            .add_ctx_int("dilate_w")
-            .add("extra", extra_ss.str())
-            .render(name_temp);
 }
 
 bool ConvIm2colFloat::IsAvailable(TContext* ctx) const {
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32WinogradNchw44.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32WinogradNchw44.cpp
index 961afcfa..f482b13f 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32WinogradNchw44.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Fp32WinogradNchw44.cpp
@@ -8,6 +8,7 @@
  */
 
 #include <memory>
+#include "Arm/Arm64/InternalKernel/InternalKernel.h"
 #include "ConvKernel.h"
 #include "GeneralIntrinsic/Activation.h"
 #include "GeneralIntrinsic/InternalKernel/InternalKernel.h"
@@ -90,4 +91,152 @@ std::string WinogradFloatF23NCHW44::GetKernelSymbol(TContext* context) const {
     auto symbol = GIConvImpl::GetKernelSymbol(context);
     return symbol + "_winograd_f23";
 }
+
+bool WinogradFloatF43Nchw44::IsAvailable(TContext* ctx) const {
+    bool param_value_ok =
+            ctx->getAttrUInt("kernel_h") == 3 &&
+            ctx->getAttrUInt("kernel_w") == 3 &&
+            ctx->getAttrUInt("stride_h") == ctx->getAttrUInt("stride_w") &&
+            ctx->getAttrUInt("stride_h") == 1 &&
+            ctx->getAttrUInt("dilate_h") == 1 &&
+            ctx->getAttrUInt("dilate_w") == 1;
+
+    bool param_mode_ok = ctx->getAttrStr("sparse") == "DENSE" &&
+                         ctx->getAttrStr("format") == "NCHW44" &&
+                         ctx->getAttrStr("mode") == "CROSS_CORRELATION";
+
+    bool noline_ok = !ctx->haveAttr("nonlineMode") ||
+                     ctx->getAttrStr("nonlineMode") == "IDENTITY" ||
+                     ctx->getAttrStr("nonlineMode") == "RELU" ||
+                     ctx->getAttrStr("nonlineMode") == "H_SWISH";
+
+    bool type_ok = ctx->getAttrInt("nr_operands") >= 3 &&
+                   ctx->getAttrOprand("operand:0").dtype == "f32" &&
+                   ctx->getAttrOprand("operand:1").dtype == "f32" &&
+                   ctx->getAttrOprand("operand:2").dtype == "f32";
+
+    bool layout_ok = ctx->getAttrOprand("operand:0").shape.size() == 5 &&
+                     ctx->getAttrOprand("operand:0").shape[4] == 4;
+
+    return param_value_ok && param_mode_ok && type_ok && noline_ok && layout_ok;
+}
+
+std::string WinogradFloatF43Nchw44::GetInitBody(TContext* ctx) const {
+    std::stringstream writer;
+    writer << "#include\"gi_float.h\"\n";
+    writer << "#include<math.h>\n";
+    writer << "\n\n";
+    writer << GenCommonRet() << " " << GetInitSignature(ctx) << "{\n";
+    writer << m_framework.GenInitCode(ctx, &m_winograd_strategy);
+    writer << "\n}";
+    return writer.str();
+}
+
+std::string WinogradFloatF43Nchw44::GetWorkspaceBody(TContext* ctx) const {
+    std::stringstream writer;
+    writer << GenCommonRet() << " " << GetWorkspaceSignature(ctx) << "{\n";
+    writer << m_framework.GenGetWorkSpaceCode(ctx, &m_winograd_strategy);
+    writer << "\n}";
+    return writer.str();
+}
+
+std::string WinogradFloatF43Nchw44::GetKernelBody(TContext* ctx) const {
+    std::stringstream writer;
+    writer << "#include \"gi_float.h\"";
+    writer << "\n\n";
+    writer << "extern "
+           << Arm64::MatmulM4N16MK4Kernel().GetKernelSignature(nullptr)
+           << ";\n";
+    writer << GenCommonRet() << " " << GetKernelSignature(ctx) << "{\n";
+    writer << m_framework.GenKernelBodyCode(ctx, &m_winograd_strategy);
+    writer << "return TinyNN_SUCCESS;\n}";
+    return writer.str();
+}
+
+std::vector<KernelObj> WinogradFloatF43Nchw44::GetDependInternalSymbol(
+        TContext*) const {
+    auto matmul = Arm64::MatmulM4N16MK4Kernel();
+    return {{matmul.GetKernelSymbol(nullptr), matmul.GetKernelBody(nullptr),
+             matmul.GetBodyGuardBegin(nullptr), matmul.GetBodyGuardEnd(nullptr),
+             matmul.GetDependInternalSymbol(nullptr)}};
+}
+
+std::string WinogradFloatF43Nchw44::GetKernelSymbol(TContext* context) const {
+    auto symbol = GIConvImpl::GetKernelSymbol(context);
+    return symbol + "_winograd_f43";
+}
+
+bool WinogradFloatF63Nchw44::IsAvailable(TContext* ctx) const {
+    bool param_value_ok =
+            ctx->getAttrUInt("kernel_h") == 3 &&
+            ctx->getAttrUInt("kernel_w") == 3 &&
+            ctx->getAttrUInt("stride_h") == ctx->getAttrUInt("stride_w") &&
+            ctx->getAttrUInt("stride_h") == 1 &&
+            ctx->getAttrUInt("dilate_h") == 1 &&
+            ctx->getAttrUInt("dilate_w") == 1;
+
+    bool param_mode_ok = ctx->getAttrStr("sparse") == "DENSE" &&
+                         ctx->getAttrStr("format") == "NCHW44" &&
+                         ctx->getAttrStr("mode") == "CROSS_CORRELATION";
+
+    bool noline_ok = !ctx->haveAttr("nonlineMode") ||
+                     ctx->getAttrStr("nonlineMode") == "IDENTITY" ||
+                     ctx->getAttrStr("nonlineMode") == "RELU" ||
+                     ctx->getAttrStr("nonlineMode") == "H_SWISH";
+
+    bool type_ok = ctx->getAttrInt("nr_operands") >= 3 &&
+                   ctx->getAttrOprand("operand:0").dtype == "f32" &&
+                   ctx->getAttrOprand("operand:1").dtype == "f32" &&
+                   ctx->getAttrOprand("operand:2").dtype == "f32";
+
+    bool layout_ok = ctx->getAttrOprand("operand:0").shape.size() == 5 &&
+                     ctx->getAttrOprand("operand:0").shape[4] == 4;
+
+    return param_value_ok && param_mode_ok && type_ok && noline_ok && layout_ok;
+}
+
+std::string WinogradFloatF63Nchw44::GetInitBody(TContext* ctx) const {
+    std::stringstream writer;
+    writer << "#include\"gi_float.h\"\n";
+    writer << "#include<math.h>\n";
+    writer << "\n\n";
+    writer << GenCommonRet() << " " << GetInitSignature(ctx) << "{\n";
+    writer << m_framework.GenInitCode(ctx, &m_winograd_strategy);
+    writer << "\n}";
+    return writer.str();
+}
+
+std::string WinogradFloatF63Nchw44::GetWorkspaceBody(TContext* ctx) const {
+    std::stringstream writer;
+    writer << GenCommonRet() << " " << GetWorkspaceSignature(ctx) << "{\n";
+    writer << m_framework.GenGetWorkSpaceCode(ctx, &m_winograd_strategy);
+    writer << "\n}";
+    return writer.str();
+}
+
+std::string WinogradFloatF63Nchw44::GetKernelBody(TContext* ctx) const {
+    std::stringstream writer;
+    writer << "#include \"gi_float.h\"";
+    writer << "\n\n";
+    writer << "extern "
+           << Arm64::MatmulM4N16MK4Kernel().GetKernelSignature(nullptr)
+           << ";\n";
+    writer << GenCommonRet() << " " << GetKernelSignature(ctx) << "{\n";
+    writer << m_framework.GenKernelBodyCode(ctx, &m_winograd_strategy);
+    writer << "return TinyNN_SUCCESS;\n}";
+    return writer.str();
+}
+
+std::vector<KernelObj> WinogradFloatF63Nchw44::GetDependInternalSymbol(
+        TContext*) const {
+    auto matmul = Arm64::MatmulM4N16MK4Kernel();
+    return {{matmul.GetKernelSymbol(nullptr), matmul.GetKernelBody(nullptr),
+             matmul.GetBodyGuardBegin(nullptr), matmul.GetBodyGuardEnd(nullptr),
+             matmul.GetDependInternalSymbol(nullptr)}};
+}
+
+std::string WinogradFloatF63Nchw44::GetKernelSymbol(TContext* context) const {
+    auto symbol = GIConvImpl::GetKernelSymbol(context);
+    return symbol + "_winograd_f63";
+}
 // vim: syntax=cpp.doxygen
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp
index 26f1523d..dae4bd6a 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp
@@ -22,7 +22,8 @@ using namespace GeneralIntrinsic;
 std::string WinogradFrameNchw44::GenGetWorkSpaceCode(
         TContext* context, WinogradStrategyBase* strategy) {
     CC_ASSERT(context->getAttrStr("format") == "NCHW44")
-            << "format mismatch  now: "<< context->getAttrStr("format") << ", expect: NCHW44\n";
+            << "format mismatch  now: " << context->getAttrStr("format")
+            << ", expect: NCHW44\n";
     auto WeightShape = context->getAttrOprand("operand:1").shape;
     std::stringstream ss;
     std::string workspace_temp = R"({
@@ -55,7 +56,12 @@ std::string WinogradFrameNchw44::GenGetWorkSpaceCode(
                 Alpha * Alpha * OC * ${tile_per_loop} * sizeof(float);
         output_transform_buf_size = 
                 (output_transform_buf_size + Align -1) / Align * Align;
-        *workspace = input_transform_buf_size + output_transform_buf_size;
+
+        size_t transform_mid_buf_size = 2 * Alpha * Alpha * sizeof(float) *
+                PACK_C_SIZE;
+        transform_mid_buf_size = (transform_mid_buf_size + Align -1) / Align * Align; 
+        *workspace = input_transform_buf_size + output_transform_buf_size
+        + transform_mid_buf_size;
         return TinyNN_SUCCESS;
     })";
     ss << StringTemplate::StringTemplateArgs()
@@ -177,10 +183,18 @@ std::string WinogradFrameNchw44::GenKernelBodyCode(
                 Alpha * Alpha * IC * nr_tiles_per_loop * sizeof(float);
     input_transform_buf_size = 
                 (input_transform_buf_size + Align -1) / Align * Align;
+    
+    size_t output_transform_buf_size =
+                Alpha * Alpha * OC * nr_tiles_per_loop * sizeof(float);
+    output_transform_buf_size = 
+                (output_transform_buf_size + Align -1) / Align * Align;
 
     float* transform_input_ptr = workspace->ptr;
     float* transform_output_ptr = transform_input_ptr +
                         input_transform_buf_size / sizeof(float);
+    
+    float* transform_mid_ptr = transform_output_ptr +
+                        output_transform_buf_size / sizeof(float);
 
     const float* input_ptr = input->ptr;
     const float* weight_ptr = weight->ptr;
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h
index 58223146..488010d3 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h
@@ -63,7 +63,7 @@ class WinogradStrategyBase {
 };
 
 class WinogradFrameNchw44 {
-    uint32_t m_tile_per_loop = 24;
+    uint32_t m_tile_per_loop = 32;
 
 public:
     //! gen init code
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.cpp
index 0cf02cb8..a2564b21 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.cpp
@@ -356,7 +356,7 @@ std::string WinogradF23Strategy4x8MK4::OutputFeatureTrans(
             MULTI_TWO(1);
 #undef MULTI_TWO
 
-            if (bias_ptr) {
+            if (bias_ptr_) {
                 GI_FLOAT32_t vbias = GiLoadFloat32(bptr + oc);
                 dst_v[0][0]= GiFloat32Type2FixLenType(GiAddFloat32(GiFixLenType2GiFloat32Type(dst_v[0][0]), vbias));
                 dst_v[0][1]= GiFloat32Type2FixLenType(GiAddFloat32(GiFixLenType2GiFloat32Type(dst_v[0][1]), vbias));
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.h
index 354a816b..2381fa79 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.h
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.h
@@ -1,6 +1,6 @@
 /**
  * \file
- * compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/Winograd_strategy_4x16_mk4.h
+ * compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF23Strategy4x8MK4.h
  *
  * This file is part of MegCC, a deep learning compiler developed by Megvii.
  *
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/Fp32M4N12K4Matmul.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/Fp32M4N12K4Matmul.cpp
index fe477774..2716b9cc 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/Fp32M4N12K4Matmul.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/Fp32M4N12K4Matmul.cpp
@@ -20,7 +20,7 @@ static inline std::pair<std::string, std::string> gen_postprocess_inline(
         TContext* ctx, bool need_postprocess = true) {
     std::string call_str;
     std::stringstream declare_ss;
-    auto nonline_mode = ctx->haveAttr("nonlineMode")
+    auto nonline_mode = ctx && ctx->haveAttr("nonlineMode")
                                 ? ctx->getAttrStr("nonlineMode")
                                 : "IDENTITY";
     if ((nonline_mode == "SIGMOID") && need_postprocess) {
@@ -206,11 +206,13 @@ static inline void transpose_1x4_4_s(const float* inptr0, float* outptr) {
 }
 
 static std::string kern_4x12(TContext* ctx) {
-    auto nonline_mode = ctx->haveAttr("nonlineMode")
+    auto nonline_mode = ctx && ctx->haveAttr("nonlineMode")
                                 ? ctx->getAttrStr("nonlineMode")
                                 : "IDENTITY";
     auto activation_gen = create_activation_gener_instrinsic(nonline_mode);
-    bool with_bias = ctx->getAttrBool("with_bias");
+    bool with_bias = ctx && ctx->getAttrBool("with_bias")
+                             ? ctx->getAttrBool("with_bias")
+                             : false;
     std::stringstream writer;
     writer << R"(
 static inline void kern_4x12_bias_relu(const float* packA, const float* packB, int K,
@@ -437,11 +439,13 @@ static inline void kern_4x12_bias_relu(const float* packA, const float* packB, i
 }
 
 static std::string kern_4x4(TContext* ctx) {
-    auto nonline_mode = ctx->haveAttr("nonlineMode")
+    auto nonline_mode = ctx && ctx->haveAttr("nonlineMode")
                                 ? ctx->getAttrStr("nonlineMode")
                                 : "IDENTITY";
     auto activation_gen = create_activation_gener_instrinsic(nonline_mode);
-    bool with_bias = ctx->getAttrBool("with_bias");
+    bool with_bias = ctx && ctx->getAttrBool("with_bias")
+                             ? ctx->getAttrBool("with_bias")
+                             : false;
     std::stringstream writer;
     writer << R"(
 static inline void kern_4x4_bias_relu(const float* packA, const float* packB, int K,
@@ -696,10 +700,10 @@ std::string gen_kernel(const std::string& sig, TContext* ctx,
 std::string MatmulM4N12MK4Kernel::GetKernelSymbol(TContext* ctx) const {
     std::stringstream ss;
     ss << "GI_fp32_m4_n12_k4_matmul";
-    if (ctx->getAttrBool("with_bias")) {
+    if (ctx && ctx->getAttrBool("with_bias")) {
         ss << "_bias";
     }
-    if (ctx->haveAttr("nonlineMode") &&
+    if (ctx && ctx->haveAttr("nonlineMode") &&
         ctx->getAttrStr("nonlineMode") != "IDENTITY") {
         ss << "_" << ctx->getAttrStr("nonlineMode");
     }
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.cpp
index 0506fdb6..c96b3127 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.cpp
@@ -20,7 +20,7 @@ const std::string MatmulInternal::m_workspace_call =
         "(int y0, int ymax, int x0, int xmax, int k0, int kmax)";
 
 std::string MatmulInternal::GenNakedKernelCall(TContext* ctx) {
-    auto dtype = ctx->getAttrStr("dtype");
+    auto dtype = ctx ? ctx->getAttrStr("dtype") : "f32";
     if (Utils::is_float_dtype(dtype)) {
         return R"((const float* pack_a, const float* pack_b, float* C,
             size_t LDC, size_t M, size_t N, size_t K, const float* bias_ptr))";
@@ -37,7 +37,7 @@ std::string MatmulInternal::GenNakedKernelCall(TContext* ctx) {
 }
 
 std::string MatmulInternal::GenKernelCall(TContext* ctx) {
-    auto dtype = ctx->getAttrStr("dtype");
+    auto dtype = ctx ? ctx->getAttrStr("dtype") : "f32";
     if (Utils::is_float_dtype(dtype)) {
         return R"((const float* A, size_t LDA, const float* B, size_t LDB, float* C,
             size_t LDC, size_t M, size_t N, size_t K, const float* bias_ptr, void* workspace))";
@@ -54,7 +54,7 @@ std::string MatmulInternal::GenKernelCall(TContext* ctx) {
 }
 
 std::string MatmulInternal::GenPackACall(TContext* ctx) {
-    auto dtype = ctx->getAttrStr("dtype");
+    auto dtype = ctx ? ctx->getAttrStr("dtype") : "f32";
     if (Utils::is_float_dtype(dtype)) {
         return "(float* outptr, const float* inptr, int ldin, int y0, int "
                "ymax, int k0, int kmax)";
@@ -68,7 +68,7 @@ std::string MatmulInternal::GenPackACall(TContext* ctx) {
 }
 
 std::string MatmulInternal::GenPackBCall(TContext* ctx) {
-    auto dtype = ctx->getAttrStr("dtype");
+    auto dtype = ctx ? ctx->getAttrStr("dtype") : "f32";
     if (Utils::is_float_dtype(dtype)) {
         return "(float* outptr, const float* inptr, int ldin, int x0, int "
                "xmax, int k0, int kmax)";
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.h b/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.h
index f2df656b..8ce3bca8 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.h
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/InternalKernel/InternalKernel.h
@@ -22,12 +22,12 @@ class MatmulInternal : public InternalKernelFunc {
         return "void " + GetKernelSymbol(ctx) + GenKernelCall(ctx);
     }
     virtual std::string GetPackASymbol(TContext* ctx) const {
-        bool trans_a = ctx->getAttrBool("transposeA");
+        bool trans_a = ctx && ctx->getAttrBool("transposeA") ? true : false;
         std::string suffix = trans_a ? "t" : "n";
         return GetKernelSymbol(ctx) + "_packa_" + suffix;
     }
     virtual std::string GetPackBSymbol(TContext* ctx) const {
-        bool trans_b = ctx->getAttrBool("transposeB");
+        bool trans_b = ctx && ctx->getAttrBool("transposeB") ? true : false;
         std::string suffix = trans_b ? "t" : "n";
         return GetKernelSymbol(ctx) + "_packb_" + suffix;
     }
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/KernelPack.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/KernelPack.cpp
index 41856a1c..18c0bdaa 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/KernelPack.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/KernelPack.cpp
@@ -46,11 +46,15 @@ struct AllGICommonKernel {
                 std::make_shared<GeneralIntrinsic::MatmulM4N12Kernel>(),
                 std::make_shared<GeneralIntrinsic::MatmulM4N12MK4Kernel>()};
         inner_map[KernelPack::KernType::ConvKernel] = {
+                std::make_shared<GeneralIntrinsic::Conv1x1FloatMk4>(),
                 std::make_shared<GeneralIntrinsic::ChannelWiseFloatMk4>(),
                 std::make_shared<GeneralIntrinsic::ConvFloatNCHWNCHW44>(),
-                std::make_shared<GeneralIntrinsic::ConvIm2colFloat>(),
+                std::make_shared<GeneralIntrinsic::WinogradFloatF63Nchw44>(),
+                std::make_shared<GeneralIntrinsic::WinogradFloatF43Nchw44>(),
                 std::make_shared<GeneralIntrinsic::WinogradFloatF23NCHW44>(),
-                std::make_shared<GeneralIntrinsic::Conv1x1FloatMk4>()};
+                std::make_shared<GeneralIntrinsic::ConvIm2colFloat>(),
+
+        };
 
         inner_map[KernelPack::KernType::PoolingKernel] = {
                 std::make_shared<GeneralIntrinsic::PoolingNchw44Fp32>(),
diff --git a/compiler/lib/KernelGen/KernelGen.cpp b/compiler/lib/KernelGen/KernelGen.cpp
index 20e68428..cd7978b9 100644
--- a/compiler/lib/KernelGen/KernelGen.cpp
+++ b/compiler/lib/KernelGen/KernelGen.cpp
@@ -6,7 +6,7 @@
  *
  * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
  */
-
+#include <regex>
 #include "Arm/Arm64/KernelPack.h"
 #include "Arm/ArmCommon/KernelPack.h"
 #include "Arm/Armv7/KernelPack.h"
@@ -32,29 +32,51 @@ KernelPack::GetKernel(KernelPack::KernType kernel_type, Arch arch) {
     //! arm64v7 is used by tinycv, nn opr should be armv64 or armv7, not arm64v7
     auto deduce_func = GetDeduceLayout(kernel_type);
     if (arch == Arch::ARM64 || arch == Arch::ARM64V7) {
+        auto a64_kerns = Arm64::ArchKernelPack::GetKernel(kernel_type);
+        auto armcommon_kerns =
+                ArmCommon::ArchKernelPack::GetKernel(kernel_type);
+        auto gi_kerns =
+                GeneralIntrinsic::ArchKernelPack::GetKernel(kernel_type);
         if (kernel_type == KernelPack::KernType::MatrixMulKernel) {
-            auto a64_kerns = Arm64::ArchKernelPack::GetKernel(kernel_type);
-            auto armcommon_kerns =
-                    ArmCommon::ArchKernelPack::GetKernel(kernel_type);
-            auto gi_kerns =
-                    GeneralIntrinsic::ArchKernelPack::GetKernel(kernel_type);
             armcommon_kerns.insert(armcommon_kerns.end(), a64_kerns.begin(),
                                    a64_kerns.end());
             armcommon_kerns.insert(armcommon_kerns.end(), gi_kerns.begin(),
                                    gi_kerns.end());
             return {armcommon_kerns, deduce_func};
+        }
+
+        std::vector<const KernelFunc*> valid_kern;
+        if (kernel_type == KernelPack::KernType::ConvKernel) {
+            std::vector<const KernelFunc*> sorted_kern(2);
+            for (auto&& kern : gi_kerns) {
+                auto kern_sym = kern->GetKernelSymbol(nullptr);
+                auto is_f63 = std::regex_match(
+                        kern_sym, std::regex("^GI.*_winograd_f63.*"));
+                auto is_f43 = std::regex_match(
+                        kern_sym, std::regex("^GI.*_winograd_f43.*"));
+                auto if_match = is_f63 || is_f43;
+                if (!if_match) {
+                    valid_kern.push_back(kern);
+                } else {
+                    if (is_f43) {
+                        sorted_kern[0] = kern;
+                    } else {
+                        sorted_kern[1] = kern;
+                    }
+                }
+            }
+            //! WARNING: the f63 and f43 must exist in GI kernel
+            a64_kerns.insert(a64_kerns.begin(), sorted_kern.begin(),
+                             sorted_kern.end());
         } else {
-            auto a64_kerns = Arm64::ArchKernelPack::GetKernel(kernel_type);
-            auto armcommon_kerns =
-                    ArmCommon::ArchKernelPack::GetKernel(kernel_type);
-            auto gi_kerns =
-                    GeneralIntrinsic::ArchKernelPack::GetKernel(kernel_type);
-            a64_kerns.insert(a64_kerns.end(), armcommon_kerns.begin(),
-                             armcommon_kerns.end());
-            a64_kerns.insert(a64_kerns.end(), gi_kerns.begin(), gi_kerns.end());
-            return {a64_kerns, deduce_func};
+            valid_kern = gi_kerns;
         }
 
+        a64_kerns.insert(a64_kerns.end(), armcommon_kerns.begin(),
+                         armcommon_kerns.end());
+        a64_kerns.insert(a64_kerns.end(), valid_kern.begin(), valid_kern.end());
+        return {a64_kerns, deduce_func};
+
     } else if (arch == Arch::ARMV7) {
         auto a32_kerns = Armv7::ArchKernelPack::GetKernel(kernel_type);
 
@@ -77,10 +99,30 @@ KernelPack::GetKernel(KernelPack::KernType kernel_type, Arch arch) {
 #endif
     else {
         CC_ASSERT(arch == Arch::BAREMETAL);
+        //! FIXME: the f43 f63 winograd matmul is using arm64 asm kernel, it is
+        //! invalid for barmetal
         auto gi_kerns =
                 GeneralIntrinsic::ArchKernelPack::GetKernel(kernel_type);
+        std::vector<const KernelFunc*> valid_kern;
+        if (kernel_type == KernelPack::KernType::ConvKernel) {
+            for (auto&& kern : gi_kerns) {
+                auto kern_sym = kern->GetKernelSymbol(nullptr);
+                auto if_match =
+                        std::regex_match(kern_sym,
+                                         std::regex("^GI.*_winograd_f63.*")) ||
+                        std::regex_match(kern_sym,
+                                         std::regex("^GI.*_winograd_f43.*"));
+                if (!if_match) {
+                    valid_kern.push_back(kern);
+                }
+            }
+        } else {
+            valid_kern = gi_kerns;
+        }
+
         auto naive_impl = BareMetal::ArchKernelPack::GetKernel(kernel_type);
-        naive_impl.insert(naive_impl.begin(), gi_kerns.begin(), gi_kerns.end());
+        naive_impl.insert(naive_impl.begin(), valid_kern.begin(),
+                          valid_kern.end());
         return {naive_impl, deduce_func};
     }
 }
diff --git a/immigration/include/marm_neon.h b/immigration/include/marm_neon.h
index 375e9617..f8327103 100644
--- a/immigration/include/marm_neon.h
+++ b/immigration/include/marm_neon.h
@@ -627,7 +627,7 @@ __ai float32x4_t vfmsq_laneq_f32_impl_3(float32x4_t a, float32x4_t b,
 #undef vfmaq_laneq_f32
 #define vfmaq_laneq_f32(a, b, v, lane) vfmaq_laneq_f32_impl_##lane(a, b, v)
 #undef vfmsq_laneq_f32
-#define vfmsq_laneq_f32(a, b, v, lane) Vfmsq_laneq_f32_impl_##lane(a, b, v)
+#define vfmsq_laneq_f32(a, b, v, lane) vfmsq_laneq_f32_impl_##lane(a, b, v)
 
 #endif
 

From befea7ec9c9d3d17417949a26cbbc82d4fcbca1a Mon Sep 17 00:00:00 2001
From: yuxiongxiong <yuxiongxiong@megvii.com>
Date: Mon, 9 Jan 2023 10:45:37 +0800
Subject: [PATCH 12/17] feat(compiler): add f43 winograd for arm64 kernel

---
 .../Winograd/WinogradF43Strategy4x16MK4.cpp   | 1066 +++++++++++++++++
 .../Winograd/WinogradF43Strategy4x16MK4.h     |   36 +
 2 files changed, 1102 insertions(+)
 create mode 100644 compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp
 create mode 100644 compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h

diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp
new file mode 100644
index 00000000..becbb458
--- /dev/null
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp
@@ -0,0 +1,1066 @@
+/**
+ * \file
+ * compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+
+#include "WinogradF43Strategy4x16MK4.h"
+#include <string>
+#include "Arm/Arm64/InternalKernel/InternalKernel.h"
+#include "GeneralIntrinsic/Activation.h"
+#include "GeneralIntrinsic/ConvKernel/ConvKernel.h"
+#include "Utils/StringTemplate.h"
+#include "compiler/KernelGen/KernelGen.h"
+
+using namespace megcc;
+using namespace KernelGen;
+using namespace GeneralIntrinsic;
+
+std::string WinogradF43Strategy4x16MK4::WeightTrans(
+        const std::vector<std::string>& strs) {
+    auto inptr = strs[0];
+    auto outptr = strs[1];
+    auto OC = strs[2];
+    auto IC = strs[3];
+    std::string filter_process = R"(
+    const uint32_t  PACK_C_SIZE= 4;
+    const uint32_t KERNEL_SIZE = 3;
+    const uint32_t alpha = 4 + 3 - 1; 
+    size_t OCB = ${OC} /  PACK_C_SIZE;
+    size_t ICB = ${IC} /  PACK_C_SIZE;
+
+    for (size_t ocb = 0; ocb < OCB; ocb++) {
+        for (size_t icb = 0; icb < ICB; icb++) {
+            for (size_t ic_inner = 0; ic_inner <  PACK_C_SIZE; ic_inner++) {
+                const float* fptr = ${filter} + (ocb * ICB + icb) * KERNEL_SIZE *
+                      KERNEL_SIZE *  PACK_C_SIZE *  PACK_C_SIZE +
+                      ic_inner *  PACK_C_SIZE;
+                //! read 4OC 1IC filter
+                GI_FLOAT32_t g00 = GiLoadFloat32(fptr + 0*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g01 = GiLoadFloat32(fptr + 1*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g02 = GiLoadFloat32(fptr + 2*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g10 = GiLoadFloat32(fptr + 3*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g11 = GiLoadFloat32(fptr + 4*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g12 = GiLoadFloat32(fptr + 5*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g20 = GiLoadFloat32(fptr + 6*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g21 = GiLoadFloat32(fptr + 7*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g22 = GiLoadFloat32(fptr + 8*  PACK_C_SIZE *  PACK_C_SIZE);
+
+                //! twice matmul
+                GI_FLOAT32_t tmp0, tmp1;
+                ${FilterTransUnroll(3, midle, g, tmp0, tmp1)}
+                ${FilterTransUnroll(6, ret, midle, tmp0, tmp1)}
+
+                //! write to the dst
+                float* dst = ${outptr};
+                ${StoreRet2D(6, 6, ret)};
+            }
+        }
+    })";
+    auto FilterTransUnroll = [](const std::vector<std::string>& strs) {
+        int times = std::stoi(strs[0]);
+        std::string dst = strs[1];
+        std::string src = strs[2];
+        std::string tmp0 = strs[3];
+        std::string tmp1 = strs[4];
+        std::stringstream ss;
+        for (int i = 0; i < times; i++) {
+            ss << "GI_FLOAT32_t " << dst << i << "0 = GiMultiplyScalerFloat32("
+               << src << "0" << i << ", 0.25f);\n";
+            ss << tmp0 << " = GiMultiplyScalerFloat32(GiAddFloat32(" << src
+               << "0" << i << ", " << src << "2" << i << "), (-1.0/6));\n";
+            ss << tmp1 << " = GiMultiplyScalerFloat32(" << src << "1" << i
+               << ", (-1.0/6));\n";
+            ss << "GI_FLOAT32_t " << dst << i << "1 = GiAddFloat32(" << tmp0
+               << ", " << tmp1 << ");\n";
+            ss << "GI_FLOAT32_t " << dst << i << "2 = GiSubtractFloat32("
+               << tmp0 << ", " << tmp1 << ");\n";
+            ss << tmp0 << " = GiAddFloat32(GiMultiplyScalerFloat32(" << src
+               << "0" << i << ", 1.0/24), GiMultiplyScalerFloat32(" << src
+               << "2" << i << ", 1.0/6));\n";
+            ss << tmp1 << " = GiMultiplyScalerFloat32(" << src << "1" << i
+               << ", 1.0/12);\n";
+            ss << "GI_FLOAT32_t " << dst << i << "3 = GiAddFloat32(" << tmp0
+               << ", " << tmp1 << ");\n";
+            ss << "GI_FLOAT32_t " << dst << i << "4 = GiSubtractFloat32("
+               << tmp0 << ", " << tmp1 << ");\n";
+            ss << "GI_FLOAT32_t " << dst << i << "5 = " << src << "2" << i
+               << ";\n";
+        }
+        return ss.str();
+    };
+
+    auto StoreRet2D = [](const std::vector<std::string>& strs) {
+        int times_out = std::stoi(strs[0]);
+        int times_inner = std::stoi(strs[1]);
+        std::string src = strs[2];
+        std::stringstream ss;
+        for (int out = 0; out < times_out; out++) {
+            for (int inner = 0; inner < times_inner; inner++) {
+                ss << "GiStoreFloat32(dst + (" << out << " * alpha + " << inner
+                   << ") * OCB * ICB * PACK_C_SIZE * PACK_C_SIZE + ocb * ICB * "
+                      "PACK_C_SIZE *PACK_C_SIZE + icb* PACK_C_SIZE * "
+                      "PACK_C_SIZE + "
+                      "ic_inner*PACK_C_SIZE, "
+                   << src << out << inner << ");\n";
+            }
+        }
+        return ss.str();
+    };
+    std::stringstream ss;
+    ss << StringTemplate::StringTemplateArgs()
+                    .add("StoreRet2D", StoreRet2D)
+                    .add("FilterTransUnroll", FilterTransUnroll)
+                    .add("OC", OC)
+                    .add("IC", IC)
+                    .add("filter", inptr)
+                    .add("outptr", outptr)
+                    .render(filter_process);
+    return ss.str();
+}
+
+std::string WinogradF43Strategy4x16MK4::InputFeatureTrans(
+        const std::vector<std::string>& strs) {
+    auto InputTransformF43NCHW44 = [](std::vector<std::string>) {
+        std::stringstream ss;
+        std::string kernel = R"(
+        size_t ICB = IC_ /  PACK_C_SIZE;
+        size_t icb = ic /  PACK_C_SIZE;
+
+        #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+            const float* v0 = input_parameters;
+        #else
+            GI_FLOAT32_t v0 = GiLoadFloat32(input_parameters);
+        #endif
+        int base_offset= ic * IH_ * IW_ + ih_start * IW_ * 4 + iw_start * 4;
+        const float* input_ptr_ =source;
+
+        GI_FLOAT32_t zero = GiZeroFloat32();
+        GI_FLOAT32_t d00, d01, d02, d03, d04, d05;
+        GI_FLOAT32_t d10, d11, d12, d13, d14, d15;
+        GI_FLOAT32_t d20, d21, d22, d23, d24, d25;
+        GI_FLOAT32_t d30, d31, d32, d33, d34, d35;
+
+#define cb(i) GI_FLOAT32_t t##i;
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+        // load line 0 -> d00 ... d05
+        int offset = base_offset;
+        const float* line_ptr = input_ptr_+ offset;
+        if (inner) {
+#define cb(i) d0##i = GiLoadFloat32(line_ptr + i *  PACK_C_SIZE);
+            UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+        } else {
+            if (ih_valid[0] == 1) {
+#define cb(i) d0##i = iw_valid[i] ==1 ? GiLoadFloat32(line_ptr + i *  PACK_C_SIZE) : zero;
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            } else {
+#define cb(i) d0##i = zero;
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            }
+        }
+
+        // load line 4 -> d30 ... t35
+        offset = base_offset + 4 * IW_ * 4;
+        line_ptr = input_ptr_ + offset;
+        if (inner) {
+#define cb(i)                                        \
+    d3##i = GiLoadFloat32(line_ptr + i *  PACK_C_SIZE); \
+    t##i = MADD(d3##i, d0##i, v0, 0);
+            UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+        } else {
+            if (ih_valid[4] == 1 ) {
+#define cb(i)                                                             \
+    d3##i = iw_valid[i] == 1 ? GiLoadFloat32(line_ptr + i *  PACK_C_SIZE) : zero; \
+    t##i = MADD(d3##i, d0##i, v0, 0);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            } else {
+#define cb(i)     \
+    d3##i = zero; \
+    t##i = MADD(d3##i, d0##i, v0, 0);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            }
+        }
+
+        // load line 2 -> d20 ... d25
+        offset = base_offset + 2 * IW_ * 4;
+        line_ptr = input_ptr_ + offset;
+        if (inner) {
+#define cb(i)                                        \
+    d2##i = GiLoadFloat32(line_ptr + i *  PACK_C_SIZE); \
+    t##i = MSUB(t##i, d2##i, v0, 1);
+            UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+        } else {
+            if (ih_valid[2] == 1 ) {
+#define cb(i)                                                             \
+    d2##i = iw_valid[i] ==1 ? GiLoadFloat32(line_ptr + i *  PACK_C_SIZE) : zero; \
+    t##i = MSUB(t##i, d2##i, v0, 1);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            } else {
+#define cb(i)     \
+    d2##i = zero; \
+    t##i = MSUB(t##i, d2##i, v0, 1);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            }
+        }
+
+        // load line 3 -> d10 ... d15
+        offset = base_offset + 3 * IW_ * 4;
+        line_ptr = input_ptr_ + offset;
+        if (inner) {
+#define cb(i) d1##i = GiLoadFloat32(line_ptr + i *  PACK_C_SIZE);
+            UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+        } else {
+            if (ih_valid[3] ==1 ) {
+#define cb(i) d1##i = iw_valid[i] ==1 ? GiLoadFloat32(line_ptr + i *  PACK_C_SIZE) : zero;
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            } else {
+#define cb(i) d1##i = zero;
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            }
+        }
+
+        float* buf_ptr = dst + icb * nr_tiles_in_loop_ *  PACK_C_SIZE +
+                         tile_idx *  PACK_C_SIZE;
+
+        d00 = MADD(t4, t0, v0, 0);
+        d00 = MSUB(d00, t2, v0, 1);
+        GiStoreFloat32(buf_ptr, d00);
+        d00 = MSUB(t3, t1, v0, 0);
+        d01 = MSUB(t4, t2, v0, 0);
+        d02 = GiAddFloat32(d00, d01);
+        GiStoreFloat32(buf_ptr + 1 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d02);
+        d02 = GiSubtractFloat32(d01, d00);
+        GiStoreFloat32(buf_ptr + 2 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d02);
+        d00 = GiSubtractFloat32(t3, t1);
+        d01 = GiSubtractFloat32(t4, t2);
+        d02 = MADD(d01, d00, v0, 2);
+        GiStoreFloat32(buf_ptr + 3 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d02);
+        d02 = MSUB(d01, d00, v0, 2);
+        GiStoreFloat32(buf_ptr + 4 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d02);
+        d01 = GiSubtractFloat32(t5, t3);
+        d02 = MSUB(d01, d00, v0, 0);
+        GiStoreFloat32(buf_ptr + 5 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d02);
+
+// ln4 - ln2 -> t
+#define cb(i) t##i = GiSubtractFloat32(d3##i, d2##i);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+        // load line 1 -> d00 ... d05
+        offset = base_offset + 1 * IW_ * 4;
+        line_ptr = input_ptr_ + offset;
+        if (inner) {
+#define cb(i) d0##i = GiLoadFloat32(line_ptr + i *  PACK_C_SIZE);
+            UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+        } else {
+            if (ih_valid[1] ==1) {
+#define cb(i) d0##i = iw_valid[i] ==1 ? GiLoadFloat32(line_ptr + i *  PACK_C_SIZE) : zero;
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            } else {
+#define cb(i) d0##i = zero;
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            }
+        }
+
+// ln4 - 4 * ln2 -> ln4
+#define cb(i) d3##i = MSUB(d3##i, d2##i, v0, 0);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+// ln3 - 4 * ln1 -> ln2
+#define cb(i) d2##i = MSUB(d1##i, d0##i, v0, 0);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+// ln3 - ln1 -> ln3
+#define cb(i) d1##i = GiSubtractFloat32(d1##i, d0##i);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+// (ln4 - 4 * ln2)[ln4] + (ln3 - 4 * ln1)[ln2] -> ln1
+#define cb(i) d0##i = GiAddFloat32(d3##i, d2##i);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+// (ln4 - 4 * ln2)[ln4] - (ln3 - 4 * ln1)[ln2] -> ln2
+#define cb(i) d2##i = GiSubtractFloat32(d3##i, d2##i);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+        // ln4(d30 ... d35) is free until now
+        buf_ptr = dst + 1 * Alpha * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE +
+                  icb * nr_tiles_in_loop_ *  PACK_C_SIZE + tile_idx *  PACK_C_SIZE;
+        d30 = MADD(d04, d00, v0, 0);
+        d30 = MSUB(d30, d02, v0, 1);
+        GiStoreFloat32(buf_ptr, d30);
+        d30 = MSUB(d03, d01, v0, 0);
+        d32 = MSUB(d04, d02, v0, 0);
+        d31 = GiAddFloat32(d30, d32);
+        GiStoreFloat32(buf_ptr + ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d31);
+        d31 = GiSubtractFloat32(d32, d30);
+        GiStoreFloat32(buf_ptr + 2 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d31);
+        d30 = GiSubtractFloat32(d03, d01);
+        d31 = GiSubtractFloat32(d04, d02);
+        d32 = MADD(d31, d30, v0, 2);
+        GiStoreFloat32(buf_ptr + 3 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d32);
+        d32 = MSUB(d31, d30, v0, 2);
+        GiStoreFloat32(buf_ptr + 4 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d32);
+        d31 = GiSubtractFloat32(d05, d03);
+        d32 = MSUB(d31, d30, v0, 0);
+        GiStoreFloat32(buf_ptr + 5 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d32);
+
+        buf_ptr = dst + 2 * Alpha * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE +
+                  icb * nr_tiles_in_loop_ *  PACK_C_SIZE + tile_idx *  PACK_C_SIZE;
+        d33 = MADD(d24, d20, v0, 0);
+        d33 = MSUB(d33, d22, v0, 1);
+        GiStoreFloat32(buf_ptr, d33);
+        d33 = MSUB(d23, d21, v0, 0);
+        d35 = MSUB(d24, d22, v0, 0);
+        d34 = GiAddFloat32(d33, d35);
+        GiStoreFloat32(buf_ptr + ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d34);
+        d34 = GiSubtractFloat32(d35, d33);
+        GiStoreFloat32(buf_ptr + 2 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d34);
+        d33 = GiSubtractFloat32(d23, d21);
+        d34 = GiSubtractFloat32(d24, d22);
+        d35 = MADD(d34, d33, v0, 2);
+        GiStoreFloat32(buf_ptr + 3 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d35);
+        d35 = MSUB(d34, d33, v0, 2);
+        GiStoreFloat32(buf_ptr + 4 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d35);
+        d34 = GiSubtractFloat32(d25, d23);
+        d35 = MSUB(d34, d33, v0, 0);
+        GiStoreFloat32(buf_ptr + 5 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, d35);
+
+// (ln4 - ln2)[t] + (ln3 - ln1)[ln3] * 2 -> ln4
+#define cb(i) d3##i = MADD(t##i, d1##i, v0, 2);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+// (ln4 - ln2)[t] - (ln3 - ln1)[ln3] * 2 -> ln3
+#define cb(i) d1##i = MSUB(t##i, d1##i, v0, 2);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+        // t is free
+        buf_ptr = dst + 3 * Alpha * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE +
+                  icb * nr_tiles_in_loop_ *  PACK_C_SIZE + tile_idx *  PACK_C_SIZE;
+        t0 = MADD(d34, d30, v0, 0);
+        t0 = MSUB(t0, d32, v0, 1);
+        GiStoreFloat32(buf_ptr, t0);
+        t0 = MSUB(d33, d31, v0, 0);
+        t2 = MSUB(d34, d32, v0, 0);
+        t1 = GiAddFloat32(t0, t2);
+        GiStoreFloat32(buf_ptr + ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t1);
+        t1 = GiSubtractFloat32(t2, t0);
+        GiStoreFloat32(buf_ptr + 2 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t1);
+        t0 = GiSubtractFloat32(d33, d31);
+        t1 = GiSubtractFloat32(d34, d32);
+        t2 = MADD(t1, t0, v0, 2);
+        GiStoreFloat32(buf_ptr + 3 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t2);
+        t2 = MSUB(t1, t0, v0, 2);
+        GiStoreFloat32(buf_ptr + 4 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t2);
+        t1 = GiSubtractFloat32(d35, d33);
+        t2 = MSUB(t1, t0, v0, 0);
+        GiStoreFloat32(buf_ptr + 5 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t2);
+
+        buf_ptr = dst + 4 * Alpha * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE +
+                  icb * nr_tiles_in_loop_ *  PACK_C_SIZE + tile_idx *  PACK_C_SIZE;
+        t3 = MADD(d14, d10, v0, 0);
+        t3 = MSUB(t3, d12, v0, 1);
+        GiStoreFloat32(buf_ptr, t3);
+        t3 = MSUB(d13, d11, v0, 0);
+        t5 = MSUB(d14, d12, v0, 0);
+        t4 = GiAddFloat32(t3, t5);
+        GiStoreFloat32(buf_ptr + ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t4);
+        t4 = GiSubtractFloat32(t5, t3);
+        GiStoreFloat32(buf_ptr + 2 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t4);
+        t3 = GiSubtractFloat32(d13, d11);
+        t4 = GiSubtractFloat32(d14, d12);
+        t5 = MADD(t4, t3, v0, 2);
+        GiStoreFloat32(buf_ptr + 3 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t5);
+        t5 = MSUB(t4, t3, v0, 2);
+        GiStoreFloat32(buf_ptr + 4 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t5);
+        t4 = GiSubtractFloat32(d15, d13);
+        t5 = MSUB(t4, t3, v0, 0);
+        GiStoreFloat32(buf_ptr + 5 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t5);
+
+        // load line 5 -> d30 ... d35
+        offset = base_offset + 5 * IW_ * 4;
+        line_ptr = input_ptr_ + offset;
+        if (inner) {
+#define cb(i) d3##i = GiLoadFloat32(line_ptr + i *  PACK_C_SIZE);
+            UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+        } else {
+            if (ih_valid[5] == 1) {
+#define cb(i) d3##i = iw_valid[i] ==1 ? GiLoadFloat32(line_ptr + i *  PACK_C_SIZE) : zero;
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            } else {
+#define cb(i) d3##i = zero;
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            }
+        }
+
+        // load line 1 -> d0 ... d5
+        offset = base_offset + 1 * IW_ * 4;
+        line_ptr = input_ptr_ + offset;
+        if (inner) {
+#define cb(i)                                        \
+    d0##i = GiLoadFloat32(line_ptr + i *  PACK_C_SIZE); \
+    d3##i = MADD(d3##i, d0##i, v0, 0);
+            UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+        } else {
+            if (ih_valid[1] ==1) {
+#define cb(i)                                                             \
+    d0##i = iw_valid[i] ==1 ? GiLoadFloat32(line_ptr + i *  PACK_C_SIZE) : zero; \
+    d3##i = MADD(d3##i, d0##i, v0, 0);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            } else {
+#define cb(i)     \
+    d0##i = zero; \
+    d3##i = MADD(d3##i, d0##i, v0, 0);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            }
+        }
+
+        // load line 3 -> d10 ... d15
+        offset = base_offset + 3 * IW_ * 4;
+        line_ptr = input_ptr_ + offset;
+        if (inner) {
+#define cb(i)                                        \
+    d1##i = GiLoadFloat32(line_ptr + i *  PACK_C_SIZE); \
+    d3##i = MSUB(d3##i, d1##i, v0, 1);
+            UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+        } else {
+            if (ih_valid[3] == 1 ) {
+#define cb(i)                                                             \
+    d1##i = iw_valid[i] ==1 ? GiLoadFloat32(line_ptr + i *  PACK_C_SIZE) : zero; \
+    d3##i = MSUB(d3##i, d1##i, v0, 1);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            } else {
+#define cb(i)     \
+    d1##i = zero; \
+    d3##i = MSUB(d3##i, d1##i, v0, 1);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+            }
+        }
+
+        buf_ptr = dst + 5 * Alpha * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE +
+                  icb * nr_tiles_in_loop_ *  PACK_C_SIZE + tile_idx *  PACK_C_SIZE;
+        t0 = MADD(d34, d30, v0, 0);
+        t0 = MSUB(t0, d32, v0, 1);
+        GiStoreFloat32(buf_ptr, t0);
+        t0 = MSUB(d33, d31, v0, 0);
+        t2 = MSUB(d34, d32, v0, 0);
+        t1 = GiAddFloat32(t0, t2);
+        GiStoreFloat32(buf_ptr + ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t1);
+        t1 = GiSubtractFloat32(t2, t0);
+        GiStoreFloat32(buf_ptr + 2 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t1);
+        t0 = GiSubtractFloat32(d33, d31);
+        t1 = GiSubtractFloat32(d34, d32);
+        t2 = MADD(t1, t0, v0, 2);
+        GiStoreFloat32(buf_ptr + 3 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t2);
+        t2 = MSUB(t1, t0, v0, 2);
+        GiStoreFloat32(buf_ptr + 4 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t2);
+        t1 = GiSubtractFloat32(d35, d33);
+        t2 = MSUB(t1, t0, v0, 0);
+        GiStoreFloat32(buf_ptr + 5 * ICB * nr_tiles_in_loop_ *  PACK_C_SIZE, t2);
+
+
+
+)";
+        return kernel;
+    };
+    std::string input_process = R"(
+    const uint32_t OUTPUT_BLOCK_SIZE = 4;
+    const uint32_t KS = 3;
+
+    float* dst = ${transform_input_ptr};
+    const float* source = ${inptr};
+    uint32_t IH_ = ${IH};
+    uint32_t IW_ = ${IW};
+    uint32_t IC_ = ${IC};
+    uint32_t PH_ = ${PH};
+    uint32_t PW_ = ${PW};
+    uint32_t nr_tiles_in_loop_ = ${nr_tiles_in_loop};
+    uint32_t tile_id_ = ${tile_id};
+
+    const float input_parameters[4] = {4.0f, 5.0f, 2.0f, 0.0f};
+
+     #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+    //! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
+    //! GiMultiplyAddScalarFloat32
+    #define MADD(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d))
+    #define MSUB(a, b, c, d) GiMultiplySubScalarFloat32(a, b, *(c + d))
+    #else
+    #define MADD(a, b, c, d) GiSimdFmaLane(a, b, c, d)
+    #define MSUB(a, b, c, d) GiFmsqLaneQFloat32(a, b, c, d)
+    #endif
+
+    uint32_t OW = IW_ + 2 * PW_ - KS + 1;
+    uint32_t tiles_w = (OW + OUTPUT_BLOCK_SIZE -1)/ OUTPUT_BLOCK_SIZE;
+    int ih_valid[6]={0,0,0,0,0,0};
+    int iw_valid[6]={0,0,0,0,0,0};
+
+    for (uint32_t ic = 0; ic < IC_; ic += 4) {
+        uint32_t tile_start_id = tile_id_;
+        for(uint32_t tile_idx = 0; tile_idx < nr_tiles_in_loop_; tile_idx++) {
+            uint32_t index = tile_start_id + tile_idx;
+            uint32_t nh = index / tiles_w;
+            uint32_t nw = index % tiles_w;
+
+            int ih_start = nh * OUTPUT_BLOCK_SIZE - PH_;
+            int iw_start = nw * OUTPUT_BLOCK_SIZE - PW_;
+            int inner = (ih_start >= 0 && iw_start >= 0 &&
+                        ih_start + Alpha <= (int)IH_ &&
+                        iw_start + Alpha <= (int)IW_)?1:0;
+            if(!inner){
+                 for (int iho = 0; iho < Alpha; ++iho) {
+                    ih_valid[iho] =
+                            (iho + ih_start >= 0 &&
+                             iho + ih_start < (int)IH_) ? 1:0;
+                }
+                for (int iwo = 0; iwo < Alpha; ++iwo) {
+                    iw_valid[iwo] =
+                            (iwo + iw_start >= 0 &&
+                             iwo + iw_start < (int)(IW_))?1:0;
+                }
+            }
+            ${InputTransformF43NCHW44()}
+        }
+    })";
+    std::stringstream ss;
+    ss << StringTemplate::StringTemplateArgs()
+                    .add("inptr", strs[0])
+                    .add("transform_input_ptr", strs[1])
+                    .add("IH", strs[2])
+                    .add("IW", strs[3])
+                    .add("IC", strs[4])
+                    .add("PH", strs[5])
+                    .add("PW", strs[6])
+                    .add("tile_id", strs[7])
+                    .add("nr_tiles_in_loop", strs[8])
+                    .add("InputTransformF43NCHW44", InputTransformF43NCHW44)
+                    .render(input_process);
+    return ss.str();
+}
+
+std::string WinogradF43Strategy4x16MK4::DependMatmulSymbol() {
+    return Arm64::MatmulM4N16MK4Kernel().GetKernelSymbol(NULL);
+}
+
+std::string WinogradF43Strategy4x16MK4::BatchedMatMul(
+        const std::vector<std::string>& strs) {
+    std::string matmul_compute = R"(
+    for(uint32_t i =0; i< Alpha; i++){
+        for(uint32_t j=0; j<Alpha; j++){
+            const float* a_ptr = ${A_ptr} +
+                (i * Alpha + j) * ${OC} * ${IC};
+            float* b_ptr = ${B_ptr} +
+                (i * Alpha + j) * ${nr_tiles_in_loop} * ${IC};
+            float* c_ptr = ${C_ptr} +
+                (i * Alpha + j) * ${nr_tiles_in_loop} * ${OC};
+            ${MatMul}(a_ptr, ${LDA}, b_ptr, ${LDB}, c_ptr, ${LDC}, ${OC}, 
+                    ${nr_tiles_in_loop}, ${IC});
+        }
+    }
+    )";
+
+    std::stringstream ss;
+    ss << StringTemplate::StringTemplateArgs()
+                    .add("MatMul", DependMatmulSymbol())
+                    .add("A_ptr", strs[0])
+                    .add("LDA", strs[1])
+                    .add("B_ptr", strs[2])
+                    .add("LDB", strs[3])
+                    .add("C_ptr", strs[4])
+                    .add("LDC", strs[5])
+                    .add("OC", strs[6])
+                    .add("IC", strs[7])
+                    .add("nr_tiles_in_loop", strs[8])
+                    .render(matmul_compute);
+    return ss.str();
+}
+
+std::string WinogradF43Strategy4x16MK4::OutputFeatureTrans(
+        const std::vector<std::string>& strs, TContext* ctx) {
+    std::string ouput_trans = R"(
+    float* transform_output_ptr_ = ${transform_output_ptr};
+    const float output_parameters[4] = {1.0f, 2.0f, 4.0f, 8.0f};
+    float* outptr_ = ${outptr};
+    const float* bias = ${bias_ptr};
+    
+    uint32_t OH_ = ${OH};
+    uint32_t OW_ = ${OW};
+    uint32_t OC_ = ${OC};  
+
+    uint32_t tile_id_ = ${tile_id};
+    uint32_t nr_tiles_in_loop_ = ${nr_tiles_in_loop};
+     #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+            const float* v0 = output_parameters;
+        #else
+            GI_FLOAT32_t v0 = GiLoadFloat32(output_parameters);
+        #endif
+    uint32_t tiles_w_ = (OW_ + OutputBlockSize -1) / OutputBlockSize;
+    for (uint32_t oc = 0; oc < OC_; oc += 4) {
+        for(uint32_t tile_idx = 0; tile_idx < nr_tiles_in_loop_; tile_idx++) {
+            uint32_t index = tile_id_ + tile_idx;
+            uint32_t nh = index / tiles_w_;
+            uint32_t nw = index % tiles_w_;
+            uint32_t oh_start = nh * OutputBlockSize;
+            uint32_t ow_start = nw * OutputBlockSize;
+
+            size_t num_valid_oh =(OH_ - oh_start) < 4 ?(OH_ - oh_start) : 4;
+            size_t num_valid_ow = (OW_ - ow_start) < 4 ?(OW_ - ow_start) : 4;
+
+            //! AT * m * A
+            size_t OCB = (OC_) /  PACK_C_SIZE;
+            size_t ocb = oc /  PACK_C_SIZE;
+            size_t col_step = OCB * nr_tiles_in_loop_ * 4;
+            size_t row_step = Alpha * col_step;
+
+            GI_FLOAT32_t vbias = GiZeroFloat32();
+            GI_FLOAT32_t v00, v01, v02, v03, v04, v05;
+            GI_FLOAT32_t v10, v11, v12, v13, v14, v15;
+            GI_FLOAT32_t v20, v21, v22, v23, v24, v25;
+            GI_FLOAT32_t v30, v31, v32, v33, v34, v35;
+            GI_FLOAT32_t v40, v41, v42, v43, v44, v45;
+
+            if(num_valid_ow == num_valid_oh && num_valid_ow ==4){
+                const float* buf_base =
+                        transform_output_ptr_ + ocb * nr_tiles_in_loop_ * 4 + tile_idx * 4;
+                const float* buf_ptr = NULL;
+
+                // load line 1 -> v10 ... v15
+                buf_ptr = buf_base + row_step;
+        #define cb(i) v1##i = GiLoadFloat32(buf_ptr + i * col_step);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+        #undef cb
+
+                // load line 2 -> v20 ... v25
+                buf_ptr = buf_base + 2 * row_step;
+        #define cb(i)                                      \
+            v2##i = GiLoadFloat32(buf_ptr + i * col_step); \
+            v0##i = GiAddFloat32(v1##i, v2##i);                    \
+            v1##i = GiSubtractFloat32(v1##i, v2##i);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+        #undef cb
+
+                // load line 3 -> v30 ... v35
+                buf_ptr = buf_base + 3 * row_step;
+        #define cb(i) v3##i = GiLoadFloat32(buf_ptr + i * col_step);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+        #undef cb
+
+                // load line 4 -> v40 ... v45
+                buf_ptr = buf_base + 4 * row_step;
+        #define cb(i)                                      \
+            v4##i = GiLoadFloat32(buf_ptr + i * col_step); \
+            v2##i = GiAddFloat32(v3##i, v4##i);                    \
+            v3##i = GiSubtractFloat32(v3##i, v4##i);                    \
+            v4##i = MADD(v0##i, v2##i, v0, 2);             \
+            v2##i = GiAddFloat32(v2##i, v0##i);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+        #undef cb
+                ${nonline_gen_init()}
+                float* output_base = outptr_ + oc * OH_ * OW_ + oh_start * OW_ * PACK_C_SIZE +
+                                    ow_start * PACK_C_SIZE;
+                float* output_ptr = output_base + 2 * OW_ * PACK_C_SIZE;
+                if (bias) {
+                   vbias = GiLoadFloat32(bias + oc);
+                }
+                v00 = GiAddFloat32(v41, v42);
+                v01 = GiAddFloat32(v43, v44);
+                v02 = GiAddFloat32(v40, v00);
+                v02 = GiAddFloat32(v02, v01);
+        
+                v02 = GiAddFloat32(v02, vbias);
+                ${nonline_gen_func(v02, v02)};
+                GiStoreFloat32(output_ptr, v02);
+
+                v03 = GiSubtractFloat32(v41, v42);
+                v04 = GiSubtractFloat32(v43, v44);
+                v05 = MADD(v03, v04, v0, 1);
+               
+                v05 = GiAddFloat32(v05, vbias);
+                ${nonline_gen_func(v05, v05)};
+                GiStoreFloat32(output_ptr + PACK_C_SIZE, v05);
+
+                v02 = MADD(v00, v01, v0, 2);
+               
+                v02 = GiAddFloat32(v02, vbias);
+                ${nonline_gen_func(v02, v02)};
+                GiStoreFloat32(output_ptr + 2 * PACK_C_SIZE, v02);
+
+                v05 = MADD(v03, v04, v0, 3);
+                v05 = GiAddFloat32(v05, v45);
+
+                v05 = GiAddFloat32(v05, vbias);
+                ${nonline_gen_func(v05, v05)};
+                GiStoreFloat32(output_ptr + 3 * PACK_C_SIZE, v05);
+
+                buf_ptr = buf_base;
+        #define cb(i)                                      \
+            v4##i = GiLoadFloat32(buf_ptr + i * col_step); \
+            v4##i = GiAddFloat32(v4##i, v2##i);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+        #undef cb
+
+                output_ptr = output_base;
+               
+                v00 = GiAddFloat32(v41, v42);
+                v01 = GiAddFloat32(v43, v44);
+                v02 = GiAddFloat32(v40, v00);
+                v02 = GiAddFloat32(v02, v01);
+               
+                v02 = GiAddFloat32(v02, vbias);
+                ${nonline_gen_func(v02, v02)};
+                GiStoreFloat32(output_ptr, v02);
+
+                v03 = GiSubtractFloat32(v41, v42);
+                v04 = GiSubtractFloat32(v43, v44);
+                v05 = MADD(v03, v04, v0, 1);
+                
+                v05 = GiAddFloat32(v05, vbias);
+                ${nonline_gen_func(v05, v05)};
+                GiStoreFloat32(output_ptr + PACK_C_SIZE, v05);
+
+                v02 = MADD(v00, v01, v0, 2);
+                
+                v02 = GiAddFloat32(v02, vbias);
+                ${nonline_gen_func(v02,v02)};
+                GiStoreFloat32(output_ptr + 2 * PACK_C_SIZE, v02);
+
+                v05 = MADD(v03, v04, v0, 3);
+                v05 = GiAddFloat32(v05, v45);
+                
+                v05 = GiAddFloat32(v05, vbias);
+                ${nonline_gen_func(v05,v05)};
+                GiStoreFloat32(output_ptr + 3 * PACK_C_SIZE, v05);
+
+        #define cb(i) v4##i = MADD(v1##i, v3##i, v0, 1);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+        #undef cb
+
+                output_ptr = output_base + OW_ * PACK_C_SIZE;
+                
+                v00 = GiAddFloat32(v41, v42);
+                v01 = GiAddFloat32(v43, v44);
+                v02 = GiAddFloat32(v40, v00);
+                v02 = GiAddFloat32(v02, v01);
+               
+                v02 = GiAddFloat32(v02, vbias);
+                ${nonline_gen_func(v02, v02)};
+                GiStoreFloat32(output_ptr, v02);
+
+                v03 = GiSubtractFloat32(v41, v42);
+                v04 = GiSubtractFloat32(v43, v44);
+                v05 = MADD(v03, v04, v0, 1);
+               
+                v05 = GiAddFloat32(v05, vbias);
+                ${nonline_gen_func(v05, v05)};
+                GiStoreFloat32(output_ptr + PACK_C_SIZE, v05);
+
+                v02 = MADD(v00, v01, v0, 2);
+                
+                v02 = GiAddFloat32(v02, vbias);
+                ${nonline_gen_func(v02, v02)};
+                GiStoreFloat32(output_ptr + 2 * PACK_C_SIZE, v02);
+
+                v05 = MADD(v03, v04, v0, 3);
+                v05 = GiAddFloat32(v05, v45);
+                
+                v05 = GiAddFloat32(v05, vbias);
+                ${nonline_gen_func(v05, v05)};
+                GiStoreFloat32(output_ptr + 3 * PACK_C_SIZE, v05);
+
+                buf_ptr = buf_base + 5 * row_step;
+        #define cb(i)                                      \
+            v2##i = GiLoadFloat32(buf_ptr + i * col_step); \
+            v1##i = MADD(v1##i, v3##i, v0, 3);             \
+            v2##i = GiAddFloat32(v1##i, v2##i);
+                UNROLL_CALL_NOWRAPPER(6, cb);
+        #undef cb
+
+                output_ptr = output_base + 3 * OW_ * PACK_C_SIZE;
+                
+                v00 = GiAddFloat32(v21, v22);
+                v01 = GiAddFloat32(v23, v24);
+                v02 = GiAddFloat32(v20, v00);
+                v02 = GiAddFloat32(v02, v01);
+                
+                v02 = GiAddFloat32(v02, vbias);
+                ${nonline_gen_func(v02, v02)};
+                GiStoreFloat32(output_ptr, v02);
+
+                v03 = GiSubtractFloat32(v21, v22);
+                v04 = GiSubtractFloat32(v23, v24);
+                v05 = MADD(v03, v04, v0, 1);
+               
+                v05 = GiAddFloat32(v05, vbias);
+                ${nonline_gen_func(v05, v05)};
+                GiStoreFloat32(output_ptr + PACK_C_SIZE, v05);
+
+                v02 = MADD(v00, v01, v0, 2);
+
+                v02 = GiAddFloat32(v02, vbias);
+                ${nonline_gen_func(v02, v02)};
+                GiStoreFloat32(output_ptr + 2 * PACK_C_SIZE, v02);
+
+                v05 = MADD(v03, v04, v0, 3);
+                v05 = GiAddFloat32(v05, v25);
+
+                v05 = GiAddFloat32(v05, vbias);
+                ${nonline_gen_func(v05, v05)};
+                GiStoreFloat32(output_ptr + 3 * PACK_C_SIZE, v05);
+        }else{
+
+        const float* buf_base =
+                transform_output_ptr_ + ocb * nr_tiles_in_loop_ * 4 + tile_idx * 4;
+        const float* buf_ptr = NULL;
+
+        // load line 1 -> v10 ... v15
+        buf_ptr = buf_base + row_step;
+#define cb(i) v1##i = GiLoadFloat32(buf_ptr + i * col_step);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+        // load line 2 -> v20 ... v25
+        buf_ptr = buf_base + 2 * row_step;
+#define cb(i)                                      \
+    v2##i = GiLoadFloat32(buf_ptr + i * col_step); \
+    v0##i = GiAddFloat32(v1##i, v2##i);                    \
+    v1##i = GiSubtractFloat32(v1##i, v2##i);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+        // load line 3 -> v30 ... v35
+        buf_ptr = buf_base + 3 * row_step;
+#define cb(i) v3##i = GiLoadFloat32(buf_ptr + i * col_step);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+        // load line 4 -> v40 ... v45
+        buf_ptr = buf_base + 4 * row_step;
+#define cb(i)                                      \
+    v4##i = GiLoadFloat32(buf_ptr + i * col_step); \
+    v2##i = GiAddFloat32(v3##i, v4##i);                    \
+    v3##i = GiSubtractFloat32(v3##i, v4##i);                    \
+    v4##i = MADD(v0##i, v2##i, v0, 2);             \
+    v2##i = GiAddFloat32(v2##i, v0##i);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+        // result line 2, v40 ... v45 -> v02 ... v05
+        // v40 ... v45 is free.
+        v00 = GiAddFloat32(v41, v42);
+        v01 = GiAddFloat32(v43, v44);
+        v02 = GiAddFloat32(v40, v00);
+        v02 = GiAddFloat32(v02, v01);
+
+        v04 = MADD(v00, v01, v0, 2);
+
+        v00 = GiSubtractFloat32(v41, v42);
+        v01 = GiSubtractFloat32(v43, v44);
+        v03 = MADD(v00, v01, v0, 1);
+
+        v05 = MADD(v00, v01, v0, 3);
+        v05 = GiAddFloat32(v05, v45);
+
+        buf_ptr = buf_base;
+#define cb(i)                                      \
+    v4##i = GiLoadFloat32(buf_ptr + i * col_step); \
+    v4##i = GiAddFloat32(v4##i, v2##i);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+        // result line 0
+        // v40 ... v45 -> v22 ... v25
+        v20 = GiAddFloat32(v41, v42);
+        v21 = GiAddFloat32(v43, v44);
+        v22 = GiAddFloat32(v40, v20);
+        v22 = GiAddFloat32(v22, v21);
+
+        v24 = MADD(v20, v21, v0, 2);
+
+        v20 = GiSubtractFloat32(v41, v42);
+        v21 = GiSubtractFloat32(v43, v44);
+        v23 = MADD(v20, v21, v0, 1);
+
+        v25 = MADD(v20, v21, v0, 3);
+        v25 = GiAddFloat32(v25, v45);
+
+#define cb(i)                          \
+    v4##i = MADD(v1##i, v3##i, v0, 1); \
+    v3##i = MADD(v1##i, v3##i, v0, 3);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+        // result line 1
+        // v40 ... v45 -> v12 ... v15
+        v10 = GiAddFloat32(v41, v42);
+        v11 = GiAddFloat32(v43, v44);
+        v12 = GiAddFloat32(v40, v10);
+        v12 = GiAddFloat32(v12, v11);
+
+        v14 = MADD(v10, v11, v0, 2);
+
+        v10 = GiSubtractFloat32(v41, v42);
+        v11 = GiSubtractFloat32(v43, v44);
+        v13 = MADD(v10, v11, v0, 1);
+
+        v15 = MADD(v10, v11, v0, 3);
+        v15 = GiAddFloat32(v15, v45);
+
+        buf_ptr = buf_base + 5 * row_step;
+#define cb(i)                                      \
+    v4##i = GiLoadFloat32(buf_ptr + i * col_step); \
+    v4##i = GiAddFloat32(v3##i, v4##i);
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+        // result line 3
+        // v40 ... v45 -> v32 ... v35
+        v30 = GiAddFloat32(v41, v42);
+        v31 = GiAddFloat32(v43, v44);
+        v32 = GiAddFloat32(v40, v30);
+        v32 = GiAddFloat32(v32, v31);
+
+        v34 = MADD(v30, v31, v0, 2);
+
+        v30 = GiSubtractFloat32(v41, v42);
+        v31 = GiSubtractFloat32(v43, v44);
+        v33 = MADD(v30, v31, v0, 1);
+
+        v35 = MADD(v30, v31, v0, 3);
+        v35 = GiAddFloat32(v35, v45);
+
+        float* output_base = outptr_ + oc * OH_ * OW_ + oh_start * OW_ *  PACK_C_SIZE +
+                             ow_start *  PACK_C_SIZE;
+        float* output_ptr = NULL;
+
+
+        ${nonline_gen_init()}
+        if (bias) {
+            vbias = GiLoadFloat32(bias + oc);
+        }
+# define BIAS_LINE(i, j, k) \
+    v##j##k = GiAddFloat32(v##j##k, vbias); 
+
+#define BIAS(m) \
+    BIAS_LINE(3, m, 5) \
+    BIAS_LINE(2, m, 4) \
+    BIAS_LINE(1, m, 3) \
+    BIAS_LINE(0, m, 2) 
+
+
+// add_bias
+if(bias){
+    BIAS(0)
+    BIAS(1)
+    BIAS(2)
+    BIAS(3)
+}
+#undef BIAS_LINE 
+#undef BIAS
+
+// activate
+
+${nonline_gen_func(v35, vbias)};v35=vbias;
+${nonline_gen_func(v34, vbias)};v34=vbias;
+${nonline_gen_func(v33, vbias)};v33=vbias;
+${nonline_gen_func(v32, vbias)};v32=vbias;
+
+${nonline_gen_func(v25, vbias)};v25=vbias;
+${nonline_gen_func(v24, vbias)};v24=vbias;
+${nonline_gen_func(v23, vbias)};v23=vbias;
+${nonline_gen_func(v22, vbias)};v22=vbias;
+
+${nonline_gen_func(v15, vbias)};v15=vbias;
+${nonline_gen_func(v14, vbias)};v14=vbias;
+${nonline_gen_func(v13, vbias)};v13=vbias;
+${nonline_gen_func(v12, vbias)};v12=vbias;
+
+${nonline_gen_func(v05, vbias)};v05=vbias;
+${nonline_gen_func(v04, vbias)};v04=vbias;
+${nonline_gen_func(v03, vbias)};v03=vbias;
+${nonline_gen_func(v02, vbias)};v02=vbias;
+
+
+// store
+# define STORE_LINE(i, j, k) \
+if(num_valid_ow >i){ \
+    GiStoreFloat32(output_ptr + i *  PACK_C_SIZE, v##j##k); \
+}
+#define STORE(m, l) \
+if(num_valid_oh >m){ \
+    output_ptr = output_base + m * OW_ *  PACK_C_SIZE; \
+    STORE_LINE(3, l, 5) \
+    STORE_LINE(2, l, 4) \
+    STORE_LINE(1, l, 3) \
+    STORE_LINE(0, l, 2) \
+}
+    STORE(3, 3)
+    STORE(2, 0)
+    STORE(1, 1)
+    STORE(0, 2)
+    }
+
+#undef MSUB
+#undef MADD
+        }
+    })";
+    std::string nonline_mode = ctx->haveAttr("nonlineMode")
+                                       ? ctx->getAttrStr("nonlineMode")
+                                       : "IDENTITY";
+    auto nonline_gen = create_activation_gener_instrinsic(nonline_mode);
+    auto nonline_gen_func = [&](std::vector<std::string> str) -> std::string {
+        return nonline_gen->GenIntrinsicFloat(str[0], str[1]);
+    };
+    auto nonline_gen_init = [&]() -> std::string {
+        return nonline_gen->GenIntrinsicInitFloat();
+    };
+
+    std::stringstream ss;
+    ss << StringTemplate::StringTemplateArgs()
+                    .add("nonline_gen_func", nonline_gen_func)
+                    .add("nonline_gen_init", nonline_gen_init)
+                    .add("transform_output_ptr", strs[0])
+                    .add("outptr", strs[1])
+                    .add("bias_ptr", strs[2])
+                    .add("OH", strs[3])
+                    .add("OW", strs[4])
+                    .add("OC", strs[5])
+                    .add("tile_id", strs[6])
+                    .add("nr_tiles_in_loop", strs[7])
+                    .render(ouput_trans);
+    return ss.str();
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h
new file mode 100644
index 00000000..13dbccec
--- /dev/null
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h
@@ -0,0 +1,36 @@
+/**
+ * \file
+ * compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+#pragma once
+#include <string>
+#include "Common/ConvKernel.h"
+#include "Utils/StringTemplate.h"
+#include "WinogradCommon.h"
+#include "compiler/KernelGen/KernelGen.h"
+namespace megcc {
+namespace KernelGen {
+namespace GeneralIntrinsic {
+
+class WinogradF43Strategy4x16MK4 : public WinogradStrategyBase {
+public:
+    uint32_t GetKernelSize() override { return 3; }
+    uint32_t GetOutputBlockSize() override { return 4; }
+    std::string DependMatmulSymbol() override;
+    std::string WeightTrans(const std::vector<std::string>& strs) override;
+    std::string InputFeatureTrans(
+            const std::vector<std::string>& strs) override;
+    std::string BatchedMatMul(const std::vector<std::string>& strs) override;
+    std::string OutputFeatureTrans(const std::vector<std::string>& strs,
+                                   TContext*) override;
+};
+
+}  // namespace GeneralIntrinsic
+}  // namespace KernelGen
+}  // namespace megcc
+
+// vim: syntax=cpp.doxygen

From 4f3b5ddb15cc5610315d02b8dd56a23b3cf0a3dd Mon Sep 17 00:00:00 2001
From: yuxiongxiong <yuxiongxiong@megvii.com>
Date: Mon, 9 Jan 2023 11:11:38 +0800
Subject: [PATCH 13/17] feat(compiler): add f63 winograd for arm64 kernel

---
 .../Winograd/WinogradF63Strategy4x16MK4.cpp   | 667 ++++++++++++++++++
 .../Winograd/WinogradF63Strategy4x16MK4.h     |  36 +
 2 files changed, 703 insertions(+)
 create mode 100644 compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp
 create mode 100644 compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h

diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp
new file mode 100644
index 00000000..c63bb9f9
--- /dev/null
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp
@@ -0,0 +1,667 @@
+/**
+ * \file
+ * compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+
+#include "WinogradF63Strategy4x16MK4.h"
+#include <string>
+#include "Arm/Arm64/InternalKernel/InternalKernel.h"
+#include "GeneralIntrinsic/Activation.h"
+#include "GeneralIntrinsic/ConvKernel/ConvKernel.h"
+#include "Utils/StringTemplate.h"
+#include "compiler/KernelGen/KernelGen.h"
+
+using namespace megcc;
+using namespace KernelGen;
+using namespace GeneralIntrinsic;
+
+std::string WinogradF63Strategy4x16MK4::WeightTrans(
+        const std::vector<std::string>& strs) {
+    auto inptr = strs[0];
+    auto outptr = strs[1];
+    auto OC = strs[2];
+    auto IC = strs[3];
+    std::string filter_process = R"(
+    const uint32_t  PACK_C_SIZE= 4;
+    const uint32_t KERNEL_SIZE = 3;
+    size_t OCB = ${OC} /  PACK_C_SIZE;
+    size_t ICB = ${IC} /  PACK_C_SIZE;
+
+    for (size_t ocb = 0; ocb < OCB; ocb++) {
+        for (size_t icb = 0; icb < ICB; icb++) {
+            for (size_t ic_inner = 0; ic_inner <  PACK_C_SIZE; ic_inner++) {
+                const float* fptr = ${filter} + (ocb * ICB + icb) * KERNEL_SIZE *
+                      KERNEL_SIZE *  PACK_C_SIZE *  PACK_C_SIZE +
+                      ic_inner *  PACK_C_SIZE;
+                //! read 4OC 1IC filter
+                GI_FLOAT32_t g00 = GiLoadFloat32(fptr + 0*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g01 = GiLoadFloat32(fptr + 1*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g02 = GiLoadFloat32(fptr + 2*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g10 = GiLoadFloat32(fptr + 3*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g11 = GiLoadFloat32(fptr + 4*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g12 = GiLoadFloat32(fptr + 5*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g20 = GiLoadFloat32(fptr + 6*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g21 = GiLoadFloat32(fptr + 7*  PACK_C_SIZE *  PACK_C_SIZE);
+                GI_FLOAT32_t g22 = GiLoadFloat32(fptr + 8*  PACK_C_SIZE *  PACK_C_SIZE);
+
+                //! twice matmul
+                GI_FLOAT32_t tmp0, tmp1;
+                ${FilterTransUnroll(3, midle, g, tmp0, tmp1)}
+                ${FilterTransUnroll(8, ret, midle, tmp0, tmp1)}
+
+                //! write to the dst
+                float* dst = ${outptr};
+                ${StoreRet2D(8, 8, ret)};
+            }
+        }
+    })";
+    auto FilterTransUnroll = [](const std::vector<std::string>& strs) {
+        int times = std::stoi(strs[0]);
+        std::string dst = strs[1];
+        std::string src = strs[2];
+        std::string tmp0 = strs[3];
+        std::string tmp1 = strs[4];
+        std::stringstream ss;
+        for (int i = 0; i < times; i++) {
+            ss << "GI_FLOAT32_t " << dst << i << "0 = " << src << "0" << i
+               << ";\n";
+            ss << tmp0 << " = GiMultiplyScalerFloat32(GiAddFloat32(" << src
+               << "0" << i << ", " << src << "2" << i << "), (-2.0/9));\n";
+            ss << tmp1 << " = GiMultiplyScalerFloat32(" << src << "1" << i
+               << ", (-2.0/9));\n";
+            ss << "GI_FLOAT32_t " << dst << i << "1 = GiAddFloat32(" << tmp0
+               << ", " << tmp1 << ");\n";
+            ss << "GI_FLOAT32_t " << dst << i << "2 = GiSubtractFloat32("
+               << tmp0 << ", " << tmp1 << ");\n";
+            ss << tmp0 << " = GiAddFloat32(GiMultiplyScalerFloat32(" << src
+               << "0" << i << ", 1.0/90), GiMultiplyScalerFloat32(" << src
+               << "2" << i << ", 2.0/45));\n";
+            ss << tmp1 << " = GiMultiplyScalerFloat32(" << src << "1" << i
+               << ", 2.0/90);\n";
+            ss << "GI_FLOAT32_t " << dst << i << "3 = GiAddFloat32(" << tmp0
+               << ", " << tmp1 << ");\n";
+            ss << "GI_FLOAT32_t " << dst << i << "4 = GiSubtractFloat32("
+               << tmp0 << ", " << tmp1 << ");\n";
+            ss << tmp0 << " = GiAddFloat32(GiMultiplyScalerFloat32(" << src
+               << "0" << i << ", 32.0/45), GiMultiplyScalerFloat32(" << src
+               << "2" << i << ", 8.0/45));\n";
+            ss << tmp1 << " = GiMultiplyScalerFloat32(" << src << "1" << i
+               << ", 16.0/45);\n";
+            ss << "GI_FLOAT32_t " << dst << i << "5 = GiAddFloat32(" << tmp0
+               << ", " << tmp1 << ");\n";
+            ss << "GI_FLOAT32_t " << dst << i << "6 = GiSubtractFloat32("
+               << tmp0 << ", " << tmp1 << ");\n";
+            ss << "GI_FLOAT32_t " << dst << i << "7 = " << src << "2" << i
+               << ";\n";
+        }
+        return ss.str();
+    };
+
+    auto StoreRet2D = [](const std::vector<std::string>& strs) {
+        int times_out = std::stoi(strs[0]);
+        int times_inner = std::stoi(strs[1]);
+        std::string src = strs[2];
+        std::stringstream ss;
+        for (int out = 0; out < times_out; out++) {
+            for (int inner = 0; inner < times_inner; inner++) {
+                ss << "GiStoreFloat32(dst + (" << out << " * Alpha + " << inner
+                   << ") * OCB * ICB * PACK_C_SIZE * PACK_C_SIZE + ocb * ICB * "
+                      "PACK_C_SIZE *PACK_C_SIZE + icb* PACK_C_SIZE * "
+                      "PACK_C_SIZE + "
+                      "ic_inner*PACK_C_SIZE, "
+                   << src << out << inner << ");\n";
+            }
+        }
+        return ss.str();
+    };
+    std::stringstream ss;
+    ss << StringTemplate::StringTemplateArgs()
+                    .add("StoreRet2D", StoreRet2D)
+                    .add("FilterTransUnroll", FilterTransUnroll)
+                    .add("OC", OC)
+                    .add("IC", IC)
+                    .add("filter", inptr)
+                    .add("outptr", outptr)
+                    .render(filter_process);
+    return ss.str();
+}
+
+std::string WinogradF63Strategy4x16MK4::InputFeatureTrans(
+        const std::vector<std::string>& strs) {
+    auto InputPrepareF43NCHW44 = [](std::vector<std::string>) {
+        std::stringstream ss;
+        std::string kernel = R"(
+        size_t IW4 = IW_ * PACK_C_SIZE;
+        size_t iw4_start = iw_start * PACK_C_SIZE;
+        size_t icb = ic / PACK_C_SIZE;
+        memset(patchT, 0, sizeof(float) * PACK_C_SIZE * Alpha * Alpha);
+        if (inner) {
+            const float* input_ptr =
+                    source + icb * IH_ * IW4 + ih_start * IW4 + iw4_start;
+            for (size_t ih = 0; ih < Alpha; ih++) {
+#define cb(i) GI_FLOAT32_t v##i = GiLoadFloat32(input_ptr + PACK_C_SIZE * i);
+                UNROLL_CALL_NOWRAPPER(8, cb);
+#undef cb
+
+#define cb(i) GiStoreFloat32(patchT + ih * PACK_C_SIZE * Alpha + i * PACK_C_SIZE, v##i);
+                UNROLL_CALL_NOWRAPPER(8, cb);
+#undef cb
+                input_ptr += IW4;
+            }
+        } else {
+            int ih0_act = ih_start >0 ? ih_start:0,
+                ih1_act = (ih_start + Alpha)< IH_?(ih_start + Alpha):IH_,
+                iw0_act = iw_start > 0 ? iw_start : 0,
+                iw1_act =(iw_start + Alpha)< IW_?(iw_start + Alpha):IW_;
+            const float* input_ptr = source + icb * IH_ * IW4;
+            // partial copy
+            for (int ih = ih0_act; ih < ih1_act; ++ih) {
+                for (int iw = iw0_act; iw < iw1_act; ++iw) {
+                    size_t iho = ih - ih_start, iwo = iw - iw_start;
+                    GI_FLOAT32_t src = GiLoadFloat32(input_ptr + ih * IW4 + iw * PACK_C_SIZE);
+                    GiStoreFloat32(
+                            patchT + iho * PACK_C_SIZE * Alpha + iwo * PACK_C_SIZE, src);
+                }
+            }
+        }
+
+
+)";
+        return kernel;
+    };
+    auto InputTransformF43NCHW44 = [](std::vector<std::string>) {
+        std::stringstream ss;
+        std::string kernel = R"(
+         // BT * d * B
+
+        size_t ICB = IC_ / PACK_C_SIZE;
+
+        GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7;
+#if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+        const float* v0 = input_parameters + 0;
+        const float* v1 = input_parameters + 4;
+        const float* v2 = input_parameters + 8;
+#else
+        GI_FLOAT32_t v0 = GiLoadFloat32(input_parameters + 0);
+        GI_FLOAT32_t v1 = GiLoadFloat32(input_parameters + 4);
+        GI_FLOAT32_t v2 = GiLoadFloat32(input_parameters + 8);
+#endif
+
+        //! B
+        //!     1     0     0     0     0    0    0     0
+        //!     0     1    -1   0.5  -0.5    2   -2    -1
+        //! -5.25     1     1  0.25  0.25    4    4     0
+        //!     0 -4.25  4.25  -2.5   2.5 -2.5  2.5  5.25
+        //!  5.25 -4.25 -4.25 -1.25 -1.25   -5   -5     0
+        //!     0     1    -1     2    -2  0.5 -0.5 -5.25
+        //!    -1     1     1     1     1    1    1     0
+        //!     0     0     0     0     0    0    0     1
+
+#define cb(i)                                                                     \
+    d1 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 1 * PACK_C_SIZE);           \
+    d2 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 2 * PACK_C_SIZE);           \
+    d3 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 3 * PACK_C_SIZE);           \
+    d4 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 4 * PACK_C_SIZE);           \
+    d5 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 5 * PACK_C_SIZE);           \
+    d6 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 6 * PACK_C_SIZE);           \
+    GI_FLOAT32_t t##i##0 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 0 * PACK_C_SIZE); \
+    GI_FLOAT32_t t##i##7 = GiLoadFloat32(patchT + i * Alpha * PACK_C_SIZE + 7 * PACK_C_SIZE); \
+    GI_FLOAT32_t t##i##1 = d6;                                                            \
+    GI_FLOAT32_t t##i##2 = d6;                                                            \
+    GI_FLOAT32_t t##i##3 = d6;                                                            \
+    GI_FLOAT32_t t##i##4 = d6;                                                            \
+    GI_FLOAT32_t t##i##5 = d6;                                                            \
+    GI_FLOAT32_t t##i##6 = d6;                                                            \
+    t##i##0 = GiSubtractFloat32(t##i##0, d6);                                                  \
+    t##i##1 = GiAddFloat32(t##i##1, d1);                                                  \
+    t##i##2 = GiSubtractFloat32(t##i##2, d1);                                                  \
+    t##i##3 = MADD(t##i##3, d1, v0, 2);                                           \
+    t##i##4 = MSUB(t##i##4, d1, v0, 2);                                           \
+    t##i##5 = MADD(t##i##5, d1, v1, 2);                                           \
+    t##i##6 = MSUB(t##i##6, d1, v1, 2);                                           \
+    t##i##7 = GiSubtractFloat32(t##i##7, d1);                                                  \
+    t##i##0 = MSUB(t##i##0, d2, v0, 0);                                           \
+    t##i##1 = GiAddFloat32(t##i##1, d2);                                                  \
+    t##i##2 = GiAddFloat32(t##i##2, d2);                                                  \
+    t##i##3 = MADD(t##i##3, d2, v0, 3);                                           \
+    t##i##4 = MADD(t##i##4, d2, v0, 3);                                           \
+    t##i##5 = MADD(t##i##5, d2, v1, 3);                                           \
+    t##i##6 = MADD(t##i##6, d2, v1, 3);                                           \
+    t##i##1 = MSUB(t##i##1, d3, v0, 1);                                           \
+    t##i##2 = MADD(t##i##2, d3, v0, 1);                                           \
+    t##i##3 = MSUB(t##i##3, d3, v1, 0);                                           \
+    t##i##4 = MADD(t##i##4, d3, v1, 0);                                           \
+    t##i##5 = MSUB(t##i##5, d3, v1, 0);                                           \
+    t##i##6 = MADD(t##i##6, d3, v1, 0);                                           \
+    t##i##7 = MADD(t##i##7, d3, v0, 0);                                           \
+    t##i##0 = MADD(t##i##0, d4, v0, 0);                                           \
+    t##i##1 = MSUB(t##i##1, d4, v0, 1);                                           \
+    t##i##2 = MSUB(t##i##2, d4, v0, 1);                                           \
+    t##i##3 = MSUB(t##i##3, d4, v1, 1);                                           \
+    t##i##4 = MSUB(t##i##4, d4, v1, 1);                                           \
+    t##i##5 = MSUB(t##i##5, d4, v2, 0);                                           \
+    t##i##6 = MSUB(t##i##6, d4, v2, 0);                                           \
+    t##i##1 = GiAddFloat32(t##i##1, d5);                                                  \
+    t##i##2 = GiSubtractFloat32(t##i##2, d5);                                                  \
+    t##i##3 = MADD(t##i##3, d5, v1, 2);                                           \
+    t##i##4 = MSUB(t##i##4, d5, v1, 2);                                           \
+    t##i##5 = MADD(t##i##5, d5, v0, 2);                                           \
+    t##i##6 = MSUB(t##i##6, d5, v0, 2);                                           \
+    t##i##7 = MSUB(t##i##7, d5, v0, 0);
+        UNROLL_CALL_RAW(8, cb);
+#undef cb
+
+#define cb(i)                                                                  \
+    d0 = t0##i;                                                                \
+    d1 = t6##i;                                                                \
+    d2 = t6##i;                                                                \
+    d3 = t6##i;                                                                \
+    d4 = t6##i;                                                                \
+    d5 = t6##i;                                                                \
+    d6 = t6##i;                                                                \
+    d7 = t7##i;                                                                \
+    d0 = GiSubtractFloat32(d0, t6##i);                                                      \
+    d1 = GiAddFloat32(d1, t1##i);                                                      \
+    d2 = GiSubtractFloat32(d2, t1##i);                                                      \
+    d3 = MADD(d3, t1##i, v0, 2);                                               \
+    d4 = MSUB(d4, t1##i, v0, 2);                                               \
+    d5 = MADD(d5, t1##i, v1, 2);                                               \
+    d6 = MSUB(d6, t1##i, v1, 2);                                               \
+    d7 = GiSubtractFloat32(d7, t1##i);                                                      \
+    d0 = MSUB(d0, t2##i, v0, 0);                                               \
+    d1 = GiAddFloat32(d1, t2##i);                                                      \
+    d2 = GiAddFloat32(d2, t2##i);                                                      \
+    d3 = MADD(d3, t2##i, v0, 3);                                               \
+    d4 = MADD(d4, t2##i, v0, 3);                                               \
+    d5 = MADD(d5, t2##i, v1, 3);                                               \
+    d6 = MADD(d6, t2##i, v1, 3);                                               \
+    d1 = MSUB(d1, t3##i, v0, 1);                                               \
+    d2 = MADD(d2, t3##i, v0, 1);                                               \
+    d3 = MSUB(d3, t3##i, v1, 0);                                               \
+    d4 = MADD(d4, t3##i, v1, 0);                                               \
+    d5 = MSUB(d5, t3##i, v1, 0);                                               \
+    d6 = MADD(d6, t3##i, v1, 0);                                               \
+    d7 = MADD(d7, t3##i, v0, 0);                                               \
+    d0 = MADD(d0, t4##i, v0, 0);                                               \
+    d1 = MSUB(d1, t4##i, v0, 1);                                               \
+    d2 = MSUB(d2, t4##i, v0, 1);                                               \
+    d3 = MSUB(d3, t4##i, v1, 1);                                               \
+    d4 = MSUB(d4, t4##i, v1, 1);                                               \
+    d5 = MSUB(d5, t4##i, v2, 0);                                               \
+    d6 = MSUB(d6, t4##i, v2, 0);                                               \
+    d1 = GiAddFloat32(d1, t5##i);                                                      \
+    d2 = GiSubtractFloat32(d2, t5##i);                                                      \
+    d3 = MADD(d3, t5##i, v1, 2);                                               \
+    d4 = MSUB(d4, t5##i, v1, 2);                                               \
+    d5 = MADD(d5, t5##i, v0, 2);                                               \
+    d6 = MSUB(d6, t5##i, v0, 2);                                               \
+    d7 = MSUB(d7, t5##i, v0, 0);                                               \
+    GiStoreFloat32(                                                            \
+            dst +                                              \
+                    (0 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE +     \
+                    icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \
+            d0);                                                               \
+    GiStoreFloat32(                                                            \
+            dst +                                              \
+                    (1 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE +     \
+                    icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \
+            d1);                                                               \
+    GiStoreFloat32(                                                            \
+            dst +                                              \
+                    (2 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE +     \
+                    icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \
+            d2);                                                               \
+    GiStoreFloat32(                                                            \
+            dst +                                              \
+                    (3 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE +     \
+                    icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \
+            d3);                                                               \
+    GiStoreFloat32(                                                            \
+            dst +                                              \
+                    (4 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE +     \
+                    icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \
+            d4);                                                               \
+    GiStoreFloat32(                                                            \
+            dst +                                              \
+                    (5 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE +     \
+                    icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \
+            d5);                                                               \
+    GiStoreFloat32(                                                            \
+            dst +                                              \
+                    (6 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE +     \
+                    icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \
+            d6);                                                               \
+    GiStoreFloat32(                                                            \
+            dst +                                              \
+                    (7 * Alpha + i) * ICB * nr_tiles_in_loop_ * PACK_C_SIZE +     \
+                    icb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE, \
+            d7);
+        UNROLL_CALL_RAW(8, cb);
+#undef cb
+
+
+)";
+        return kernel;
+    };
+
+    std::string input_process = R"(
+    const uint32_t OUTPUT_BLOCK_SIZE = 6;
+    const uint32_t KS = 3;
+
+    float* dst = ${transform_input_ptr};
+    const float* source = ${inptr};
+    uint32_t IH_ = ${IH};
+    uint32_t IW_ = ${IW};
+    uint32_t IC_ = ${IC};
+    uint32_t PH_ = ${PH};
+    uint32_t PW_ = ${PW};
+    uint32_t nr_tiles_in_loop_ = ${nr_tiles_in_loop};
+    uint32_t tile_id_ = ${tile_id};
+
+
+    const float input_parameters[12] = {5.25f, 4.25f, 0.5f, 0.25f, 2.5f, 1.25f,
+                                        2.0f,  4.0f,  5.0f, 0.0f,  0.0f, 0.0f};
+
+     #if defined(GI_TARGET_X86) || defined(GI_RVV_INTRINSICS)
+    //! x86 and rvv GiSimdFmaLane API is slowly, as an alternate, use
+    //! GiMultiplyAddScalarFloat32
+    #define MADD(a, b, c, d) GiMultiplyAddScalarFloat32(a, b, *(c + d))
+    #define MSUB(a, b, c, d) GiMultiplySubScalarFloat32(a, b, *(c + d))
+    #else
+    #define MADD(a, b, c, d) GiSimdFmaLane(a, b, c, d)
+    #define MSUB(a, b, c, d) GiFmsqLaneQFloat32(a, b, c, d)
+    #endif
+
+    uint32_t OW = IW_ + 2 * PW_ - KS + 1;
+    uint32_t tiles_w = (OW + OUTPUT_BLOCK_SIZE -1)/ OUTPUT_BLOCK_SIZE;
+    float* patch = transform_mid_ptr;
+    float* patchT = transform_mid_ptr + PACK_C_SIZE * Alpha * Alpha;
+
+    for (uint32_t ic = 0; ic < IC_; ic += 4) {
+        uint32_t tile_start_id = tile_id_;
+        for(uint32_t tile_idx = 0; tile_idx < nr_tiles_in_loop_; tile_idx++) {
+            uint32_t index = tile_start_id + tile_idx;
+            uint32_t nh = index / tiles_w;
+            uint32_t nw = index % tiles_w;
+
+            int ih_start = nh * OUTPUT_BLOCK_SIZE - PH_;
+            int iw_start = nw * OUTPUT_BLOCK_SIZE - PW_;
+            int inner = (ih_start >= 0 && iw_start >= 0 &&
+                        ih_start + Alpha <= (int)IH_ &&
+                        iw_start + Alpha <= (int)IW_)?1:0;
+
+            
+            ${InputPrepareF43NCHW44()}
+            ${InputTransformF43NCHW44()}
+        }
+    })";
+
+    std::stringstream ss;
+    ss << StringTemplate::StringTemplateArgs()
+                    .add("inptr", strs[0])
+                    .add("transform_input_ptr", strs[1])
+                    .add("IH", strs[2])
+                    .add("IW", strs[3])
+                    .add("IC", strs[4])
+                    .add("PH", strs[5])
+                    .add("PW", strs[6])
+                    .add("tile_id", strs[7])
+                    .add("nr_tiles_in_loop", strs[8])
+                    .add("InputTransformF43NCHW44", InputTransformF43NCHW44)
+                    .add("InputPrepareF43NCHW44", InputPrepareF43NCHW44)
+                    .render(input_process);
+    return ss.str();
+}
+
+std::string WinogradF63Strategy4x16MK4::DependMatmulSymbol() {
+    return Arm64::MatmulM4N16MK4Kernel().GetKernelSymbol(NULL);
+}
+
+std::string WinogradF63Strategy4x16MK4::BatchedMatMul(
+        const std::vector<std::string>& strs) {
+    std::string matmul_compute = R"(
+    for(uint32_t i =0; i< Alpha; i++){
+        for(uint32_t j=0; j<Alpha; j++){
+            const float* a_ptr = ${A_ptr} +
+                (i * Alpha + j) * ${OC} * ${IC};
+            float* b_ptr = ${B_ptr} +
+                (i * Alpha + j) * ${nr_tiles_in_loop} * ${IC};
+            float* c_ptr = ${C_ptr} +
+                (i * Alpha + j) * ${nr_tiles_in_loop} * ${OC};
+            ${MatMul}(a_ptr, ${LDA}, b_ptr, ${LDB}, c_ptr, ${LDC}, ${OC}, 
+                    ${nr_tiles_in_loop}, ${IC});
+        }
+    }
+    )";
+
+    std::stringstream ss;
+    ss << StringTemplate::StringTemplateArgs()
+                    .add("MatMul", DependMatmulSymbol())
+                    .add("A_ptr", strs[0])
+                    .add("LDA", strs[1])
+                    .add("B_ptr", strs[2])
+                    .add("LDB", strs[3])
+                    .add("C_ptr", strs[4])
+                    .add("LDC", strs[5])
+                    .add("OC", strs[6])
+                    .add("IC", strs[7])
+                    .add("nr_tiles_in_loop", strs[8])
+                    .render(matmul_compute);
+    return ss.str();
+}
+
+std::string WinogradF63Strategy4x16MK4::OutputFeatureTrans(
+        const std::vector<std::string>& strs, TContext* ctx) {
+    std::string ouput_trans = R"(
+    float* transform_output_ptr_ = ${transform_output_ptr};
+    float* outptr_ = ${outptr};
+    const float* bias = ${bias_ptr};
+    
+    uint32_t OH_ = ${OH};
+    uint32_t OW_ = ${OW};
+    uint32_t OC_ = ${OC};
+    uint32_t tile_id_ = ${tile_id};
+    uint32_t nr_tiles_in_loop_ = ${nr_tiles_in_loop};
+    uint32_t tiles_w_ = (OW_ + OutputBlockSize -1) / OutputBlockSize;
+    for (uint32_t oc = 0; oc < OC_; oc += 4) {
+        for(uint32_t tile_idx = 0; tile_idx < nr_tiles_in_loop_; tile_idx++) {
+            uint32_t index = tile_id_ + tile_idx;
+            uint32_t nh = index / tiles_w_;
+            uint32_t nw = index % tiles_w_;
+            uint32_t oh_start = nh * OutputBlockSize;
+            uint32_t ow_start = nw * OutputBlockSize;
+        //! AT * m * A
+
+        size_t OCB = OC_ / PACK_C_SIZE;
+        size_t ocb = oc / PACK_C_SIZE;
+
+#define cb(m, n)                                                   \
+    GI_FLOAT32_t v##m##n = GiLoadFloat32(                                  \
+            transform_output_ptr_ +                                 \
+            (m * Alpha + n) * OCB * nr_tiles_in_loop_ * PACK_C_SIZE + \
+            ocb * nr_tiles_in_loop_ * PACK_C_SIZE + tile_idx * PACK_C_SIZE);
+        UNROLL_CALL_NOWRAPPER_D2(8, 8, cb);
+#undef cb
+
+        /**
+         * A
+         *
+         * 1    0    0      0       0         0
+         * 1    1    1      1       1         1
+         * 1   -1    1     -1       1        -1
+         * 1    2    4      8      16        32
+         * 1   -2    4     -8      16       -32
+         * 1  0.5 0.25  0.125  0.0625   0.03125
+         * 1 -0.5 0.25 -0.125  0.0625  -0.03125
+         * 0    0    0      0       0         1
+         */
+
+        /*
+         * v1addv2 = v1##m + v2##m;
+         * v1subv2 = v1##m - v2##m;
+         * v3addv4 = v3##m + v4##m;
+         * v3subv4 = v3##m - v4##m;
+         * v5addv6 = v5##m + v6##m;
+         * v5subv6 = v5##m - v6##m;
+         * t0##m = v0##m + v1addv2 + v3addv4 + v5addv6;
+         * t1##m = v1subv2 + v3subv4 * 2.f + v5subv6 * 0.5f;
+         * t2##m = v1addv2 + v3addv4 * 4.f + v5addv6 * 0.25f;
+         * t3##m = v1subv2 + v3subv4 * 8.f + v5subv6 * 0.125f;
+         * t4##m = v1addv2 + v3addv4 * 16.f + v5addv6 * 0.0625f;
+         * t5##m = v1subv2 + v3subv4 * 32.f + v5subv6 * 0.03125f + v7##m;
+         */
+        GI_FLOAT32_t v1addv2, v1subv2, v3addv4, v3subv4, v5addv6, v5subv6;
+#define cb(m)                                                                         \
+    v1addv2 = GiAddFloat32(v1##m, v2##m);                                                     \
+    v1subv2 = GiSubtractFloat32(v1##m, v2##m);                                                     \
+    v3addv4 = GiAddFloat32(v3##m, v4##m);                                                     \
+    v3subv4 = GiSubtractFloat32(v3##m, v4##m);                                                     \
+    v5addv6 = GiAddFloat32(v5##m, v6##m);                                                     \
+    v5subv6 = GiSubtractFloat32(v5##m, v6##m);                                                     \
+    GI_FLOAT32_t t0##m = GiAddFloat32(GiAddFloat32(GiAddFloat32(v0##m, v1addv2), v3addv4), v5addv6);                  \
+    GI_FLOAT32_t t1##m = GiAddFloat32(GiAddFloat32(v1subv2, GiMultiplyScalerFloat32(v3subv4, 2.f)), GiMultiplyScalerFloat32(v5subv6, 0.5f));      \
+    GI_FLOAT32_t t2##m = GiAddFloat32(GiAddFloat32(v1addv2, GiMultiplyScalerFloat32(v3addv4, 4.f)), GiMultiplyScalerFloat32(v5addv6, 0.25f));     \
+    GI_FLOAT32_t t3##m = GiAddFloat32(GiAddFloat32(v1subv2, GiMultiplyScalerFloat32(v3subv4, 8.f)), GiMultiplyScalerFloat32(v5subv6, 0.125f));    \
+    GI_FLOAT32_t t4##m = GiAddFloat32(GiAddFloat32(v1addv2, GiMultiplyScalerFloat32(v3addv4, 16.f)), GiMultiplyScalerFloat32(v5addv6, 0.0625f));  \
+    GI_FLOAT32_t t5##m =                                                                      \
+            GiAddFloat32(GiAddFloat32(GiAddFloat32(v1subv2, GiMultiplyScalerFloat32(v3subv4, 32.f)), GiMultiplyScalerFloat32(v5subv6, 0.03125f)), \
+                 v7##m);
+
+        UNROLL_CALL_NOWRAPPER(8, cb);
+#undef cb
+
+        /*
+         * v1addv2 = t##m##1 + t##m##2;
+         * v1subv2 = t##m##1 - t##m##2;
+         * v3addv4 = t##m##3 + t##m##4;
+         * v3subv4 = t##m##3 - t##m##4;
+         * v5addv6 = t##m##5 + t##m##6;
+         * v5subv6 = t##m##5 - t##m##6;
+         * v##m##0 = t##m##0 + v1addv2 + v3addv4 + v5addv6;
+         * v##m##1 = v1subv2 + v3subv4 * 2.f + v5subv6 * 0.5f;
+         * v##m##2 = v1addv2 + v3addv4 * 4.f + v5addv6 * 0.25f;
+         * v##m##3 = v1subv2 + v3subv4 * 8.f + v5subv6 * 0.125f;
+         * v##m##4 = v1addv2 + v3addv4 * 16.f + v5addv6 * 0.0625f;
+         * v##m##5 = v1subv2 + v3subv4 * 32.f + v5subv6 * 0.03125f + t##m##7;
+         */
+#define cb(m)                                                                         \
+    v1addv2 = GiAddFloat32(t##m##1, t##m##2);                                                 \
+    v1subv2 = GiSubtractFloat32(t##m##1, t##m##2);                                                 \
+    v3addv4 = GiAddFloat32(t##m##3, t##m##4);                                                 \
+    v3subv4 = GiSubtractFloat32(t##m##3, t##m##4);                                                 \
+    v5addv6 = GiAddFloat32(t##m##5, t##m##6);                                                 \
+    v5subv6 = GiSubtractFloat32(t##m##5, t##m##6);                                                 \
+    v##m##0 = GiAddFloat32(GiAddFloat32(GiAddFloat32(t##m##0, v1addv2), v3addv4), v5addv6);                   \
+    v##m##1 = GiAddFloat32(GiAddFloat32(v1subv2, GiMultiplyScalerFloat32(v3subv4, 2.f)), GiMultiplyScalerFloat32(v5subv6, 0.5f));         \
+    v##m##2 = GiAddFloat32(GiAddFloat32(v1addv2, GiMultiplyScalerFloat32(v3addv4, 4.f)), GiMultiplyScalerFloat32(v5addv6, 0.25f));        \
+    v##m##3 = GiAddFloat32(GiAddFloat32(v1subv2, GiMultiplyScalerFloat32(v3subv4, 8.f)), GiMultiplyScalerFloat32(v5subv6, 0.125f));       \
+    v##m##4 = GiAddFloat32(GiAddFloat32(v1addv2, GiMultiplyScalerFloat32(v3addv4, 16.f)), GiMultiplyScalerFloat32(v5addv6, 0.0625f));     \
+    v##m##5 =                                                                         \
+            GiAddFloat32(GiAddFloat32(GiAddFloat32(v1subv2, GiMultiplyScalerFloat32(v3subv4, 32.f)), GiMultiplyScalerFloat32(v5subv6, 0.03125f)), \
+                 t##m##7);
+
+        UNROLL_CALL_NOWRAPPER(6, cb);
+#undef cb
+
+        GI_FLOAT32_t vbias;
+        if (bias) {
+            vbias = GiLoadFloat32(bias + oc);
+
+#define cb(m, n) v##m##n = GiAddFloat32(v##m##n, vbias);
+            UNROLL_CALL_RAW_D2(6, 6, cb);
+#undef cb
+        }
+${nonline_gen_init()}
+${nonline_gen_func(v00, vbias)};v00=vbias;
+${nonline_gen_func(v01, vbias)};v01=vbias;
+${nonline_gen_func(v02, vbias)};v02=vbias;
+${nonline_gen_func(v03, vbias)};v03=vbias;
+${nonline_gen_func(v04, vbias)};v04=vbias;
+${nonline_gen_func(v05, vbias)};v05=vbias;
+
+${nonline_gen_func(v10, vbias)};v10=vbias;
+${nonline_gen_func(v11, vbias)};v11=vbias;
+${nonline_gen_func(v12, vbias)};v12=vbias;
+${nonline_gen_func(v13, vbias)};v13=vbias;
+${nonline_gen_func(v14, vbias)};v14=vbias;
+${nonline_gen_func(v15, vbias)};v15=vbias;
+
+${nonline_gen_func(v20, vbias)};v20=vbias;
+${nonline_gen_func(v21, vbias)};v21=vbias;
+${nonline_gen_func(v22, vbias)};v22=vbias;
+${nonline_gen_func(v23, vbias)};v23=vbias;
+${nonline_gen_func(v24, vbias)};v24=vbias;
+${nonline_gen_func(v25, vbias)};v25=vbias;
+
+${nonline_gen_func(v30, vbias)};v30=vbias;
+${nonline_gen_func(v31, vbias)};v31=vbias;
+${nonline_gen_func(v32, vbias)};v32=vbias;
+${nonline_gen_func(v33, vbias)};v33=vbias;
+${nonline_gen_func(v34, vbias)};v34=vbias;
+${nonline_gen_func(v35, vbias)};v35=vbias;
+
+${nonline_gen_func(v40, vbias)};v40=vbias;
+${nonline_gen_func(v41, vbias)};v41=vbias;
+${nonline_gen_func(v42, vbias)};v42=vbias;
+${nonline_gen_func(v43, vbias)};v43=vbias;
+${nonline_gen_func(v44, vbias)};v44=vbias;
+${nonline_gen_func(v45, vbias)};v45=vbias;
+
+${nonline_gen_func(v50, vbias)};v50=vbias;
+${nonline_gen_func(v51, vbias)};v51=vbias;
+${nonline_gen_func(v52, vbias)};v52=vbias;
+${nonline_gen_func(v53, vbias)};v53=vbias;
+${nonline_gen_func(v54, vbias)};v54=vbias;
+${nonline_gen_func(v55, vbias)};v55=vbias;
+
+
+#define out_save(oho, owo)                                                           \
+    do {                                                                             \
+        size_t oh = oh_start + oho;                                                  \
+        size_t ow = ow_start + owo;                                                  \
+        if (oh < OH && ow < OW) {                                                    \
+            GiStoreFloat32(                                                          \
+                    outptr_ + oc * OH * OW + oh * OW * PACK_C_SIZE + ow * PACK_C_SIZE,    \
+                    v##oho##owo);                                                    \
+        }                                                                            \
+    } while (0);
+        UNROLL_CALL_RAW_D2(6, 6, out_save);
+
+#undef out_save
+
+#undef MSUB
+#undef MADD
+        }
+    })";
+    std::string nonline_mode = ctx->haveAttr("nonlineMode")
+                                       ? ctx->getAttrStr("nonlineMode")
+                                       : "IDENTITY";
+    auto nonline_gen = create_activation_gener_instrinsic(nonline_mode);
+    auto nonline_gen_func = [&](std::vector<std::string> str) -> std::string {
+        return nonline_gen->GenIntrinsicFloat(str[0], str[1]);
+    };
+    auto nonline_gen_init = [&]() -> std::string {
+        return nonline_gen->GenIntrinsicInitFloat();
+    };
+
+    std::stringstream ss;
+    ss << StringTemplate::StringTemplateArgs()
+                    .add("nonline_gen_func", nonline_gen_func)
+                    .add("nonline_gen_init", nonline_gen_init)
+                    .add("transform_output_ptr", strs[0])
+                    .add("outptr", strs[1])
+                    .add("bias_ptr", strs[2])
+                    .add("OH", strs[3])
+                    .add("OW", strs[4])
+                    .add("OC", strs[5])
+                    .add("tile_id", strs[6])
+                    .add("nr_tiles_in_loop", strs[7])
+                    .render(ouput_trans);
+    return ss.str();
+}
+
+// vim: syntax=cpp.doxygen
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h
new file mode 100644
index 00000000..dfd83b9b
--- /dev/null
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h
@@ -0,0 +1,36 @@
+/**
+ * \file
+ * compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+#pragma once
+#include <string>
+#include "Common/ConvKernel.h"
+#include "Utils/StringTemplate.h"
+#include "WinogradCommon.h"
+#include "compiler/KernelGen/KernelGen.h"
+namespace megcc {
+namespace KernelGen {
+namespace GeneralIntrinsic {
+
+class WinogradF63Strategy4x16MK4 : public WinogradStrategyBase {
+public:
+    uint32_t GetKernelSize() override { return 3; }
+    uint32_t GetOutputBlockSize() override { return 6; }
+    std::string DependMatmulSymbol() override;
+    std::string WeightTrans(const std::vector<std::string>& strs) override;
+    std::string InputFeatureTrans(
+            const std::vector<std::string>& strs) override;
+    std::string BatchedMatMul(const std::vector<std::string>& strs) override;
+    std::string OutputFeatureTrans(const std::vector<std::string>& strs,
+                                   TContext*) override;
+};
+
+}  // namespace GeneralIntrinsic
+}  // namespace KernelGen
+}  // namespace megcc
+
+// vim: syntax=cpp.doxygen

From 5d345c3ddf5878f0bdcdfbbe678dff9840fa4a36 Mon Sep 17 00:00:00 2001
From: yuxiongxiong <yuxiongxiong@megvii.com>
Date: Mon, 9 Jan 2023 12:17:41 +0800
Subject: [PATCH 14/17] feat(compiler): add f63 f43 kernel test and benchmark

---
 .../Transforms/KernelMaterialization.cpp      |  2 +-
 .../ConvKernel/Winograd/WinogradCommon.cpp    |  4 +-
 .../ConvKernel/Winograd/WinogradCommon.h      |  3 +-
 .../Winograd/WinogradF43Strategy4x16MK4.cpp   | 11 ++-
 .../Winograd/WinogradF43Strategy4x16MK4.h     |  1 +
 .../Winograd/WinogradF63Strategy4x16MK4.cpp   | 12 +--
 .../Winograd/WinogradF63Strategy4x16MK4.h     |  1 +
 compiler/lib/KernelGen/KernelGen.cpp          |  6 +-
 .../test/kernel/opr/arm/benchmark_conv.cpp    | 48 ++++++-----
 compiler/test/kernel/opr/arm/conv.cpp         | 80 ++++++++++---------
 10 files changed, 88 insertions(+), 80 deletions(-)

diff --git a/compiler/lib/Dialect/Kernel/Transforms/KernelMaterialization.cpp b/compiler/lib/Dialect/Kernel/Transforms/KernelMaterialization.cpp
index 574590bb..9ad16439 100644
--- a/compiler/lib/Dialect/Kernel/Transforms/KernelMaterialization.cpp
+++ b/compiler/lib/Dialect/Kernel/Transforms/KernelMaterialization.cpp
@@ -291,7 +291,7 @@ class KernelMaterialization final
 void populateKernelMaterializationPatterns(RewritePatternSet& patterns) {
     if (target_arch == megcc::KernelGen::ARM64V7) {
         auto a64_registry = std::make_unique<Kernel::KernelTemplateRegistry>();
-        Kernel::addBuiltinTemplates(*a64_registry, megcc::KernelGen::ARM64);
+        Kernel::addBuiltinTemplates(*a64_registry, megcc::KernelGen::ARM64V7);
         //! a32_registry and a64_registry shared the same map to avoid
         //! generating redundant armcommon kernel
         auto a32_registry = std::make_unique<Kernel::KernelTemplateRegistry>(
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp
index dae4bd6a..8adec61f 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.cpp
@@ -65,7 +65,7 @@ std::string WinogradFrameNchw44::GenGetWorkSpaceCode(
         return TinyNN_SUCCESS;
     })";
     ss << StringTemplate::StringTemplateArgs()
-                    .add("tile_per_loop", m_tile_per_loop)
+                    .add("tile_per_loop", strategy->GetTileSize())
                     .add("KernelSize", strategy->GetKernelSize())
                     .add("OutputBlockSize", strategy->GetOutputBlockSize())
                     .render(workspace_temp);
@@ -246,7 +246,7 @@ std::string WinogradFrameNchw44::GenKernelBodyCode(
     writer << StringTemplate::StringTemplateArgs(ctx)
                       .add("KernelSize", strategy->GetKernelSize())
                       .add("OutputBlockSize", strategy->GetOutputBlockSize())
-                      .add("nr_tiles_per_loop", m_tile_per_loop)
+                      .add("nr_tiles_per_loop", strategy->GetTileSize())
                       .add("BiasPtr", bias_ptr)
                       .add_ctx_int("pad_h")
                       .add_ctx_int("pad_w")
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h
index 488010d3..00807dc0 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradCommon.h
@@ -20,6 +20,7 @@ class WinogradStrategyBase {
 public:
     virtual uint32_t GetKernelSize() = 0;
     virtual uint32_t GetOutputBlockSize() = 0;
+    virtual uint32_t GetTileSize() { return 32; };
 
     //! transform the weight to winograd space, input strings are:
     //! 0: inptr, the start pointer of the convolution weight
@@ -63,8 +64,6 @@ class WinogradStrategyBase {
 };
 
 class WinogradFrameNchw44 {
-    uint32_t m_tile_per_loop = 32;
-
 public:
     //! gen init code
     std::string GenInitCode(TContext*, WinogradStrategyBase*);
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp
index becbb458..d20335e5 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.cpp
@@ -970,15 +970,14 @@ std::string WinogradF43Strategy4x16MK4::OutputFeatureTrans(
         if (bias) {
             vbias = GiLoadFloat32(bias + oc);
         }
-# define BIAS_LINE(i, j, k) \
+# define BIAS_LINE(j, k) \
     v##j##k = GiAddFloat32(v##j##k, vbias); 
 
 #define BIAS(m) \
-    BIAS_LINE(3, m, 5) \
-    BIAS_LINE(2, m, 4) \
-    BIAS_LINE(1, m, 3) \
-    BIAS_LINE(0, m, 2) 
-
+    BIAS_LINE(m, 5) \
+    BIAS_LINE(m, 4) \
+    BIAS_LINE(m, 3) \
+    BIAS_LINE(m, 2)
 
 // add_bias
 if(bias){
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h
index 13dbccec..6df9fb57 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF43Strategy4x16MK4.h
@@ -20,6 +20,7 @@ class WinogradF43Strategy4x16MK4 : public WinogradStrategyBase {
 public:
     uint32_t GetKernelSize() override { return 3; }
     uint32_t GetOutputBlockSize() override { return 4; }
+    uint32_t GetTileSize() override { return 68; };
     std::string DependMatmulSymbol() override;
     std::string WeightTrans(const std::vector<std::string>& strs) override;
     std::string InputFeatureTrans(
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp
index c63bb9f9..7e212a42 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.cpp
@@ -132,7 +132,7 @@ std::string WinogradF63Strategy4x16MK4::WeightTrans(
 
 std::string WinogradF63Strategy4x16MK4::InputFeatureTrans(
         const std::vector<std::string>& strs) {
-    auto InputPrepareF43NCHW44 = [](std::vector<std::string>) {
+    auto InputPrepareF63NCHW44 = [](std::vector<std::string>) {
         std::stringstream ss;
         std::string kernel = R"(
         size_t IW4 = IW_ * PACK_C_SIZE;
@@ -173,7 +173,7 @@ std::string WinogradF63Strategy4x16MK4::InputFeatureTrans(
 )";
         return kernel;
     };
-    auto InputTransformF43NCHW44 = [](std::vector<std::string>) {
+    auto InputTransformF63NCHW44 = [](std::vector<std::string>) {
         std::stringstream ss;
         std::string kernel = R"(
          // BT * d * B
@@ -395,8 +395,8 @@ std::string WinogradF63Strategy4x16MK4::InputFeatureTrans(
                         iw_start + Alpha <= (int)IW_)?1:0;
 
             
-            ${InputPrepareF43NCHW44()}
-            ${InputTransformF43NCHW44()}
+            ${InputPrepareF63NCHW44()}
+            ${InputTransformF63NCHW44()}
         }
     })";
 
@@ -411,8 +411,8 @@ std::string WinogradF63Strategy4x16MK4::InputFeatureTrans(
                     .add("PW", strs[6])
                     .add("tile_id", strs[7])
                     .add("nr_tiles_in_loop", strs[8])
-                    .add("InputTransformF43NCHW44", InputTransformF43NCHW44)
-                    .add("InputPrepareF43NCHW44", InputPrepareF43NCHW44)
+                    .add("InputTransformF63NCHW44", InputTransformF63NCHW44)
+                    .add("InputPrepareF63NCHW44", InputPrepareF63NCHW44)
                     .render(input_process);
     return ss.str();
 }
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h
index dfd83b9b..9187bfaa 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ConvKernel/Winograd/WinogradF63Strategy4x16MK4.h
@@ -20,6 +20,7 @@ class WinogradF63Strategy4x16MK4 : public WinogradStrategyBase {
 public:
     uint32_t GetKernelSize() override { return 3; }
     uint32_t GetOutputBlockSize() override { return 6; }
+    uint32_t GetTileSize() override { return 16; };
     std::string DependMatmulSymbol() override;
     std::string WeightTrans(const std::vector<std::string>& strs) override;
     std::string InputFeatureTrans(
diff --git a/compiler/lib/KernelGen/KernelGen.cpp b/compiler/lib/KernelGen/KernelGen.cpp
index cd7978b9..221831f0 100644
--- a/compiler/lib/KernelGen/KernelGen.cpp
+++ b/compiler/lib/KernelGen/KernelGen.cpp
@@ -66,8 +66,10 @@ KernelPack::GetKernel(KernelPack::KernType kernel_type, Arch arch) {
                 }
             }
             //! WARNING: the f63 and f43 must exist in GI kernel
-            a64_kerns.insert(a64_kerns.begin(), sorted_kern.begin(),
-                             sorted_kern.end());
+            if (arch == Arch::ARM64) {
+                a64_kerns.insert(a64_kerns.begin(), sorted_kern.begin(),
+                                 sorted_kern.end());
+            }
         } else {
             valid_kern = gi_kerns;
         }
diff --git a/compiler/test/kernel/opr/arm/benchmark_conv.cpp b/compiler/test/kernel/opr/arm/benchmark_conv.cpp
index 29c5a63e..560d229c 100644
--- a/compiler/test/kernel/opr/arm/benchmark_conv.cpp
+++ b/compiler/test/kernel/opr/arm/benchmark_conv.cpp
@@ -124,13 +124,13 @@ TEST(AARCH64, BenchmarkConvNCHWNCHW44) {
     param.compute_mode = ConvBiasForward::Param::ComputeMode::DEFAULT;
     param.format = ConvBiasForward::Param::Format::NCHW44;
     benchmarker.set_param(param);
-    benchmarker.execs(
-            {{1, 3, 224, 224}, {8, 3, 3, 3, 4}, {1, 8, 1, 1, 4}, {}, {}}).print();
+    benchmarker
+            .execs({{1, 3, 224, 224}, {8, 3, 3, 3, 4}, {1, 8, 1, 1, 4}, {}, {}})
+            .print();
 }
 
 TEST(AARCH64, BenchmarkConvF32Winograd) {
     Benchmarker<ConvBiasForward> benchmarker(Arch::ARM64);
-    benchmarker.set_kernel_symbol(".*_winograd_f23");
 
     ConvBiasForward::Param param;
     param.pad_h = 1;
@@ -140,25 +140,29 @@ TEST(AARCH64, BenchmarkConvF32Winograd) {
     param.compute_mode = ConvBiasForward::Param::ComputeMode::DEFAULT;
     param.format = ConvBiasForward::Param::Format::NCHW44;
     benchmarker.set_param(param);
-    benchmarker.set_before_exec_callback(
-            megdnn::test::AlgoChecker<ConvBiasForward>(
-                    "WINOGRAD_NCHW44:AARCH64_F32_MK4_4x16:4:2:24"));
-    for (size_t Channel : {32, 256}) {
-        for (size_t HW : {56, 28, 14}) {
-            auto result =
-                    benchmarker.execs({{1, Channel / 4, HW, HW, 4},
-                                       {Channel / 4, Channel / 4, 3, 3, 4, 4},
-                                       {1, Channel / 4, 1, 1, 4},
-                                       {},
-                                       {}});
-            printf("megcc result time = %f, throughput %f Gops, %f mbps\n",
-                   result.megcc_performance.kernel_time_ms,
-                   result.megcc_performance.compute_throughput_gops,
-                   result.megcc_performance.memory_throughput_mbps);
-            printf("dnn result time = %f, throughput %f Gops, %f mbps\n",
-                   result.dnn_performance.kernel_time_ms,
-                   result.dnn_performance.compute_throughput_gops,
-                   result.dnn_performance.memory_throughput_mbps);
+    std::vector<std::vector<std::string>> algo_pairs = {
+            {".*_winograd_f23", "WINOGRAD_NCHW44:AARCH64_F32_MK4_4x16:4:2:24"},
+            {".*_winograd_f43", "WINOGRAD_NCHW44:AARCH64_F32_MK4_4x16:4:4:68"},
+            {".*_winograd_f63", "WINOGRAD_NCHW44:AARCH64_F32_MK4_4x16:4:6:16"}};
+
+    for (auto algo : algo_pairs) {
+        printf("megcc algo: %s VS megdnn algo: %s\n", algo[0].c_str(),
+               algo[1].c_str());
+        for (size_t Channel : {32, 256}) {
+            for (size_t HW : {56, 28, 14}) {
+                benchmarker.set_kernel_symbol(algo[0]);
+                benchmarker.set_before_exec_callback(
+                        megdnn::test::AlgoChecker<ConvBiasForward>(
+                                algo[1].c_str()));
+
+                auto result = benchmarker.execs(
+                        {{1, Channel / 4, HW, HW, 4},
+                         {Channel / 4, Channel / 4, 3, 3, 4, 4},
+                         {1, Channel / 4, 1, 1, 4},
+                         {},
+                         {}});
+                result.print();
+            }
         }
     }
 }
diff --git a/compiler/test/kernel/opr/arm/conv.cpp b/compiler/test/kernel/opr/arm/conv.cpp
index ad1d4c9e..fad7aefd 100644
--- a/compiler/test/kernel/opr/arm/conv.cpp
+++ b/compiler/test/kernel/opr/arm/conv.cpp
@@ -343,53 +343,55 @@ TEST(AARCH64, ConvBiasNCHWNCHW44) {
 
 TEST(AARCH64, ConvWinogradNCHW44) {
     Checker<ConvBiasForward> checker(Arch::ARM64);
-    checker.set_kernel_symbol(".*_winograd_f23");
-    checker.set_epsilon(1e-3);
+    checker.set_epsilon(1e-2);
     ConvBiasForward::Param param;
     param.stride_h = 1;
     param.stride_w = 1;
     param.compute_mode = ConvBiasForward::Param::ComputeMode::DEFAULT;
     param.format = ConvBiasForward::Param::Format::NCHW44;
     param.sparse = ConvBiasForward::Param::Sparse::DENSE;
-
-    for (size_t Channel : {32, 64, 256}) {
-        for (size_t HW : {28, 14}) {
-            param.pad_h = 1;
-            param.pad_w = 1;
-            checker.set_param(param);
-            checker.execs({{1, Channel / 4, HW, HW, 4},
-                           {Channel / 4, Channel / 4, 3, 3, 4, 4},
-                           {1, Channel / 4, 1, 1, 4},
-                           {},
-                           {}});
+    for (auto name :
+         {".*_winograd_f23", "^GI.*_winograd_f43.*", "^GI.*_winograd_f63.*"}) {
+        checker.set_kernel_symbol(name);
+        for (size_t Channel : {32, 64, 256}) {
+            for (size_t HW : {28, 14}) {
+                param.pad_h = 1;
+                param.pad_w = 1;
+                checker.set_param(param);
+                checker.execs({{1, Channel / 4, HW, HW, 4},
+                               {Channel / 4, Channel / 4, 3, 3, 4, 4},
+                               {1, Channel / 4, 1, 1, 4},
+                               {},
+                               {}});
+            }
         }
+        // clang-format off
+        for(size_t P:{0, 1})
+        for(size_t IC : {1, 3, 8})
+        for(size_t OC : {1, 4})
+        for(size_t IH: {3, 5, 22, 32})
+        for(size_t IW : {22, 56})
+        for(auto mode : {ConvBiasForward::Param::NonlineMode::IDENTITY,
+                        ConvBiasForward::Param::NonlineMode::RELU,
+                        ConvBiasForward::Param::NonlineMode::H_SWISH})
+                            // clang-format on
+                            {
+                                param.pad_h = P;
+                                param.pad_w = P;
+                                param.nonlineMode = mode;
+                                checker.set_param(param);
+                                checker.execs({{1, IC, IH, IW, 4},
+                                               {OC, IC, 3, 3, 4, 4},
+                                               {},
+                                               {},
+                                               {}});
+                                checker.execs({{2, IC, IH, IW, 4},
+                                               {OC, IC, 3, 3, 4, 4},
+                                               {1, OC, 1, 1, 4},
+                                               {},
+                                               {}});
+                            }
     }
-
-    // clang-format off
-    for(size_t P:{0, 1})
-    for(size_t IC : {1, 3, 8})
-    for(size_t OC : {1, 4})
-    for(size_t IH: {3, 5, 22, 32})
-    for(size_t IW : {22, 56})
-    for(auto mode : {ConvBiasForward::Param::NonlineMode::IDENTITY,
-                      ConvBiasForward::Param::NonlineMode::RELU})
-                        // clang-format on
-                        {
-                            param.pad_h = P;
-                            param.pad_w = P;
-                            param.nonlineMode = mode;
-                            checker.set_param(param);
-                            checker.execs({{1, IC, IH, IW, 4},
-                                           {OC, IC, 3, 3, 4, 4},
-                                           {},
-                                           {},
-                                           {}});
-                            checker.execs({{2, IC, IH, IW, 4},
-                                           {OC, IC, 3, 3, 4, 4},
-                                           {1, OC, 1, 1, 4},
-                                           {},
-                                           {}});
-                        }
 }
 
 TEST(AARCH64, ConvBiasIm2col) {

From 8b66bd4aab65700da6caff9860c147377cc2085a Mon Sep 17 00:00:00 2001
From: yuxiongxiong <yuxiongxiong@megvii.com>
Date: Mon, 9 Jan 2023 19:05:26 +0800
Subject: [PATCH 15/17] feat(compiler): add gi max and min kernel

---
 .../GeneralIntrinsic/Elemwise/Elemwise.cpp    |  3 +-
 .../ElemwiseHelper/BinaryHelper.cpp           | 70 +++++++++++++++++++
 .../ElemwiseHelper/ElemwiseHelper.cpp         |  4 ++
 .../ElemwiseHelper/ElemwiseHelper.h           |  2 +
 .../generalIntrinsic/benchmark_elemwise.cpp   |  7 +-
 .../opr/generalIntrinsic/benchmark_reduce.cpp |  3 +-
 .../test/kernel/opr/generalIntrinsic/cv.cpp   |  5 +-
 .../opr/generalIntrinsic/elementwise.cpp      |  2 +-
 runtime/include/lite-c/common_enum_c.h        |  2 +-
 runtime/src/lite/network.c                    | 10 +--
 10 files changed, 94 insertions(+), 14 deletions(-)

diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/Elemwise/Elemwise.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/Elemwise/Elemwise.cpp
index 5ffdd2c2..4f1390c1 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/Elemwise/Elemwise.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/Elemwise/Elemwise.cpp
@@ -28,7 +28,8 @@ bool ElemwiseKernel::IsAvailable(TContext* ctx) const {
     bool mode_ok = mode == "RELU" || mode == "EXP" || mode == "SIGMOID" ||
                    mode == "H_SWISH" || mode == "ADD" || mode == "SUB" ||
                    mode == "MUL" || mode == "TRUE_DIV" ||
-                   mode == "FUSE_ADD_RELU" || mode == "FUSE_MUL_ADD3";
+                   mode == "FUSE_ADD_RELU" || mode == "FUSE_MUL_ADD3" ||
+                   mode == "MAX" || mode == "MIN";
     if (mode == "FUSE_MUL_ADD3") {
         auto bcast_type = ElemwiseGenTernary::GetBcastType(
                 ctx->getAttrOprand("operand:0"),
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/BinaryHelper.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/BinaryHelper.cpp
index 891474b6..547a1340 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/BinaryHelper.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/BinaryHelper.cpp
@@ -573,6 +573,76 @@ std::string ElemwiseGenBinaryFuseAddRelu::GenKernelNaiveUnroll(
     return writer.str();
 }
 
+std::string ElemwiseGenBinaryMax::GenKernelSimdInit(
+        std::vector<std::string>) const {
+    return "";
+}
+
+std::string ElemwiseGenBinaryMax::GenKernelSimdUnroll(
+        std::vector<std::string> strs) const {
+    int unroll = std::stoi(strs[0]);
+    auto dst = strs[1];
+    std::stringstream writer;
+    int str_id = 2;
+    for (int i = 0; i < unroll; i++) {
+        writer << "\n GiStoreFloat32((" << dst << ") + 4 * " << i
+               << ", GiMaximumFloat32(" << strs[str_id] << "," << strs[str_id + 1]
+               << "));";
+        str_id += 2;
+    }
+    return writer.str();
+}
+
+std::string ElemwiseGenBinaryMax::GenKernelNaiveUnroll(
+        std::vector<std::string> strs) const {
+    int unroll = std::stoi(strs[0]);
+    auto dst = strs[1];
+    std::stringstream writer;
+    int str_id = 2;
+    for (int i = 0; i < unroll; i++) {
+        writer << "\n(" << dst << ")[" << i << "] = (" << strs[str_id] << ")["
+               << i << "] > (" << strs[str_id + 1] << ")[" << i << "] ?(" << strs[str_id] << ")[" << i <<"]:(" << strs[str_id + 1] << ")["
+               << i << "] ;";
+        str_id += 2;
+    }
+    return writer.str();
+}
+
+std::string ElemwiseGenBinaryMin::GenKernelSimdInit(
+        std::vector<std::string>) const {
+    return "";
+}
+
+std::string ElemwiseGenBinaryMin::GenKernelSimdUnroll(
+        std::vector<std::string> strs) const {
+    int unroll = std::stoi(strs[0]);
+    auto dst = strs[1];
+    std::stringstream writer;
+    int str_id = 2;
+    for (int i = 0; i < unroll; i++) {
+        writer << "\n GiStoreFloat32((" << dst << ") + 4 * " << i
+               << ", GiMinimumFloat32(" << strs[str_id] << "," << strs[str_id + 1]
+               << "));";
+        str_id += 2;
+    }
+    return writer.str();
+}
+
+std::string ElemwiseGenBinaryMin::GenKernelNaiveUnroll(
+        std::vector<std::string> strs) const {
+    int unroll = std::stoi(strs[0]);
+    auto dst = strs[1];
+    std::stringstream writer;
+    int str_id = 2;
+   for (int i = 0; i < unroll; i++) {
+        writer << "\n(" << dst << ")[" << i << "] = (" << strs[str_id] << ")["
+               << i << "] < (" << strs[str_id + 1] << ")[" << i << "] ?(" << strs[str_id] << ")[" << i <<"]:(" << strs[str_id + 1] << ")["
+               << i << "] ;";
+        str_id += 2;
+    }
+    return writer.str();
+}
+
 std::string ElemwiseGenBinary::GenCodeBody(
         std::vector<std::string> strs) const {
     auto input0 = strs[0];
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.cpp b/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.cpp
index 27dff585..32216d16 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.cpp
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.cpp
@@ -44,6 +44,10 @@ std::shared_ptr<ElemwiseGenBase> ElemwiseHelperFunc::CreateGenHelper(
                           operands[1]);
         CASE_DISPATCH_ARG("FUSE_ADD_RELU", ElemwiseGenBinaryFuseAddRelu,
                           operands[0], operands[1]);
+        CASE_DISPATCH_ARG("MAX", ElemwiseGenBinaryMax,
+                          operands[0], operands[1]);
+        CASE_DISPATCH_ARG("MIN", ElemwiseGenBinaryMin,
+                          operands[0], operands[1]);
         CC_ABORT << "Binary mode: " << mode << " not Implement now\n";
     } else if (nr_operands == 4) {
         CASE_DISPATCH_ARG("FUSE_MUL_ADD3", ElemwiseGenTernaryFuseMulAdd3,
diff --git a/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.h b/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.h
index ee65ad94..8fe654d8 100644
--- a/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.h
+++ b/compiler/lib/KernelGen/GeneralIntrinsic/ElemwiseHelper/ElemwiseHelper.h
@@ -157,6 +157,8 @@ DEFINE_BINARY_OP(ElemwiseGenBinarySub)
 DEFINE_BINARY_OP(ElemwiseGenBinaryMul)
 DEFINE_BINARY_OP(ElemwiseGenBinaryTrueDiv)
 DEFINE_BINARY_OP(ElemwiseGenBinaryFuseAddRelu)
+DEFINE_BINARY_OP(ElemwiseGenBinaryMax)
+DEFINE_BINARY_OP(ElemwiseGenBinaryMin)
 #undef DEFINE_BINARY_OP
 //! TODO: add more binary elemwise here
 /************************************Ternary***********************************/
diff --git a/compiler/test/kernel/opr/generalIntrinsic/benchmark_elemwise.cpp b/compiler/test/kernel/opr/generalIntrinsic/benchmark_elemwise.cpp
index db2b6c16..2f57bf1e 100644
--- a/compiler/test/kernel/opr/generalIntrinsic/benchmark_elemwise.cpp
+++ b/compiler/test/kernel/opr/generalIntrinsic/benchmark_elemwise.cpp
@@ -8,6 +8,7 @@
  */
 
 #include "test/kernel/common/benchmark.h"
+#include "megbrain/reflection.h"
 using namespace megdnn;
 using namespace megcc::test;
 using namespace megcc::KernelGen;
@@ -18,7 +19,7 @@ TEST(GI, ElementwiseUnique_BMK) {
     benchmarker.set_kernel_symbol("GI_kernel_elementwise.+");
     ElemwiseForward::Param param;
     for (auto mode : {MODE::RELU, MODE::SIGMOID, MODE::EXP, MODE::H_SWISH}) {
-        printf("mode=%d\n", mode);
+        printf("mode=%s\n", mgb::reflection::nameOfEnumValue<ElemwiseForward::Param::Mode>(mode).c_str());
         param.mode = mode;
         benchmarker.set_param(param);
         benchmarker.execs({{10000}, {}}).print();
@@ -36,7 +37,7 @@ TEST(GI, ElementwiseBinary_BMK) {
     ElemwiseForward::Param param;
     for (auto mode : {MODE::ADD, MODE::SUB, MODE::MUL, MODE::FUSE_ADD_RELU,
                       MODE::TRUE_DIV}) {
-        printf("mode=%d\n", mode);
+        printf("mode=%s\n", mgb::reflection::nameOfEnumValue<ElemwiseForward::Param::Mode>(mode).c_str());
         param.mode = mode;
         benchmarker.set_param(param);
         benchmarker.execs({{10000}, {10000}, {}}).print();
@@ -53,7 +54,7 @@ TEST(GI, ElementwiseTernary_BMK) {
     ElemwiseForward::Param param;
     benchmarker.set_kernel_symbol("GI_kernel_elementwise.+");
     for (auto mode : {MODE::FUSE_MUL_ADD3}) {
-        printf("mode=%d\n", mode);
+        printf("mode=%s\n", mgb::reflection::nameOfEnumValue<ElemwiseForward::Param::Mode>(mode).c_str());
         param.mode = mode;
         benchmarker.set_param(param);
         //! vec_vec
diff --git a/compiler/test/kernel/opr/generalIntrinsic/benchmark_reduce.cpp b/compiler/test/kernel/opr/generalIntrinsic/benchmark_reduce.cpp
index 760d18a1..28f8b2ea 100644
--- a/compiler/test/kernel/opr/generalIntrinsic/benchmark_reduce.cpp
+++ b/compiler/test/kernel/opr/generalIntrinsic/benchmark_reduce.cpp
@@ -8,6 +8,7 @@
  */
 
 #include "test/kernel/common/benchmark.h"
+#include "megbrain/reflection.h"
 using namespace megdnn;
 using namespace megcc::test;
 using namespace megcc::KernelGen;
@@ -17,7 +18,7 @@ TEST(GI, BENCHMARK_Reduce) {
     Benchmarker<Reduce> benchmarker(Arch::BAREMETAL);
     benchmarker.set_kernel_symbol("GI_kernel_reduce.*");
     for (auto mode : {Mode::MIN, Mode::MAX, Mode::SUM, Mode::SUM_SQR, Mode::MEAN, Mode::PRODUCT}){
-        printf("mode=%d\n", mode);
+        printf("mode=%s\n", mgb::reflection::nameOfEnumValue<ReduceForward::Param::Mode>(mode).c_str());
         for (auto src : {TensorShape{200, 300}, TensorShape{3, 200, 300}, TensorShape{1, 3, 200, 300}}){
             for (size_t axis = 0; axis < 4; ++axis) {
                 if (axis < src.ndim) {
diff --git a/compiler/test/kernel/opr/generalIntrinsic/cv.cpp b/compiler/test/kernel/opr/generalIntrinsic/cv.cpp
index 935cf9e7..7e354ecf 100644
--- a/compiler/test/kernel/opr/generalIntrinsic/cv.cpp
+++ b/compiler/test/kernel/opr/generalIntrinsic/cv.cpp
@@ -9,6 +9,7 @@
 
 #include "test/kernel/common/checker.h"
 #include "test/kernel/common/cv_opr.h"
+#include "megbrain/reflection.h"
 using namespace megcc::test;
 using namespace megdnn;
 using namespace megcc::KernelGen;
@@ -134,13 +135,13 @@ TEST(GI, CVcvtcolor) {
     checker.set_dtype(1, dtype::Uint8());
 
     for (auto mode : {CvtMode::RGB2YUV, CvtMode::RGB2BGR}) {
-        printf("mode=%d\n", mode);
+        printf("mode=%s\n", mgb::reflection::nameOfEnumValue<CvtMode>(mode).c_str());
         param.mode = mode;
         checker.set_param(param);
         checker.exec({{1, 17, 31, 3}, {}});
     }
     for (auto mode : {CvtMode::YUV2BGR_NV21}) {
-        printf("mode=%d\n", mode);
+        printf("mode=%s\n", mgb::reflection::nameOfEnumValue<CvtMode>(mode).c_str());
         param.mode = mode;
         checker.set_param(param);
         checker.exec({{1, 3, 18, 1}, {}});
diff --git a/compiler/test/kernel/opr/generalIntrinsic/elementwise.cpp b/compiler/test/kernel/opr/generalIntrinsic/elementwise.cpp
index 5bc3f59d..c9337640 100644
--- a/compiler/test/kernel/opr/generalIntrinsic/elementwise.cpp
+++ b/compiler/test/kernel/opr/generalIntrinsic/elementwise.cpp
@@ -33,7 +33,7 @@ TEST(GI, ElementwiseBinary) {
     checker.set_kernel_symbol("GI_kernel_elementwise.+");
 
     ElemwiseForward::Param param;
-    for (auto mode : {MODE::ADD, MODE::SUB, MODE::MUL, MODE::FUSE_ADD_RELU}) {
+    for (auto mode : {MODE::ADD, MODE::SUB, MODE::MUL, MODE::FUSE_ADD_RELU, MODE::MAX, MODE::MIN}) {
         param.mode = mode;
         checker.set_param(param);
         checker.execs({{1}, {1}, {}});
diff --git a/runtime/include/lite-c/common_enum_c.h b/runtime/include/lite-c/common_enum_c.h
index 361b52d2..9178aee3 100644
--- a/runtime/include/lite-c/common_enum_c.h
+++ b/runtime/include/lite-c/common_enum_c.h
@@ -17,7 +17,7 @@
  */
 typedef enum LiteLogLevel {
     DEBUG = 0, /*!< The lowest level and most verbose */
-    INFO = 1,  /*!< The lowest level and most verbose */
+    INFO = 1,  /*!< print infos, warns and errors message */
     WARN = 2,  /*!< Print only warning and errors */
     ERROR = 3, /*!< Print only errors */
 } LiteLogLevel;
diff --git a/runtime/src/lite/network.c b/runtime/src/lite/network.c
index 35b6f6e0..b4523a49 100644
--- a/runtime/src/lite/network.c
+++ b/runtime/src/lite/network.c
@@ -200,7 +200,7 @@ int LITE_forward(const LiteNetwork network) {
 
             Layout in_layout = opr->inputs[0]->layout;
             Layout out_layout = opr->outputs[0]->layout;
-            LOG_ERROR(
+            LOG_INFO(
                     " instruction: %s \nuse %fms \t"
                     "[%d(%d), %d(%d), %d(%d), %d(%d), %d(%d)] \t"
                     "[%d(%d), %d(%d), %d(%d), %d(%d), %d(%d)]\n",
@@ -216,9 +216,9 @@ int LITE_forward(const LiteNetwork network) {
                     out_layout.stride[4]);
 
         } else {
-            LOG_ERROR("execute used time %f ms of instruction %s.\n",
-                      inst->time_ms / inst->time_count,
-                      instruction_type_name(inst->tag));
+            LOG_INFO("execute used time %f ms of instruction %s.\n",
+                     inst->time_ms / inst->time_count,
+                     instruction_type_name(inst->tag));
         }
 #endif
     }
@@ -361,7 +361,7 @@ int LITE_destroy_network(LiteNetwork network) {
     }
     FREE(cb_model->device_models);
 
-    //! free combine model struce
+    //! free combine model struct
     FREE(cb_model);
     return TinyNN_SUCCESS;
 }

From b43b6e1f5483c982d3ac73e20a311c3b3955caa2 Mon Sep 17 00:00:00 2001
From: yuxiongxiong <yuxiongxiong@megvii.com>
Date: Mon, 9 Jan 2023 19:17:43 +0800
Subject: [PATCH 16/17] feat(benchmark): add megcc benchmark

---
 README.md                           |   1 +
 benchmark/.gitignore                |   4 +
 benchmark/CMakeLists.txt            |  97 +++++++++++++++++
 benchmark/README.md                 |  80 ++++++++++++++
 benchmark/clean.sh                  |   3 +
 benchmark/main.cpp                  |  64 +++++++++++
 benchmark/model/model_arm.json      |  47 ++++++++
 benchmark/model/model_riscv.json    |  40 +++++++
 benchmark/model/model_x86.json      |  40 +++++++
 benchmark/python/example.py         | 157 +++++++++++++++++++++++++++
 benchmark/python/format.sh          |  22 ++++
 benchmark/python/src/benchmark.py   | 102 +++++++++++++++++
 benchmark/python/src/models.py      | 163 ++++++++++++++++++++++++++++
 benchmark/src/CCbenchmark.cpp       |  97 +++++++++++++++++
 benchmark/src/CCbenchmark.h         |  33 ++++++
 benchmark/src/MGEbenchmark.cpp      | 100 +++++++++++++++++
 benchmark/src/MGEbenchmark.h        |  38 +++++++
 benchmark/src/benchmark.h           |  25 +++++
 benchmark/src/build_config.h.in     |  10 ++
 benchmark/tools/cc_analysis.py      |  89 +++++++++++++++
 benchmark/tools/inference_visual.py |  91 ++++++++++++++++
 21 files changed, 1303 insertions(+)
 create mode 100644 benchmark/.gitignore
 create mode 100644 benchmark/CMakeLists.txt
 create mode 100644 benchmark/README.md
 create mode 100755 benchmark/clean.sh
 create mode 100644 benchmark/main.cpp
 create mode 100644 benchmark/model/model_arm.json
 create mode 100644 benchmark/model/model_riscv.json
 create mode 100644 benchmark/model/model_x86.json
 create mode 100644 benchmark/python/example.py
 create mode 100755 benchmark/python/format.sh
 create mode 100644 benchmark/python/src/benchmark.py
 create mode 100644 benchmark/python/src/models.py
 create mode 100644 benchmark/src/CCbenchmark.cpp
 create mode 100644 benchmark/src/CCbenchmark.h
 create mode 100644 benchmark/src/MGEbenchmark.cpp
 create mode 100644 benchmark/src/MGEbenchmark.h
 create mode 100644 benchmark/src/benchmark.h
 create mode 100644 benchmark/src/build_config.h.in
 create mode 100644 benchmark/tools/cc_analysis.py
 create mode 100644 benchmark/tools/inference_visual.py

diff --git a/README.md b/README.md
index 5e055dd6..ce22036f 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,7 @@ MegCC supports Arm64/ArmV7/X86/BareMatal backend. You may want to check [support
 * Download release compiler suit from [release page](https://github.com/MegEngine/MegCC/releases)
 * Compiler from source, please fellow the [compiler doc](compiler/README.md)
 * Build the release tar, please fellow the [release doc](doc/how-to-release.md)
+* Get benchmark of different model please reference [benchmark](benchmark/README.md)
 
 #### How to use MegCC
 
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
new file mode 100644
index 00000000..17e80af5
--- /dev/null
+++ b/benchmark/.gitignore
@@ -0,0 +1,4 @@
+model/benchmark_*
+model/generated_models
+config
+output
\ No newline at end of file
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
new file mode 100644
index 00000000..797bb5b6
--- /dev/null
+++ b/benchmark/CMakeLists.txt
@@ -0,0 +1,97 @@
+cmake_minimum_required(VERSION 3.15.2)
+set(CMAKE_EXPORT_COMPILE_COMMANDS
+    ON
+    CACHE INTERNAL "")
+
+project(Benchmarker)
+
+option(ENABLE_MEGENGINE_FRAMEWORK "build benchmark for megengine" OFF)
+configure_file(src/build_config.h.in
+               ${CMAKE_CURRENT_BINARY_DIR}/genfiles/build_config.h)
+# set megcc lib 
+if(NOT DEFINED RUNTIME_KERNEL_DIR)
+  message(FATAL_ERROR "build MegCC runtime kernel dir RUNTIME_KERNEL_DIR is empty, use -DRUNTIME_KERNEL_DIR=your_model_kernel_dir to set")
+else()
+  message(STATUS "build MegCC runtime with kernel dir ${RUNTIME_KERNEL_DIR}")
+endif()
+
+add_library(TinyNN STATIC IMPORTED)
+set_target_properties(
+  TinyNN PROPERTIES IMPORTED_LOCATION
+                    "${RUNTIME_KERNEL_DIR}/runtime/install/lib/libTinyNN.a")
+if(ENABLE_MEGENGINE_FRAMEWORK)
+  message(STATUS "build benchmark with megengine ${ENABLE_MEGENGINE_FRAMEWORK}")
+  option(X86_BACKEND "Build bechmarker with X86 megengine lib" ON)
+  # set megengine lib
+  if(NOT DEFINED MEGENGINE_INSTALL_DIR)
+    message(FATAL_ERROR "MEGENGINE_INSTALL_DIR is empty use -DMEGENGINE_INSTALL_DIR=your_megengine_install_dir to set")
+  else()
+    message(STATUS "MEGENGINE_INSTALL_DIR is ${MEGENGINE_INSTALL_DIR}")
+  endif()
+  add_library(mgb_imported INTERFACE IMPORTED)
+
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
+    set(MGE_INSTALL_LIBS ${MEGENGINE_INSTALL_DIR}/lite/lib/aarch64/liblite_static_all_in_one.a)
+    target_link_libraries(mgb_imported INTERFACE  ${MGE_INSTALL_LIBS})
+  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
+    set(MGE_INSTALL_LIBS ${MEGENGINE_INSTALL_DIR}/lite/lib/armv7/liblite_static_all_in_one.a)
+    target_link_libraries(mgb_imported INTERFACE  ${MGE_INSTALL_LIBS})
+  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
+    set(MGE_INSTALL_LIBS ${MEGENGINE_INSTALL_DIR}/lite/lib/riscv64/liblite_static_all_in_one.a)
+    target_link_libraries(mgb_imported INTERFACE  ${MGE_INSTALL_LIBS})
+  else()
+    if(X86_BACKEND)
+      if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64")
+        set(MKL_LIBS
+          ${PROJECT_SOURCE_DIR}/../third_party/MegEngine/third_party/mkl/x86_64/lib/libmkl_core.a
+          ${PROJECT_SOURCE_DIR}/../third_party/MegEngine/third_party/mkl/x86_64/lib/libmkl_sequential.a
+          ${PROJECT_SOURCE_DIR}/../third_party/MegEngine/third_party/mkl/x86_64/lib/libmkl_intel_ilp64.a
+        )
+        set(MGE_INSTALL_LIBS ${MEGENGINE_INSTALL_DIR}/lite/lib/x86_64/liblite_static_all_in_one.a)
+        target_compile_definitions(mgb_imported INTERFACE -DMKL_ILP64)
+      # WARNING: i386 is not test locally 
+      elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i386" OR ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i686")
+        set(MKL_LIBS
+          ${PROJECT_SOURCE_DIR}/../third_party/MegEngine/third_party/mkl/i386/lib/libmkl_core.a
+          ${PROJECT_SOURCE_DIR}/../third_party/MegEngine/third_party/mkl/x86_64/lib/libmkl_sequential.a
+          ${PROJECT_SOURCE_DIR}/../third_party/MegEngine/third_party/mkl/x86_64/lib/libmkl_intel_32.a
+        )
+        set(MGE_INSTALL_LIBS ${MEGENGINE_INSTALL_DIR}/lite/lib/i386/liblite_static_all_in_one.a)
+      endif()
+      set(MKL_DNN_LIBS
+        ${MEGENGINE_INSTALL_DIR}/lib/libdnnl.a
+        ${MEGENGINE_INSTALL_DIR}/lib/libmkldnn.a
+      )
+
+      if(UNIX AND NOT APPLE)
+        target_link_libraries(mgb_imported INTERFACE  ${MGE_INSTALL_LIBS} ${MKL_DNN_LIBS} -Wl,--start-group -ldl ${MKL_LIBS} -Wl,--end-group) 
+      else()
+        target_link_libraries(mgb_imported INTERFACE  ${MGE_INSTALL_LIBS} ${MKL_DNN_LIBS} ${MKL_LIBS}) 
+      endif()
+    else()
+      set(MGE_INSTALL_LIBS ${MEGENGINE_INSTALL_DIR}/lib/libmegengine.a ${MEGENGINE_INSTALL_DIR}/lib/libflatbuffers.a)
+      target_link_libraries(mgb_imported INTERFACE  ${MGE_INSTALL_LIBS})  
+    endif()
+    
+  endif()
+
+  target_include_directories(mgb_imported INTERFACE ${MEGENGINE_INSTALL_DIR}/include)
+endif()
+# benchmarker config
+file(GLOB_RECURSE SOURCES main.cpp src/*.cpp src/*.h)
+add_executable(benchmarker ${SOURCES})
+target_include_directories(
+    benchmarker PUBLIC $<BUILD_INTERFACE:${RUNTIME_KERNEL_DIR}/runtime/install/include> $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/genfiles>)
+if(ENABLE_MEGENGINE_FRAMEWORK)
+  target_link_libraries(benchmarker -pthread TinyNN mgb_imported)
+else()
+target_link_libraries(benchmarker -pthread TinyNN)
+endif()
+message(STATUS "${CMAKE_TOOLCHAIN_FILE}")
+if(CMAKE_TOOLCHAIN_FILE)
+  if(ANDROID)
+    target_link_libraries(benchmarker log)
+  endif()
+endif()
+
+install(TARGETS benchmarker LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX})
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 00000000..192569de
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,80 @@
+# How to use megcc benchmark
+
+## introduction
+megcc benchmark is a easy tool to get the benchmark result of different model in megcc
+the file struction  is shown bellow:
+```
+├── clean.sh
+├── CMakeLists.txt
+├── main.cpp
+├── model
+│   ├── model_arm.json
+│   ├── model_riscv.json
+│   ├── model_x86.json
+│   └── request.txt
+├── python
+│   ├── example.py
+│   ├── format.sh
+│   └── src
+│       ├── benchmark.py
+│       └── models.py
+├── README.md
+├── src
+│   ├── benchmark.h
+│   ├── build_config.h.in
+│   ├── CCbenchmark.cpp
+│   ├── CCbenchmark.h
+│   ├── MGEbenchmark.cpp
+│   └── MGEbenchmark.h
+└── tools
+    ├── cc_analysis.py
+    └── inference_visual.py
+```
+
+in src, it is a c++ application to run benchmark result on different platform.
+in python, the model convertion, other related preparing work and the benchmarker example is given
+the tools contains some usable scripts to analysis benchmark results
+## supported model
+mobilenetv2,  resnet18, efficientnetb0 shufflenetv2 vgg16
+## request
+```bash
+mgeconvert > v.1.0.2 
+onnx==1.11.0
+torch==1.10.0
+# or 
+git clone https://github.com/MegEngine/mgeconvert.git
+cd mgeconvert 
+git checkout master
+python3 -m pip install . --user --install-option="--targets=onnx"
+
+```
+the mgeconvert can be install by following command:
+```bash
+git clone https://github.com/MegEngine/mgeconvert.git
+cd mgeconvert 
+git checkout master
+python3 -m pip install . --user --install-option="--targets=onnx"
+
+```
+## get model and run benchmark example 
+``` bash
+cd megcc/benchmark
+export MEGCC_MGB_TO_TINYNN_PATH=<your_mgb_to_tinynn_path>
+python3  python/example.py
+```
+if you want to run in other platform, please reference the example add your new run_platform_xxx function in BenchmarkRunner, 
+the example given a ssh remote device test template
+
+## analysis megcc log
+
+the `output` directory is generated by `example.py`
+
+### visualize the inference result of different model
+```bash
+python3 benchmark/tools/inference_visual.py benchmark/output -o figure_dir
+```
+
+### visualize the profile result of different kernel in different model
+```bash
+python3 benchmark/tools/cc_analysis.py benchmark/output -o figure_dir
+```
\ No newline at end of file
diff --git a/benchmark/clean.sh b/benchmark/clean.sh
new file mode 100755
index 00000000..61110f8f
--- /dev/null
+++ b/benchmark/clean.sh
@@ -0,0 +1,3 @@
+# /bin/bash -e
+set -x
+rm -rf ./build* ./output ./config ./model/benchmark* ./model/generate*
diff --git a/benchmark/main.cpp b/benchmark/main.cpp
new file mode 100644
index 00000000..e124fa2f
--- /dev/null
+++ b/benchmark/main.cpp
@@ -0,0 +1,64 @@
+/**
+ * \file benchmark/main.cpp
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+#include <cmath>
+#include <cstdio>
+#include <memory>
+#include "src/CCbenchmark.h"
+#include "src/MGEbenchmark.h"
+
+using namespace megcc;
+using namespace Benchmark;
+int main(int argc, char** argv) {
+    if (argc < 2 && argc > 4) {
+        fprintf(stderr, "cmdline error, please run with:\n");
+        fprintf(stderr, "benchmarker <input_model> [options] ... \n");
+        fprintf(stderr,
+                "tips:\n\t you can use --profile and --mge to profile model "
+                "and enable megengine framework (\"megcc\" is default)\n");
+        return -1;
+    }
+    int log_level = 3;
+    std::string framework = "megcc";
+    std::string model_path = argv[1];
+    int idx = 2;
+    while (idx < argc) {
+        std::string args = argv[idx];
+        if (args == "--profile") {
+            log_level = 0;
+        } else if (args == "--mge") {
+            framework = "mge";
+        } else {
+            fprintf(stderr, "invalid option: %s\n", argv[idx]);
+        }
+        ++idx;
+    }
+    std::vector<std::shared_ptr<Benchmarker>> benchmarkers;
+    if (framework == "megcc") {
+        benchmarkers.push_back(
+                std::make_shared<CCBenchmarker>(model_path, log_level));
+    }
+#if ENABLE_MEGENGINE_FRAMEWORK
+    else if (framework == "mge") {
+        benchmarkers.push_back(
+                std::make_shared<MGEBenchmarker>(model_path, log_level));
+    }
+#endif
+    else {
+        fprintf(stderr,
+                "unsupport framework: %s, megcc, mge(export "
+                "ENABLE_MEGENGINE_FRAMEWORK=ON) is supported\n",
+                framework.c_str());
+    }
+
+    for (size_t i = 0; i < benchmarkers.size(); ++i) {
+        benchmarkers[i]->load_model();
+        benchmarkers[i]->profile();
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/benchmark/model/model_arm.json b/benchmark/model/model_arm.json
new file mode 100644
index 00000000..34103980
--- /dev/null
+++ b/benchmark/model/model_arm.json
@@ -0,0 +1,47 @@
+{
+    "dump_dir": "./benchmark_kernel_arm/",
+    "models": [
+        {
+            "model_name": "mobilenetv2",
+            "model_path": "./generated_models/mobilenetv2.mge",
+            "input_shape_str": "data=(1,3,224,224)",
+            "enable_nchw44": true
+        },
+        {
+            "model_name": "resnet18",
+            "model_path": "./generated_models/resnet18.mge",
+            "input_shape_str": "data=(1,3,224,224)",
+            "enable_nchw44": true
+        },
+        {
+            "model_name": "resnet50",
+            "model_path": "./generated_models/resnet50.mge",
+            "input_shape_str": "data=(1,3,224,224)",
+            "enable_nchw44": true
+        },
+        {
+            "model_name": "efficientnetb0",
+            "model_path": "./generated_models/efficientnetb0.mge",
+            "input_shape_str": "data=(1,3,256,256)",
+            "enable_nchw44": true
+        },
+        {
+            "model_name": "shufflenetv2",
+            "model_path": "./generated_models/shufflenetv2.mge",
+            "input_shape_str": "data=(1,3,224,224)",
+            "enable_nchw44": true
+        },
+        {
+            "model_name": "vgg11",
+            "model_path": "./generated_models/vgg11.mge",
+            "input_shape_str": "data=(1,3,224,224)",
+            "enable_nchw44": true
+        },
+        {
+            "model_name": "vgg16",
+            "model_path": "./generated_models/vgg16.mge",
+            "input_shape_str": "data=(1,3,224,224)",
+            "enable_nchw44": true
+        }
+    ]
+}
\ No newline at end of file
diff --git a/benchmark/model/model_riscv.json b/benchmark/model/model_riscv.json
new file mode 100644
index 00000000..6d00992f
--- /dev/null
+++ b/benchmark/model/model_riscv.json
@@ -0,0 +1,40 @@
+{
+    "dump_dir": "./benchmark_kernel_riscv/",
+    "models": [
+        {
+            "model_name": "mobilenetv2",
+            "model_path": "./generated_models/mobilenetv2.mge",
+            "input_shape_str": "data=(1,3,224,224)"
+        },
+        {
+            "model_name": "resnet18",
+            "model_path": "./generated_models/resnet18.mge",
+            "input_shape_str": "data=(1,3,224,224)"
+        },
+        {
+            "model_name": "resnet50",
+            "model_path": "./generated_models/resnet50.mge",
+            "input_shape_str": "data=(1,3,224,224)"
+        },
+        {
+            "model_name": "efficientnetb0",
+            "model_path": "./generated_models/efficientnetb0.mge",
+            "input_shape_str": "data=(1,3,256,256)"
+        },
+        {
+            "model_name": "shufflenetv2",
+            "model_path": "./generated_models/shufflenetv2.mge",
+            "input_shape_str": "data=(1,3,224,224)"
+        },
+        {
+            "model_name": "vgg11",
+            "model_path": "./generated_models/vgg11.mge",
+            "input_shape_str": "data=(1,3,224,224)"
+        },
+        {
+            "model_name": "vgg16",
+            "model_path": "./generated_models/vgg16.mge",
+            "input_shape_str": "data=(1,3,224,224)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/benchmark/model/model_x86.json b/benchmark/model/model_x86.json
new file mode 100644
index 00000000..a93077e3
--- /dev/null
+++ b/benchmark/model/model_x86.json
@@ -0,0 +1,40 @@
+{
+    "dump_dir": "./benchmark_kernel_x86/",
+    "models": [
+        {
+            "model_name": "mobilenetv2",
+            "model_path": "./generated_models/mobilenetv2.mge",
+            "input_shape_str": "data=(1,3,224,224)"
+        },
+        {
+            "model_name": "resnet18",
+            "model_path": "./generated_models/resnet18.mge",
+            "input_shape_str": "data=(1,3,224,224)"
+        },
+        {
+            "model_name": "resnet50",
+            "model_path": "./generated_models/resnet50.mge",
+            "input_shape_str": "data=(1,3,224,224)"
+        },
+        {
+            "model_name": "efficientnetb0",
+            "model_path": "./generated_models/efficientnetb0.mge",
+            "input_shape_str": "data=(1,3,256,256)"
+        },
+        {
+            "model_name": "shufflenetv2",
+            "model_path": "./generated_models/shufflenetv2.mge",
+            "input_shape_str": "data=(1,3,224,224)"
+        },
+        {
+            "model_name": "vgg11",
+            "model_path": "./generated_models/vgg11.mge",
+            "input_shape_str": "data=(1,3,224,224)"
+        },
+        {
+            "model_name": "vgg16",
+            "model_path": "./generated_models/vgg16.mge",
+            "input_shape_str": "data=(1,3,224,224)"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/benchmark/python/example.py b/benchmark/python/example.py
new file mode 100644
index 00000000..bed2fcd6
--- /dev/null
+++ b/benchmark/python/example.py
@@ -0,0 +1,157 @@
+#! /usr/bin/env python3
+import os
+
+import numpy as np
+import yaml
+from src.benchmark import BenchMarkRunnerBase, ValidModel, ValidOutputDir
+from src.models import *
+
+all_models = AllModel()
+arch_str=["x86", "arm64", "armv7"]
+arch_str=["x86"]
+framework_str = ["megcc"]
+models_dir = "{}/benchmark/model/generated_models".format(megcc_path)
+bechmarkers = {}
+kernel_build_dirs = {}
+# set as your own ssh device host and workdir(make sure install sshd and rsync on your device)
+ssh_device_info=[
+{"name":"","host": "", "workdir": ""}
+]
+
+
+class BenchmarkRunner(BenchMarkRunnerBase):
+    remote_config = None
+    remote_config_file = "{}/benchmark/config/cofnig.yaml".format(megcc_path)
+
+    def __init__(self, benchmark_build_dir="", benchmark_arch="x86"):
+        super().__init__(benchmark_build_dir, benchmark_arch)
+
+    def run_ssh_device(self, ssh_name, ssh_host, ssh_workdir):
+        if not os.path.exists(self.output_dir.local_path) or os.path.isfile(
+            self.output_dir.local_path
+        ):
+            os.makedirs(self.output_dir.local_path)
+        logfile = open(
+            "{}/{}-{}-{}-{}-log-{}.txt".format(
+                self.output_dir.local_path,
+                self.benchmark_framework,
+                self.benchmark_arch,
+                self.model.name,
+                self.log_level,
+                ssh_name,
+            ),
+            "w",
+        )
+        run_options = ""
+        if self.log_level == 0:
+            run_options += " --profile"
+        if self.benchmark_framework == "mge":
+            run_options += " --mge"
+        config_name = "benchmark-{}-{}-{}".format(
+            self.benchmark_framework, self.benchmark_arch, self.model.name
+        )
+        for file_ in [self.benchmark_exec_func, self.model.path]:
+            cmd = "rsync -aP -zz {} {}:{}/".format(
+                file_, ssh_host, ssh_workdir
+            )
+            subprocess.check_call(cmd, shell=True)
+        cmd = ' ssh -t {} "unset LD_PRELOAD && cd {} && LD_LIBRARY_PATH=./ && chmod +x ./benchmarker && ./benchmarker {}.{} {}" '.format(
+            ssh_host, ssh_workdir, self.model.name, self.model.exten, run_options
+        )
+        subprocess.check_call(cmd, shell=True, stdout=logfile, stderr=subprocess.STDOUT)
+
+
+def build_model_and_megcc_lib():
+    #! dump all models from onnx to megengine
+    all_models.make(models_dir)
+    #! prepare megcc compiler
+    prepare_megcc()
+    #! build megcc model lib
+    for arch_desc in arch_str:
+        build_megcc_lib(arch_desc, model_config_json="", kernel_build_dir="")
+
+
+#! build benchmarker
+def gen_benchmarker():
+    for arch_desc in arch_str:
+        benchmark_build_dir = "{}/benchmark/build/{}".format(megcc_path, arch_desc)
+        kernel_build_dirs[arch_desc] = "{}/benchmark/model/benchmark_kernel_{}".format(
+            megcc_path, arch_desc
+        )
+        benchmarker = BenchmarkRunner(
+            benchmark_build_dir=benchmark_build_dir, benchmark_arch=arch_desc
+        )
+        bechmarkers[arch_desc] = benchmarker
+
+
+def build_benchmarker(x86_target="fallback"):
+    for arch_desc in arch_str:
+        benchmark_build_dir = "{}/benchmark/build/{}".format(megcc_path, arch_desc)
+        if arch_desc == "x86":
+            build_option = "-DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PWD/install -DRUNTIME_KERNEL_DIR={}".format(
+                kernel_build_dirs[arch_desc]
+            )
+        else:
+            if arch_desc == "arm64":
+                TOOLCHAIN_OPTION = '-DCMAKE_TOOLCHAIN_FILE="$NDK_ROOT/build/cmake/android.toolchain.cmake"  -DANDROID_NDK="$NDK_ROOT" -DANDROID_ABI=arm64-v8a  -DANDROID_NATIVE_API_LEVEL=21'
+            elif arch_desc == "armv7":
+                TOOLCHAIN_OPTION = '-DCMAKE_TOOLCHAIN_FILE="$NDK_ROOT/build/cmake/android.toolchain.cmake"  -DANDROID_NDK="$NDK_ROOT" -DANDROID_ABI=armeabi-v7a  -DANDROID_NATIVE_API_LEVEL=21'
+            elif arch_desc == "riscv":
+                TOOLCHAIN_OPTION = '-DCMAKE_TOOLCHAIN_FILE="{}/runtime/toolchains/riscv64-linux-gnu.toolchain.cmake"'.format(
+                    megcc_path
+                )
+            build_option = "{} -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PWD/install -DRUNTIME_KERNEL_DIR={}".format(
+                TOOLCHAIN_OPTION, kernel_build_dirs[arch_desc]
+            )
+
+        bechmarkers[arch_desc].build(build_options=build_option)
+
+
+# ！set test config and run
+def set_config_and_run():
+    for arch_desc in arch_str:
+        kernel_build_dir = "{}/benchmark/model/benchmark_kernel_{}".format(
+            megcc_path, arch_desc
+        )
+        for model in all_models.models:
+            for framework in framework_str:
+                for log_level in [False, True]:
+                    if framework == "megcc":
+                        exten = "tiny"
+                        model_path = "{}/{}.tiny".format(kernel_build_dir, model.name)
+                    elif framework == "mge":
+                        model_path = "{}/{}.mge".format(models_dir, model.name)
+                        exten = "mge"
+                    model_ = ValidModel(model_path, model.name, exten)
+                    output_dir_ = ValidOutputDir(
+                        "{}/benchmark/output".format(megcc_path), "output"
+                    )
+                    bechmarkers[arch_desc].set_config(
+                        profile_kernel=log_level,
+                        benchmark_framework=framework,
+                        model=model_,
+                        output_dir=output_dir_,
+                    )
+                    if arch_desc == "x86":
+                        bechmarkers[arch_desc].run_local()
+                    elif arch_desc != "riscv":
+                        # run for different device may avoid the effection of device heat radiation
+                        for ssh_device in ssh_device_info:
+                            ssh_name=ssh_device["name"]
+                            ssh_host=ssh_device["host"]
+                            ssh_workdir=ssh_device["workdir"]
+                            bechmarkers[arch_desc].run_ssh_device(ssh_name, ssh_host, ssh_workdir)
+                    else:
+                        print("unsupported arch type in megcc")
+                        return
+
+
+def main():
+    build_model_and_megcc_lib()
+    gen_benchmarker()
+    build_benchmarker()
+    set_config_and_run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/python/format.sh b/benchmark/python/format.sh
new file mode 100755
index 00000000..49e1b340
--- /dev/null
+++ b/benchmark/python/format.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -e
+cd $(dirname $0)
+
+ISORT_ARG=""
+BLACK_ARG=""
+
+while getopts 'd' OPT; do
+    case $OPT in
+        d)
+            ISORT_ARG="--diff --check-only"
+            BLACK_ARG="--diff --check"
+            ;;
+        ?)
+            echo "Usage: `basename $0` [-d]"
+    esac
+done
+
+isort $ISORT_ARG -j $(nproc) . 
+black $BLACK_ARG --target-version=py35 . 
+isort $ISORT_ARG -j $(nproc) ../tools 
+black $BLACK_ARG --target-version=py35 ../tools
diff --git a/benchmark/python/src/benchmark.py b/benchmark/python/src/benchmark.py
new file mode 100644
index 00000000..66b52a97
--- /dev/null
+++ b/benchmark/python/src/benchmark.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+import os
+import subprocess
+from pathlib import Path
+
+import numpy as np
+
+megcc_path = Path(
+    os.path.split(os.path.realpath(__file__))[0]
+).parent.parent.parent.absolute()
+
+
+class ValidModel:
+    path = ""
+    name = ""
+    exten = ""
+
+    def __init__(self, model_path="", model_name="", exten=""):
+        self.path = model_path
+        self.name = model_name
+        self.exten = exten
+
+
+class ValidOutputDir:
+    local_path = ""
+    remote_path = ""
+    tag = ""
+
+    def __init__(self, local_path="", remote_path=""):
+        self.local_path = local_path
+        self.remote_path = remote_path
+
+
+class BenchMarkRunnerBase:
+    model = None
+    benchmark_build_dir = ""
+    benchmark_arch = ""
+    benchmark_framework = ""
+    output_dir = None
+    log_level = -1
+    benchmark_exec_func = ""
+
+    def __init__(self, benchmark_build_dir="", benchmark_arch="x86"):
+        if benchmark_build_dir == "":
+            benchmark_build_dir = "{}/benchmark/build_{}".format(
+                megcc_path, benchmark_arch
+            )
+        self.benchmark_build_dir = benchmark_build_dir
+        self.benchmark_arch = benchmark_arch
+
+    def build(self, x86_target="fallback", build_options=""):
+        # build prepare
+        if not os.path.exists(self.benchmark_build_dir) or os.path.isfile(
+            self.benchmark_build_dir
+        ):
+            os.makedirs(self.benchmark_build_dir)
+        # build megengine lib and set cmake build options
+        cmd = "cd {} && cmake {}/benchmark {} -G Ninja && ninja install/strip".format(
+            self.benchmark_build_dir, megcc_path, build_options
+        )
+        subprocess.check_call(cmd, shell=True)
+
+    def set_config(
+        self,
+        profile_kernel=False,
+        benchmark_framework="megcc",
+        model=None,
+        output_dir=None,
+    ):
+        if profile_kernel:
+            self.log_level = 0
+        else:
+            self.log_level = 3
+        self.benchmark_framework = benchmark_framework
+        self.output_dir = output_dir
+        self.model = model
+        self.benchmark_exec_func = "{}/install/bin/benchmarker".format(
+            self.benchmark_build_dir
+        )
+
+    def run_local(self):
+        if not os.path.exists(self.output_dir.local_path) or os.path.isfile(
+            self.output_dir.local_path
+        ):
+            os.makedirs(self.output_dir.local_path)
+        logfile = open(
+            "{}/{}-{}-{}-{}-log-local.txt".format(
+                self.output_dir.local_path,
+                self.benchmark_framework,
+                self.benchmark_arch,
+                self.model.name,
+                self.log_level,
+            ),
+            "w",
+        )
+        run_options = ""
+        if self.log_level == 0:
+            run_options += " --profile"
+        if self.benchmark_framework == "mge":
+            run_options += " --mge"
+        cmd = "{} {} {}".format(self.benchmark_exec_func, self.model.path, run_options)
+        subprocess.check_call(cmd, shell=True, stdout=logfile, stderr=subprocess.STDOUT)
diff --git a/benchmark/python/src/models.py b/benchmark/python/src/models.py
new file mode 100644
index 00000000..6547ad01
--- /dev/null
+++ b/benchmark/python/src/models.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+import logging
+import os
+import subprocess
+from pathlib import Path
+
+import numpy as np
+import torch.onnx
+import torchvision
+from mgeconvert.converters.onnx_to_mge import *
+
+megcc_path = Path(
+    os.path.split(os.path.realpath(__file__))[0]
+).parent.parent.parent.absolute()
+default_gen_path = "{}/benchmark/model/generated_models".format(megcc_path)
+
+
+class Model:
+    name = None
+    torch_model = None
+    input_shape = []
+
+    def __init__(self, name, torch_model, input_shape):
+        self.name = name
+        self.torch_model = torch_model
+        self.input_shape = input_shape
+
+
+class AllModel:
+    models = []
+    # model src from onnx
+    def __init__(self):
+        # pytorch model
+        self.models.append(
+            Model(
+                "mobilenetv2",
+                torchvision.models.mobilenetv2.mobilenet_v2(),
+                [1, 3, 224, 224],
+            )
+        )
+        self.models.append(
+            Model(
+                "efficientnetb0",
+                torchvision.models.efficientnet.efficientnet_b0(),
+                [1, 3, 256, 256],
+            )
+        )
+        self.models.append(
+            Model(
+                "shufflenetv2",
+                torchvision.models.shufflenetv2.shufflenet_v2_x0_5(),
+                [1, 3, 224, 224],
+            )
+        )
+        self.models.append(
+            Model("resnet18", torchvision.models.resnet.resnet18(), [1, 3, 224, 224])
+        )
+        self.models.append(
+            Model("resnet50", torchvision.models.resnet.resnet50(), [1, 3, 224, 224])
+        )
+        self.models.append(
+            Model("vgg11", torchvision.models.vgg.vgg11(), [1, 3, 224, 224])
+        )
+        self.models.append(
+            Model("vgg16", torchvision.models.vgg.vgg16(), [1, 3, 224, 224])
+        )
+
+    def get_all_onnx_models(self, output_dir=default_gen_path):
+        if not os.path.exists(output_dir) or os.path.isfile(output_dir):
+            os.makedirs(output_dir)
+        for model in self.models:
+            output = "{}/{}.onnx".format(output_dir, model.name)
+            logging.debug("get model file from torchvision to: {}".format(output))
+            net = model.torch_model
+            net.eval()
+            input_data = torch.randn(model.input_shape)
+            torch.onnx.export(
+                net,
+                input_data,
+                output,
+                export_params=True,
+                opset_version=12,
+                input_names=["data"],
+                output_names=["ret"],
+            )
+
+    def convert_to_mge(self, output_dir=default_gen_path):
+        for model in self.models:
+            input = "{}/{}.onnx".format(output_dir, model.name)
+            output = "{}/{}.mge".format(output_dir, model.name)
+            onnx_to_mge(input, output)
+
+    def make(self, model_dir=""):
+        if model_dir != "":
+            self.get_all_onnx_models(model_dir)
+            self.convert_to_mge(model_dir)
+        else:
+            self.get_all_onnx_models()
+            self.convert_to_mge()
+
+
+def prepare_megcc():
+    # build prepare
+    MEGCC_MGB_TO_TINYNN_PATH = os.environ.get("MEGCC_MGB_TO_TINYNN_PATH")
+    assert (
+        len(MEGCC_MGB_TO_TINYNN_PATH) != 0
+    ), "MEGCC_MGB_TO_TINYNN_PATH is not valid, please export MEGCC_MGB_TO_TINYNN_PATH to your path of mgb_to_tinynn"
+
+
+def build_megcc_lib(arch_desc="x86", model_config_json="", kernel_build_dir=""):
+    MEGCC_MGB_TO_TINYNN_PATH = os.environ.get("MEGCC_MGB_TO_TINYNN_PATH")
+    # build prepare
+    change_dir = ""
+    if model_config_json == "":
+        arch_ = arch_desc
+        if arch_desc == "arm64" or arch_desc == "armv7":
+            arch_ = "arm"
+        model_config_json = "{}/benchmark/model/model_{}.json".format(megcc_path, arch_)
+    if kernel_build_dir == "":
+        # WARNING: the dir path should be the same with path set in model_config_json file
+        kernel_build_dir = "{}/benchmark/model/benchmark_kernel_{}".format(
+            megcc_path, arch_desc
+        )
+        change_dir = "cd {}/benchmark/model".format(megcc_path)
+    if not os.path.exists(kernel_build_dir) or os.path.isfile(kernel_build_dir):
+        os.makedirs(kernel_build_dir)
+    # set runtime build options
+    if arch_desc == "x86":
+        arch = "--baremetal"
+        runtime_flag = ""
+    elif arch_desc == "arm64":
+        arch = "--arm64"
+        runtime_flag = "--cross_build --cross_build_target_arch aarch64 --cross_build_target_os ANDROID"
+    elif arch_desc == "armv7":
+        arch = "--armv7"
+        runtime_flag = "--cross_build --cross_build_target_arch armv7-a --cross_build_target_os ANDROID "
+    elif arch_desc == "riscv":
+        arch = "--baremetal"
+        runtime_flag = "--cross_build --cross_build_target_arch rv64gcv0p7 --cross_build_target_os LINUX"
+
+    # convert model
+    if len(change_dir) != 0:
+        cmd = "{} && {}/mgb-to-tinynn -json={} {} --dump {}".format(
+            change_dir,
+            MEGCC_MGB_TO_TINYNN_PATH,
+            model_config_json,
+            arch,
+            kernel_build_dir,
+        )
+    else:
+        cmd = "{}/mgb-to-tinynn -json={} {} --dump {}".format(
+            change_dir,
+            MEGCC_MGB_TO_TINYNN_PATH,
+            model_config_json,
+            arch,
+            kernel_build_dir,
+        )
+    subprocess.check_call(cmd, shell=True)
+    # build runtime
+    cmd = "python3 {}/runtime/scripts/runtime_build.py --build_with_profile --kernel_dir {}/ --remove_old_build {}".format(
+        megcc_path, kernel_build_dir, runtime_flag
+    )
+    subprocess.check_call(cmd, shell=True)
diff --git a/benchmark/src/CCbenchmark.cpp b/benchmark/src/CCbenchmark.cpp
new file mode 100644
index 00000000..23b8e4c1
--- /dev/null
+++ b/benchmark/src/CCbenchmark.cpp
@@ -0,0 +1,97 @@
+/**
+ * \file benchmark/src/CCbenchmark.cpp
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+#include "CCbenchmark.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <map>
+#include "lite-c/common_enum_c.h"
+#include "lite-c/global_c.h"
+#include "lite-c/tensor_c.h"
+const int number = 50;
+const int warmup = 10;
+
+#define LITE_CAPI_CHECK(error_, msg_)  \
+    if (error_) {                      \
+        printf(msg_);                  \
+        LITE_destroy_network(m_model); \
+        __builtin_trap();              \
+    }
+
+#define EXAMPLE_ASSERT(exp_, ...) \
+    if (!(exp_)) {                \
+        printf("" __VA_ARGS__);   \
+        __builtin_trap();         \
+    }
+
+using namespace megcc;
+using namespace Benchmark;
+
+#if TINYNN_CALLBACK_ENABLE
+#include <malloc.h>
+#include "tinynn_callback.h"
+static void test_timeimp(int32_t* sec, int32_t* usec) {
+    struct timeval t;
+    gettimeofday(&t, NULL);
+    *sec = t.tv_sec;
+    *usec = t.tv_usec;
+}
+static TinyNnCallBack g_cb = {
+        .tinynn_log_cb = printf,
+        .tinynn_gettime_cb = test_timeimp,
+        .tinynn_malloc_cb = malloc,
+        .tinynn_free_cb = free,
+        .tinynn_fopen_cb = fopen,
+        .tinynn_ftell_cb = ftell,
+        .tinynn_fseek_cb = fseek,
+        .tinynn_fclose_cb = fclose,
+        .tinynn_fwrite_cb = fwrite,
+        .tinynn_fread_cb = fread,
+};
+#endif
+
+/////////////////// CCBenchmarker ////////////////
+void CCBenchmarker::load_model() {
+#if TINYNN_CALLBACK_ENABLE
+    register_tinynn_cb(TINYNN_CB_VERSION, g_cb);
+#endif
+    LITE_CAPI_CHECK(LITE_make_network(&m_model, *default_config(),
+                                      *default_network_io()),
+                    "create model error. \n");
+
+    LITE_CAPI_CHECK(LITE_load_model_from_path(m_model, m_model_path.c_str()),
+                    "load model error. \n");
+}
+
+void CCBenchmarker::profile() {
+    for (int i = 0; i < warmup; i++) {
+        LITE_CAPI_CHECK(LITE_forward(m_model), "run model failed\n");
+        LITE_CAPI_CHECK(LITE_wait(m_model), "wait model failed\n");
+    }
+
+    struct timeval start;
+    struct timeval end;
+    gettimeofday(&start, NULL);
+    for (int i = 0; i < number; i++) {
+        LITE_CAPI_CHECK(LITE_forward(m_model), "run model failed\n");
+        LITE_CAPI_CHECK(LITE_wait(m_model), "wait model failed\n");
+    }
+    gettimeofday(&end, NULL);
+
+    unsigned long diff =
+            1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - start.tv_usec;
+    float average_time = ((float)diff) / number / 1000;
+    if (m_log_level == 3) {
+        printf("the inference average time=%.3f ms\n", average_time);
+    }
+}
+
+CCBenchmarker::~CCBenchmarker() {
+    LITE_CAPI_CHECK(LITE_destroy_network(m_model), "delete model failed\n");
+}
\ No newline at end of file
diff --git a/benchmark/src/CCbenchmark.h b/benchmark/src/CCbenchmark.h
new file mode 100644
index 00000000..a0e81792
--- /dev/null
+++ b/benchmark/src/CCbenchmark.h
@@ -0,0 +1,33 @@
+/**
+ * \file benchmark/src/CCbenchmark.h
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+#pragma once
+#include <string>
+#include <vector>
+#include "benchmark.h"
+#include "lite-c/global_c.h"
+#include "lite-c/network_c.h"
+namespace megcc {
+namespace Benchmark {
+class CCBenchmarker final : public Benchmarker {
+public:
+    CCBenchmarker(std::string model, int log_level)
+            : m_model_path(model), m_log_level(log_level) {
+        LITE_set_log_level(static_cast<LiteLogLevel>(log_level));
+    };
+    virtual void load_model() override;
+    virtual void profile() override;
+    ~CCBenchmarker();
+
+private:
+    int m_log_level;
+    std::string m_model_path;
+    LiteNetwork m_model;
+};
+}  // namespace Benchmark
+
+}  // namespace megcc
diff --git a/benchmark/src/MGEbenchmark.cpp b/benchmark/src/MGEbenchmark.cpp
new file mode 100644
index 00000000..6ba26b35
--- /dev/null
+++ b/benchmark/src/MGEbenchmark.cpp
@@ -0,0 +1,100 @@
+/**
+ * \file benchmark/src/MGEbenchmark.cpp
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+#include "MGEbenchmark.h"
+#if ENABLE_MEGENGINE_FRAMEWORK
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <map>
+#include "megbrain/gopt/inference.h"
+using namespace mgb;
+using namespace megcc;
+using namespace Benchmark;
+const int number = 50;
+const int warmup = 10;
+
+void MGEBenchmarker::load_model() {
+    std::unique_ptr<serialization::InputFile> inp_file =
+            serialization::InputFile::make_fs(m_model_path.c_str());
+    auto format =
+            serialization::GraphLoader::identify_graph_dump_format(*inp_file);
+    mgb_assert(format.valid(), "invalid model: unknown model format");
+    auto loader =
+            serialization::GraphLoader::make(std::move(inp_file), format.val());
+    if (m_log_level == 0) {
+        m_profiler = std::move(std::make_unique<mgb::GraphProfiler>(
+                m_load_config.comp_graph.get()));
+    } else {
+        m_load_config.comp_graph->options().comp_node_seq_record_level = 1;
+    }
+    m_load_config.comp_graph->options().var_sanity_check_first_run = false;
+    m_load_config.comp_graph->options()
+            .graph_opt.enable_fuse_conv_bias_nonlinearity();
+    m_load_config.comp_graph->options().graph_opt.enable_weight_preprocess();
+
+    m_model = loader->load(m_load_config, false);
+}
+
+void MGEBenchmarker::profile() {
+    //! optimize for inference
+    auto& output_vars = m_model.output_var_list;
+
+    using Strategy =
+            mgb::opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
+    auto strategy = static_cast<Strategy>(0);
+    strategy = Strategy::PROFILE | Strategy::OPTIMIZED | strategy;
+    mgb::gopt::modify_opr_algo_strategy_inplace(output_vars, strategy);
+    mgb::gopt::OptimizeForInferenceOptions opt_for_inference;
+#ifdef __ANDROID__
+#if __ARM_FEATURE_DOTPROD
+    opt_for_inference.enable_nchw44_dot();
+#else
+    opt_for_inference.enable_nchw44();
+#endif
+#else
+    output_vars = mgb::gopt::layout_transform(
+            output_vars, mgb::gopt::GraphTuningOptions::Target::CPU);
+#endif
+    size_t nr_output = output_vars.size();
+
+    output_vars =
+            mgb::gopt::optimize_for_inference(output_vars, opt_for_inference);
+    std::vector<std::map<std::string, megdnn::TensorShape>> input_map_vec;
+    auto cg = m_model.output_var_list[0].node()->owner_graph();
+    for (auto&& i : output_vars) {
+        mgb::ComputingGraph::Callback cb;
+        m_output_spec.emplace_back(i, std::move(cb));
+    }
+    m_func = cg->compile(m_output_spec);
+    struct timeval start;
+    struct timeval end;
+    gettimeofday(&start, NULL);
+    for (int i = 0; i < warmup; ++i) {
+        m_func->execute().wait();
+    }
+    gettimeofday(&end, NULL);
+    unsigned long diff =
+            1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - start.tv_usec;
+
+    gettimeofday(&start, NULL);
+    for (int i = 0; i < number; ++i) {
+        m_func->execute().wait();
+    }
+    gettimeofday(&end, NULL);
+    diff = 1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - start.tv_usec;
+    float average_time = ((float)diff) / number / 1000;
+    if (m_log_level == 0) {
+        std::string profile_ret;
+        m_profiler->to_json_full(m_func.get())->writeto(profile_ret, 4);
+        printf("%s\n", profile_ret.c_str());
+    } else {
+        printf("the inference average time=%.3f ms\n", average_time);
+    }
+}
+#endif
\ No newline at end of file
diff --git a/benchmark/src/MGEbenchmark.h b/benchmark/src/MGEbenchmark.h
new file mode 100644
index 00000000..c4caf24d
--- /dev/null
+++ b/benchmark/src/MGEbenchmark.h
@@ -0,0 +1,38 @@
+/**
+ * \file benchmark/src/MGEbenchmark.h
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+#pragma once
+#include "build_config.h"
+#if ENABLE_MEGENGINE_FRAMEWORK
+#include <string>
+#include <vector>
+#include "benchmark.h"
+#include "megbrain/plugin/profiler.h"
+#include "megbrain/serialization/serializer.h"
+namespace megcc {
+namespace Benchmark {
+class MGEBenchmarker final : public Benchmarker {
+public:
+    MGEBenchmarker(std::string model, int log_level)
+            : m_model_path(model), m_log_level(log_level) {
+        m_load_config.comp_graph = mgb::ComputingGraph::make();
+    };
+    virtual void load_model() override;
+    virtual void profile() override;
+
+private:
+    int m_log_level;
+    std::string m_model_path;
+    mgb::serialization::GraphLoadConfig m_load_config;
+    mgb::serialization::GraphLoader::LoadResult m_model;
+    std::unique_ptr<mgb::cg::AsyncExecutable> m_func;
+    std::unique_ptr<mgb::GraphProfiler> m_profiler;
+    mgb::cg::ComputingGraph::OutputSpec m_output_spec;
+};
+}  // namespace Benchmark
+}  // namespace megcc
+#endif
\ No newline at end of file
diff --git a/benchmark/src/benchmark.h b/benchmark/src/benchmark.h
new file mode 100644
index 00000000..1a6172fe
--- /dev/null
+++ b/benchmark/src/benchmark.h
@@ -0,0 +1,25 @@
+/**
+ * \file benchmark/src/benchmark.h
+ *
+ * This file is part of MegCC, a deep learning compiler developed by Megvii.
+ *
+ * \copyright Copyright (c) 2021-2022 Megvii Inc. All rights reserved.
+ */
+#pragma once
+#include <string>
+#include <vector>
+namespace megcc {
+namespace Benchmark {
+/**
+ * Benchmarker interface
+ *
+ */
+class Benchmarker {
+public:
+    virtual void load_model() = 0;
+    virtual void profile() = 0;
+    virtual ~Benchmarker() = default;
+};
+}  // namespace Benchmark
+
+}  // namespace megcc
diff --git a/benchmark/src/build_config.h.in b/benchmark/src/build_config.h.in
new file mode 100644
index 00000000..da9180e9
--- /dev/null
+++ b/benchmark/src/build_config.h.in
@@ -0,0 +1,10 @@
+#ifndef _HEADER_BUILD_CONFIG
+#define _HEADER_BUILD_CONFIG
+
+#cmakedefine01 ENABLE_MEGENGINE_FRAMEWORK
+
+#ifndef ENABLE_MEGENGINE_FRAMEWORK
+#define ENABLE_MEGENGINE_FRAMEWORK 0
+#endif
+
+#endif  // _HEADER_BUILD_CONFIG
diff --git a/benchmark/tools/cc_analysis.py b/benchmark/tools/cc_analysis.py
new file mode 100644
index 00000000..b07248a8
--- /dev/null
+++ b/benchmark/tools/cc_analysis.py
@@ -0,0 +1,89 @@
+#! /usr/bin/env python3
+import argparse
+import os
+import re
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def main(passed_args=None):
+    parser = argparse.ArgumentParser(
+        description="analyze profile result",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("data")
+    parser.add_argument("--output", "-o", default=".", type=str)
+    args = parser.parse_args(passed_args)
+    if not os.path.exists(args.output) or os.path.isfile(args.output):
+        os.makedirs(args.output)
+    files0 = set()
+    if os.path.isdir(args.data):
+        for i in os.listdir(args.data):
+            files0.add(str(Path(args.data) / i))
+    else:
+        files0.add(args.data)
+    data_map = {}
+    data_info = []
+    model_set = set()
+    for i in files0:
+        path = i.split("/")
+        file_name = path[len(path) - 1].split(".")
+        info = file_name[0].split("-")
+        if info[0] == "megcc" and info[3] == "0":
+            text_file = open(i, "r")
+            data = text_file.read()
+            text_file.close()
+            pattern = re.compile(r"\s\w+\s[\r\n]+use\s\d*\.\d+")
+            results = pattern.findall(data)
+            analyze_data = []
+            op_totoal_nums = len(results)
+            op_per_test = int(op_totoal_nums / 60)
+            iter_num = 0
+            total = 0.0
+            for i in results:
+                kernel_name_pattern = re.compile(r"\s\w+\s")
+                kernel_time_pattern = re.compile(r"\d*\.\d+")
+                kernel_name = kernel_name_pattern.search(i).group()
+                kernel_time = float(kernel_time_pattern.search(i).group())
+                if iter_num < op_per_test:
+                    total = total + kernel_time
+                    analyze_data.append([kernel_name, kernel_time])
+                else:
+                    total = total + kernel_time
+                    analyze_data[iter_num % op_per_test][1] += kernel_time
+
+                iter_num = iter_num + 1
+            diff_kernel_data = {}
+            for i in analyze_data:
+                if not i[0] in diff_kernel_data:
+                    diff_kernel_data[i[0]] = [i[1], i[1] / total]
+                else:
+                    diff_kernel_data[i[0]][0] += i[1]
+                    diff_kernel_data[i[0]][1] += i[1] / total
+            kernel_name = []
+            kernel_rate = []
+            for k, v in sorted(
+                diff_kernel_data.items(), key=lambda item: item[1][1], reverse=True
+            ):
+                kernel_name.append(k)
+                kernel_rate.append(v[1] * 100)
+
+            barWidth = 0.5
+            topK = 10
+            kernel_name = kernel_name[0:topK]
+            kernel_rate = kernel_rate[0:topK]
+            br1 = np.arange(len(kernel_name))
+            plt.figure(figsize=(25, 6))
+            plt.title("{}-{}-{}".format(info[1], info[5], info[2]), fontsize=30)
+            plt.pie(kernel_rate, labels=kernel_name, autopct="%0.1f%%")
+            plt.savefig(
+                "{}/{}-{}-{}-profile-top{}.png".format(
+                    args.output, info[1], info[5], info[2], topK
+                )
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/tools/inference_visual.py b/benchmark/tools/inference_visual.py
new file mode 100644
index 00000000..ca023c66
--- /dev/null
+++ b/benchmark/tools/inference_visual.py
@@ -0,0 +1,91 @@
+#! /usr/bin/env python3
+import argparse
+import os
+import re
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def main(passed_args=None):
+    parser = argparse.ArgumentParser(
+        description="visualize inference result",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("data")
+    parser.add_argument("--output", "-o", default=".", type=str)
+    args = parser.parse_args(passed_args)
+    files0 = set()
+    if not os.path.exists(args.output) or os.path.isfile(args.output):
+        os.makedirs(args.output)
+    if os.path.isdir(args.data):
+        for i in os.listdir(args.data):
+            files0.add(str(Path(args.data) / i))
+    else:
+        files0.add(args.data)
+    data_map = {}
+    data_info = []
+    model_set = set()
+    for i in files0:
+        path = i.split("/")
+        file_name = path[len(path) - 1].split(".")
+        info = file_name[0].split("-")
+
+        if "{}-{}".format(info[1], info[5]) not in data_map:
+            data_map["{}-{}".format(info[1], info[5])] = {}
+        if info[0] not in data_map["{}-{}".format(info[1], info[5])]:
+            data_map["{}-{}".format(info[1], info[5])][info[0]] = []
+        if info[3] == "3":
+            text_file = open(i, "r")
+            data = text_file.read()
+            text_file.close()
+            pattern = re.compile(r"\d*\.\d+")
+            result = float(pattern.search(data).group())
+            #  for excel
+            data_info.append([info[1], info[5], info[0], info[2], result])
+            data_map["{}-{}".format(info[1], info[5])][info[0]].append(
+                [info[2], result]
+            )
+            model_set.add(info[2])
+
+    model_list = []
+    for model in model_set:
+        model_list.append(model)
+    model_list = sorted(model_list)
+
+    for k, v in data_map.items():
+        for k0, v0 in v.items():
+            v1 = sorted(v0, key=lambda item: item[0])
+            v1_val = []
+            for i in v1:
+                v1_val.append(i[1])
+            data_map[k][k0] = v1_val
+    for i in data_info:
+        print(i[0], i[1], i[2], i[3], i[4])
+    print(model_list)
+    print(data_map)
+    # generate figure
+    barWidth = 0.5
+    br1 = np.arange(len(model_list))
+    br2 = [x + barWidth for x in br1]
+    for k, v in data_map.items():
+        plt.figure(figsize=(10, 6))
+        plt.title(k)
+        # Make the plot
+        plt.bar(br1, v["megcc"], width=barWidth, edgecolor="grey", label="megcc")
+
+        # Adding Xticks
+        plt.xlabel("model", fontweight="bold", fontsize=15)
+        plt.ylabel("inference(ms)", fontweight="bold", fontsize=15)
+        plt.xticks([r + barWidth for r in range(len(model_list))], model_list)
+        plt.grid(axis="y")
+        for a, b in zip(br1, v["megcc"]):
+            plt.text(a, b + 0.05, "%.2f" % b, ha="center", va="bottom")
+
+        plt.legend()
+        plt.savefig("{}/{}.png".format(args.output, k))
+
+
+if __name__ == "__main__":
+    main()

From 16fa9ee784351bcdce9f92142ccd7199c80f9bd7 Mon Sep 17 00:00:00 2001
From: yeasoon <1695924908@qq.com>
Date: Tue, 10 Jan 2023 16:25:48 +0800
Subject: [PATCH 17/17] fix(third_party): update megengine to
 e77cea141387fc8095b8c842547fcd6510f5c41f

---
 compiler/include/compiler/Common/Version.h.in | 2 +-
 script/release_megcc.sh                       | 6 ------
 third_party/MegEngine                         | 2 +-
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/compiler/include/compiler/Common/Version.h.in b/compiler/include/compiler/Common/Version.h.in
index ce1f2c8c..7b2a0301 100644
--- a/compiler/include/compiler/Common/Version.h.in
+++ b/compiler/include/compiler/Common/Version.h.in
@@ -11,7 +11,7 @@
 #include <vector>
 #define MEGCC_MAJOR 0
 #define MEGCC_MINOR 1
-#define MEGCC_PATCH 2
+#define MEGCC_PATCH 3
 namespace megcc {
 namespace {
 const std::string git_branch = "@GIT_BRANCH@";
diff --git a/script/release_megcc.sh b/script/release_megcc.sh
index eef6c540..6e5ff688 100755
--- a/script/release_megcc.sh
+++ b/script/release_megcc.sh
@@ -32,17 +32,11 @@ pushd ${OUT_DIR}/build_host
     cmake ${COMPILER_PATH} -G Ninja
     ninja
     cp tools/mgb-to-tinynn/mgb-to-tinynn ${OUT_DIR}/bin/
-    strip mgb-to-tinynn
     cp tools/mgb-runner/mgb-runner ${OUT_DIR}/bin/
-    strip mgb-runner
     cp tools/mgb-importer/mgb-importer ${OUT_DIR}/bin/
-    strip mgb-importer
     cp tools/kernel_exporter/kernel_exporter ${OUT_DIR}/bin/
-    strip kernel_exporter
     cp tools/hako-to-mgb/hako-to-mgb ${OUT_DIR}/bin/
-    strip hako-to-mgb
     cp tools/megcc-opt/megcc-opt ${OUT_DIR}/bin/
-    strip megcc-opt
 popd
 pushd ${PROJECT_PATH}/compiler
     GIT_ID=`git rev-parse --short HEAD`
diff --git a/third_party/MegEngine b/third_party/MegEngine
index 31218a18..e77cea14 160000
--- a/third_party/MegEngine
+++ b/third_party/MegEngine
@@ -1 +1 @@
-Subproject commit 31218a1863edf07be0feed947fad0dc38740fee3
+Subproject commit e77cea141387fc8095b8c842547fcd6510f5c41f