From 4cf401711b392310e9fcbe5cff1cd73de1a48b90 Mon Sep 17 00:00:00 2001 From: huochenghai Date: Tue, 26 Sep 2023 10:35:01 +0800 Subject: [PATCH] add conv2d cpu kernel --- .../CodeGen/CSourceConvertVisitor.cs | 2 +- modules/cpu/src/runtime/cmodel/include/tdma.h | 72 +++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/modules/Nncase.Modules.CPU/CodeGen/CSourceConvertVisitor.cs b/modules/Nncase.Modules.CPU/CodeGen/CSourceConvertVisitor.cs index 5a99b23ae2..0a305dea70 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CSourceConvertVisitor.cs +++ b/modules/Nncase.Modules.CPU/CodeGen/CSourceConvertVisitor.cs @@ -412,7 +412,7 @@ protected override CSymbol VisitCall(Call expr) } else { - IndentScope.Writer.IndWrite($"auto {ret_name}_tmp = &{ret_name};\n"); + IndentScope.Writer.IndWrite($"tensor<{args[0].CheckedDataType.ToC()}, loc_t::local>& {ret_name}_tmp = {ret_name};\n"); } if (grs.ReducePosition.Count == 2) diff --git a/modules/cpu/src/runtime/cmodel/include/tdma.h b/modules/cpu/src/runtime/cmodel/include/tdma.h index a43f790e1c..a618e50774 100644 --- a/modules/cpu/src/runtime/cmodel/include/tdma.h +++ b/modules/cpu/src/runtime/cmodel/include/tdma.h @@ -114,6 +114,78 @@ void matmul(tensor &a, tensor &b, tensor &c) { c.dimension(), c.strides()); } + +template +T *im2col(tensor &input, dims_t filter, dims_t padding, dims_t stride, + [[maybe_unused]] dims_t dilation = {1, 1}, + [[maybe_unused]] int32_t groups = 1) { + // todo: support dilated and group conv2d + int32_t N = input.dimension()[0]; + int32_t C = input.dimension()[1]; + int32_t H = input.dimension()[2] + padding[0] + padding[2]; + int32_t W = input.dimension()[3] + padding[1] + padding[3]; + + int32_t OH = (H - filter[0]) / stride[0] + 1; + int32_t OW = (W - filter[1]) / stride[1] + 1; + + T *cols = runtime_util->malloc((C * filter[0] * filter[1]) * (N * OH * OW) * + sizeof(T)); + for (auto c = 0; c < C; c++) + for (auto e = 0; e < OH; e++) + for (auto f = 0; f < OW; f++) + for (auto r = 0; r < filter[0]; r++) + for (auto s = 0; s < filter[1]; s++) { + auto row = + c * filter[0] * filter[1] + r * filter[0] + s; + for (auto b = 0; b < N; b++) { + auto col = e * OW * N + f * N + b; + auto out_index = row * (N * OH * OW) + col; + auto in_index = + b * input.strides()[0] + + c * input.strides()[1] + + (stride[0] * e + r) * input.strides()[2] + + (stride[1] * f + s); + if (row < padding[0] || col < padding[1]) { + cols[out_index] = 0; + } else { + cols[out_index] = input.cdata()[in_index]; + } + } + } + return cols; +} + +template +void conv2d(tensor &input, tensor &weight, + tensor bias, tensor &output, dims_t padding, + dims_t stride, dims_t dilation = {1, 1}, int32_t groups = 1) { + // todo: support dilated and group conv2d + int32_t N = input.dimension()[0]; + int32_t C = input.dimension()[1]; + int32_t H = input.dimension()[2] + padding[0] + padding[2]; + int32_t W = input.dimension()[3] + padding[1] + padding[3]; + + int32_t M = weight.dimension()[0]; + dims_t filter = {weight.dimension()[2], weight.dimension()[3]}; + + int32_t OH = (H - filter[0]) / stride[0] + 1; + int32_t OW = (W - filter[1]) / stride[1] + 1; + + auto input_cols = im2col( + input, filter, padding, stride, dilation, groups); + auto mm = runtime_util->malloc(M * C * OH * OW * sizeof(T)); + kernels::matmul(weight.cdata(), input.cdata(), mm, + {M, C * filter[0] * filter[1]}, + {C * filter[0] * filter[1], 1}, + {C * filter[0] * filter[1], N * OH * OW}, {N * OH * OW, 1}, + {N * OH * OW, 1}); + kernels::binary(binary_op_t::add, mm, bias.cdata(), mm, {M, N * OH * OW}, + {N * OH * OW, 1}, {M, 1}, {1, 1}, {M, N * OH * OW}, + {N * OH * OW, 1}); + kernels::transpose(mm, output.data(), {M, N, OH, OW}, {3, 0, 1, 2}, + {N * OH * OW, 1}, output.strides()); +} + template void reduce(tensor &input, tensor &output, reduce_op_t op, T init_value, dims_t axis, bool keep_dims) {