diff --git a/nix/pkgs/buddy-mlir.nix b/nix/pkgs/buddy-mlir.nix index 17b426871..2f520e491 100644 --- a/nix/pkgs/buddy-mlir.nix +++ b/nix/pkgs/buddy-mlir.nix @@ -18,8 +18,8 @@ let src = fetchFromGitHub { owner = "buddy-compiler"; repo = "buddy-mlir"; - rev = "d7d90a488ac0d6fc1e700e932f842c7b2bcad816"; - hash = "sha256-MhykCa6Z7Z8PpAlNh+vMuWYEOZZDyWhtMzMnFlNbGIk="; + rev = "802cefe91199c0935122546d463e400bee8635a6"; + hash = "sha256-d8e/VM5LrsEwsC7NyNy/kdBp0fpY/CWeItrk4adOK0A="; }; nativeBuildInputs = [ cmake ninja bintools ]; diff --git a/tests/pytorch/README.md b/tests/pytorch/README.md index 031d07b93..f6510cc95 100644 --- a/tests/pytorch/README.md +++ b/tests/pytorch/README.md @@ -8,11 +8,10 @@ Assuming that the new PyTorch test have project name call `demo`, let's create t cd tests/pytorch mkdir -p demo cd demo -touch demo.cc demo.py config.nix +touch demo.cc demo.py build.nix ``` Developers should put their PyTorch implementation into ".py" file. -For each PyTorch tests, developers must write the MLIR model to "forward.mlir" file. ```python # demo.py @@ -49,57 +48,55 @@ extern "C" int test() { } ``` -After PyTorch model and the C entry is correctly created, developers should declare a "config.nix" +After PyTorch model and the C entry is correctly created, developers should declare a "build.nix" file to indicate our build system to find and build the test case: ```nix -{ - # Tell our build system to include the memref.h header. - # Developer could add extra headers here. - includes = [ - ../memref.hpp - ]; - - # Tell the build system to run buddy-opt with three phrase, with arguments to run in each phrase - buddyOptArgs = [ - [ - "--pass-pipeline" - "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" - ] - [ - "--pass-pipeline" - "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), eliminate-empty-tensors, func.func(llvm-request-c-wrappers))" - ] - [ - "--lower-affine" - "--convert-math-to-llvm" - "--convert-math-to-libm" - "--convert-scf-to-cf" - "--convert-arith-to-llvm" - "--expand-strided-metadata" - "--finalize-memref-to-llvm" - "--lower-vector-exp" - "--lower-rvv=rv32" - "--convert-vector-to-llvm" - "--convert-func-to-llvm" - "--reconcile-unrealized-casts" - ] - ]; +{ buildBuddyE2ETest }: +buildBuddyE2ETest { + caseName = "demo"; + + optPhase = '' + echo "Lowering MLIR" + python ./demo.py \ + | buddy-opt --pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith),\ + empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, \ + func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" \ + | buddy-opt --pass-pipeline "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), \ + eliminate-empty-tensors, func.func(llvm-request-c-wrappers))" \ + | buddy-opt --lower-affine \ + --convert-math-to-llvm \ + --convert-math-to-libm \ + --convert-scf-to-cf \ + --convert-arith-to-llvm \ + --expand-strided-metadata \ + --finalize-memref-to-llvm \ + --lower-vector-exp \ + --lower-rvv=rv32 \ + --convert-vector-to-llvm \ + --convert-func-to-llvm \ + --reconcile-unrealized-casts \ + -o forward-lowered.mlir + + optArtifacts+=( + "forward-lowered.mlir" + ) + ''; } ``` -Our build system accept the below data layout for the "config.nix" file: +Here you can think `optPhase` as a bash function. Developers can write their own pass in this function. +Each `optPhase` should modify the `optArtifacts` array, to indicate our build system about the final output. -```text -Set { - buddyOptArgs: Array>, +The `caseName` and `optPhase` attribute is always required. +We also offer the below attribute for you to override: - includes: Optional>, - pythonArgs: Optional>, - buddyTranslateArgs: Optional>, - buddyLLCArgs: Optional>, -} -``` +* `translatePhase`: By default, run `buddy-translate --buddy-to-llvmir` for each file in `optArtifacts` array, + add output into `translateArtifacts` array. +* `llcPhase`: By default run `buddy-llc` for each file in `translateArtifacts` array, add output to `llcArtifacts` array. +* `linkPhase`: By default link all the .o object file present in `llcArtifacts` array and the `caseName.cc` C++ file. + +> Developer can add other `stdenv.mkDerivation` attribute to override if they know the risks. After the project have been implemented, developers can run the below commands to build and test the ELF: diff --git a/tests/pytorch/default.nix b/tests/pytorch/default.nix index 6b54203d4..cebefa214 100644 --- a/tests/pytorch/default.nix +++ b/tests/pytorch/default.nix @@ -5,96 +5,88 @@ , findAndBuild , getTestRequiredFeatures , t1main +, callPackage }: let - - builder = makeBuilder { casePrefix = "mlir"; }; + builder = makeBuilder { casePrefix = "pytorch"; }; build = { caseName, sourcePath }: - let - buddyBuildConfig = import (sourcePath + "/config.nix"); - defaultBuddyTranslateArgs = [ "--buddy-to-llvmir" ]; - defaultBuddyLLCArgs = [ - "-mtriple=riscv32" - "-target-abi=ilp32f" - "-mattr=+m,+f,+zve32f" - "-riscv-v-vector-bits-min=128" - ]; - in - builder rec { - inherit caseName; - - src = sourcePath; - - featuresRequired = getTestRequiredFeatures sourcePath; - - nativeBuildInputs = [ buddy-mlir.pyenv buddy-mlir ]; - - pythonArgs = buddyBuildConfig.pythonArgs or [ ]; - buddyTranslateArgs = buddyBuildConfig.buddyTranslateArgs or defaultBuddyTranslateArgs; - buddyLLCArgs = buddyBuildConfig.buddyLLCArgs or defaultBuddyLLCArgs; - buddyIncludes = buddyBuildConfig.includes or [ ]; - - postUnpack = '' - buddyIncludeDir="." - if [ "x$buddyIncludes" != "x" ]; then - mkdir -p buddyInclude - _buddyHeaderArray=( $buddyIncludes ) - for h in "''${_buddyHeaderArray}"; do - cp -v "$h" buddyInclude/"$(stripHash $h)" - done - - buddyIncludeDir=$PWD/buddyInclude - fi - ''; - - buildPhase = '' - runHook preBuild - - echo "Running python with args $pythonArgs" - python $pythonArgs ${caseName}.py - - # Generate multiple buddy-opt call, each will read input from former pipeline - # For example, for buddyOptArgs = [ [ "--arg-a" ], [ "--arg-b" ], [ "--arg-c" ] ] - # This will generate - # - # echo "..." - # buddy-opt forward.mlir --arg-a -o forward-1.mlir - # echo "..." - # buddy-opt forward-1.mlir --arg-b -o forward-2.mlir - # echo "..." - # buddy-opt forward-2.mlir --arg-c -o forward-3.mlir - # - ${lib.concatStringsSep "\n" ( - lib.imap0 - (idx: args: '' - echo "Running buddy-opt with args ${lib.escapeShellArgs args}" - buddy-opt \ - forward${if idx == 0 then "" else "-${toString idx}"}.mlir \ - ${lib.escapeShellArgs args} \ - -o forward-${toString (idx+1)}.mlir - '') - buddyBuildConfig.buddyOptArgs - )} - - # Pick up the last optimized MLIR file - echo "Running buddy-translate with args $buddyTranslateArgs" - buddy-translate forward-${with builtins; toString (length buddyBuildConfig.buddyOptArgs)}.mlir \ - $buddyTranslateArgs -o forward.ll - - echo "Running buddy-llc with args $buddyLLCArgs" - buddy-llc forward.ll $buddyLLCArgs --filetype=obj -o forward.o - - echo "Using include dir $buddyIncludeDir" - $CXX -nostdlib -I$buddyIncludeDir -c ${caseName}.cc -o host.o - $CC -T${linkerScript} \ - host.o forward.o ${t1main} \ - -o $pname.elf - - runHook postBuild - ''; - - meta.description = "testcase '${caseName}', written in MLIR"; + callPackage (sourcePath + "/build.nix") { + buildBuddyE2ETest = { optPhase, ... }@overrides: builder + ({ + inherit caseName; + + featuresRequired = getTestRequiredFeatures sourcePath; + + nativeBuildInputs = [ buddy-mlir.pyenv buddy-mlir ]; + + src = sourcePath; + + configurePhase = '' + declare -A optArtifacts translateArtifacts llcArtifacts + ''; + + translatePhase = '' + if [[ -z "$optArtifacts" ]]; then + echo "optPhase doesn't produce optArtifacts, abort" >&2 + exit 1 + fi + + for mlir in ''${optArtifacts[@]}; do + echo "Translating $mlir" + buddy-translate --buddy-to-llvmir "$mlir" -o "$mlir.ll" + + translateArtifacts+=("$mlir.ll") + done + ''; + + llcPhase = '' + if [[ -z "$translateArtifacts" ]]; then + echo "translatePhase doesn't produce translateArtifacts, abort" >&2 + exit 1 + fi + + for llvmir in ''${translateArtifacts[@]}; do + echo "Compiling $llvmir" + buddy-llc "$llvmir" \ + -mtriple=riscv32 \ + -target-abi=ilp32f \ + -mattr=+m,+f,+zve32f \ + -riscv-v-vector-bits-min=128 \ + --filetype=obj \ + -o "$llvmir.o" + + llcArtifacts+=("$llvmir.o") + done + ''; + + linkPhase = '' + if [[ -z "$llcArtifacts" ]]; then + echo "llcPhase doesn't produce any llcArtifacts" >&2 + exit 1 + fi + + echo "Building final binary" + mkdir -p _include + cp ${./memref.hpp} _include/memref.hpp + + $CXX -nostdlib -I _include -c ${caseName}.cc -o host.o + $CC -T${linkerScript} \ + host.o ''${llcArtifacts[@]} ${t1main} \ + -o $pname.elf + ''; + + buildPhase = '' + runHook preBuild + + runPhase optPhase + runPhase translatePhase + runPhase llcPhase + runPhase linkPhase + + runHook postBuild + ''; + } // overrides); }; in findAndBuild ./. build diff --git a/tests/pytorch/demo/build.nix b/tests/pytorch/demo/build.nix new file mode 100644 index 000000000..f773022fa --- /dev/null +++ b/tests/pytorch/demo/build.nix @@ -0,0 +1,31 @@ +{ buildBuddyE2ETest }: +buildBuddyE2ETest { + caseName = "demo"; + + optPhase = '' + echo "Lowering MLIR" + python ./demo.py \ + | buddy-opt --pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith),\ + empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, \ + func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" \ + | buddy-opt --pass-pipeline "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), \ + eliminate-empty-tensors, func.func(llvm-request-c-wrappers))" \ + | buddy-opt --lower-affine \ + --convert-math-to-llvm \ + --convert-math-to-libm \ + --convert-scf-to-cf \ + --convert-arith-to-llvm \ + --expand-strided-metadata \ + --finalize-memref-to-llvm \ + --lower-vector-exp \ + --lower-rvv=rv32 \ + --convert-vector-to-llvm \ + --convert-func-to-llvm \ + --reconcile-unrealized-casts \ + -o forward-lowered.mlir + + optArtifacts+=( + "forward-lowered.mlir" + ) + ''; +} diff --git a/tests/pytorch/demo/config.nix b/tests/pytorch/demo/config.nix deleted file mode 100644 index 0a97e1a09..000000000 --- a/tests/pytorch/demo/config.nix +++ /dev/null @@ -1,30 +0,0 @@ -{ - includes = [ - ../memref.hpp - ]; - - buddyOptArgs = [ - [ - "--pass-pipeline" - "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" - ] - [ - "--pass-pipeline" - "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), eliminate-empty-tensors, func.func(llvm-request-c-wrappers))" - ] - [ - "--lower-affine" - "--convert-math-to-llvm" - "--convert-math-to-libm" - "--convert-scf-to-cf" - "--convert-arith-to-llvm" - "--expand-strided-metadata" - "--finalize-memref-to-llvm" - "--lower-vector-exp" - "--lower-rvv=rv32" - "--convert-vector-to-llvm" - "--convert-func-to-llvm" - "--reconcile-unrealized-casts" - ] - ]; -} diff --git a/tests/pytorch/demo/demo.py b/tests/pytorch/demo/demo.py index 650227eca..a0d38c840 100644 --- a/tests/pytorch/demo/demo.py +++ b/tests/pytorch/demo/demo.py @@ -9,22 +9,25 @@ def foo(x, y): return x * y + x -# Define the input data. -float32_in1 = torch.randn(512).to(torch.float32) -float32_in2 = torch.randn(512).to(torch.float32) +def main(): + # Define the input data. + float32_in1 = torch.randn(512).to(torch.float32) + float32_in2 = torch.randn(512).to(torch.float32) -# Initialize the dynamo compiler. -dynamo_compiler = DynamoCompiler( - primary_registry=tosa.ops_registry, - aot_autograd_decomposition=inductor_decomp, -) + # Initialize the dynamo compiler. + dynamo_compiler = DynamoCompiler( + primary_registry=tosa.ops_registry, + aot_autograd_decomposition=inductor_decomp, + ) -# Pass the function and input data to the dynamo compiler's importer, the -# importer will first build a graph. Then, lower the graph to top-level IR. -# (tosa, linalg, etc.). Finally, accepts the generated module and weight parameters. -graphs = dynamo_compiler.importer(foo, *(float32_in1, float32_in2)) -graph = graphs[0] -graph.lower_to_top_level_ir() + # Pass the function and input data to the dynamo compiler's importer, the + # importer will first build a graph. Then, lower the graph to top-level IR. + # (tosa, linalg, etc.). Finally, accepts the generated module and weight parameters. + graphs = dynamo_compiler.importer(foo, *(float32_in1, float32_in2)) + graph = graphs[0] + graph.lower_to_top_level_ir() -with open("forward.mlir", "w") as mlir_module: - print(graph._imported_module, file = mlir_module) + print(graph._imported_module) + +if __name__ == "__main__": + main() diff --git a/tests/pytorch/lenet/.gitignore b/tests/pytorch/lenet/.gitignore new file mode 100644 index 000000000..4a44e5828 --- /dev/null +++ b/tests/pytorch/lenet/.gitignore @@ -0,0 +1,3 @@ +arg0.data +forward.mlir +subgraph0.mlir diff --git a/tests/pytorch/lenet/build.nix b/tests/pytorch/lenet/build.nix new file mode 100644 index 000000000..1bab5a80b --- /dev/null +++ b/tests/pytorch/lenet/build.nix @@ -0,0 +1,60 @@ +{ buildBuddyE2ETest, fetchurl }: +let + lenetModel = fetchurl { + url = "https://raw.githubusercontent.com/buddy-compiler/buddy-benchmark/1e166d53faae6d96a209645688cd9ab1d6eb604d/benchmarks/DeepLearning/Models/LeNet/lenet_model.pth"; + hash = "sha256-OqUzJ9vF1GF6jMVlSm0AYowLk4ypiR/Qs2KD9NMQJfg="; + }; +in +buildBuddyE2ETest { + caseName = "lenet"; + + optPhase = '' + export LENET_MODEL_PATH=${lenetModel} + python ./lenet.py + + echo "Lowering forward.mlir" + buddy-opt forward.mlir -pass-pipeline \ + "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), \ + empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, \ + func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" \ + | buddy-opt -pass-pipeline \ + "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), \ + eliminate-empty-tensors, func.func(llvm-request-c-wrappers), \ + convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, \ + convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, \ + convert-func-to-llvm, reconcile-unrealized-casts)" \ + > forward-lowered.mlir + + echo "Lowering subgraphs[0]" + buddy-opt subgraphs0.mlir -pass-pipeline \ + "builtin.module(func.func(tosa-to-linalg-named, tosa-to-arith, tosa-to-linalg, tosa-to-tensor))" \ + | buddy-opt \ + -convert-elementwise-to-linalg \ + -func-bufferize-dynamic-offset \ + -arith-bufferize \ + -func-bufferize \ + -tensor-bufferize \ + -linalg-bufferize \ + -finalizing-bufferize \ + -batchmatmul-optimize \ + -convert-linalg-to-affine-loops \ + -lower-affine \ + -convert-vector-to-scf \ + -convert-scf-to-cf \ + -llvm-request-c-wrappers \ + -convert-vector-to-llvm \ + -convert-math-to-llvm \ + -convert-math-to-libm \ + -convert-arith-to-llvm \ + -convert-func-to-llvm \ + -expand-strided-metadata \ + -finalize-memref-to-llvm \ + -reconcile-unrealized-casts \ + > subgraphs0-lowered.mlir + + optArtifacts+=( + "forward-lowered.mlir" + "subgraphs0-lowered.mlir" + ) + ''; +} diff --git a/tests/pytorch/lenet/lenet.cc b/tests/pytorch/lenet/lenet.cc new file mode 100644 index 000000000..3cb5d1a88 --- /dev/null +++ b/tests/pytorch/lenet/lenet.cc @@ -0,0 +1,34 @@ +#include "memref.hpp" + +#define INPUT_N 1 +#define INPUT_C 1 +#define INPUT_H 28 +#define INPUT_W 28 +#define INPUT_TOTAL (INPUT_N * INPUT_C * INPUT_H * INPUT_W) +#define OUTPUT_N 10 +#define PARAM_N 44426 + +__attribute((section(".vdata"))) float input_0[INPUT_TOTAL]; +__attribute((section(".vdata"))) float output_0[OUTPUT_N]; +__attribute((section(".vdata"))) float param_0[PARAM_N]; + +// Define the sizes of the input and output tensors. +static const int32_t sizesInput[4] = {INPUT_N, INPUT_C, INPUT_H, INPUT_W}; +static const int32_t sizesOutput[2] = {1, OUTPUT_N}; +static const int32_t sizesParams[1] = {PARAM_N}; + +// Create input and output containers for the image and model output. +MemRef input(input_0, sizesInput); +MemRef output(output_0, sizesOutput); +MemRef params(param_0, 2.0, sizesParams); + +// Declare the target model C interface. +extern "C" { +void _mlir_ciface_forward(MemRef *output, MemRef *arg0, + MemRef *input); +} + +extern "C" int test() { + _mlir_ciface_forward(&output, ¶ms, &input); + return 0; +} diff --git a/tests/pytorch/lenet/lenet.py b/tests/pytorch/lenet/lenet.py new file mode 100644 index 000000000..9fd364f68 --- /dev/null +++ b/tests/pytorch/lenet/lenet.py @@ -0,0 +1,47 @@ +import os +import sys +from pathlib import Path + +import numpy as np +import torch +from torch._inductor.decomposition import decompositions as inductor_decomp + +from buddy.compiler.frontend import DynamoCompiler +from buddy.compiler.graph import GraphDriver +from buddy.compiler.graph.transform import simply_fuse +from buddy.compiler.ops import tosa +from model import LeNet + +def main(): + model_path = os.environ.get("LENET_MODEL_PATH") + if model_path is None: + sys.exit("Error: No model path was provided. Please set $LENET_MODEL_PATH") + model = torch.load(model_path) + model = model.eval() + + # Initialize Dynamo Compiler with specific configurations as an importer. + dynamo_compiler = DynamoCompiler( + primary_registry=tosa.ops_registry, + aot_autograd_decomposition=inductor_decomp, + ) + + data = torch.randn([1, 1, 28, 28]) + # Import the model into MLIR module and parameters. + with torch.no_grad(): + graphs = dynamo_compiler.importer(model, data) + + assert len(graphs) == 1 + graph = graphs[0] + params = dynamo_compiler.imported_params[graph] + pattern_list = [simply_fuse] + graphs[0].fuse_ops(pattern_list) + driver = GraphDriver(graphs[0]) + driver.subgraphs[0].lower_to_top_level_ir() + + with open("subgraphs0.mlir", "w") as module_file: + print(driver.subgraphs[0]._imported_module, file=module_file) + with open("forward.mlir", "w") as module_file: + print(driver.construct_main_graph(True), file=module_file) + +if __name__ == "__main__": + main() diff --git a/tests/pytorch/lenet/model.py b/tests/pytorch/lenet/model.py new file mode 100644 index 000000000..2f32951a6 --- /dev/null +++ b/tests/pytorch/lenet/model.py @@ -0,0 +1,42 @@ +# ===- model.py ---------------------------------------------------------------- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ===--------------------------------------------------------------------------- +# +# LeNet model definition. +# +# ===--------------------------------------------------------------------------- + +import torch +import torch.nn as nn + + +class LeNet(nn.Module): + def __init__(self): + super(LeNet, self).__init__() + self.conv1 = nn.Conv2d(1, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 4 * 4, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(torch.relu(self.conv1(x))) + x = self.pool(torch.relu(self.conv2(x))) + x = x.view(-1, 16 * 4 * 4) + x = torch.relu(self.fc1(x)) + x = torch.relu(self.fc2(x)) + x = self.fc3(x) + return x diff --git a/tests/pytorch/memref.hpp b/tests/pytorch/memref.hpp index 09276cfc6..b37b96181 100644 --- a/tests/pytorch/memref.hpp +++ b/tests/pytorch/memref.hpp @@ -7,6 +7,7 @@ template class MemRef { public: constexpr MemRef(T *data, const int32_t sizes[N]); + constexpr MemRef(T *data, T init, const int32_t sizes[N]); protected: inline void setStrides(); @@ -19,8 +20,8 @@ template class MemRef { int32_t strides[N]; }; -template constexpr -MemRef::MemRef(T *data, const int32_t sizes[N]) { +template +constexpr MemRef::MemRef(T *data, const int32_t sizes[N]) { for (size_t i = 0; i < N; i++) { this->sizes[i] = sizes[i]; } @@ -31,6 +32,20 @@ MemRef::MemRef(T *data, const int32_t sizes[N]) { aligned = data; } +template +constexpr MemRef::MemRef(T *data, T init, const int32_t sizes[N]) + : MemRef(data, sizes) { + + int32_t total_size = 0; + for (size_t i = 0; i < N; i++) { + total_size += sizes[i]; + } + + for (int32_t i = 0; i < total_size; i++) { + aligned[i] = init; + } +} + template inline void MemRef::setStrides() { strides[N - 1] = 1; if (N < 2)