diff --git a/.github/workflows/compiler-build.yml b/.github/workflows/compiler-build.yml
index d6c526e4f8..d34934dd55 100644
--- a/.github/workflows/compiler-build.yml
+++ b/.github/workflows/compiler-build.yml
@@ -2,206 +2,161 @@ name: compiler-build
 
 on: [push, pull_request]
 
-env:
-  BUILD_TYPE: Release
-
 jobs:
   build:
-    runs-on: ${{ matrix.os }}
+    name: build-${{matrix.config.name}}
+    runs-on: ${{matrix.config.os}}
     strategy:
       matrix:
-        os: [ubuntu-18.04,windows-2019,macos-10.15]
+        config:
+          - {name: x86_64-macos, os: macos-11, cmakeArgs: '', buildType: Release}
+          - {name: x86_64-linux, os: ubuntu-20.04, cmakeArgs: '', buildType: Release}
+          - {name: x86_64-windows, os: windows-latest, arch: x64, cmakeArgs: '', buildType: Release}
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - uses: seanmiddleditch/gha-setup-ninja@master
 
-    - name: Install System Requirements
-      if: runner.os == 'Macos'
-      shell: bash
-      run: |
-        brew install sunnycase/core/libomp@11.1.0
-
-    - name: Add msbuild to PATH
+    - name: Set up build environment (Windows, Visual Studio)
+      uses: ilammy/msvc-dev-cmd@v1
+      with:
+        arch: ${{matrix.config.arch}}
       if: runner.os == 'Windows'
-      uses: ilammy/msvc-dev-cmd@v1.10.0
+      
+    - name: Set up build environment (Macos)
+      run: |
+        brew install sunnycase/core/libomp@14.0.6
+      if: runner.os == 'Macos'
 
     - name: Setup Python
-      uses: actions/setup-python@v2.2.1
+      uses: actions/setup-python@v4
       with:
         python-version: 3.7
 
     - name: Install Conan
-      shell: bash
-      run: |
-        pip install conan
+      run: pip install conan==1.59.0
 
-    - name: Configure Conan
-      if: runner.os == 'Linux'
-      shell: bash
+    - name: Configure Conan (Linux)
       run: |
         conan profile new default --detect
         conan profile update settings.compiler.libcxx=libstdc++11 default
-
-    - name: Create Build Environment
-      run: cmake -E make_directory ${{github.workspace}}/build
-
-    - name: Configure CMake
-      env:
-        CC: gcc-10
-        CXX: g++-10
+        echo "CC=gcc-10" >> $GITHUB_ENV
+        echo "CXX=g++-10" >> $GITHUB_ENV
       if: runner.os == 'Linux'
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation}
-
+      
     - name: Configure CMake
-      if: runner.os != 'Linux'
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation}
-
-    - name: Build
       shell: bash
-      working-directory: ${{github.workspace}}/build
       run: |
-        cmake --build . --config $BUILD_TYPE
+        cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=${{matrix.config.buildType}} ${{matrix.config.cmakeArgs}} -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation}
 
-    - name: Install
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake --install . --prefix ../install
+    - name: Build & Install
+      run: |
+        cmake --build build --config ${{matrix.config.buildType}}
+        cmake --install build --prefix install
 
     - name: CTest
-      shell: bash
       working-directory: ${{github.workspace}}/build/tests/kernels
-      run: ctest -C $BUILD_TYPE
+      run: ctest -C ${{matrix.config.buildType}}
 
-    - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
+    - name: Upload nncase Build Artifact
+      uses: actions/upload-artifact@v3
       with:
-        name: nncase-${{matrix.os}}-x86_64
+        name: nncase-${{matrix.config.name}}
         path: ${{github.workspace}}/install
         if-no-files-found: error
 
   test-compiler:
     needs: [build]
-    runs-on: ${{ matrix.os }}
+    name: test-${{matrix.config.name}}
+    runs-on: ${{matrix.config.os}}
     strategy:
       matrix:
-        os: [ubuntu-18.04,windows-2019,macos-10.15]
+        config:
+          - {name: x86_64-macos, os: macos-11, shell: bash}
+          - {name: x86_64-linux, os: ubuntu-20.04, shell: bash}
+          - {name: x86_64-windows, os: windows-latest, shell: bash}
 
-    steps:
-    - uses: actions/checkout@v2
+    env:
+      VULKANSDK_VER: 1.2.182.0
 
-    - name: Install System Requirements
-      if: runner.os == 'Macos'
-      shell: bash
+    steps:
+    - uses: actions/checkout@v3
+    
+    - name: Set up test environment (macOS)
       run: |
-        brew install sunnycase/core/libomp@11.1.0
-
-    - name: Install Vulkan SDK
-      if: runner.os == 'Linux'
-      shell: bash
-      env:
-        VULKANSDK_VER: 1.2.182.0
+        brew install sunnycase/core/libomp@14.0.6
+        aria2c --parameterized-uri=true https://{sdk.lunarg.com/sdk/download/${VULKANSDK_VER}/mac,distfiles.macports.org/MoltenVK}/vulkansdk-macos-${VULKANSDK_VER}.dmg
+        hdiutil attach ./vulkansdk-macos-*.dmg
+        sudo /Volumes/vulkansdk-macos-*/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $HOME/VulkanSDK --accept-licenses --default-answer --confirm-command install
+        hdiutil detach /Volumes/vulkansdk-macos-*
+        echo "VULKAN_SDK=$HOME/VulkanSDK/macOS" >> $GITHUB_ENV
+        wget https://github.com/sunnycase/swiftshader/releases/download/v1.0/swiftshader-macos-10.15-x86_64.zip -O swiftshader.zip
+        unzip swiftshader.zip
+        sudo cmake -E make_directory /usr/local/share/vulkan/icd.d
+        sudo cp lib/* /usr/local/share/vulkan/icd.d
+        echo "PYTHONPATH=$GITHUB_WORKSPACE/install/lib:$GITHUB_WORKSPACE/install/python:$GITHUB_WORKSPACE/tests" >> $GITHUB_ENV
+      if: runner.os == 'macOS'
+      
+    - name: Set up test environment (Linux)
       run: |
-        wget https://sdk.lunarg.com/sdk/download/1.2.182.0/linux/vulkansdk-linux-x86_64-${VULKANSDK_VER}.tar.gz -O vulkansdk.tar.gz
+        wget https://sdk.lunarg.com/sdk/download/${VULKANSDK_VER}/linux/vulkansdk-linux-x86_64-${VULKANSDK_VER}.tar.gz -O vulkansdk.tar.gz
         tar xf vulkansdk.tar.gz
         sudo cp -P ${VULKANSDK_VER}/x86_64/lib/libvulkan.so* /usr/local/lib/
-
-    - name: Install Vulkan SDK
-      if: runner.os == 'Windows'
-      shell: pwsh
-      run: |
-        Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.2.182.0/windows/VulkanSDK-1.2.182.0-Installer.exe -O VulkanSDK-Installer.exe
-        .\VulkanSDK-Installer.exe /S
-
-    - name: Install SwiftShader
-      if: runner.os != 'Windows'
-      shell: bash
-      run: |
-        wget https://github.com/sunnycase/swiftshader/releases/download/v1.0/swiftshader-${{matrix.os}}-x86_64.zip -O swiftshader.zip
+        wget https://github.com/sunnycase/swiftshader/releases/download/v1.0/swiftshader-ubuntu-18.04-x86_64.zip -O swiftshader.zip
         unzip swiftshader.zip
         sudo cmake -E make_directory /usr/local/share/vulkan/icd.d
         sudo cp lib/* /usr/local/share/vulkan/icd.d
-
-    - name: Install SwiftShader
-      if: runner.os == 'Windows'
+        echo "PYTHONPATH=$GITHUB_WORKSPACE/install/lib:$GITHUB_WORKSPACE/install/python:$GITHUB_WORKSPACE/tests" >> $GITHUB_ENV
+      if: runner.os == 'Linux'
+      
+    - name: Set up test environment (Windows)
       shell: pwsh
       run: |
-        Invoke-WebRequest -Uri https://github.com/sunnycase/swiftshader/releases/download/v1.0/swiftshader-${{matrix.os}}-x86_64.zip -OutFile swiftshader.zip
+        Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/${env:VULKANSDK_VER}/windows/VulkanSDK-${env:VULKANSDK_VER}-Installer.exe -O VulkanSDK-Installer.exe
+        .\VulkanSDK-Installer.exe /S
+        Invoke-WebRequest -Uri https://github.com/sunnycase/swiftshader/releases/download/v1.0/swiftshader-windows-2019-x86_64.zip -OutFile swiftshader.zip
         Expand-Archive swiftshader.zip
         Copy-Item swiftshader\lib\vk_swiftshader_icd.json swiftshader\bin\
+        echo "VK_ICD_FILENAMES=${env:GITHUB_WORKSPACE}/swiftshader/bin/vk_swiftshader_icd.json" >> $env:GITHUB_ENV
+        echo "PYTHONPATH=${env:GITHUB_WORKSPACE}/install/lib;${env:GITHUB_WORKSPACE}/install/python;${env:GITHUB_WORKSPACE}/tests" >> $env:GITHUB_ENV
+      if: runner.os == 'Windows'
 
     - name: Setup Python
-      uses: actions/setup-python@v2.2.1
+      uses: actions/setup-python@v4
       with:
         python-version: 3.7
 
     - name: Install Python Packages
-      if: runner.os == 'Linux'
-      shell: bash
-      run: |
-        pip install conan tensorflow==2.5.0 matplotlib pillow onnx==1.9.0 onnx-simplifier==0.3.6 onnxoptimizer==0.2.6 onnxruntime==1.8.0 opencv-python
-        pip install torch==1.9.0+cpu torchvision==0.10.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
-        pip install imageio==2.15.0
-        pip install https://github.com/kendryte/caffe/releases/download/v1.0.0.20210829/kendryte_caffe-1.0.0.20210829-cp37-cp37m-manylinux_2_24_x86_64.whl
-        pip install pytest
-
-    - name: Install Python Packages
-      if: runner.os == 'Windows'
-      shell: bash
-      run: |
-        pip install conan tensorflow==2.5.0 matplotlib pillow onnx==1.9.0 onnx-simplifier==0.3.6 onnxoptimizer==0.2.6 onnxruntime==1.8.0 opencv-python
-        pip install torch==1.9.0+cpu torchvision==0.10.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
-        pip install imageio==2.15.0
-        pip install https://github.com/kendryte/caffe/releases/download/v1.0.0.20210829/kendryte_caffe-1.0.0.20210829-cp37-cp37m-win_amd64.whl
-        pip install pytest
-
-    - name: Install Python Packages
-      if: runner.os == 'Macos'
-      shell: bash
-      run: |
-        pip install conan tensorflow==2.5.0 matplotlib pillow onnx==1.9.0 onnx-simplifier==0.3.6 onnxoptimizer==0.2.6 onnxruntime==1.8.0 opencv-python
-        pip install torch==1.9.0 torchvision==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html
-        pip install imageio==2.15.0
-        pip install https://github.com/kendryte/caffe/releases/download/v1.0.0.20210829/kendryte_caffe-1.0.0.20210829-cp37-cp37m-macosx_10_9_x86_64.whl
-        pip install pytest
+      run: pip install -r requirements.test.txt
 
     - name: Create Test Environment
-      working-directory: ${{github.workspace}}
-      shell: bash
-      run: |
-        mkdir test_results
+      run: mkdir test_results
 
-    - name: Download nncase Artifact
-      uses: actions/download-artifact@v2.0.9
+    - name: Install nncase
+      uses: actions/download-artifact@v3
       with:
-        name: nncase-${{matrix.os}}-x86_64
+        name: nncase-${{matrix.config.name}}
         path: ${{github.workspace}}/install
 
-    - name: Test
-      working-directory: ${{github.workspace}}
-      if: runner.os != 'Windows'
-      shell: bash
+    - name: Generate benchmark kmodels
+      working-directory: ${{github.workspace}}/benchmark
       env:
-        PYTHONPATH: ${{github.workspace}}/install/lib:${{github.workspace}}/install/python:${{github.workspace}}/tests
+        PATH: ${{github.workspace}}/install/bin
       run: |
-        pytest tests/other --doctest-modules --junitxml=test_results/other.xml
-        pytest tests/importer --doctest-modules --junitxml=test_results/importer.xml
-        pytest tests/schedule --doctest-modules --junitxml=test_results/schedule.xml
-        pytest tests/graph_partition --doctest-modules --junitxml=test_results/graph_partition.xml
-        pytest tests/examples --doctest-modules --junitxml=test_results/examples.xml
+        python gen_kmodel.py
+      if: matrix.config.name == 'x86_64-linux'
+
+    - uses: stefanzweifel/git-auto-commit-action@v4
+      with:
+        commit_message: Update benchmark kmodels
+        file_pattern: 'benchmark/models/*'
+      if: matrix.config.name == 'x86_64-linux'
 
     - name: Test
       working-directory: ${{github.workspace}}
-      if: runner.os == 'Windows'
       shell: bash
       env:
         PATH: ${{github.workspace}}/install/bin
-        PYTHONPATH: ${{github.workspace}}/install/lib;${{github.workspace}}/install/python;${{github.workspace}}/tests
-        VK_ICD_FILENAMES: ${{github.workspace}}/swiftshader/bin/vk_swiftshader_icd.json
       run: |
         pytest tests/other --doctest-modules --junitxml=test_results/other.xml
         pytest tests/importer --doctest-modules --junitxml=test_results/importer.xml
@@ -210,7 +165,7 @@ jobs:
         pytest tests/examples --doctest-modules --junitxml=test_results/examples.xml
 
     - name: Publish Test Results
-      uses: EnricoMi/publish-unit-test-result-action@v1
-      if: always() && runner.os == 'Linux'
+      uses: EnricoMi/publish-unit-test-result-action@v2
+      if: always() && matrix.config.name == 'x86_64-linux'
       with:
         files: test_results/*.xml
diff --git a/.github/workflows/compiler-python-release.yml b/.github/workflows/compiler-python-release.yml
index 8b8b7fdaf5..6ac04e6a27 100644
--- a/.github/workflows/compiler-python-release.yml
+++ b/.github/workflows/compiler-python-release.yml
@@ -5,61 +5,58 @@ on:
     tags:
       - '*'
 
-env:
-  BUILD_TYPE: Release
-
 jobs:
   build:
-    runs-on: ${{ matrix.os }}
+    name: ${{matrix.config.name}}
+    runs-on: ${{matrix.config.os}}
     strategy:
       matrix:
-        os: [ubuntu-18.04,windows-2019,macos-10.15]
+        config:
+          - {name: x86_64-macos, os: macos-11}
+          - {name: x86_64-linux, os: ubuntu-20.04}
+          - {name: x86_64-windows, os: windows-latest, arch: x64}
+
+    env:
+      VULKANSDK_VER: 1.2.182.0
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - uses: seanmiddleditch/gha-setup-ninja@master
 
+    - name: Set up build environment (Windows, Visual Studio)
+      uses: ilammy/msvc-dev-cmd@v1
+      with:
+        arch: ${{matrix.config.arch}}
+      if: runner.os == 'Windows'
+      
+    - name: Set up build environment (Macos)
+      run: |
+        brew install sunnycase/core/libomp@14.0.6
+        aria2c --parameterized-uri=true https://{sdk.lunarg.com/sdk/download/${VULKANSDK_VER}/mac,distfiles.macports.org/MoltenVK}/vulkansdk-macos-${VULKANSDK_VER}.dmg
+        hdiutil attach ./vulkansdk-macos-*.dmg
+        sudo /Volumes/vulkansdk-macos-*/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $HOME/VulkanSDK --accept-licenses --default-answer --confirm-command install
+        hdiutil detach /Volumes/vulkansdk-macos-*
+        echo "VULKAN_SDK=$HOME/VulkanSDK/macOS" >> $GITHUB_ENV
+        wget https://github.com/sunnycase/swiftshader/releases/download/v1.0/swiftshader-macos-10.15-x86_64.zip -O swiftshader.zip
+        unzip swiftshader.zip
+        sudo cmake -E make_directory /usr/local/share/vulkan/icd.d
+        sudo cp lib/* /usr/local/share/vulkan/icd.d
+      if: runner.os == 'Macos'
+
     - name: Setup Python
-      uses: actions/setup-python@v2.2.1
+      uses: actions/setup-python@v4
       with:
         python-version: 3.7
 
     - name: Install cibuildwheel
       run: pip install cibuildwheel
 
-    - name: Install System Requirements
-      if: runner.os == 'Macos'
-      shell: bash
-      run: |
-        brew install sunnycase/core/libomp@11.1.0
-
-    - name: Add msbuild to PATH
-      if: runner.os == 'Windows'
-      uses: ilammy/msvc-dev-cmd@v1.10.0
-
     - name: Build wheel
       run: python -m cibuildwheel --output-dir wheelhouse
-    
-    - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
-      if: runner.os == 'Windows'
-      with:
-        name: nncase-python-windows
-        path: ${{github.workspace}}/wheelhouse
-        if-no-files-found: error
 
-    - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
-      if: runner.os == 'Linux'
-      with:
-        name: nncase-python-linux
-        path: ${{github.workspace}}/wheelhouse
-        if-no-files-found: error
-
-    - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
-      if: runner.os == 'Macos'
+    - name: Upload nncase-python Build Artifact
+      uses: actions/upload-artifact@v3
       with:
-        name: nncase-python-macos
+        name: nncase-python-${{matrix.config.name}}
         path: ${{github.workspace}}/wheelhouse
         if-no-files-found: error
diff --git a/.github/workflows/compiler-test.yml b/.github/workflows/compiler-test.yml
index 623b778646..45a84fc791 100644
--- a/.github/workflows/compiler-test.yml
+++ b/.github/workflows/compiler-test.yml
@@ -2,112 +2,96 @@ name: compiler-test
 
 on: [push, pull_request]
 
-env:
-  BUILD_TYPE: Release
-
 jobs:
   build:
+    name: build-${{matrix.config.name}}
     runs-on: [self-hosted]
+    strategy:
+      matrix:
+        config:
+          - {name: x86_64-linux, shell: bash, cmakeArgs: '', buildType: Release}
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
 
     - name: Setup Python
-      uses: actions/setup-python@v2.2.1
+      uses: actions/setup-python@v4
       with:
         python-version: 3.7
 
     - name: Install Conan
-      shell: bash
-      run: |
-        pip install conan
+      run: pip install conan==1.59.0
 
-    - name: Configure Conan
-      if: runner.os == 'Linux'
-      shell: bash
+    - name: Configure Conan (Linux)
       run: |
         conan profile update settings.compiler.libcxx=libstdc++11 default
-
-    - name: Create Build Environment
-      run: cmake -E make_directory ${{github.workspace}}/build
-
-    - name: Configure CMake
-      env:
-        CC: gcc-10
-        CXX: g++-10
+        echo "CC=gcc-10" >> $GITHUB_ENV
+        echo "CXX=g++-10" >> $GITHUB_ENV
       if: runner.os == 'Linux'
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation}
-
-    - name: Configure CMake
-      if: runner.os != 'Linux'
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation}
 
     - name: Build
-      shell: bash
-      working-directory: ${{github.workspace}}/build
       run: |
-        cmake --build . --config $BUILD_TYPE
+        cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=${{matrix.config.buildType}} ${{matrix.config.cmakeArgs}} -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation}
+        cmake --build build --config ${{matrix.config.buildType}}
 
     - name: Install
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake --install . --prefix /tmp/nncase
+      run: cmake --install build --prefix /tmp/nncase
 
     - name: CTest
-      shell: bash
       working-directory: ${{github.workspace}}/build/tests/kernels
-      run: ctest -C $BUILD_TYPE
+      run: ctest -C ${{matrix.config.buildType}}
 
   test-compiler:
     needs: [build]
+    name: test-${{matrix.config.name}}
     runs-on: [self-hosted]
+    strategy:
+      matrix:
+        config:
+          - {name: x86_64-linux, shell: bash}
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
 
     - name: Setup Python
-      uses: actions/setup-python@v2.2.1
+      uses: actions/setup-python@v4
       with:
         python-version: 3.7
 
     - name: Install Python Packages
-      if: runner.os != 'Macos'
-      shell: bash
-      run: |
-        pip install conan tensorflow==2.5.0 matplotlib pillow onnx==1.9.0 onnx-simplifier==0.3.6 onnxoptimizer==0.2.6 onnxruntime==1.8.0 opencv-python
-        pip install torch==1.9.0+cpu torchvision==0.10.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
-        pip install imageio==2.15.0
-        pip install kendryte_caffe
-        pip install pytest
+      run: pip install -r requirements.test.txt
 
     - name: Create Test Environment
-      working-directory: ${{github.workspace}}
-      shell: bash
-      run: |
-        pip install pytest pytest-xdist
-        mkdir test_results
+      run: mkdir test_results
 
     - name: Test
       working-directory: ${{github.workspace}}
-      if: runner.os != 'Windows'
-      shell: bash
       env:
         PYTHONPATH: /tmp/nncase/lib:/tmp/nncase/python:${{github.workspace}}/tests
         ONNX_MODELS_DIR: /compiler/github-runner/onnx-models
         TFLITE_MODELS_DIR: /compiler/github-runner/tflite-models
         DATASET_DIR: /compiler/share
       run: |
-
         pytest -n 50 --dist=load tests/other --doctest-modules --junitxml=test_results/other.xml
         pytest -n 50 --dist=load tests/importer --doctest-modules --junitxml=test_results/importer.xml
         pytest -n 50 --dist=load tests/schedule --doctest-modules --junitxml=test_results/schedule.xml
         pytest -n 50 --dist=load tests/graph_partition --doctest-modules --junitxml=test_results/graph_partition.xml
         pytest -n 50 --dist=load tests/transform --doctest-modules --junitxml=test_results/transform.xml
-        pytest -n 8 tests/models/onnx-model-zoo --doctest-modules --junitxml=test_results/models.xml
+        pytest -n 8 tests/models/onnx-model-zoo --doctest-modules --junitxml=test_results/onnx-models.xml
         pytest tests/examples --doctest-modules --junitxml=test_results/examples.xml
         for dir in `ls dataset_tests_output`; do cat dataset_tests_output/$dir/dataset_test_result.txt; done
-
+      if: runner.os != 'Windows'
+      
+    - name: Upload Test Results
+      uses: actions/upload-artifact@v3
+      with:
+        name: nncase-test_results-${{matrix.config.name}}
+        path: ${{github.workspace}}/test_results
+        if-no-files-found: error
+      
+    - name: Upload Dataset Test Results
+      uses: actions/upload-artifact@v3
+      with:
+        name: nncase-dataset_test_results-${{matrix.config.name}}
+        path: ${{github.workspace}}/dataset_tests_output
+        if-no-files-found: error
diff --git a/.github/workflows/dataset-test.yml b/.github/workflows/dataset-test.yml
index 1ce1cbbbe5..5c81bcee83 100644
--- a/.github/workflows/dataset-test.yml
+++ b/.github/workflows/dataset-test.yml
@@ -5,99 +5,72 @@ on:
     - cron: '0 17 * * 6'
     # 1:00 am
 
-env:
-  BUILD_TYPE: Release
-
 jobs:
   build:
+    name: build-${{matrix.config.name}}
     runs-on: [self-hosted]
+    strategy:
+      matrix:
+        config:
+          - {name: x86_64-linux, shell: bash, cmakeArgs: '', buildType: Release}
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
 
     - name: Setup Python
-      uses: actions/setup-python@v2.2.1
+      uses: actions/setup-python@v4
       with:
         python-version: 3.7
 
     - name: Install Conan
-      shell: bash
-      run: |
-        pip install conan
+      run: pip install conan==1.59.0
 
-    - name: Configure Conan
-      if: runner.os == 'Linux'
-      shell: bash
+    - name: Configure Conan (Linux)
       run: |
         conan profile update settings.compiler.libcxx=libstdc++11 default
+      if: runner.os == 'Linux'
 
-    - name: Create Build Environment
-      run: cmake -E make_directory ${{github.workspace}}/build
-
-    - name: Configure CMake
+    - name: Build
       env:
         CC: gcc-10
         CXX: g++-10
-      if: runner.os == 'Linux'
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation}
-
-    - name: Configure CMake
-      if: runner.os != 'Linux'
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation}
-
-    - name: Build
-      shell: bash
-      working-directory: ${{github.workspace}}/build
       run: |
-        cmake --build . --config $BUILD_TYPE
+        cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=${{matrix.config.buildType}} ${{matrix.config.cmakeArgs}} -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation}
+        cmake --build build --config ${{matrix.config.buildType}}
 
     - name: Install
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake --install . --prefix /tmp/nncase
+      run: cmake --install build --prefix /tmp/nncase
 
     - name: CTest
-      shell: bash
       working-directory: ${{github.workspace}}/build/tests/kernels
-      run: ctest -C $BUILD_TYPE
+      run: ctest -C ${{matrix.config.buildType}}
 
   dataset-test:
     needs: [build]
+    name: test-${{matrix.config.name}}
     runs-on: [self-hosted]
     timeout-minutes: 4320
+    strategy:
+      matrix:
+        config:
+          - {name: x86_64-linux, shell: bash}
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
 
     - name: Setup Python
-      uses: actions/setup-python@v2.2.1
+      uses: actions/setup-python@v4
       with:
         python-version: 3.7
 
     - name: Install Python Packages
-      if: runner.os != 'Macos'
-      shell: bash
-      run: |
-        pip install conan tensorflow==2.5.0 matplotlib pillow onnx==1.9.0 onnx-simplifier==0.3.6 onnxoptimizer==0.2.6 onnxruntime==1.8.0 opencv-python
-        pip install torch==1.9.0+cpu torchvision==0.10.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
-        pip install imageio==2.15.0
-        pip install kendryte_caffe
-        pip install pytest
+      run: pip install -r requirements.test.txt
 
     - name: Create Test Environment
-      working-directory: ${{github.workspace}}
-      shell: bash
-      run: |
-        pip install pytest pytest-xdist
-        mkdir test_results
+      run: mkdir test_results
 
     - name: Test
       working-directory: ${{github.workspace}}
-      if: runner.os != 'Windows'
       shell: bash
       env:
         PYTHONPATH: /tmp/nncase/lib:/tmp/nncase/python:${{github.workspace}}/tests
@@ -105,6 +78,13 @@ jobs:
         TFLITE_MODELS_DIR: /compiler/github-runner/tflite-models
         DATASET_DIR: /compiler/share
       run: |
-        
-        pytest -n 8 tests/models/tflite-model-zoo --doctest-modules --junitxml=test_results/models-dataset.xml
+        pytest -n 8 tests/models/tflite-model-zoo --doctest-modules --junitxml=test_results/tflite-models.xml
         for dir in `ls dataset_tests_output`; do cat dataset_tests_output/$dir/dataset_test_result.txt; done
+      if: runner.os != 'Windows'
+      
+    - name: Upload Dataset Test Results
+      uses: actions/upload-artifact@v3
+      with:
+        name: nncase-dataset_test_results-${{matrix.config.name}}
+        path: ${{github.workspace}}/dataset_tests_output
+        if-no-files-found: error
diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
index cb2ccbcaa1..41417d9ffc 100644
--- a/.github/workflows/presubmit.yml
+++ b/.github/workflows/presubmit.yml
@@ -5,25 +5,12 @@ on:
   pull_request:
     types: [opened, synchronize, reopened]
     paths:
-      - '**.h'
-      - '**.c'
-      - '**.cpp'
       - '**.py'
 
 jobs:
-  check_clang_format:
-    name: Check clang-format
-    runs-on: ubuntu-18.04
-    steps:
-      - uses: actions/checkout@v2
-      - uses: DoozyX/clang-format-lint-action@v0.11
-        with:
-          source: 'tests src include modules python targets'
-          extensions: 'h,c,cc,cxx,cpp,hpp,cppm'
-          clangFormatVersion: 11
   check_autopep8_format:
     name: Check autopep8-format
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-latest
     steps:
       - name: autopep8
         id: autopep8
@@ -33,4 +20,4 @@ jobs:
 
       - name: Fail if autopep8 made changes
         if: steps.autopep8.outputs.exit-code == 2
-        run: exit 1
\ No newline at end of file
+        run: exit 1
diff --git a/.github/workflows/runtime-build.yml b/.github/workflows/runtime-build.yml
new file mode 100644
index 0000000000..331040e40b
--- /dev/null
+++ b/.github/workflows/runtime-build.yml
@@ -0,0 +1,124 @@
+name: runtime-build
+
+on: [push, pull_request]
+
+jobs:
+  build-native:
+    name: ${{matrix.config.name}}
+    runs-on: ${{matrix.config.os}}
+    strategy:
+      matrix:
+        config:
+          - {name: x86_64-macos, os: macos-11, cmakeArgs: -G Ninja, buildType: Release}
+          - {name: x86_64-linux, os: ubuntu-20.04, cmakeArgs: -G Ninja, buildType: Release}
+          - {name: x86_64-windows, os: windows-latest, arch: x64, cmakeArgs: -G Ninja, buildType: Release}
+
+    steps:
+    - uses: actions/checkout@v3
+    - uses: seanmiddleditch/gha-setup-ninja@master
+
+    - name: Set up build environment (Windows, Visual Studio)
+      uses: ilammy/msvc-dev-cmd@v1
+      with:
+        arch: ${{matrix.config.arch}}
+      if: runner.os == 'Windows'
+      
+    - name: Set up build environment (Macos)
+      run: |
+        brew install sunnycase/core/libomp@14.0.6
+      if: runner.os == 'Macos'
+
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.7
+
+    - name: Install Conan
+      run: pip install conan==1.59.0
+
+    - name: Configure Conan (Linux)
+      run: |
+        conan profile new default --detect
+        conan profile update settings.compiler.libcxx=libstdc++11 default
+        echo "CC=gcc-10" >> $GITHUB_ENV
+        echo "CXX=g++-10" >> $GITHUB_ENV
+      if: runner.os == 'Linux'
+
+    - name: Configure CMake
+      shell: bash
+      run: |
+        cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=${{matrix.config.buildType}} ${{matrix.config.cmakeArgs}} -DBUILDING_RUNTIME=TRUE -DBUILD_PYTHON_BINDING=OFF -DPython3_ROOT_DIR=${pythonLocation}
+
+    - name: Build & Install
+      run: |
+        cmake --build build --config ${{matrix.config.buildType}}
+        cmake --install build --prefix install
+
+    - name: Benchmark
+      run: |
+        ${{github.workspace}}/install/bin/benchnncase > benchnncase.log
+        cat benchnncase.log
+
+    - name: Upload nncaseruntime Build Artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: nncaseruntime-${{matrix.config.name}}
+        path: ${{github.workspace}}/install
+        if-no-files-found: error
+
+    - name: Upload nncaseruntime Benchmark
+      uses: actions/upload-artifact@v3
+      with:
+        name: nncaseruntime-benchmark-${{matrix.config.name}}
+        path: ${{github.workspace}}/benchnncase.log
+        if-no-files-found: error
+
+  build-cross:
+    name: ${{matrix.config.name}}
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        config:
+          - {name: riscv64-none-k210, shell: bash, toolchain: k210, cmakeArgs: -DK210_SDK_DIR=$GITHUB_WORKSPACE/kendryte-standalone-sdk-develop, buildType: Release}
+
+    steps:
+    - uses: actions/checkout@v3
+    - uses: seanmiddleditch/gha-setup-ninja@master
+
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.7
+        
+    - name: Install K210 Baremetal SDK
+      shell: bash
+      run: |
+        wget https://github.com/kendryte/kendryte-gnu-toolchain/releases/download/v8.2.0-20190409/kendryte-toolchain-ubuntu-amd64-8.2.0-20190409.tar.xz -O $GITHUB_WORKSPACE/kendryte-toolchain.tar.xz
+        sudo tar xf $GITHUB_WORKSPACE/kendryte-toolchain.tar.xz -C $GITHUB_WORKSPACE
+        wget https://github.com/kendryte/kendryte-standalone-sdk/archive/refs/heads/develop.tar.gz -O $GITHUB_WORKSPACE/k210-sdk.tar.gz
+        sudo tar xf $GITHUB_WORKSPACE/k210-sdk.tar.gz -C $GITHUB_WORKSPACE
+        echo "RISCV_ROOT_PATH=$GITHUB_WORKSPACE/kendryte-toolchain" >> $GITHUB_ENV
+      if: matrix.config.name == 'riscv64-none-k210'
+        
+    - name: Install Conan
+      run: pip install conan==1.59.0
+
+    - name: Configure Conan (Linux)
+      run: |
+        conan profile new default --detect
+        conan profile update settings.compiler.libcxx=libstdc++11 default
+
+    - name: Build
+      run: |
+        cmake -B build -G Ninja -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/${{matrix.config.toolchain}}.toolchain.cmake -DCMAKE_BUILD_TYPE=${{matrix.config.buildType}} ${{matrix.config.cmakeArgs}} -DBUILDING_RUNTIME=TRUE -DBUILD_PYTHON_BINDING=OFF -DPython3_ROOT_DIR=${pythonLocation}
+        cmake --build build --config ${{matrix.config.buildType}}
+
+    - name: Install
+      run: cmake --install build --prefix install
+
+    - name: Upload nncaseruntime Build Artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: nncaseruntime-${{matrix.config.name}}
+        path: ${{github.workspace}}/install
+        if-no-files-found: error
diff --git a/.github/workflows/runtime-k210.yml b/.github/workflows/runtime-k210.yml
deleted file mode 100644
index cc98ee8c3b..0000000000
--- a/.github/workflows/runtime-k210.yml
+++ /dev/null
@@ -1,80 +0,0 @@
-name: runtime-k210
-
-on: [push, pull_request]
-
-env:
-  BUILD_TYPE: Release
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ubuntu-18.04]
-
-    steps:
-    - uses: actions/checkout@v2
-    - uses: seanmiddleditch/gha-setup-ninja@master
-        
-    - name: Download K210 Toolchains
-      if: runner.os == 'Linux'
-      shell: bash
-      run: |
-        wget https://github.com/kendryte/kendryte-gnu-toolchain/releases/download/v8.2.0-20190409/kendryte-toolchain-ubuntu-amd64-8.2.0-20190409.tar.xz -O $GITHUB_WORKSPACE/kendryte-toolchain.tar.xz
-        
-    - name: Install K210 Toolchains
-      shell: bash
-      run: |
-        sudo tar xf $GITHUB_WORKSPACE/kendryte-toolchain.tar.xz -C $GITHUB_WORKSPACE
-        
-    - name: Download K210 SDK
-      shell: bash
-      run: |
-        wget https://github.com/kendryte/kendryte-standalone-sdk/archive/refs/heads/develop.tar.gz -O $GITHUB_WORKSPACE/k210-sdk.tar.gz
-        
-    - name: Install K210 SDK
-      shell: bash
-      run: |
-        sudo tar xf $GITHUB_WORKSPACE/k210-sdk.tar.gz -C $GITHUB_WORKSPACE
-          
-    - name: Setup Python
-      uses: actions/setup-python@v2.2.1
-      with:
-        python-version: 3.7
-
-    - name: Install Conan
-      run: |
-        pip install conan
-        
-    - name: Configure Conan
-      shell: bash
-      run: |
-        conan profile new default --detect
-        conan profile update settings.compiler.libcxx=libstdc++11 default
-
-    - name: Create Build Environment
-      run: cmake -E make_directory ${{github.workspace}}/build
-
-    - name: Configure CMake
-      env:
-        RISCV_ROOT_PATH: ${{github.workspace}}/kendryte-toolchain
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DK210_SDK_DIR=$GITHUB_WORKSPACE/kendryte-standalone-sdk-develop -DBUILDING_RUNTIME=TRUE -DBUILD_PYTHON_BINDING=OFF -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/k210.toolchain.cmake -DPython3_ROOT_DIR=${pythonLocation}
-
-    - name: Build
-      working-directory: ${{github.workspace}}/build
-      shell: bash
-      run: cmake --build . --config $BUILD_TYPE
-
-    - name: Install
-      working-directory: ${{github.workspace}}/build
-      shell: bash
-      run: cmake --install . --prefix ../install
-      
-    - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
-      with:
-        name: nncaseruntime-k210
-        path: ${{github.workspace}}/install
-        if-no-files-found: error
diff --git a/.github/workflows/runtime-linux-x64-gcc.yml b/.github/workflows/runtime-linux-x64-gcc.yml
deleted file mode 100644
index fe755cd81a..0000000000
--- a/.github/workflows/runtime-linux-x64-gcc.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-name: runtime-linux-x64-gcc
-
-on: [push, pull_request]
-
-env:
-  BUILD_TYPE: Release
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ubuntu-18.04]
-
-    steps:
-    - uses: actions/checkout@v2
-    - uses: seanmiddleditch/gha-setup-ninja@master
-          
-    - name: Setup Python
-      uses: actions/setup-python@v2.2.1
-      with:
-        python-version: 3.7
-
-    - name: Install Conan
-      run: |
-        pip install conan
-        
-    - name: Configure Conan
-      shell: bash
-      run: |
-        conan profile new default --detect
-        conan profile update settings.compiler.libcxx=libstdc++11 default
-
-    - name: Create Build Environment
-      run: cmake -E make_directory ${{github.workspace}}/build
-
-    - name: Configure CMake
-      env:
-        CC: gcc-7
-        CXX: g++-7
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILDING_RUNTIME=TRUE -DPython3_ROOT_DIR=${pythonLocation}
-
-    - name: Build
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake --build . --config $BUILD_TYPE
-
-    - name: Install
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake --install . --prefix ../install
-
-    - name: Benchmark
-      shell: bash
-      working-directory: ${{github.workspace}}
-      run: ${{github.workspace}}/install/bin/benchnncase > benchnncase.log
-      
-    - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
-      with:
-        name: nncaseruntime-linux-x64-gcc
-        path: ${{github.workspace}}/install
-        if-no-files-found: error
-      
-    - name: Upload Benchmark Result
-      uses: actions/upload-artifact@v2.2.2
-      with:
-        name: nncasebenchmark-linux-x64-gcc
-        path: ${{github.workspace}}/benchnncase.log
-        if-no-files-found: error
diff --git a/.github/workflows/runtime-macos-x64-appleclang.yml b/.github/workflows/runtime-macos-x64-appleclang.yml
deleted file mode 100644
index 45b4a489dc..0000000000
--- a/.github/workflows/runtime-macos-x64-appleclang.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-name: runtime-macos-x64-appleclang
-
-on: [push, pull_request]
-
-env:
-  BUILD_TYPE: Release
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [macos-10.15]
-
-    steps:
-    - uses: actions/checkout@v2
-    - uses: seanmiddleditch/gha-setup-ninja@master
-        
-    - name: Install System Requirements
-      shell: bash
-      run: |
-        brew install sunnycase/core/libomp@11.1.0
-          
-    - name: Setup Python
-      uses: actions/setup-python@v2.2.1
-      with:
-        python-version: 3.7
-
-    - name: Install Conan
-      shell: bash
-      run: |
-        pip install conan
-
-    - name: Create Build Environment
-      run: cmake -E make_directory ${{github.workspace}}/build
-
-    - name: Configure CMake
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILDING_RUNTIME=TRUE -DPython3_ROOT_DIR=${pythonLocation}
-
-    - name: Build
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake --build . --config $BUILD_TYPE
-
-    - name: Install
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake --install . --prefix ../install
-
-    - name: Benchmark
-      shell: bash
-      working-directory: ${{github.workspace}}
-      run: ${{github.workspace}}/install/bin/benchnncase > benchnncase.log
-      
-    - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
-      with:
-        name: nncaseruntime-macos-x64-appleclang
-        path: ${{github.workspace}}/install
-        if-no-files-found: error
-      
-    - name: Upload Benchmark Result
-      uses: actions/upload-artifact@v2.2.2
-      with:
-        name: nncasebenchmark-macos-x64-appleclang
-        path: ${{github.workspace}}/benchnncase.log
-        if-no-files-found: error
diff --git a/.github/workflows/runtime-win-x64-msvc.yml b/.github/workflows/runtime-win-x64-msvc.yml
deleted file mode 100644
index 5cef593961..0000000000
--- a/.github/workflows/runtime-win-x64-msvc.yml
+++ /dev/null
@@ -1,68 +0,0 @@
-name: runtime-win-x64-msvc
-
-on: [push, pull_request]
-
-env:
-  BUILD_TYPE: Release
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [windows-2019]
-
-    steps:
-    - uses: actions/checkout@v2
-    - uses: seanmiddleditch/gha-setup-ninja@master
-
-    - name: Add msbuild to PATH
-      if: runner.os == 'Windows'
-      uses: ilammy/msvc-dev-cmd@v1.10.0
-          
-    - name: Setup Python
-      uses: actions/setup-python@v2.2.1
-      with:
-        python-version: 3.7
-
-    - name: Install Conan
-      shell: bash
-      run: |
-        pip install conan
-
-    - name: Create Build Environment
-      run: cmake -E make_directory ${{github.workspace}}/build
-
-    - name: Configure CMake
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILDING_RUNTIME=TRUE -DPython3_ROOT_DIR=${pythonLocation}
-
-    - name: Build
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake --build . --config $BUILD_TYPE
-
-    - name: Install
-      shell: bash
-      working-directory: ${{github.workspace}}/build
-      run: cmake --install . --prefix ../install
-
-    - name: Benchmark
-      shell: pwsh
-      working-directory: ${{github.workspace}}
-      run: .\install\bin\benchnncase.exe > benchnncase.log
-      
-    - name: Upload a Build Artifact
-      uses: actions/upload-artifact@v2.2.2
-      with:
-        name: nncaseruntime-win-x64-msvc
-        path: ${{github.workspace}}/install
-        if-no-files-found: error
-      
-    - name: Upload Benchmark Result
-      uses: actions/upload-artifact@v2.2.2
-      with:
-        name: nncasebenchmark-win-x64-msvc
-        path: ${{github.workspace}}/benchnncase.log
-        if-no-files-found: error
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e7d77a2090..275db97f00 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,31 +1,31 @@
-﻿cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.13)
 
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake/Modules)
 
 if(NOT DEFINED NNCASE_VERSION)
-    set(NNCASE_VERSION "1.0.0")
+    set(NNCASE_VERSION "1.9.0")
 endif()
 
 if(DEFINED ENV{NNCASE_VERSION_SUFFIX})
-    set(NNCASE_VERSION_SUFFIX $ENV{NNCASE_VERSION_SUFFIX})
+  set(NNCASE_VERSION_SUFFIX $ENV{NNCASE_VERSION_SUFFIX})
 endif()
 
 if(NOT DEFINED NNCASE_VERSION_SUFFIX)
-    find_package (Git)
-    execute_process(
-        COMMAND ${GIT_EXECUTABLE} describe --always --dirty
-        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-        OUTPUT_VARIABLE GIT_DESC
-        OUTPUT_STRIP_TRAILING_WHITESPACE)
-    set(NNCASE_VERSION_SUFFIX "-${GIT_DESC}")
+  find_package(Git)
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} describe --always --dirty
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+    OUTPUT_VARIABLE GIT_DESC
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+  set(NNCASE_VERSION_SUFFIX "-${GIT_DESC}")
 endif()
 
-if (NOT PACKAGE_VERSION)
-    set(PACKAGE_VERSION
-        "${NNCASE_VERSION}${NNCASE_VERSION_SUFFIX}")
+if(NOT PACKAGE_VERSION)
+  set(PACKAGE_VERSION "${NNCASE_VERSION}${NNCASE_VERSION_SUFFIX}")
 endif()
 
-project(nncase
+project(
+  nncase
   VERSION ${NNCASE_VERSION}
   LANGUAGES C CXX ASM)
 
@@ -35,208 +35,259 @@ option(BUILD_PYTHON_BINDING "Build python binding" ON)
 option(BUILD_BENCHMARK "Build benchmark programs" ON)
 option(BUILD_TESTING "Build test programs" OFF)
 option(ENABLE_OP_PROFILE "Profile ops cast time" OFF)
-if (ENABLE_OP_PROFILE)
-    add_definitions(-DENABLE_OP_PROFILE)
+if(ENABLE_OP_PROFILE)
+  add_definitions(-DENABLE_OP_PROFILE)
 endif()
 
-if (BUILDING_RUNTIME)
-    option(ENABLE_VULKAN_RUNTIME "Enable Vulkan runtime" ON)
-    option(ENABLE_K210_RUNTIME "Enable k210 runtime" OFF)
-    option(DEFAULT_BUILTIN_RUNTIMES "Use default builtin runtimes" ON)
-    option(DEFAULT_SHARED_RUNTIME_TENSOR_PLATFORM_IMPL "Use default shared memory platform impl" ON)
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES
+   "(x86)|(X86)|(amd64)|(AMD64)|(x86_64)|(X86_64)")
+  if(NOT TURNOFF_SIMD_OPTIMIZE)
+    include(toolchains/x86_64.toolchain.cmake)
+  endif()
+endif()
+
+if(BUILDING_RUNTIME)
+  option(ENABLE_VULKAN_RUNTIME "Enable Vulkan runtime" ON)
+  option(ENABLE_K210_RUNTIME "Enable k210 runtime" OFF)
+  option(DEFAULT_BUILTIN_RUNTIMES "Use default builtin runtimes" ON)
+  option(DEFAULT_SHARED_RUNTIME_TENSOR_PLATFORM_IMPL
+         "Use default shared memory platform impl" ON)
 endif()
 
 include(cmake/configure-conan.cmake)
 include(cmake/conan.cmake)
 
-if(NOT CONAN_EXPORTED) 
-    conan_check()
-    conan_add_remote(NAME sunnycase URL https://conan.sunnycase.moe INDEX 0)
+if(NOT CONAN_EXPORTED)
+  conan_check()
+  conan_add_remote(NAME sunnycase URL https://conan.sunnycase.moe INDEX 0)
 endif()
 
 if(CONAN_EXPORTED) # in conan local cache
-    message(STATUS "Standard Conan Installation")
-    include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
-    conan_basic_setup() # NOTE need manmul set cppstd in conanfile.py
+  message(STATUS "Standard Conan Installation")
+  include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
+  conan_basic_setup() # NOTE need manmul set cppstd in conanfile.py
 else() # in user space
-    message(STATUS "Auto Cmake Conan Installation")
-    include(${CMAKE_SOURCE_DIR}/cmake/conan.cmake)
-    conan_cmake_run(CONANFILE conanfile.py
-                    BASIC_SETUP
-                    OPTIONS ${CONAN_OPTS}
-                    SETTINGS ${CONAN_SETTINGS}
-                    BUILD missing)
+  message(STATUS "Auto Cmake Conan Installation")
+  include(${CMAKE_SOURCE_DIR}/cmake/conan.cmake)
+  conan_cmake_run(
+    CONANFILE
+    conanfile.py
+    BASIC_SETUP
+    OPTIONS
+    ${CONAN_OPTS}
+    SETTINGS
+    ${CONAN_SETTINGS}
+    BUILD
+    missing)
 endif()
 
 include(${CMAKE_BINARY_DIR}/conan_paths.cmake)
 include(cmake/dependencies.cmake)
 
-if (BUILDING_RUNTIME)
-    set(NNCASE_MAIN_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include)
-    set(NNCASE_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
-    set(THIRD_PARTY ${CMAKE_CURRENT_LIST_DIR}/third_party)
-    set_property(GLOBAL PROPERTY POSITION_INDEPENDENT_CODE ON)
-    
-    if (MSVC)
-        add_definitions(/D_CRT_SECURE_NO_WARNINGS /DNOMINMAX)
-        add_compile_options(/wd4267 /wd4251 /wd4244 /FC /utf-8 /W3 /WX)
+if(BUILDING_RUNTIME)
+  set(NNCASE_MAIN_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include)
+  set(NNCASE_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
+  set(THIRD_PARTY ${CMAKE_CURRENT_LIST_DIR}/third_party)
+  set_property(GLOBAL PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+  if(MSVC)
+    add_definitions(/D_CRT_SECURE_NO_WARNINGS /DNOMINMAX)
+    add_compile_options(
+      /wd4267
+      /wd4251
+      /wd4244
+      /FC
+      /utf-8
+      /W3
+      /WX)
+  else()
+    add_compile_options(
+      -Wall
+      -Wextra
+      -pedantic
+      -Werror
+      -Wno-multichar
+      -Wno-missing-field-initializers
+      -Wno-unused-function
+      -Wno-type-limits)
+    if(APPLE)
+      add_compile_options(-Wno-four-char-constants -Wno-sometimes-uninitialized)
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+      add_compile_options(-Wno-uninitialized -Wno-unused-private-field)
     else()
-        add_compile_options(-Wall -Wextra -pedantic -Werror -Wno-multichar -Wno-missing-field-initializers -Wno-unused-function -Wno-type-limits)
-        if (APPLE)
-            add_compile_options(-Wno-four-char-constants -Wno-sometimes-uninitialized)
-        elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-            add_compile_options(-Wno-uninitialized -Wno-unused-private-field)
-        else ()
-            add_compile_options(-Wno-maybe-uninitialized -Wno-unused-private-field)
-        endif()
-    endif()
-    
-    include_directories(${NNCASE_MAIN_INCLUDE_DIR})
-    include_directories(${NNCASE_INCLUDE_DIR})
-    
-    add_subdirectory(include/nncase)
-    add_subdirectory(src/kernels)
-    add_subdirectory(src/runtime)
-    add_subdirectory(src/functional)
-    if(BUILD_BENCHMARK)
-        add_subdirectory(benchmark)
+      add_compile_options(-Wno-maybe-uninitialized -Wno-unused-private-field)
     endif()
-    
-    # Python binding
-    if(BUILD_PYTHON_BINDING)
-        add_subdirectory(python/nncaseruntime/native)
-    endif()
-        
-    install(DIRECTORY ${NNCASE_INCLUDE_DIR}/nncase
-        DESTINATION include
-        COMPONENT nncase-headers
-        FILES_MATCHING
-        PATTERN "*version.h"
-        PATTERN "CMakeFiles" EXCLUDE
-        )
-    
-    install(DIRECTORY include/nncase/kernels
-        DESTINATION include/nncase
-        COMPONENT nncase-headers
-        FILES_MATCHING
-        PATTERN "*.def"
-        PATTERN "*.h"
-        PATTERN "*.hpp"
-        PATTERN "*.td"
-        PATTERN "*.inc"
-        PATTERN "LICENSE.TXT"
-        )
-    
-    install(DIRECTORY include/nncase/runtime
-        DESTINATION include/nncase
-        COMPONENT nncase-headers
-        FILES_MATCHING
-        PATTERN "*.def"
-        PATTERN "*.h"
-        PATTERN "*.hpp"
-        PATTERN "*.td"
-        PATTERN "*.inc"
-        PATTERN "LICENSE.TXT"
-        )
+  endif()
+
+  include_directories(${NNCASE_MAIN_INCLUDE_DIR})
+  include_directories(${NNCASE_INCLUDE_DIR})
+
+  add_subdirectory(include/nncase)
+  add_subdirectory(src/kernels)
+  add_subdirectory(src/runtime)
+  add_subdirectory(src/functional)
+  if(BUILD_BENCHMARK)
+    add_subdirectory(benchmark)
+  endif()
+
+  # Python binding
+  if(BUILD_PYTHON_BINDING)
+    add_subdirectory(python/nncaseruntime/native)
+  endif()
+
+  install(
+    DIRECTORY ${NNCASE_INCLUDE_DIR}/nncase
+    DESTINATION include
+    COMPONENT nncase-headers
+    FILES_MATCHING
+    PATTERN "*version.h"
+    PATTERN "CMakeFiles" EXCLUDE)
+
+  install(
+    DIRECTORY include/nncase/kernels
+    DESTINATION include/nncase
+    COMPONENT nncase-headers
+    FILES_MATCHING
+    PATTERN "*.def"
+    PATTERN "*.h"
+    PATTERN "*.hpp"
+    PATTERN "*.td"
+    PATTERN "*.inc"
+    PATTERN "LICENSE.TXT")
+
+  install(
+    DIRECTORY include/nncase/runtime
+    DESTINATION include/nncase
+    COMPONENT nncase-headers
+    FILES_MATCHING
+    PATTERN "*.def"
+    PATTERN "*.h"
+    PATTERN "*.hpp"
+    PATTERN "*.td"
+    PATTERN "*.inc"
+    PATTERN "LICENSE.TXT")
 else()
-    set(CMAKE_SKIP_RPATH OFF)
-
-    set(NNCASE_MAIN_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include)
-    set(NNCASE_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
-    set(THIRD_PARTY ${CMAKE_CURRENT_LIST_DIR}/third_party)
-    set_property(GLOBAL PROPERTY POSITION_INDEPENDENT_CODE ON)
-    if (APPLE)
-        set(CMAKE_MACOSX_RPATH TRUE)
-        set(CMAKE_INSTALL_RPATH "@loader_path")
-        set(CMAKE_INSTALL_NAME_DIR "@rpath")
-    else ()
-        set(CMAKE_INSTALL_RPATH "$ORIGIN")
-    endif()
-    
-    if (MSVC)
-        add_definitions(/D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS /D_CRT_SECURE_NO_WARNINGS /DNOMINMAX)
-        add_compile_options(/wd4267 /wd4251 /wd4244 /FC /utf-8 /W3 /WX)
-        set(PYBIND11_CPP_STANDARD "/std:c++latest")
+  set(CMAKE_SKIP_RPATH OFF)
+
+  set(NNCASE_MAIN_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include)
+  set(NNCASE_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
+  set(THIRD_PARTY ${CMAKE_CURRENT_LIST_DIR}/third_party)
+  set_property(GLOBAL PROPERTY POSITION_INDEPENDENT_CODE ON)
+  if(APPLE)
+    set(CMAKE_MACOSX_RPATH TRUE)
+    set(CMAKE_INSTALL_RPATH "@loader_path")
+    set(CMAKE_INSTALL_NAME_DIR "@rpath")
+  else()
+    set(CMAKE_INSTALL_RPATH "$ORIGIN")
+  endif()
+
+  if(MSVC)
+    add_definitions(/D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS
+                    /D_CRT_SECURE_NO_WARNINGS /DNOMINMAX)
+    add_compile_options(
+      /wd4267
+      /wd4251
+      /wd4244
+      /FC
+      /utf-8
+      /W3
+      /WX)
+    set(PYBIND11_CPP_STANDARD "/std:c++latest")
+  else()
+    add_compile_options(-fvisibility=hidden)
+    add_compile_options(
+      -Wall
+      -Wextra
+      -pedantic
+      -Werror
+      -Wno-multichar
+      -Wno-missing-field-initializers
+      -Wno-unused-function
+      -Wno-type-limits
+      -Wno-unused-local-typedefs
+      -Wno-sign-compare)
+    if(APPLE)
+      add_compile_options(-Wno-four-char-constants -Wno-sometimes-uninitialized
+                          -Wno-deprecated)
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+      add_compile_options(-Wno-uninitialized)
     else()
-        add_compile_options(-fvisibility=hidden)
-        add_compile_options(-Wall -Wextra -pedantic -Werror -Wno-multichar -Wno-missing-field-initializers -Wno-unused-function -Wno-type-limits -Wno-unused-local-typedefs -Wno-sign-compare)
-        if (APPLE)
-            add_compile_options(-Wno-four-char-constants -Wno-sometimes-uninitialized -Wno-deprecated)
-        elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-            add_compile_options(-Wno-uninitialized)
-        else ()
-            add_compile_options(-Wno-maybe-uninitialized -Wno-deprecated-copy)
-            add_link_options(-Wl,--exclude-libs,ALL)
-        endif()
-    endif()
-    
-    include_directories(${NNCASE_MAIN_INCLUDE_DIR})
-    include_directories(${NNCASE_INCLUDE_DIR})
-    
-    add_subdirectory(include/nncase)
-    add_subdirectory(src/nncase)
-    add_subdirectory(src/data)
-    add_subdirectory(src/ir)
-    add_subdirectory(src/importer)
-    add_subdirectory(src/schedule)
-    add_subdirectory(src/evaluator)
-    add_subdirectory(src/functional)
-    add_subdirectory(src/transforms)
-    add_subdirectory(src/codegen)
-    add_subdirectory(src/kernels)
-    add_subdirectory(src/runtime)
-    add_subdirectory(src/targets)
-    add_subdirectory(src/plugin)
-    add_subdirectory(src/cli)
-
-    if(BUILD_TESTING)
-        add_subdirectory(tests/kernels)
-    endif()
-    
-    # Python binding
-    if(BUILD_PYTHON_BINDING)
-        add_subdirectory(python/nncase/native)
+      add_compile_options(-Wno-maybe-uninitialized -Wno-deprecated-copy)
+      if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL "12.0")
+        add_compile_options(-Wno-array-bounds -Wno-deprecated-declarations
+                            -Wno-restrict)
+      endif()
+      add_link_options(-Wl,--exclude-libs,ALL)
     endif()
-    
-    # Thrid party
-    add_subdirectory(third_party/onnx)
-    
-    install(DIRECTORY include/nncase
-        DESTINATION include
-        COMPONENT nncase-headers
-        FILES_MATCHING
-        PATTERN "*.def"
-        PATTERN "*.h"
-        PATTERN "*.hpp"
-        PATTERN "*.td"
-        PATTERN "*.inc"
-        PATTERN "LICENSE.TXT"
-        )
-        
-    install(DIRECTORY ${NNCASE_INCLUDE_DIR}/nncase
-        DESTINATION include
-        COMPONENT nncase-headers
-        FILES_MATCHING
-        PATTERN "*.def"
-        PATTERN "*.h"
-        PATTERN "*.hpp"
-        PATTERN "*.td"
-        PATTERN "*.inc"
-        PATTERN "CMakeFiles" EXCLUDE
-        PATTERN "config.h" EXCLUDE
-        )
-    
-    install(DIRECTORY python/nncase
-        DESTINATION python
-        COMPONENT nncase-python
-        FILES_MATCHING
-        PATTERN "*.py"
-        )
-    
-    # Targets
-    add_subdirectory(targets/cpu)
-    add_subdirectory(targets/k210)
-    add_subdirectory(targets/vulkan)
+  endif()
+
+  include_directories(${NNCASE_MAIN_INCLUDE_DIR})
+  include_directories(${NNCASE_INCLUDE_DIR})
+
+  add_subdirectory(include/nncase)
+  add_subdirectory(src/nncase)
+  add_subdirectory(src/data)
+  add_subdirectory(src/ir)
+  add_subdirectory(src/importer)
+  add_subdirectory(src/schedule)
+  add_subdirectory(src/evaluator)
+  add_subdirectory(src/functional)
+  add_subdirectory(src/transforms)
+  add_subdirectory(src/codegen)
+  add_subdirectory(src/kernels)
+  add_subdirectory(src/runtime)
+  add_subdirectory(src/targets)
+  add_subdirectory(src/plugin)
+  add_subdirectory(src/cli)
+
+  if(BUILD_TESTING)
+    add_subdirectory(tests/kernels)
+  endif()
+
+  # Python binding
+  if(BUILD_PYTHON_BINDING)
+    add_subdirectory(python/nncase/native)
+  endif()
+
+  # Thrid party
+  add_subdirectory(third_party/onnx)
+
+  install(
+    DIRECTORY include/nncase
+    DESTINATION include
+    COMPONENT nncase-headers
+    FILES_MATCHING
+    PATTERN "*.def"
+    PATTERN "*.h"
+    PATTERN "*.hpp"
+    PATTERN "*.td"
+    PATTERN "*.inc"
+    PATTERN "LICENSE.TXT")
+
+  install(
+    DIRECTORY ${NNCASE_INCLUDE_DIR}/nncase
+    DESTINATION include
+    COMPONENT nncase-headers
+    FILES_MATCHING
+    PATTERN "*.def"
+    PATTERN "*.h"
+    PATTERN "*.hpp"
+    PATTERN "*.td"
+    PATTERN "*.inc"
+    PATTERN "CMakeFiles" EXCLUDE
+    PATTERN "config.h" EXCLUDE)
+
+  install(
+    DIRECTORY python/nncase
+    DESTINATION python
+    COMPONENT nncase-python
+    FILES_MATCHING
+    PATTERN "*.py")
+
+  # Targets
+  add_subdirectory(targets/cpu)
+  add_subdirectory(targets/k210)
+  add_subdirectory(targets/vulkan)
 endif()
 
 # Modules
diff --git a/benchmark/gen_kmodel.py b/benchmark/gen_kmodel.py
index 3f5765b89e..6068ea5563 100644
--- a/benchmark/gen_kmodel.py
+++ b/benchmark/gen_kmodel.py
@@ -31,12 +31,12 @@
 
 MODELS = {
     "mnist": {
-        "url": "https://media.githubusercontent.com/media/onnx/models/master/vision/classification/mnist/model/mnist-8.onnx",
+        "url": "https://github.com/onnx/models/raw/main/vision/classification/mnist/model/mnist-8.onnx",
         "in_shapes": {"Input3": [1, 1, 28, 28]}
     },
     "mobilenet_v2": {
-        "url": "https://github.com/onnx/models/raw/master/vision/classification/mobilenet/model/mobilenetv2-7.onnx",
-        "in_shapes": {"input": [1, 3, 224, 224]}
+        "url": "https://github.com/onnx/models/raw/main/vision/classification/mobilenet/model/mobilenetv2-7.onnx",
+        "in_shapes": {"data": [1, 3, 224, 224]}
     }
 }
 
@@ -47,7 +47,7 @@ def _download(url, name, in_shapes):
         req = requests.get(url)
         onnx_model, check = onnxsim.simplify(
             onnx.load_model(BytesIO(req.content)), check_n=3, input_shapes=in_shapes)
-        assert check, "Simplified ONNX model could not be validated"
+        # assert check, "Simplified ONNX model could not be validated"
         onnx.save(onnx_model, filename)
 
     with open(filename, "rb") as file:
@@ -65,10 +65,10 @@ def _make_module(name, target):
     compile_options.input_layout = "NCHW"
     compile_options.output_layout = "NCHW"
     compile_options.dump_dir = os.path.join(TEMP_DIR, name)
-    compile_options.dump_ir = True
-    compile_options.dump_asm = True
-    compile_options.dump_quant_error = True
-    compile_options.dump_import_op_range = True
+    compile_options.dump_ir = False
+    compile_options.dump_asm = False
+    compile_options.dump_quant_error = False
+    compile_options.dump_import_op_range = False
     compile_options.use_mse_quant_w = True
     compile_options.split_w_to_act = False
     compile_options.benchmark_only = True
diff --git a/benchmark/models/cpu/mnist.kmodel b/benchmark/models/cpu/mnist.kmodel
index bc3ec30a86..2e7a5e1db9 100644
Binary files a/benchmark/models/cpu/mnist.kmodel and b/benchmark/models/cpu/mnist.kmodel differ
diff --git a/benchmark/models/cpu/mobilenet_v2.kmodel b/benchmark/models/cpu/mobilenet_v2.kmodel
index d070a1c43a..8aec579391 100644
Binary files a/benchmark/models/cpu/mobilenet_v2.kmodel and b/benchmark/models/cpu/mobilenet_v2.kmodel differ
diff --git a/cmake/conan.cmake b/cmake/conan.cmake
index 208ce24855..66b381dbfd 100644
--- a/cmake/conan.cmake
+++ b/cmake/conan.cmake
@@ -33,7 +33,7 @@
 # but it is only necessary on the end-user side. It is not necessary to create conan
 # packages, in fact it shouldn't be use for that. Check the project documentation.
 
-# version: 0.18.0-dev
+# version: 0.19.0-dev
 
 include(CMakeParseArguments)
 
@@ -95,7 +95,7 @@ macro(_conan_check_system_name)
         endif()
         if(${CMAKE_SYSTEM_NAME} STREQUAL "QNX")
             set(CONAN_SYSTEM_NAME Neutrino)
-        endif()        
+        endif()
         set(CONAN_SUPPORTED_PLATFORMS Windows Linux Macos Android iOS FreeBSD WindowsStore WindowsCE watchOS tvOS FreeBSD SunOS AIX Arduino Emscripten Neutrino)
         list (FIND CONAN_SUPPORTED_PLATFORMS "${CONAN_SYSTEM_NAME}" _index)
         if (${_index} GREATER -1)
@@ -132,18 +132,28 @@ macro(_conan_detect_compiler)
         set(_CONAN_SETTING_COMPILER_CPPSTD ${CMAKE_CXX_STANDARD})
     endif()
 
-    if (${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL GNU)
-        # using GCC
+    if (${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL GNU OR ${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL QCC)
+        # using GCC or QCC
         # TODO: Handle other params
         string(REPLACE "." ";" VERSION_LIST ${CMAKE_${LANGUAGE}_COMPILER_VERSION})
         list(GET VERSION_LIST 0 MAJOR)
         list(GET VERSION_LIST 1 MINOR)
-        set(COMPILER_VERSION ${MAJOR}.${MINOR})
-        if(${MAJOR} GREATER 4)
-            set(COMPILER_VERSION ${MAJOR})
-        endif()
-        set(_CONAN_SETTING_COMPILER gcc)
+
+        if (${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL GNU)
+            set(_CONAN_SETTING_COMPILER gcc)
+            # mimic Conan client autodetection
+            if (${MAJOR} GREATER_EQUAL 5)
+                set(COMPILER_VERSION ${MAJOR})
+            else()
+                set(COMPILER_VERSION ${MAJOR}.${MINOR})
+            endif()    
+        elseif (${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL QCC)
+            set(_CONAN_SETTING_COMPILER qcc)
+            set(COMPILER_VERSION ${MAJOR}.${MINOR})
+        endif ()
+
         set(_CONAN_SETTING_COMPILER_VERSION ${COMPILER_VERSION})
+
         if (USING_CXX)
             conan_cmake_detect_unix_libcxx(_LIBCXX)
             set(_CONAN_SETTING_COMPILER_LIBCXX ${_LIBCXX})
@@ -152,7 +162,7 @@ macro(_conan_detect_compiler)
         string(REPLACE "." ";" VERSION_LIST ${CMAKE_${LANGUAGE}_COMPILER_VERSION})
         list(GET VERSION_LIST 0 MAJOR)
         list(GET VERSION_LIST 1 MINOR)
-        set(COMPILER_VERSION ${MAJOR}.${MINOR})
+        set(COMPILER_VERSION ${MAJOR})
         set(_CONAN_SETTING_COMPILER intel)
         set(_CONAN_SETTING_COMPILER_VERSION ${COMPILER_VERSION})
         if (USING_CXX)
@@ -164,18 +174,39 @@ macro(_conan_detect_compiler)
         string(REPLACE "." ";" VERSION_LIST ${CMAKE_${LANGUAGE}_COMPILER_VERSION})
         list(GET VERSION_LIST 0 MAJOR)
         list(GET VERSION_LIST 1 MINOR)
+
+        # mimic Conan client autodetection
+        if (${MAJOR} GREATER_EQUAL 13)
+            set(COMPILER_VERSION ${MAJOR})
+        else()
+            set(COMPILER_VERSION ${MAJOR}.${MINOR})
+        endif() 
+
+        set(_CONAN_SETTING_COMPILER_VERSION ${COMPILER_VERSION})
+
         set(_CONAN_SETTING_COMPILER apple-clang)
-        set(_CONAN_SETTING_COMPILER_VERSION ${MAJOR}.${MINOR})
         if (USING_CXX)
             conan_cmake_detect_unix_libcxx(_LIBCXX)
             set(_CONAN_SETTING_COMPILER_LIBCXX ${_LIBCXX})
         endif ()
-    elseif (${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL Clang)
+    elseif (${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL Clang
+                AND NOT "${CMAKE_${LANGUAGE}_COMPILER_FRONTEND_VARIANT}" STREQUAL "MSVC" 
+                AND NOT "${CMAKE_${LANGUAGE}_SIMULATE_ID}" STREQUAL "MSVC")
+
         string(REPLACE "." ";" VERSION_LIST ${CMAKE_${LANGUAGE}_COMPILER_VERSION})
         list(GET VERSION_LIST 0 MAJOR)
         list(GET VERSION_LIST 1 MINOR)
         set(_CONAN_SETTING_COMPILER clang)
-        set(_CONAN_SETTING_COMPILER_VERSION ${MAJOR}.${MINOR})
+
+        # mimic Conan client autodetection
+        if (${MAJOR} GREATER_EQUAL 8)
+            set(COMPILER_VERSION ${MAJOR})
+        else()
+            set(COMPILER_VERSION ${MAJOR}.${MINOR})
+        endif() 
+
+        set(_CONAN_SETTING_COMPILER_VERSION ${COMPILER_VERSION})
+
         if(APPLE)
             cmake_policy(GET CMP0025 APPLE_CLANG_POLICY)
             if(NOT APPLE_CLANG_POLICY STREQUAL NEW)
@@ -183,14 +214,15 @@ macro(_conan_detect_compiler)
                 set(_CONAN_SETTING_COMPILER apple-clang)
             endif()
         endif()
-        if(${_CONAN_SETTING_COMPILER} STREQUAL clang AND ${MAJOR} GREATER 7)
-            set(_CONAN_SETTING_COMPILER_VERSION ${MAJOR})
-        endif()
         if (USING_CXX)
             conan_cmake_detect_unix_libcxx(_LIBCXX)
             set(_CONAN_SETTING_COMPILER_LIBCXX ${_LIBCXX})
         endif ()
-    elseif(${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL MSVC)
+    elseif(${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL MSVC
+                OR (${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL Clang 
+                    AND "${CMAKE_${LANGUAGE}_COMPILER_FRONTEND_VARIANT}" STREQUAL "MSVC" 
+                    AND "${CMAKE_${LANGUAGE}_SIMULATE_ID}" STREQUAL "MSVC"))
+
         set(_VISUAL "Visual Studio")
         _get_msvc_ide_version(_VISUAL_VERSION)
         if("${_VISUAL_VERSION}" STREQUAL "")
@@ -281,7 +313,7 @@ function(conan_cmake_settings result)
         string(REGEX MATCH "[^=]*" MANUAL_SETTING "${ARG}")
         message(STATUS "Conan: ${MANUAL_SETTING} was added as an argument. Not using the autodetected one.")
         list(REMOVE_ITEM ARGUMENTS_PROFILE_AUTO "${MANUAL_SETTING}")
-    endforeach()    
+    endforeach()
 
     # Automatic from CMake
     foreach(ARG ${ARGUMENTS_PROFILE_AUTO})
@@ -398,7 +430,7 @@ function(conan_cmake_detect_vs_runtime result)
 
     if(build_type)
         string(TOUPPER "${build_type}" build_type)
-    endif() 
+    endif()
     set(variables CMAKE_CXX_FLAGS_${build_type} CMAKE_C_FLAGS_${build_type} CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
     foreach(variable ${variables})
         if(NOT "${${variable}}" STREQUAL "")
@@ -443,17 +475,18 @@ function(conan_cmake_autodetect detected_settings)
 endfunction()
 
 macro(conan_parse_arguments)
-  set(options BASIC_SETUP CMAKE_TARGETS UPDATE KEEP_RPATHS NO_LOAD NO_OUTPUT_DIRS OUTPUT_QUIET NO_IMPORTS SKIP_STD)
-  set(oneValueArgs CONANFILE  ARCH BUILD_TYPE INSTALL_FOLDER CONAN_COMMAND)
-  set(multiValueArgs DEBUG_PROFILE RELEASE_PROFILE RELWITHDEBINFO_PROFILE MINSIZEREL_PROFILE
-                     PROFILE REQUIRES OPTIONS IMPORTS SETTINGS BUILD ENV GENERATORS PROFILE_AUTO
-                     INSTALL_ARGS CONFIGURATION_TYPES PROFILE_BUILD BUILD_REQUIRES)
-  cmake_parse_arguments(ARGUMENTS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(options         BASIC_SETUP CMAKE_TARGETS UPDATE KEEP_RPATHS NO_LOAD NO_OUTPUT_DIRS 
+                        OUTPUT_QUIET NO_IMPORTS SKIP_STD)
+    set(oneValueArgs    CONANFILE ARCH BUILD_TYPE INSTALL_FOLDER OUTPUT_FOLDER CONAN_COMMAND)
+    set(multiValueArgs  DEBUG_PROFILE RELEASE_PROFILE RELWITHDEBINFO_PROFILE MINSIZEREL_PROFILE
+                        PROFILE REQUIRES OPTIONS IMPORTS SETTINGS BUILD ENV GENERATORS PROFILE_AUTO
+                        INSTALL_ARGS CONFIGURATION_TYPES PROFILE_BUILD BUILD_REQUIRES)
+    cmake_parse_arguments(ARGUMENTS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 endmacro()
 
 function(old_conan_cmake_install)
     # Calls "conan install"
-    # Argument BUILD is equivalant to --build={missing, PkgName,...} or
+    # Argument BUILD is equivalent to --build={missing, PkgName,...} or
     # --build when argument is 'BUILD all' (which builds all packages from source)
     # Argument CONAN_COMMAND, to specify the conan path, e.g. in case of running from source
     # cmake does not identify conan as command, even if it is +x and it is in the path
@@ -502,6 +535,10 @@ function(old_conan_cmake_install)
     if(ARGUMENTS_INSTALL_FOLDER)
       set(CONAN_INSTALL_FOLDER -if=${ARGUMENTS_INSTALL_FOLDER})
     endif()
+    set(CONAN_OUTPUT_FOLDER "")
+    if(ARGUMENTS_OUTPUT_FOLDER)
+      set(CONAN_OUTPUT_FOLDER -of=${ARGUMENTS_OUTPUT_FOLDER})
+    endif()
     foreach(ARG ${ARGUMENTS_GENERATORS})
         set(CONAN_GENERATORS ${CONAN_GENERATORS} -g=${ARG})
     endforeach()
@@ -539,9 +576,9 @@ function(conan_cmake_install)
     endif()
 
     set(installOptions UPDATE NO_IMPORTS OUTPUT_QUIET ERROR_QUIET)
-    set(installOneValueArgs PATH_OR_REFERENCE REFERENCE REMOTE LOCKFILE LOCKFILE_OUT LOCKFILE_NODE_ID INSTALL_FOLDER)
+    set(installOneValueArgs PATH_OR_REFERENCE REFERENCE REMOTE LOCKFILE LOCKFILE_OUT LOCKFILE_NODE_ID INSTALL_FOLDER OUTPUT_FOLDER)
     set(installMultiValueArgs GENERATOR BUILD ENV ENV_HOST ENV_BUILD OPTIONS_HOST OPTIONS OPTIONS_BUILD PROFILE
-                              PROFILE_HOST PROFILE_BUILD SETTINGS SETTINGS_HOST SETTINGS_BUILD)
+                              PROFILE_HOST PROFILE_BUILD SETTINGS SETTINGS_HOST SETTINGS_BUILD CONF CONF_HOST CONF_BUILD)
     cmake_parse_arguments(ARGS "${installOptions}" "${installOneValueArgs}" "${installMultiValueArgs}" ${ARGN})
     foreach(arg ${installOptions})
         if(ARGS_${arg})
@@ -560,6 +597,8 @@ function(conan_cmake_install)
                 set(flag "--lockfile-node-id")
             elseif("${arg}" STREQUAL "INSTALL_FOLDER")
                 set(flag "--install-folder")
+            elseif("${arg}" STREQUAL "OUTPUT_FOLDER")
+                set(flag "--output-folder")
             endif()
             set(${arg} ${${arg}} ${flag} ${ARGS_${arg}})
         endif()
@@ -594,6 +633,12 @@ function(conan_cmake_install)
                 set(flag "--settings:host")
             elseif("${arg}" STREQUAL "SETTINGS_BUILD")
                 set(flag "--settings:build")
+            elseif("${arg}" STREQUAL "CONF")
+                set(flag "--conf")
+            elseif("${arg}" STREQUAL "CONF_HOST")
+                set(flag "--conf:host")
+            elseif("${arg}" STREQUAL "CONF_BUILD")
+                set(flag "--conf:build")
             endif()
             list(LENGTH ARGS_${arg} numargs)
             foreach(item ${ARGS_${arg}})
@@ -611,13 +656,16 @@ function(conan_cmake_install)
     if(DEFINED NO_IMPORTS)
         set(NO_IMPORTS --no-imports)
     endif()
-    set(install_args install ${PATH_OR_REFERENCE} ${REFERENCE} ${UPDATE} ${NO_IMPORTS} ${REMOTE} ${LOCKFILE} ${LOCKFILE_OUT} ${LOCKFILE_NODE_ID} ${INSTALL_FOLDER}
-                                ${GENERATOR} ${BUILD} ${ENV} ${ENV_HOST} ${ENV_BUILD} ${OPTIONS} ${OPTIONS_HOST} ${OPTIONS_BUILD} 
-                                ${PROFILE} ${PROFILE_HOST} ${PROFILE_BUILD} ${SETTINGS} ${SETTINGS_HOST} ${SETTINGS_BUILD})
+    set(install_args install  ${PATH_OR_REFERENCE} ${REFERENCE} ${UPDATE} ${NO_IMPORTS} ${REMOTE} 
+                              ${LOCKFILE} ${LOCKFILE_OUT} ${LOCKFILE_NODE_ID} ${INSTALL_FOLDER} 
+                              ${OUTPUT_FOLDER} ${GENERATOR} ${BUILD} ${ENV} ${ENV_HOST} ${ENV_BUILD} 
+                              ${OPTIONS} ${OPTIONS_HOST} ${OPTIONS_BUILD} ${PROFILE} ${PROFILE_HOST} 
+                              ${PROFILE_BUILD} ${SETTINGS} ${SETTINGS_HOST} ${SETTINGS_BUILD} 
+                              ${CONF} ${CONF_HOST} ${CONF_BUILD})
 
     string(REPLACE ";" " " _install_args "${install_args}")
     message(STATUS "Conan executing: ${CONAN_CMD} ${_install_args}")
-    
+
     if(ARGS_OUTPUT_QUIET)
       set(OUTPUT_OPT OUTPUT_QUIET)
     endif()
@@ -641,6 +689,109 @@ function(conan_cmake_install)
 
 endfunction()
 
+function(conan_cmake_lock_create)
+    if(DEFINED CONAN_COMMAND)
+        set(CONAN_CMD ${CONAN_COMMAND})
+    else()
+        conan_check(REQUIRED)
+    endif()
+
+    set(lockCreateOptions UPDATE BASE OUTPUT_QUIET ERROR_QUIET)
+    set(lockCreateOneValueArgs PATH REFERENCE REMOTE LOCKFILE LOCKFILE_OUT)
+    set(lockCreateMultiValueArgs BUILD ENV ENV_HOST ENV_BUILD OPTIONS_HOST OPTIONS OPTIONS_BUILD PROFILE
+                              PROFILE_HOST PROFILE_BUILD SETTINGS SETTINGS_HOST SETTINGS_BUILD)
+    cmake_parse_arguments(ARGS "${lockCreateOptions}" "${lockCreateOneValueArgs}" "${lockCreateMultiValueArgs}" ${ARGN})
+    foreach(arg ${lockCreateOptions})
+        if(ARGS_${arg})
+            set(${arg} ${${arg}} ${ARGS_${arg}})
+        endif()
+    endforeach()
+    foreach(arg ${lockCreateOneValueArgs})
+        if(DEFINED ARGS_${arg})
+            if("${arg}" STREQUAL "REMOTE")
+                set(flag "--remote")
+            elseif("${arg}" STREQUAL "LOCKFILE")
+                set(flag "--lockfile")
+            elseif("${arg}" STREQUAL "LOCKFILE_OUT")
+                set(flag "--lockfile-out")
+            endif()
+            set(${arg} ${${arg}} ${flag} ${ARGS_${arg}})
+        endif()
+    endforeach()
+    foreach(arg ${lockCreateMultiValueArgs})
+        if(DEFINED ARGS_${arg})
+            if("${arg}" STREQUAL "BUILD")
+                set(flag "--build")
+            elseif("${arg}" STREQUAL "ENV")
+                set(flag "--env")
+            elseif("${arg}" STREQUAL "ENV_HOST")
+                set(flag "--env:host")
+            elseif("${arg}" STREQUAL "ENV_BUILD")
+                set(flag "--env:build")
+            elseif("${arg}" STREQUAL "OPTIONS")
+                set(flag "--options")
+            elseif("${arg}" STREQUAL "OPTIONS_HOST")
+                set(flag "--options:host")
+            elseif("${arg}" STREQUAL "OPTIONS_BUILD")
+                set(flag "--options:build")
+            elseif("${arg}" STREQUAL "PROFILE")
+                set(flag "--profile")
+            elseif("${arg}" STREQUAL "PROFILE_HOST")
+                set(flag "--profile:host")
+            elseif("${arg}" STREQUAL "PROFILE_BUILD")
+                set(flag "--profile:build")
+            elseif("${arg}" STREQUAL "SETTINGS")
+                set(flag "--settings")
+            elseif("${arg}" STREQUAL "SETTINGS_HOST")
+                set(flag "--settings:host")
+            elseif("${arg}" STREQUAL "SETTINGS_BUILD")
+                set(flag "--settings:build")
+            endif()
+            list(LENGTH ARGS_${arg} numargs)
+            foreach(item ${ARGS_${arg}})
+                if(${item} STREQUAL "all" AND ${arg} STREQUAL "BUILD")
+                    set(${arg} "--build")
+                    break()
+                endif()
+                set(${arg} ${${arg}} ${flag} ${item})
+            endforeach()
+        endif()
+    endforeach()
+    if(DEFINED UPDATE)
+        set(UPDATE --update)
+    endif()
+    if(DEFINED BASE)
+        set(BASE --base)
+    endif()
+    set(lock_create_Args lock create ${PATH} ${REFERENCE} ${UPDATE} ${BASE} ${REMOTE} ${LOCKFILE} ${LOCKFILE_OUT} ${LOCKFILE_NODE_ID} ${INSTALL_FOLDER}
+                                ${GENERATOR} ${BUILD} ${ENV} ${ENV_HOST} ${ENV_BUILD} ${OPTIONS} ${OPTIONS_HOST} ${OPTIONS_BUILD} 
+                                ${PROFILE} ${PROFILE_HOST} ${PROFILE_BUILD} ${SETTINGS} ${SETTINGS_HOST} ${SETTINGS_BUILD})
+
+    string(REPLACE ";" " " _lock_create_Args "${lock_create_Args}")
+    message(STATUS "Conan executing: ${CONAN_CMD} ${_lock_create_Args}")
+    
+    if(ARGS_OUTPUT_QUIET)
+      set(OUTPUT_OPT OUTPUT_QUIET)
+    endif()
+    if(ARGS_ERROR_QUIET)
+      set(ERROR_OPT ERROR_QUIET)
+    endif()
+
+    execute_process(COMMAND ${CONAN_CMD} ${lock_create_Args}
+                    RESULT_VARIABLE return_code
+                    ${OUTPUT_OPT}
+                    ${ERROR_OPT}
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
+    if(NOT "${return_code}" STREQUAL "0")
+        if (ARGS_ERROR_QUIET)
+            message(WARNING "Conan lock create failed='${return_code}'")
+        else()
+            message(FATAL_ERROR "Conan lock create failed='${return_code}'")
+        endif()
+    endif()
+endfunction()
+
 function(conan_cmake_setup_conanfile)
   conan_parse_arguments(${ARGV})
   if(ARGUMENTS_CONANFILE)
@@ -734,7 +885,7 @@ endmacro()
 
 macro(conan_cmake_run)
     conan_parse_arguments(${ARGV})
-    
+
     if(ARGUMENTS_CONFIGURATION_TYPES AND NOT CMAKE_CONFIGURATION_TYPES)
         message(WARNING "CONFIGURATION_TYPES should only be specified for multi-configuration generators")
     elseif(ARGUMENTS_CONFIGURATION_TYPES AND ARGUMENTS_BUILD_TYPE)
@@ -785,6 +936,30 @@ macro(conan_cmake_run)
     endif()
 endmacro()
 
+function(conan_version result)
+    set(${result} "" PARENT_SCOPE)
+
+    if(NOT CONAN_CMD)
+        find_program(CONAN_CMD conan)
+        if(NOT CONAN_CMD AND CONAN_REQUIRED)
+            message(FATAL_ERROR "Conan executable not found! Please install conan.")
+        endif()
+    endif()
+
+    execute_process(COMMAND ${CONAN_CMD} --version
+                    RESULT_VARIABLE return_code
+                    OUTPUT_VARIABLE CONAN_VERSION_OUTPUT
+                    ERROR_VARIABLE CONAN_VERSION_OUTPUT)
+
+    if(NOT "${return_code}" STREQUAL "0")
+      message(FATAL_ERROR "Conan --version failed='${return_code}'")
+    endif()
+
+    string(REGEX MATCH ".*Conan version ([0-9]+\\.[0-9]+\\.[0-9]+)" FOO "${CONAN_VERSION_OUTPUT}")
+
+    set(${result} ${CMAKE_MATCH_1} PARENT_SCOPE)
+endfunction()
+
 macro(conan_check)
     # Checks conan availability in PATH
     # Arguments REQUIRED, DETECT_QUIET and VERSION are optional
@@ -804,25 +979,16 @@ macro(conan_check)
     if(NOT CONAN_DETECT_QUIET)
         message(STATUS "Conan: Found program ${CONAN_CMD}")
     endif()
-    execute_process(COMMAND ${CONAN_CMD} --version
-                    RESULT_VARIABLE return_code
-                    OUTPUT_VARIABLE CONAN_VERSION_OUTPUT
-                    ERROR_VARIABLE CONAN_VERSION_OUTPUT)
 
-    if(NOT "${return_code}" STREQUAL "0")
-      message(FATAL_ERROR "Conan --version failed='${return_code}'")
-    endif()
-              
+    conan_version(CONAN_DETECTED_VERSION)
+
     if(NOT CONAN_DETECT_QUIET)
-        string(STRIP "${CONAN_VERSION_OUTPUT}" _CONAN_VERSION_OUTPUT)
-        message(STATUS "Conan: Version found ${_CONAN_VERSION_OUTPUT}")
+        message(STATUS "Conan: Version found ${CONAN_DETECTED_VERSION}")
     endif()
 
     if(DEFINED CONAN_VERSION)
-        string(REGEX MATCH ".*Conan version ([0-9]+\\.[0-9]+\\.[0-9]+)" FOO
-            "${CONAN_VERSION_OUTPUT}")
-        if(${CMAKE_MATCH_1} VERSION_LESS ${CONAN_VERSION})
-            message(FATAL_ERROR "Conan outdated. Installed: ${CMAKE_MATCH_1}, \
+        if(${CONAN_DETECTED_VERSION} VERSION_LESS ${CONAN_VERSION})
+            message(FATAL_ERROR "Conan outdated. Installed: ${CONAN_DETECTED_VERSION}, \
                 required: ${CONAN_VERSION}. Consider updating via 'pip \
                 install conan==${CONAN_VERSION}'.")
         endif()
@@ -869,9 +1035,10 @@ macro(conan_config_install)
     set(multiValueArgs ARGS)
     cmake_parse_arguments(CONAN "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-    find_program(CONAN_CMD conan)
-    if(NOT CONAN_CMD AND CONAN_REQUIRED)
-        message(FATAL_ERROR "Conan executable not found!")
+    if(DEFINED CONAN_COMMAND)
+        set(CONAN_CMD ${CONAN_COMMAND})
+    else()
+        conan_check(REQUIRED)
     endif()
 
     if(DEFINED CONAN_VERIFY_SSL)
@@ -883,7 +1050,9 @@ macro(conan_config_install)
     endif()
 
     if(DEFINED CONAN_ARGS)
-	set(CONAN_ARGS_ARGS "--args=\"${CONAN_ARGS}\"")
+	# Convert ; seperated multi arg list into space seperated string
+	string(REPLACE ";" " " l_CONAN_ARGS "${CONAN_ARGS}")
+	set(CONAN_ARGS_ARGS "--args=${l_CONAN_ARGS}")
     endif()
 
     if(DEFINED CONAN_SOURCE)
@@ -907,3 +1076,67 @@ macro(conan_config_install)
     message(FATAL_ERROR "Conan config failed='${return_code}'")
   endif()
 endmacro()
+
+
+function(conan_cmake_profile)
+    set(profileOneValueArgs   FILEPATH)
+    set(profileMultiValueArgs SETTINGS OPTIONS CONF ENV BUILDENV RUNENV TOOL_REQUIRES)
+    cmake_parse_arguments(ARGS "" "${profileOneValueArgs}" "${profileMultiValueArgs}" ${ARGN})
+
+    if(DEFINED ARGS_FILEPATH)  
+        set(_FN "${ARGS_FILEPATH}")
+    else()
+        set(_FN "${CMAKE_CURRENT_BINARY_DIR}/profile")
+    endif()
+    message(STATUS "Conan: Creating profile ${_FN}")
+    file(WRITE ${_FN} "")
+
+    if(DEFINED ARGS_SETTINGS)
+        file(APPEND ${_FN} "[settings]\n")
+        foreach(SETTING ${ARGS_SETTINGS})
+            file(APPEND ${_FN} ${SETTING} "\n")
+        endforeach()
+    endif()
+
+    if(DEFINED ARGS_OPTIONS)
+        file(APPEND ${_FN} "[options]\n")
+        foreach(OPTION ${ARGS_OPTIONS})
+            file(APPEND ${_FN} ${OPTION} "\n")
+        endforeach()
+    endif()
+
+    if(DEFINED ARGS_CONF)
+        file(APPEND ${_FN} "[conf]\n")
+        foreach(CONF ${ARGS_CONF})
+            file(APPEND ${_FN} ${CONF} "\n")
+        endforeach()
+    endif()
+
+    if(DEFINED ARGS_ENV)
+        file(APPEND ${_FN} "[env]\n")
+        foreach(ENV ${ARGS_ENV})
+            file(APPEND ${_FN} ${ENV} "\n")
+        endforeach()
+    endif()
+
+    if(DEFINED ARGS_BUILDENV)
+        file(APPEND ${_FN} "[buildenv]\n")
+        foreach(BUILDENV ${ARGS_BUILDENV})
+            file(APPEND ${_FN} ${BUILDENV} "\n")
+        endforeach()
+    endif()
+
+    if(DEFINED ARGS_RUNENV)
+        file(APPEND ${_FN} "[runenv]\n")
+        foreach(RUNENV ${ARGS_RUNENV})
+            file(APPEND ${_FN} ${RUNENV} "\n")
+        endforeach()
+    endif()
+
+    if(DEFINED ARGS_TOOL_REQUIRES)
+        file(APPEND ${_FN} "[tool_requires]\n")
+        foreach(TOOL_REQUIRE ${ARGS_TOOL_REQUIRES})
+            file(APPEND ${_FN} ${TOOL_REQUIRE} "\n")
+        endforeach()
+    endif()
+endfunction()
diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
index fe6152980b..226089c2f5 100644
--- a/cmake/dependencies.cmake
+++ b/cmake/dependencies.cmake
@@ -9,7 +9,7 @@ if ((NOT BUILDING_RUNTIME) OR ENABLE_VULKAN_RUNTIME)
 endif ()
 
 if (NOT BUILDING_RUNTIME)
-    find_package(Flatbuffers REQUIRED)
+    find_package(flatbuffers REQUIRED)
     find_package(libzip REQUIRED)
     if(NOT CONAN_EXPORTED)
         set(FLATBUFFERS_FLATC_EXECUTABLE ${flatbuffers_LIB_DIRS}/../bin/flatc)
diff --git a/conanfile.py b/conanfile.py
index b11adc3851..1a84490650 100644
--- a/conanfile.py
+++ b/conanfile.py
@@ -60,7 +60,8 @@ def requirements(self):
             self.requires('protobuf/3.17.1')
             self.requires('xtensor/0.21.5')
             self.requires('spdlog/1.8.2')
-            self.requires('libzippp/4.0')
+            self.requires('zlib/1.2.12')
+            self.requires('libzippp/5.0-1.8.0')
             self.requires('inja/3.2.0')
             self.requires('shaderc/2021.1')
             if self.options.tests:
@@ -82,20 +83,21 @@ def configure(self):
 
         if not self.options.runtime:
             self.options["opencv"].contrib = False
+            self.options["opencv"].with_ade = False
             self.options["opencv"].with_webp = False
             self.options["opencv"].with_openexr = False
             self.options["opencv"].with_eigen = False
             self.options["opencv"].with_quirc = False
+            self.options["opencv"].with_ffmpeg = False
+            self.options["opencv"].with_tiff = False
+            self.options["opencv"].with_jpeg = 'libjpeg-turbo'
             self.options["opencv"].dnn = False
-            self.options["flatbuffers"].options_from_context = False
             self.options["xtensor"].xsimd = False
             self.options["libzip"].with_bzip2 = False
             self.options["libzip"].with_zstd = False
             self.options["libzip"].crypto = False
             if self.settings.os == 'Linux':
                 self.options["opencv"].with_gtk = False
-                self.options["spirv-tools"].link_libcpp = False
-                self.options["shaderc"].link_libcpp = False
 
         if (not self.options.runtime) or self.options.vulkan_runtime:
             if self.settings.os == 'Linux':
diff --git a/docs/USAGE_EN.md b/docs/USAGE_EN.md
index b161c26fa7..b8af24687a 100644
--- a/docs/USAGE_EN.md
+++ b/docs/USAGE_EN.md
@@ -2,32 +2,67 @@
 
 # Overview
 
-nncase provides both python wheel package and ncc client to compile your neural models.
+nncase provides python wheel package to compile your neural models. The current documentation only works for nncase-v1. The available version are shown below.
 
-- nncase wheel package can be  downloaded at [nncase release](https://github.com/kendryte/nncase/releases),  target wheel package except for both cpu and K210 can be got from nncase sdk for your target.
-- For ncc client, you should git clone nncase repository and then build it by yourself.
+```
+1.0.0.20211029, 1.1.0.20211203, 1.3.0.20220127, 1.4.0.20220303, 1.5.0.20220331, 1.6.0.20220505, 1.7.0.20220530, 1.7.1.20220701, 1.8.0.20220929, 1.9.0.20230322
+```
+
+- nncase wheel package can be downloaded at [nncase release](https://github.com/kendryte/nncase/releases).
 
 # nncase python APIs
 
-nncase provides Python APIs to compile neural network model and inference on your PC.
+nncase provides Python APIs to compile neural network model and inference on x86_64 and amd64 platforms.
 
 ## Installation
 
+The nncase toolchain compiler consists of nncase and plug-in wheel packages.
+
+- Both nncase and plug-in wheel packages are released at [nncase github](https://github.com/kendryte/nncase/releases)
+- Nncase wheel package supports Python 3.6/3.7/3.8/3.9/3.10, You can download it according to your operating system and Python version.
+- The plug-in wheel package does not depend on Python version, you can install it directly.
+
 You can make use of [nncase docker image](https://github.com/kendryte/nncase/blob/master/docs/build.md)(Ubuntu 20.04 + Python 3.8) if you do not have Ubuntu development.
 
 ```shell
+$ cd /path/to/nncase_sdk
 $ docker pull registry.cn-hangzhou.aliyuncs.com/kendryte/nncase:latest
 $ docker run -it --rm -v `pwd`:/mnt -w /mnt registry.cn-hangzhou.aliyuncs.com/kendryte/nncase:latest /bin/bash -c "/bin/bash"
 ```
 
-Take Ubuntu 20.04 + Python 3.8 for example
+### cpu/K210
+
+- Download nncase wheel package and then install it.
+
+```
+root@2b11cc15c7f8:/mnt# wget -P x86_64 https://github.com/kendryte/nncase/releases/download/v1.8.0/nncase-1.8.0.20220929-cp38-cp38-manylinux_2_24_x86_64.whl
+
+root@2b11cc15c7f8:/mnt# pip3 install x86_64/*.whl
+```
+
+### K510
+
+- Download both nncase and nncase_k510 wheel packages and then install them.
 
 ```shell
-root@f74598de4a02:/mnt# pip3 install nncase_github/nncase-1.0.0.20211029-cp38-cp38-manylinux_2_24_x86_64.whl
+root@2b11cc15c7f8:/mnt# wget -P x86_64 https://github.com/kendryte/nncase/releases/download/v1.8.0/nncase-1.8.0.20220929-cp38-cp38-manylinux_2_24_x86_64.whl
 
+root@2b11cc15c7f8:/mnt# wget -P x86_64 https://github.com/kendryte/nncase/releases/download/v1.8.0/nncase_k510-1.8.0.20220930-py2.py3-none-manylinux_2_24_x86_64.whl
+
+root@2b11cc15c7f8:/mnt# pip3 install x86_64/*.whl
 ```
 
-> You should get and install target wheel package from your nncase sdk if you do not take cpu/K210 as your target
+### Check nncase version
+
+```python
+root@469e6a4a9e71:/mnt# python3
+Python 3.8.10 (default, Jun  2 2021, 10:49:15)
+[GCC 9.4.0] on linux
+Type "help", "copyright", "credits" or "license" for more information.
+>>> import _nncase
+>>> print(_nncase.__version__)
+1.8.0-55be52f
+```
 
 ## nncase compile model APIs
 
@@ -76,13 +111,13 @@ The details of all attributes are following.
 | quant_type       | string    | N            | Specify the quantization type for input data , such as 'uint8', 'int8', 'int16'                                                                                                                         |
 | w_quant_type     | string    | N            | Specify the quantization type for weight , such as 'uint8'(by default), 'int8', 'int16'                                                                                                                 |
 | use_mse_quant_w  | bool      | N            | Specify whether use  mean-square error when quantizing weight                                                                                                                                           |
-| split_w_to_act   | bool      | N            | Specify whether split weight into activation                                                                                                                                            |
+| split_w_to_act   | bool      | N            | Specify whether split weight into activation                                                                                                                                                            |
 | preprocess       | bool      | N            | Whether enable preprocess, False by default                                                                                                                                                             |
 | swapRB           | bool      | N            | Whether swap red and blue channel for RGB data(from RGB to BGR or from BGR to RGB), False by default                                                                                                    |
 | mean             | list      | N            | Normalize mean value for preprocess, [0, 0, 0] by default                                                                                                                                               |
 | std              | list      | N            | Normalize std value for preprocess, [1, 1, 1] by default                                                                                                                                                |
 | input_range      | list      | N            | The float range for dequantized input data, [0，1] by default                                                                                                                                           |
-| output_range | list | N | The float range for quantized output data,  [ ] by default |
+| output_range     | list      | N            | The float range for quantized output data,  [ ] by default                                                                                                                                              |
 | input_shape      | list      | N            | Specify the shape of input data.  input_shape should be consistent with input _layout.  There will be letterbox  operations(Such as resize/pad) if input_shape is not the same as input shape of model. |
 | letterbox_value  | float     | N            | Specify the pad value of letterbox during preprocess.                                                                                                                                                   |
 | input_type       | string    | N            | Specify the data type of input data, 'float32' by default.                                                                                                                                              |
@@ -727,10 +762,50 @@ if __name__ == '__main__':
 
 ## Deploy nncase runtime
 
-### K210
+### Inference on K210 development board
+
+1. Download [SDK](https://github.com/kendryte/kendryte-standalone-sdk)
+
+   ```shell
+   $ git clone https://github.com/kendryte/kendryte-standalone-sdk.git
+   $ cd kendryte-standalone-sdk
+   $ export KENDRYTE_WORKSPACE=`pwd`
+   ```
+2. Download the cross-compile toolchain and extract it
+
+   ```shell
+   $ wget https://github.com/kendryte/kendryte-gnu-toolchain/releases/download/v8.2.0-20190409/kendryte-toolchain-ubuntu-amd64-8.2.0-20190409.tar.xz -O $KENDRYTE_WORKSPACE/kendryte-toolchain.tar.xz
+   $ cd $KENDRYTE_WORKSPACE
+   $ mkdir toolchain
+   $ tar -xf kendryte-toolchain.tar.xz -C ./toolchain
+   ```
+3. Update nncase runtime
+
+   Download `k210-runtime.zip` from [Release](https://github.com/kendryte/nncase/releases) and extract it into [kendryte-standalone-sdk](https://github.com/kendryte/kendryte-standalone-sdk) 's `lib/nncase/v1`.
+4. Compile App
+
+   ```shell
+   # 1.copy your programe into `$KENDRYTE_WORKSPACE/src`
+   # e.g. copy ($NNCASE_WORK_DIR/examples/facedetect_landmark/k210/facedetect_landmark_example) into PATH_TO_SDK/src.
+   $ cp -r $NNCASE_WORK_DIR/examples/facedetect_landmark/k210/facedetect_landmark_example $KENDRYTE_WORKSPACE/src/
+
+   # 2. compile
+   $ cd $KENDRYTE_WORKSPACE
+   $ mkdir build
+   $ cmake .. -DPROJ=facedetect_landmark_example -DTOOLCHAIN=$KENDRYTE_WORKSPACE/toolchain/kendryte-toolchain/bin && make
+   ```
+
+   `facedetect_landmark_example` and `FaceDETECt_landmark_example.bin` will be generated.
+5. Write the program to the K210 development board
 
-1. Download `k210-runtime.zip` from [Release](https://github.com/kendryte/nncase/releases) page.
-2. Unzip to your [kendryte-standalone-sdk](https://github.com/kendryte/kendryte-standalone-sdk) 's `lib/nncase/v1` directory.
+   ```shell
+   # 1. Check available USB ports
+   $ ls /dev/ttyUSB*
+   # /dev/ttyUSB0 /dev/ttyUSB1
+
+   # 2. Write your App by kflash
+   $ kflash -p /dev/ttyUSB0 -t facedetect_landmark_example.bin
+   ```
 
 ## nncase inference APIs
 
@@ -1161,143 +1236,3 @@ N/A
 ```python
 sim.run()
 ```
-
-# ncc
-
-## Comannd line
-
-```shell
-DESCRIPTION
-NNCASE model compiler and inference tool.
-
-SYNOPSIS
-    ncc compile -i <input format> -t <target>
-        <input file> [--input-prototxt <input prototxt>] <output file> [--output-arrays <output arrays>]
-        [--quant-type <quant type>] [--w-quant-type <w quant type>] [--use-mse-quant-w]
-        [--dataset <dataset path>] [--dataset-format <dataset format>] [--calibrate-method <calibrate method>]
-        [--preprocess] [--swapRB] [--mean <normalize mean>] [--std <normalize std>]
-        [--input-range <input range>] [--input-shape <input shape>] [--letterbox-value <letter box value>]
-        [--input-type <input type>] [--output-type <output type>]
-        [--input-layout <input layout>] [--output-layout <output layout>] [--tcu-num <tcu number>]
-        [--is-fpga] [--dump-ir] [--dump-asm] [--dump-quant-error] [--dump-import-op-range] [--dump-dir <dump directory>]
-        [--dump-range-dataset <dataset path>] [--dump-range-dataset-format <dataset format>] [--benchmark-only]
-
-    ncc infer <input file> <output path>
-        --dataset <dataset path> [--dataset-format <dataset format>]
-        [--input-layout <input layout>]
-
-    ncc [-v]
-
-OPTIONS
-  compile
-
-  -i, --input-format <input format>
-                          input format, e.g. tflite|onnx|caffe
-  -t, --target <target>   target architecture, e.g. cpu|k210|k510
-  <input file>            input file
-  --input-prototxt <input prototxt>
-                          input prototxt
-  <output file>           output file
-  --output-arrays <output arrays>
-                          output arrays
-  --quant-type <quant type>
-                          post trainning quantize type, e.g uint8|int8|int16, default is uint8
-  --w-quant-type <w quant type>
-                          post trainning weights quantize type, e.g uint8|int8|int16, default is uint8
-  --use-mse-quant-w       use min mse algorithm to refine weights quantilization or not, default is 0
-  --dataset <dataset path>
-                          calibration dataset, used in post quantization
-  --dataset-format <dataset format>
-                          datset format: e.g. image|raw, default is image
-  --dump-range-dataset <dataset path>
-                          dump import op range dataset
-  --dump-range-dataset-format <dataset format>
-                          datset format: e.g. image|raw, default is image
-  --calibrate-method <calibrate method>
-                          calibrate method: e.g. no_clip|l2|kld_m0|kld_m1|kld_m2|cdf, default is no_clip
-  --preprocess            enable preprocess, default is 0
-  --swapRB                swap red and blue channel, default is 0
-  --mean <normalize mean> normalize mean, default is 0. 0. 0.
-  --std <normalize std>   normalize std, default is 1. 1. 1.
-  --input-range <input range>
-                          float range after preprocess
-  --input-shape <input shape>
-                          shape for input data
-  --letterbox-value <letter box value>
-                          letter box pad value, default is 0.000000
-  --input-type <input type>
-                          input type, e.g float32|uint8|default, default is default
-  --output-type <output type>
-                          output type, e.g float32|uint8, default is float32
-  --input-layout <input layout>
-                          input layout, e.g NCHW|NHWC, default is NCHW
-  --output-layout <output layout>
-                          output layout, e.g NCHW|NHWC, default is NCHW
-  --tcu-num <tcu number>  tcu number, e.g 1|2|3|4, default is 0
-  --is-fpga               use fpga parameters, default is 0
-  --dump-ir               dump ir to .dot, default is 0
-  --dump-asm              dump assembly, default is 0
-  --dump-quant-error      dump quant error, default is 0
-  --dump-import-op-range  dump import op range, default is 0
-  --dump-dir <dump directory>
-                          dump to directory
-  --benchmark-only        compile kmodel only for benchmark use, default is 0
-
-  infer
-
-  <model filename>        kmodel filename
-  <output path>           output path
-  --dataset <dataset path>
-                          dataset path
-  --dataset-format <dataset format>
-                          dataset format, e.g. image|raw, default is image
-  --input-layout <input layout>
-                          input layout, e.g NCHW|NHWC, default is NCHW
-```
-
-## Description
-
-`ncc` is the nncase command line tool. It has two commands: `compile` and `infer`.
-
-`compile` command compile your trained models (`.tflite`, `.caffemodel`, `.onnx`) to `.kmodel`.
-
-- `-i, --input-format` option is used to specify the input model format. nncase supports `tflite`, `caffe` and `onnx` input model currently.
-- `-t, --target` option is used to set your desired target device to run the model. `cpu` is the most general target that almost every platform should support. `k210` is the Kendryte K210 SoC platform. If you set this option to `k210`, this model can only run on K210 or be emulated on your PC.
-- `<input file>` is your input model path.
-- `--input-prototxt` is the prototxt file for caffe model.
-- `<output file>` is the output model path.
-- `--output-arrays` is the names of nodes to output.
-- `--quant-type` is used to specify quantize type, such as `uint8` by default and `int8` and `int16`.
-- `--w-quant-type` is used to specify quantize type for weight, such as `uint8` by default and `int8 `and `int16`.
-- `--use-mse-quant-w ` is used to specify whether use minimize mse(mean-square error, mse) algorithm to quantize weight or not.
-- `--dataset` is to provide your quantization calibration dataset to quantize your models. You should put hundreds or thousands of data in training set to this directory.
-- `--dataset-format` is to set the format of the calibration dataset. Default is `image`, nncase will use `opencv` to read your images and autoscale to the desired input size of your model. If the input has 3 channels, ncc will convert images to RGB float tensors [0,1] in `NCHW` layout. If the input has only 1 channel, ncc will grayscale your images. Set to `raw` if your dataset is not image dataset for example, audio or matrices. In this scenario you should convert your dataset to raw binaries which contains float tensors.
-- `--dump-range-dataset` is to provide your dump range dataset to dump each op data range of your models. You should put hundreds or thousands of data in training set to this directory.
-- `--dump-range-dataset-format` is to set the format of the dump range dataset. Default is `image`, nncase will use `opencv` to read your images and autoscale to the desired input size of your model. If the input has 3 channels, ncc will convert images to RGB float tensors [0,1] in `NCHW` layout. If the input has only 1 channel, ncc will grayscale your images. Set to `raw` if your dataset is not image dataset for example, audio or matrices. In this scenario you should convert your dataset to raw binaries which contains float tensors.
-- `--calibrate-method` is to set your desired calibration method, which is used to select the optimal activation ranges. The default is `no_clip` in that ncc will use the full range of activations. If you want a better quantization result, you can use `l2` but it will take a longer time to find the optimal ranges.
-- `--preprocess ` is used specify whether enable preprocessing or not.
-- `--swapRB ` is used specify whether swap red and blue channel or not. You can use this flag to implement RGB2BGR or BGR2RGB feature.
-- `--mean` is the mean values to be subtracted during preprocessing.
-- `--std` is the std values to be divided during preprocessing.
-- `--input-range` is the input range in float after dequantization.
-- `--input-shape` is used to specify the shape of input data. If the input shape is different from the input shape of your model, the preprocess will add resize/pad ops automatically for the transformation.
-- `--letterbox-value` is used to specify the pad values when pad is added during preprocessing.
-- `--input-type` is to set your desired input data type when do inference. If `--input-type` is `uint8`, for example you should provide RGB888 uint8 tensors when you do inference. If `--input-type` is `float`, you should provide RGB float tensors instead.
-- `--output-type` is the type of output data.
-- `--input-layout` is the layout of input data.
-- `--output-layout` is the layout of output data.
-- `--tcu-num` is used to configure the number of TCU. 0 means do not configure the number of TCU.
-- `--is-fpga` is a debug option. It is used to specify whether the kmodel run on fpga or not.
-- `--dump-ir` is a debug option. It is used to specify whether dump IR or not.
-- `--dump-asm` is a debug option. It is used to specify whether dump asm file or not.
-- `--dump-quant-error` is a debug option. It is used to specify whether dump quantization error information or not.
-- `--dump-import-op-range` is a debug option. It is used to specify whether dump imported op data range or not, need to also specify dump-range-dataset if enabled.
-- `--dump-dir` is used to specify dump directory.
-- `--benchmark-only` is used to specify whether the kmodel is used for benchmark or not.
-
-`infer` command can run your kmodel, and it's often used as debug purpose. ncc will save the model's output tensors to `.bin` files in `NCHW` layout.
-
-- `<input file>` is your kmodel path.
-- `<output path>` is the output directory ncc will produce to.
-- `--dataset` is the test set directory.
-- `--dataset-format` and `--input-layout` have the same meaning as in `compile` command.
diff --git a/docs/USAGE_ZH.md b/docs/USAGE_ZH.md
index 81adc69cea..0293f3eb5c 100644
--- a/docs/USAGE_ZH.md
+++ b/docs/USAGE_ZH.md
@@ -1,31 +1,66 @@
 # 概述
 
-nncase目前提供了python wheel包和ncc客户端两种方法编译模型.
+nncase目前提供了python wheel包编译模型。当前文档仅适用于nncase-v1，适用于以下版本号：
 
-- nncase wheel包需要去[nncase release](https://github.com/kendryte/nncase/releases)获取,  target wheel包除cpu/K210不需要安装外, 其它target需要从nncase sdk离线获取
-- ncc客户端需要用户下载并编译nncase
+```
+1.0.0.20211029, 1.1.0.20211203, 1.3.0.20220127, 1.4.0.20220303, 1.5.0.20220331, 1.6.0.20220505, 1.7.0.20220530, 1.7.1.20220701, 1.8.0.20220929, 1.9.0.20230322
+```
+
+- nncase wheel包需要去[nncase release](https://github.com/kendryte/nncase/releases)获取
 
 # nncase python APIs
 
-nncase提供了Python APIs, 用于在PC上编译/推理深度学习模型.
+nncase提供了Python APIs, 用于在x86_64和amd64平台上编译/推理深度学习模型.
 
 ## 安装
 
-用户若没有Ubuntu环境, 可使用[nncase docker image](https://github.com/kendryte/nncase/blob/master/docs/build.md)(Ubuntu 20.04 + Python 3.8)
+nncase工具链compiler部分包括nncase和插件包
+
+- nncase 和插件包均在[nncase github](https://github.com/kendryte/nncase/releases)发布
+- nncase wheel包支持Python 3.6/3.7/3.8/3.9/3.10, 用户可根据操作系统和Python选择相应版本下载 .
+- 插件包不依赖Python版本, 可直接安装
+
+用户若没有Ubuntu环境, 可使用[nncase docker](https://github.com/kendryte/nncase/blob/master/docs/build.md#docker)(Ubuntu 20.04 + Python 3.8)
 
 ```shell
+$ cd /path/to/nncase_sdk
 $ docker pull registry.cn-hangzhou.aliyuncs.com/kendryte/nncase:latest
 $ docker run -it --rm -v `pwd`:/mnt -w /mnt registry.cn-hangzhou.aliyuncs.com/kendryte/nncase:latest /bin/bash -c "/bin/bash"
 ```
 
-下面以Ubuntu 20.04 + Python 3.8平台安装nncase为例
+### cpu/K210
+
+- 下载nncase wheel包, 直接安装即可.
+
+```
+root@2b11cc15c7f8:/mnt# wget -P x86_64 https://github.com/kendryte/nncase/releases/download/v1.8.0/nncase-1.8.0.20220929-cp38-cp38-manylinux_2_24_x86_64.whl
+
+root@2b11cc15c7f8:/mnt# pip3 install x86_64/*.whl
+```
+
+### K510
+
+- 分别下载nncase和nncase_k510插件包，再一起安装
 
 ```shell
-root@f74598de4a02:/mnt# pip3 install nncase_github/nncase-1.0.0.20211029-cp38-cp38-manylinux_2_24_x86_64.whl
+root@2b11cc15c7f8:/mnt# wget -P x86_64 https://github.com/kendryte/nncase/releases/download/v1.8.0/nncase-1.8.0.20220929-cp38-cp38-manylinux_2_24_x86_64.whl
 
+root@2b11cc15c7f8:/mnt# wget -P x86_64 https://github.com/kendryte/nncase/releases/download/v1.8.0/nncase_k510-1.8.0.20220930-py2.py3-none-manylinux_2_24_x86_64.whl
+
+root@2b11cc15c7f8:/mnt# pip3 install x86_64/*.whl
 ```
 
-> 若不使用cpu/K210作为target, 需要从相应target的nncase sdk中获取wheel包并进行安装
+### 查看版本信息
+
+```python
+root@469e6a4a9e71:/mnt# python3
+Python 3.8.10 (default, Jun  2 2021, 10:49:15)
+[GCC 9.4.0] on linux
+Type "help", "copyright", "credits" or "license" for more information.
+>>> import _nncase
+>>> print(_nncase.__version__)
+1.8.0-55be52f
+```
 
 ## nncase 编译模型APIs
 
@@ -68,32 +103,32 @@ py::class_<compile_options>(m, "CompileOptions")
 
 各属性说明如下
 
-| 属性名称         | 类型   | 是否必须 | 描述                                                         |
-| ---------------- | ------ | -------- | ------------------------------------------------------------ |
-| target           | string | 是       | 指定编译目标, 如'k210', 'k510'                               |
-| quant_type       | string | 否       | 指定数据量化类型, 如'uint8', 'int8', 'int16'                 |
-| w_quant_type     | string | 否       | 指定权重量化类型, 如'uint8', 'int8', 'int16', 默认为'uint8'  |
-| use_mse_quant_w  | bool   | 否       | 指定权重量化时是否使用最小化均方误差(mean-square error, MSE)算法优化量化参数 |
-| split_w_to_act   | bool   | 否       | 指定是否将权重数据平衡到激活数据中                           |
-| preprocess       | bool   | 否       | 是否开启前处理，默认为False                                  |
-| swapRB           | bool   | 否       | 是否交换RGB输入数据的红和蓝两个通道(RGB-->BGR或者BGR-->RGB)，默认为False |
-| mean             | list   | 否       | 前处理标准化参数均值，默认为[0, 0, 0]                        |
-| std              | list   | 否       | 前处理标准化参数方差，默认为[1, 1, 1]                        |
-| input_range      | list   | 否       | 输入数据反量化后对应浮点数的范围，默认为[0，1]               |
-| output_range     | list   | 否       | 输出定点数据前对应浮点数的范围，默认为空，使用模型实际浮点输出范围 |
+| 属性名称         | 类型   | 是否必须 | 描述                                                                                                                                                  |
+| ---------------- | ------ | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
+| target           | string | 是       | 指定编译目标, 如'k210', 'k510'                                                                                                                        |
+| quant_type       | string | 否       | 指定数据量化类型, 如'uint8', 'int8', 'int16'                                                                                                          |
+| w_quant_type     | string | 否       | 指定权重量化类型, 如'uint8', 'int8', 'int16', 默认为'uint8'                                                                                           |
+| use_mse_quant_w  | bool   | 否       | 指定权重量化时是否使用最小化均方误差(mean-square error, MSE)算法优化量化参数                                                                          |
+| split_w_to_act   | bool   | 否       | 指定是否将权重数据平衡到激活数据中                                                                                                                    |
+| preprocess       | bool   | 否       | 是否开启前处理，默认为False                                                                                                                           |
+| swapRB           | bool   | 否       | 是否交换RGB输入数据的红和蓝两个通道(RGB-->BGR或者BGR-->RGB)，默认为False                                                                              |
+| mean             | list   | 否       | 前处理标准化参数均值，默认为[0, 0, 0]                                                                                                                 |
+| std              | list   | 否       | 前处理标准化参数方差，默认为[1, 1, 1]                                                                                                                 |
+| input_range      | list   | 否       | 输入数据反量化后对应浮点数的范围，默认为[0，1]                                                                                                        |
+| output_range     | list   | 否       | 输出定点数据前对应浮点数的范围，默认为空，使用模型实际浮点输出范围                                                                                    |
 | input_shape      | list   | 否       | 指定输入数据的shape，input_shape的layout需要与input layout保持一致，输入数据的input_shape与模型的input shape不一致时会进行letterbox操作(resize/pad等) |
-| letterbox_value  | float  | 否       | 指定前处理letterbox的填充值                                  |
-| input_type       | string | 否       | 指定输入数据的类型, 默认为'float32'                          |
-| output_type      | string | 否       | 指定输出数据的类型, 如'float32', 'uint8'(仅用于指定量化情况下), 默认为'float32' |
-| input_layout     | string | 否       | 指定输入数据的layout, 如'NCHW', 'NHWC'. 若输入数据layout与模型本身layout不同, nncase会插入transpose进行转换 |
-| output_layout    | string | 否       | 指定输出数据的layout, 如'NCHW', 'NHWC'. 若输出数据layout与模型本身layout不同, nncase会插入transpose进行转换 |
-| model_layout     | string | 否       | 指定模型的layout，默认为空，当tflite模型layout为‘NCHW’，Onnx和Caffe模型layout为‘NHWC’时需指定 |
-| is_fpga          | bool   | 否       | 指定kmodel是否用于fpga, 默认为False                          |
-| dump_ir          | bool   | 否       | 指定是否dump IR, 默认为False                                 |
-| dump_asm         | bool   | 否       | 指定是否dump asm汇编文件, 默认为False                        |
-| dump_quant_error | bool   | 否       | 指定是否dump量化前后的模型误差                               |
-| dump_dir         | string | 否       | 前面指定dump_ir等开关后, 这里指定dump的目录, 默认为空字符串  |
-| benchmark_only   | bool   | 否       | 指定kmodel是否只用于benchmark, 默认为False                   |
+| letterbox_value  | float  | 否       | 指定前处理letterbox的填充值                                                                                                                           |
+| input_type       | string | 否       | 指定输入数据的类型, 默认为'float32'                                                                                                                   |
+| output_type      | string | 否       | 指定输出数据的类型, 如'float32', 'uint8'(仅用于指定量化情况下), 默认为'float32'                                                                       |
+| input_layout     | string | 否       | 指定输入数据的layout, 如'NCHW', 'NHWC'. 若输入数据layout与模型本身layout不同, nncase会插入transpose进行转换                                           |
+| output_layout    | string | 否       | 指定输出数据的layout, 如'NCHW', 'NHWC'. 若输出数据layout与模型本身layout不同, nncase会插入transpose进行转换                                           |
+| model_layout     | string | 否       | 指定模型的layout，默认为空，当tflite模型layout为‘NCHW’，Onnx和Caffe模型layout为‘NHWC’时需指定                                                     |
+| is_fpga          | bool   | 否       | 指定kmodel是否用于fpga, 默认为False                                                                                                                   |
+| dump_ir          | bool   | 否       | 指定是否dump IR, 默认为False                                                                                                                          |
+| dump_asm         | bool   | 否       | 指定是否dump asm汇编文件, 默认为False                                                                                                                 |
+| dump_quant_error | bool   | 否       | 指定是否dump量化前后的模型误差                                                                                                                        |
+| dump_dir         | string | 否       | 前面指定dump_ir等开关后, 这里指定dump的目录, 默认为空字符串                                                                                           |
+| benchmark_only   | bool   | 否       | 指定kmodel是否只用于benchmark, 默认为False                                                                                                            |
 
 > 1. mean和std为浮点数进行normalize的参数，用户可以自由指定.
 > 2. input range为浮点数的范围，即如果输入数据类型为uint8，则input range为反量化到浮点之后的范围（可以不为0~1），可以自由指定.
@@ -729,10 +764,51 @@ if __name__ == '__main__':
 
 ## 部署 nncase runtime
 
-### K210
+### K210上板推理流程
 
-1. 从 [Release](https://github.com/kendryte/nncase/releases) 页面下载 `k210-runtime.zip`。
-2. 解压到 [kendryte-standalone-sdk](https://github.com/kendryte/kendryte-standalone-sdk) 's `lib/nncase/v1` 目录。
+1. 下载官方[SDK](https://github.com/kendryte/kendryte-standalone-sdk)
+
+   ```shell
+   git clone https://github.com/kendryte/kendryte-standalone-sdk.git
+   cd kendryte-standalone-sdk
+   export KENDRYTE_WORKSPACE=`pwd`
+   ```
+2. 下载交叉编译工具链，并解压
+
+   ```shell
+   wget https://github.com/kendryte/kendryte-gnu-toolchain/releases/download/v8.2.0-20190409/kendryte-toolchain-ubuntu-amd64-8.2.0-20190409.tar.xz -O $KENDRYTE_WORKSPACE/kendryte-toolchain.tar.xz
+   cd $KENDRYTE_WORKSPACE
+   mkdir toolchain
+   tar -xf kendryte-toolchain.tar.xz -C ./toolchain
+   ```
+3. 更新runtime
+
+   从 [Release](https://github.com/kendryte/nncase/releases) 页面下载 `k210-runtime.zip`。解压到 [kendryte-standalone-sdk](https://github.com/kendryte/kendryte-standalone-sdk) 's `lib/nncase/v1` 目录。
+4. 编译App
+
+   ````shell
+   # 1.将自己的App工程放在`$KENDRYTE_WORKSPACE/src`目录下
+   # 例如，将[example的示例程序]($NNCASE_WORK_DIR/examples/facedetect_landmark/k210/facedetect_landmark_example)目录，拷贝到SDK的src目录下。
+   cp -r $NNCASE_WORK_DIR/examples/facedetect_landmark/k210/facedetect_landmark_example $KENDRYTE_WORKSPACE/src/
+
+   # 2.cmake 编译App
+   cd $KENDRYTE_WORKSPACE
+   mkdir build
+   cmake .. -DPROJ=facedetect_landmark_example -DTOOLCHAIN=$KENDRYTE_WORKSPACE/toolchain/kendryte-toolchain/bin && make
+   ````
+
+   之后会在当前目录下生成 `facedetect_landmark_example`和 `facedetect_landmark_example.bin`
+5. 烧写App
+
+   ```shell
+   # 1. 检查可用的USB端口
+   ls /dev/ttyUSB*
+   # > /dev/ttyUSB0 /dev/ttyUSB1
+   # 2. 使用kflash进行烧录
+   kflash -p /dev/ttyUSB0 -t facedetect_landmark_example.bin
+   ```
+
+   烧写过程缓慢，需要耐心等待。
 
 ## nncase 推理模型APIs
 
@@ -1166,144 +1242,4 @@ N/A
 
 ```python
 sim.run()
-```
-
-# ncc
-
-## 命令行
-
-```shell
-DESCRIPTION
-NNCASE model compiler and inference tool.
-
-SYNOPSIS
-    ncc compile -i <input format> -t <target>
-        <input file> [--input-prototxt <input prototxt>] <output file> [--output-arrays <output arrays>]
-        [--quant-type <quant type>] [--w-quant-type <w quant type>] [--use-mse-quant-w]
-        [--dataset <dataset path>] [--dataset-format <dataset format>] [--calibrate-method <calibrate method>]
-        [--preprocess] [--swapRB] [--mean <normalize mean>] [--std <normalize std>]
-        [--input-range <input range>] [--input-shape <input shape>] [--letterbox-value <letter box value>]
-        [--input-type <input type>] [--output-type <output type>]
-        [--input-layout <input layout>] [--output-layout <output layout>] [--tcu-num <tcu number>]
-        [--is-fpga] [--dump-ir] [--dump-asm] [--dump-quant-error] [--dump-import-op-range] [--dump-dir <dump directory>]
-        [--dump-range-dataset <dataset path>] [--dump-range-dataset-format <dataset format>] [--benchmark-only]
-
-    ncc infer <input file> <output path>
-        --dataset <dataset path> [--dataset-format <dataset format>]
-        [--input-layout <input layout>]
-
-    ncc [-v]
-
-OPTIONS
-  compile
-
-  -i, --input-format <input format>
-                          input format, e.g. tflite|onnx|caffe
-  -t, --target <target>   target architecture, e.g. cpu|k210|k510
-  <input file>            input file
-  --input-prototxt <input prototxt>
-                          input prototxt
-  <output file>           output file
-  --output-arrays <output arrays>
-                          output arrays
-  --quant-type <quant type>
-                          post trainning quantize type, e.g uint8|int8|int16, default is uint8
-  --w-quant-type <w quant type>
-                          post trainning weights quantize type, e.g uint8|int8|int16, default is uint8
-  --use-mse-quant-w       use min mse algorithm to refine weights quantilization or not, default is 0
-  --dataset <dataset path>
-                          calibration dataset, used in post quantization
-  --dataset-format <dataset format>
-                          datset format: e.g. image|raw, default is image
-  --dump-range-dataset <dataset path>
-                          dump import op range dataset
-  --dump-range-dataset-format <dataset format>
-                          datset format: e.g. image|raw, default is image
-  --calibrate-method <calibrate method>
-                          calibrate method: e.g. no_clip|l2|kld_m0|kld_m1|kld_m2|cdf, default is no_clip
-  --preprocess            enable preprocess, default is 0
-  --swapRB                swap red and blue channel, default is 0
-  --mean <normalize mean> normalize mean, default is 0. 0. 0.
-  --std <normalize std>   normalize std, default is 1. 1. 1.
-  --input-range <input range>
-                          float range after preprocess
-  --input-shape <input shape>
-                          shape for input data
-  --letterbox-value <letter box value>
-                          letter box pad value, default is 0.000000
-  --input-type <input type>
-                          input type, e.g float32|uint8|default, default is default
-  --output-type <output type>
-                          output type, e.g float32|uint8, default is float32
-  --input-layout <input layout>
-                          input layout, e.g NCHW|NHWC, default is NCHW
-  --output-layout <output layout>
-                          output layout, e.g NCHW|NHWC, default is NCHW
-  --tcu-num <tcu number>  tcu number, e.g 1|2|3|4, default is 0
-  --is-fpga               use fpga parameters, default is 0
-  --dump-ir               dump ir to .dot, default is 0
-  --dump-asm              dump assembly, default is 0
-  --dump-quant-error      dump quant error, default is 0
-  --dump-import-op-range  dump import op range, default is 0
-  --dump-dir <dump directory>
-                          dump to directory
-  --benchmark-only        compile kmodel only for benchmark use, default is 0
-
-  infer
-
-  <model filename>        kmodel filename
-  <output path>           output path
-  --dataset <dataset path>
-                          dataset path
-  --dataset-format <dataset format>
-                          dataset format, e.g. image|raw, default is image
-  --input-layout <input layout>
-                          input layout, e.g NCHW|NHWC, default is NCHW
-```
-
-## 描述
-
-`ncc` 是 nncase 的命令行工具。它有两个命令： `compile` 和 `infer`。
-
-`compile` 命令将你训练好的模型 (`.tflite`, `.caffemodel`, `.onnx`) 编译到 `.kmodel`。
-
-- `-i, --input-format` 用来指定输入模型的格式。nncase 现在支持 `tflite`、`caffe` 和 `onnx` 输入格式。
-- `-t, --target` 用来指定你想要你的模型在哪种目标设备上运行。`cpu` 几乎所有平台都支持的通用目标。`k210` 是 Kendryte K210 SoC 平台。如果你指定了 `k210`，这个模型就只能在 K210 运行或在你的 PC 上模拟运行。
-- `<input file>` 用于指定输入模型文件
-- `--input-prototxt`用于指定caffe模型的prototxt文件
-- `<output file>` 用于指定输出模型文件
-- `--output-arrays `用于指定输出结点的名称
-- `--quant-type` 用于指定数据的量化类型, 如 `uint8`/`int8`/`int16, 默认是`uint8
-- `--w-quant-type` 用于指定权重的量化类型, 如 `uint8`/`int8`/`int16, 默认是`uint8
-- `--use-mse-quant-w`指定是否使用最小化mse(mean-square error, 均方误差)算法来量化权重.
-- `--dataset` 用于提供量化校准集来量化你的模型。你需要从训练集中选择几百到上千个数据放到这个目录里。
-- `--dataset-format` 用于指定量化校准集的格式。默认是 `image`，nncase 将使用 `opencv` 读取你的图片，并自动缩放到你的模型输入需要的尺寸。如果你的输入有 3 个通道，ncc 会将你的图片转换为值域是 [0,1] 布局是 `NCHW` 的张量。如果你的输入只有 1 个通道，ncc 会灰度化你的图片。如果你的数据集不是图片（例如音频或者矩阵），把它设置为 `raw`。这种场景下你需要把你的数据集转换为 float 张量的二进制文件。
-- `--dump-range-dataset` 用于提供统计范围数据集来统计原始模型每个节点输出数据范围。你需要从训练集中选择几百到上千个数据放到这个目录里。
-- `--dump-range-dataset-format` 用于指定统计范围数据集的格式。默认是 `image`，nncase 将使用 `opencv` 读取你的图片，并自动缩放到你的模型输入需要的尺寸。如果你的输入有 3 个通道，ncc 会将你的图片转换为值域是 [0,1] 布局是 `NCHW` 的张量。如果你的输入只有 1 个通道，ncc 会灰度化你的图片。如果你的数据集不是图片（例如音频或者矩阵），把它设置为 `raw`。这种场景下你需要把你的数据集转换为 float 张量的二进制文件。
-- `--calibrate-method` 用于设置量化校准方法，它被用来选择最优的激活函数值域。默认值是 `no_clip`，ncc 会使用整个激活函数值域。如果你需要更好的量化结果，你可以使用 `l2`，但它需要花更长的时间寻找最优值域。
-- `--preprocess`指定是否预处理, 添加后表示开启预处理
-- `--swapRB`指定**预处理时**是否交换红和蓝两个通道数据, 用于实现RGB2BGR或BGR2RGB功能
-- `--mean`指定**预处理时**标准化参数均值,例如添加 `--mean "0.1 2.3 33.1f"`用于设置三个通道的均值.
-- `--std`指定**预处理时**标准化参数方差,例如添加 `--std "1. 2. 3."`用于设置三个通道的方差.
-- `--input-range`指定输入数据反量化后的数据范围,例如添加 `--input-range "0.1 2."`设置反量化的范围为 `[0.1~2]`.
-- `--input-shape`指定输入数据的形状. 若与模型的输入形状不同, 则预处理时会做resize/pad等处理, 例如添加 `--input-shape "1 1 28 28"`指明当前输入图像尺寸.
-- `--letterbox-value`用于指定预处理时pad填充的值.
-- `--input-type` 用于指定推理时输入的数据类型。如果 `--input-type` 是 `uint8`，推理时你需要提供 RGB888 uint8 张量。如果 `--input-type` 是 `float`，你则需要提供 RGB float 张量.
-- `--output-type` 用于指定推理时输出的数据类型。如 `float`/`uint8`,  `uint8`仅在量化模型时才有效. 默认是 `float`
-- `--input-layout`用于指定输入数据的layout. 若输入数据的layout与模型的layout不同, 预处理会添加transpose进行转换.
-- `--output-layout`用于指定输出数据的layout
-- `--tcu-num`用于指定tcu个数, 默认值为0, 表示不配置tcu个数.
-- `--is-fpga`指定编译后的kmodel是否运行在fpga上
-- `--dump-ir` 是一个调试选项。当它打开时 ncc 会在工作目录产生一些 `.dot` 文件。你可以使用 `Graphviz` 或 [Graphviz Online](https://dreampuf.github.io/GraphvizOnline) 来查看这些文件。
-- `--dump-asm` 是一个调试选项。当它打开时 ncc 会生成硬件指令文件compile.text.asm
-- `--dump-quant-error`是一个调试选项, 用于dump量化错误信息
-- `--dump-import-op-range`是一个调试选项, 用于dump import之后节点的数据范围，需要同时指定dump-range-dataset
-- `--dump-dir`是一个调试选项, 用于指定dump目录.
-- `--benchmark-only`是一个调试选项, 用于指定编译后的kmodel用于benchmark.
-
-`infer` 命令可以运行你的 kmodel，通常它被用来调试。ncc 会将你模型的输出张量按 `NCHW` 布局保存到 `.bin` 文件。
-
-- `<input file>` kmodel 的路径。
-- `<output path>` ncc 输出目录。
-- `--dataset` 测试集路径。
-- `--dataset-format`和 `--input-layout`同 `compile` 命令中的含义。
+```
\ No newline at end of file
diff --git a/docs/onnx_ops.md b/docs/onnx_ops.md
index 2ea9159538..7c1cfb900f 100644
--- a/docs/onnx_ops.md
+++ b/docs/onnx_ops.md
@@ -19,6 +19,7 @@
 | Ceil | ✅ |
 | Celu | ✅ |
 | Clip | ✅ |
+| Compress | ✅ |
 | Concat | ✅ |
 | Constant | ✅ |
 | ConstantOfShape | ✅ |
@@ -35,20 +36,24 @@
 | Exp | ✅ |
 | Expand | ✅ |
 | Equal | ✅ |
+| Erf | ✅ |
 | Flatten | ✅ |
 | Floor | ✅ |
 | Gather | ✅ |
+| GatherElements | ✅ |
 | GatherND | ✅ |
 | Gemm | ✅ |
 | GlobalAveragePool | ✅ |
 | GlobalMaxPool | ✅ |
 | Greater | ✅ |
 | GreaterOrEqual | ✅ |
+| GRU | ✅ |
 | Hardmax | ✅ |
 | HardSigmoid | ✅ |
 | HardSwish | ✅ |
 | Identity | ✅ |
 | InstanceNormalization | ✅ |
+| LayerNormalization | ✅ |
 | LpNormalization | ✅ |
 | LeakyRelu | ✅ |
 | Less | ✅ |
@@ -89,6 +94,7 @@
 | ReverseSequence | ✅ |
 | RoiAlign | ✅ |
 | Round | ✅ |
+| Rsqrt | ✅ |
 | Selu | ✅ |
 | Shape | ✅ |
 | Sign | ✅ |
@@ -111,6 +117,7 @@
 | TopK | ✅ |
 | Transpose | ✅ |
 | Trilu | ✅ |
+| ThresholdedRelu | ✅ |
 | Upsample | ✅ |
 | Unsqueeze | ✅ |
 | Where | ✅ |
diff --git a/examples/user_guide/README.md b/examples/user_guide/README.md
new file mode 100644
index 0000000000..34aaf7744a
--- /dev/null
+++ b/examples/user_guide/README.md
@@ -0,0 +1,15 @@
+模型编译推理参考Jupyter脚本：[User_guide](./simulate.ipynb)，脚本中包含了单输入和多输入的示例。也可以使用单独的编译脚本 [Single build](../../docs/USAGE_ZH.md#编译模型示例)完成kmodel的编译。
+
+如果在Docker中运行Jupyter脚本，可以参考[配置Jupyter lab](https://github.com/kunjing96/docker-jupyterlab#32-%E9%85%8D%E7%BD%AEjupyter-lab)进行配置。
+
+在执行脚本之前需要根据自身需求修改以下内容：
+
+1. `compile_kmodel`函数中 `compile_options`,`ptq_options`相关信息
+   `compile_options`详细信息见[CompileOptions](../../docs/USAGE_ZH.md#CompileOptions)
+   `ptq_options`详细信息见[PTQTensorOptions](../../docs/USAGE_ZH.md#PTQTensorOptions)
+2. `compile kmodel single input(multiple inputs)`部分
+   修改 `model_path`和 `dump_path`，用于指定模型路径和编译期间文件生成路径。
+   修改 `calib_data`的实现，数据格式见注释。
+3. `run kmodel(simulate)`部分，修改 `input_data`的实现，数据格式见注释。
+
+推理结束后，会在 `dump_path`路径下生成 `kmodel`、输出结果和编译期间的文件。
\ No newline at end of file
diff --git a/examples/user_guide/nncase_base_func.py b/examples/user_guide/nncase_base_func.py
new file mode 100644
index 0000000000..a624e9c0a7
--- /dev/null
+++ b/examples/user_guide/nncase_base_func.py
@@ -0,0 +1,95 @@
+import os
+
+import numpy as np
+import onnx
+import onnxsim
+from sklearn.metrics.pairwise import cosine_similarity
+
+import nncase
+
+
+def get_cosine(vec1, vec2):
+    """
+    result compare
+    """
+    return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))
+
+
+
+def read_model_file(model_file):
+    """
+    read model
+    """
+    with open(model_file, 'rb') as f:
+        model_content = f.read()
+    return model_content
+
+
+def parse_model_input_output(model_file):
+    """
+    parse onnx model
+    """
+    onnx_model = onnx.load(model_file)
+    input_all = [node.name for node in onnx_model.graph.input]
+    input_initializer = [node.name for node in onnx_model.graph.initializer]
+    input_names = list(set(input_all) - set(input_initializer))
+    input_tensors = [
+        node for node in onnx_model.graph.input if node.name in input_names]
+
+    # input
+    inputs = []
+    for _, e in enumerate(input_tensors):
+        onnx_type = e.type.tensor_type
+        input_dict = {}
+        input_dict['name'] = e.name
+        input_dict['dtype'] = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[onnx_type.elem_type]
+        input_dict['shape'] = [i.dim_value for i in onnx_type.shape.dim]
+        inputs.append(input_dict)
+
+    return onnx_model, inputs
+
+def model_simplify(model_file):
+    """
+    simplify onnx model
+    """
+    if model_file.split('.')[-1] == "onnx":
+        onnx_model, inputs = parse_model_input_output(model_file)
+        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
+        input_shapes = {}
+        for input in inputs:
+            input_shapes[input['name']] = input['shape']
+    
+        onnx_model, check = onnxsim.simplify(onnx_model, overwrite_input_shapes=input_shapes)
+        assert check, "Simplified ONNX model could not be validated"
+    
+        model_file = os.path.join(os.path.dirname(model_file), 'simplified.onnx')
+        onnx.save_model(onnx_model, model_file)
+        print("[ onnx done ]")
+    elif model_file.split('.')[-1] == "tflite":
+        print("[ tflite pass ]")
+    else:
+        raise Exception(f"Unsupport type {model_file.split('.')[-1]}")
+        
+    return model_file
+
+def run_kmodel(kmodel_path, input_data):
+    print("\n---------start run kmodel---------")
+    print("Load kmodel...")
+    model_sim = nncase.Simulator()
+    with open(kmodel_path, 'rb') as f:
+        model_sim.load_model(f.read())
+    
+    print("Set input data...")
+    for i, p_d in enumerate(input_data):
+        model_sim.set_input_tensor(i, nncase.RuntimeTensor.from_numpy(p_d))
+    
+    print("Run...")
+    model_sim.run()
+    
+    print("Get output result...")
+    all_result = []
+    for i in range(model_sim.outputs_size):
+        result = model_sim.get_output_tensor(i).to_numpy()
+        all_result.append(result)
+    print("----------------end-----------------")
+    return all_result
\ No newline at end of file
diff --git a/examples/user_guide/simulate.ipynb b/examples/user_guide/simulate.ipynb
new file mode 100644
index 0000000000..6a9a041eaa
--- /dev/null
+++ b/examples/user_guide/simulate.ipynb
@@ -0,0 +1,198 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82a8f9c1-c2bf-4270-9f1f-ac25c9fdd898",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install --upgrade pip\n",
+    "#!pip uninstall -y nncase\n",
+    "!pip install nncase==1.9.0.20230322 --timeout=1000\n",
+    "#from versions: 1.0.0.20211029, 1.1.0.20211203, 1.3.0.20220127, 1.4.0.20220303, 1.5.0.20220331, \n",
+    "# 1.6.0.20220505, 1.7.0.20220530, 1.7.1.20220701, 1.8.0.20220929, 1.9.0.20230322, 2.0.0.20230602, 2.1.0.20230703)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7eff82e-295c-4cce-afbc-ce64c84dc40a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nncase\n",
+    "from nncase_base_func import *\n",
+    "# from parse_model import *\n",
+    "\n",
+    "\n",
+    "def compile_kmodel(model_path, dump_path, calib_data):\n",
+    "    \"\"\"\n",
+    "    Set compile options and ptq options.\n",
+    "    Compile kmodel.\n",
+    "    Dump the compile-time result to 'compile_options.dump_dir'\n",
+    "    \"\"\"\n",
+    "    print(\"----------model simplify----------\")\n",
+    "    model_file = model_simplify(model_path)\n",
+    "\n",
+    "    print(\"---------- set  options ----------\")\n",
+    "    # import_options\n",
+    "    import_options = nncase.ImportOptions()\n",
+    "    \n",
+    "    # compile_options\n",
+    "    compile_options = nncase.CompileOptions()\n",
+    "    compile_options.target = \"k210\" # \"cpu\" \"k510\"\n",
+    "    compile_options.dump_ir = True  # if False, will not dump the compile-time result.\n",
+    "    compile_options.dump_asm = True\n",
+    "    compile_options.dump_dir = dump_path\n",
+    "\n",
+    "    # preprocess args\n",
+    "    compile_options.preprocess = True\n",
+    "    if compile_options.preprocess:\n",
+    "        compile_options.input_type = \"uint8\" # \"uint8\"\n",
+    "        compile_options.swapRB = False\n",
+    "        compile_options.input_shape = [1,224,320,3]\n",
+    "        compile_options.input_range = [0,1]\n",
+    "        compile_options.mean = [0,0,0]\n",
+    "        compile_options.std = [1,1,1]\n",
+    "        compile_options.input_layout = \"NHWC\" # \"NHWC\"\n",
+    "        compile_options.output_layout = \"NHWC\" # \"NHWC\"\n",
+    "        compile_options.letterbox_value = 0\n",
+    "    \n",
+    "    # quant args\n",
+    "    compile_options.quant_type = \"uint8\" \n",
+    "    compile_options.w_quant_type = \"uint8\"\n",
+    "    compile_options.use_mse_quant_w = True\n",
+    "    compile_options.split_w_to_act = False\n",
+    "\n",
+    "    # quant options\n",
+    "    ptq_options = nncase.PTQTensorOptions()\n",
+    "    ptq_options.calibrate_method = \"no_clip\" # \"kld_m2\" \"l2\" \"cdf\"\n",
+    "    ptq_options.samples_count = len(calib_data[0])\n",
+    "    ptq_options.set_tensor_data(np.array(calib_data).tobytes())\n",
+    "\n",
+    "    \n",
+    "    # set options\n",
+    "    compiler = nncase.Compiler(compile_options)\n",
+    "    compiler.use_ptq(ptq_options)\n",
+    "    \n",
+    "    print(\"----------   compile    ----------\")\n",
+    "    # import\n",
+    "    model_content = read_model_file(model_file)\n",
+    "    if model_path.split(\".\")[-1] == \"onnx\":\n",
+    "        compiler.import_onnx(model_content, import_options)\n",
+    "    elif model_path.split(\".\")[-1] == \"tflite\":\n",
+    "        compiler.import_tflite(model_content, import_options)\n",
+    "\n",
+    "    # compile\n",
+    "    compiler.compile()\n",
+    "    kmodel = compiler.gencode_tobytes()\n",
+    "    \n",
+    "    kmodel_path = os.path.join(dump_path, \"test.kmodel\")\n",
+    "    with open(kmodel_path, 'wb') as f:\n",
+    "        f.write(kmodel)\n",
+    "    print(\"----------  compile end ----------\")\n",
+    "    return kmodel_path\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c957fe20-99c9-4a54-bae8-38361a8f8830",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compile kmodel single input\n",
+    "model_path = \"./model_f32.tflite\"\n",
+    "dump_path = \"./tmp\"\n",
+    "\n",
+    "# If model has multi inputs, calib_data format is \"[[x1, x2,...], [y1, y2,...], ...]\"\n",
+    "# e.g. Model has three inputs (x, y, z), the calib_data is '[[x1, x2, x3],[y1, y2, y3],[z1, z2, z3]]'\n",
+    "\n",
+    "calib_data = [[np.random.rand(1,224,320,3).astype(np.float32), np.random.rand(1,224,320,3).astype(np.float32)]]\n",
+    "kmodel_path = compile_kmodel(model_path, dump_path, calib_data)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f617edc-781c-4b8b-b45d-fef2f0b36a46",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# run kmodel(simulate)\n",
+    "kmodel_path = \"./tmp/test.kmodel\"\n",
+    "input_data = [np.random.rand(1,224,320,3).astype(np.float32)]\n",
+    "\n",
+    "result = run_kmodel(kmodel_path, input_data)\n",
+    "for idx, i in enumerate(result):\n",
+    "    print(i.shape)\n",
+    "    i.tofile(os.path.join(dump_path, \"nncase_result_{}.bin\".format(idx)))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89280d3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compile kmodel multiple inputs\n",
+    "model_path = \"./decoder_100.onnx\"\n",
+    "dump_path = \"./tmp_dec\"\n",
+    "\n",
+    "# If model has multiple inputs, calib_data format is \"[[x1, x2,...], [y1, y2,...], ...]\"\n",
+    "# e.g. Model has three inputs (x, y, z), the calib_data is '[[x1, x2, x3],[y1, y2, y3],[z1, z2, z3]]'\n",
+    "\n",
+    "calib_data = [[np.random.randint(1, 5, size=[3, 100], dtype='int64'), np.random.randint(1, 5, size=[3, 100], dtype='int64')],\n",
+    "              [np.random.rand(100, 3, 192).astype(np.float32), np.random.rand(100, 3, 192).astype(np.float32)],\n",
+    "              [np.random.rand(3, 100).astype(np.float32) > 0.5, np.random.rand(3, 100).astype(np.float32) > 0.5], ]  # bool\n",
+    "\n",
+    "kmodel_path = compile_kmodel(model_path, dump_path, calib_data)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22a25a7f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# run kmodel(simulate)\n",
+    "import os\n",
+    "\n",
+    "kmodel_path = \"./tmp_dec/test.kmodel\"\n",
+    "input_data = [np.random.randint(1, 5, size=[3, 100], dtype='int64'),\n",
+    "              np.random.rand(100, 3, 192).astype(np.float32),\n",
+    "              np.random.rand(3, 100).astype(np.float32) > 0.5, ]\n",
+    "\n",
+    "result = run_kmodel(kmodel_path, input_data)\n",
+    "\n",
+    "for idx, i in enumerate(result):\n",
+    "    print(i.shape)\n",
+    "    i.tofile(os.path.join(dump_path, \"nncase_result_{}.bin\".format(idx)))\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/yolox/README.md b/examples/yolox/README.md
index 3dda17b474..893c7e7c4f 100644
--- a/examples/yolox/README.md
+++ b/examples/yolox/README.md
@@ -78,7 +78,11 @@ mv xxx.bin k210/yolox_detect_example/input.bin
 
 ## 定点模型推理测试
 
-使用最新的[裸机sdk](https://github.com/kendryte/kendryte-standalone-sdk/tree/develop),将`yolox_detect_example`拷贝到`src`目录下,然后进行编译(请参考裸机sdk使用指南,首先配置好工具链等相关环境)
+使用git clone的develop分支的[裸机sdk](https://github.com/kendryte/kendryte-standalone-sdk/tree/develop),将`yolox_detect_example`拷贝到`src`目录下.
+
+如果您目前使用的nncase版本大于1.0.0, 请参考[这里](https://github.com/kendryte/nncase/blob/master/docs/USAGE_ZH.md#部署-nncase-runtime)更新sdk中对应的nncase runtime版本(runtime版本需要与自身所使用的nncase版本相匹配).
+
+按照如下命令编译与烧录(请参考裸机sdk使用指南,首先配置好工具链等相关环境)
 ```bash
 mkdir build && cd build
 cmake .. -DPROJ=yolox_detect_example -DTOOLCHAIN=/usr/local/opt/kendryte-toolchain/bin
@@ -88,8 +92,10 @@ kflash yolox_detect_example.bin -B kd233 -p /dev/cu.usbserial-1130 -b 2000000 -t
 
 ⚠️不同的电脑上usb端口号并不一致.
 
+⚠️使用example中提供的kmodel时无需更新runtime.
+
 可能的结果:
 ![demo](demo.jpg)
 
 # 致谢
-[YOLOX](https://github.com/Megvii-BaseDetection/YOLOX)
\ No newline at end of file
+[YOLOX](https://github.com/Megvii-BaseDetection/YOLOX)
diff --git a/include/nncase/codegen/module_builder.h b/include/nncase/codegen/module_builder.h
index 55416204f7..e91594c398 100644
--- a/include/nncase/codegen/module_builder.h
+++ b/include/nncase/codegen/module_builder.h
@@ -93,6 +93,7 @@ class NNCASE_API module_builder
     section *find_section(std::string_view section_name);
     void merge_to_rdata_section(std::string_view from);
     function_call_id function_id(ir::graph *graph);
+    std::streampos get_current_entry_point();
     void set_current_entry_point(std::streampos pos);
     void set_current_function_text_end(std::streampos pos);
 
diff --git a/include/nncase/codegen/nnil_builder.h b/include/nncase/codegen/nnil_builder.h
index a453449a69..3faa2f7789 100644
--- a/include/nncase/codegen/nnil_builder.h
+++ b/include/nncase/codegen/nnil_builder.h
@@ -65,6 +65,8 @@ class NNCASE_API nnil_builder
     void emit_pow() { emit_opcode(runtime::nnil_pow); }
     void emit_clamp() { emit_opcode(runtime::nnil_clamp); }
 
+    // emit_erf
+    void emit_erf() { emit_opcode(runtime::nnil_erf); }
     void emit_ret() { emit_opcode(runtime::nnil_ret); }
 
 private:
diff --git a/include/nncase/codegen/stackvm/op_writer.h b/include/nncase/codegen/stackvm/op_writer.h
index 668aa9f314..dcff068765 100644
--- a/include/nncase/codegen/stackvm/op_writer.h
+++ b/include/nncase/codegen/stackvm/op_writer.h
@@ -1,4 +1,4 @@
-/* This file is generated by tools/stackvm_gen/IsaGen at 4/25/2022 3:29:27 PM +08:00.
+/* This file is generated by tools/stackvm_gen/IsaGen at 2023/5/9 下午5:18:43 +08:00.
  *
  * Copyright 2019-2021 Canaan Inc.
  *
@@ -1366,6 +1366,22 @@ struct op_writer<nncase::runtime::stackvm::tensor_softmax_op_t>
     }
 };
 
+template <>
+struct op_writer<nncase::runtime::stackvm::tensor_space_to_batch_op_t>
+{
+    void operator()(const nncase::runtime::stackvm::tensor_space_to_batch_op_t &op, binary_writer &writer) const
+    {
+        writer.write(static_cast<uint8_t>(op.opcode));
+        writer.write(static_cast<uint16_t>(op.funct));
+        writer.write(static_cast<uint8_t>(op.datatype));
+        writer.write(op.rshape_src);
+        writer.write(op.rstride_src);
+        writer.write(op.rstride_dest);
+        writer.write(op.rshape_block);
+        writer.write(op.rpad_crops);
+    }
+};
+
 template <>
 struct op_writer<nncase::runtime::stackvm::tensor_ternary_op_t>
 {
@@ -1449,6 +1465,97 @@ struct op_writer<nncase::runtime::stackvm::tensor_transpose_op_t>
     }
 };
 
+template <>
+struct op_writer<nncase::runtime::stackvm::tensor_gru_op_t>
+{
+    void operator()(const nncase::runtime::stackvm::tensor_gru_op_t &op, binary_writer &writer) const
+    {
+        writer.write(static_cast<uint8_t>(op.opcode));
+        writer.write(static_cast<uint16_t>(op.funct));
+        writer.write(op.input_shape_src);
+        writer.write(op.w_shape_src);
+        writer.write(op.direction);
+        writer.write(op.linear_before_reset);
+    }
+};
+
+template <>
+struct op_writer<nncase::runtime::stackvm::tensor_tflite_detection_postprocess_op_t>
+{
+    void operator()(const nncase::runtime::stackvm::tensor_tflite_detection_postprocess_op_t &op, binary_writer &writer) const
+    {
+        writer.write(static_cast<uint8_t>(op.opcode));
+        writer.write(static_cast<uint16_t>(op.funct));
+        writer.write(op.box_shape_src);
+        writer.write(op.score_shape_src);
+        writer.write(op.anchor_shape_src);
+        writer.write(op.max_detections);
+        writer.write(op.max_classes_per_detection);
+        writer.write(op.detections_per_class);
+        writer.write(op.use_regular_non_max_suppression);
+        writer.write(op.nms_score_threshold);
+        writer.write(op.nms_iou_threshold);
+        writer.write(op.num_classes);
+        writer.write(op.y_scale);
+        writer.write(op.x_scale);
+        writer.write(op.h_scale);
+        writer.write(op.w_scale);
+    }
+};
+
+template <>
+struct op_writer<nncase::runtime::stackvm::tensor_layer_normalization_op_t>
+{
+    void operator()(const nncase::runtime::stackvm::tensor_layer_normalization_op_t &op, binary_writer &writer) const
+    {
+        writer.write(static_cast<uint8_t>(op.opcode));
+        writer.write(static_cast<uint16_t>(op.funct));
+        writer.write(static_cast<uint8_t>(op.datatype));
+        writer.write(op.input_shape);
+        writer.write(op.axis);
+        writer.write(op.epsilon);
+    }
+};
+
+template <>
+struct op_writer<nncase::runtime::stackvm::tensor_compress_op_t>
+{
+    void operator()(const nncase::runtime::stackvm::tensor_compress_op_t &op, binary_writer &writer) const
+    {
+        writer.write(static_cast<uint8_t>(op.opcode));
+        writer.write(static_cast<uint16_t>(op.funct));
+        writer.write(op.input_shape_src);
+        writer.write(op.condition_shape_src);
+        writer.write(op.axis);
+    }
+};
+
+template <>
+struct op_writer<nncase::runtime::stackvm::tensor_gather_elements_op_t>
+{
+    void operator()(const nncase::runtime::stackvm::tensor_gather_elements_op_t &op, binary_writer &writer) const
+    {
+        writer.write(static_cast<uint8_t>(op.opcode));
+        writer.write(static_cast<uint16_t>(op.funct));
+        writer.write(op.input_shape_src);
+        writer.write(op.indices_shape_src);
+        writer.write(op.axis);
+    }
+};
+
+template <>
+struct op_writer<nncase::runtime::stackvm::tensor_instance_normalization_op_t>
+{
+    void operator()(const nncase::runtime::stackvm::tensor_instance_normalization_op_t &op, binary_writer &writer) const
+    {
+        writer.write(static_cast<uint8_t>(op.opcode));
+        writer.write(static_cast<uint16_t>(op.funct));
+        writer.write(static_cast<uint8_t>(op.datatype));
+        writer.write(op.input_shape);
+        writer.write(op.epsilon);
+    }
+};
+
 class NNCASE_API op_builder
 {
 public:
@@ -1579,11 +1686,18 @@ class NNCASE_API op_builder
     void tensor_sigmoid_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest);
     void tensor_slice_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest, uint8_t rbegins, uint8_t rends, uint8_t rstrides);
     void tensor_softmax_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest, int32_t axis, float beta);
+    void tensor_space_to_batch_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest, uint8_t rshape_block, uint8_t rpad_crops);
     void tensor_ternary_(datatype_t datatype, uint8_t rshape_src1, uint8_t rstride_src1, uint8_t rshape_src2, uint8_t rstride_src2, uint8_t rshape_src3, uint8_t rstride_src3, uint8_t rstride_dest);
     void tensor_topk_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rshape_dest1, uint8_t rstride_dest1, uint8_t rshape_dest2, uint8_t rstride_dest2, int64_t k, int32_t axis, bool largest, bool sorted);
     void tensor_trilu_(datatype_t datatype, uint8_t rshape_src, bool upper, int64_t k);
     void tensor_unary_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest, unary_op_t unary_op);
     void tensor_transpose_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest, uint8_t rshape_perm);
+    void tensor_gru_(uint8_t input_shape_src, uint8_t w_shape_src, uint8_t direction, bool linear_before_reset);
+    void tensor_tflite_detection_postprocess_(uint8_t box_shape_src, uint8_t score_shape_src, uint8_t anchor_shape_src, int32_t max_detections, int32_t max_classes_per_detection, int32_t detections_per_class, bool use_regular_non_max_suppression, float nms_score_threshold, float nms_iou_threshold, int32_t num_classes, float y_scale, float x_scale, float h_scale, float w_scale);
+    void tensor_layer_normalization_(datatype_t datatype, uint8_t input_shape, int32_t axis, float epsilon);
+    void tensor_compress_(uint8_t input_shape_src, uint8_t condition_shape_src, float axis);
+    void tensor_gather_elements_(uint8_t input_shape_src, uint8_t indices_shape_src, int32_t axis);
+    void tensor_instance_normalization_(datatype_t datatype, uint8_t input_shape, float epsilon);
 
 private:
     section_writer &writer_;
diff --git a/include/nncase/ir/graph.h b/include/nncase/ir/graph.h
index 800833ca9c..5486b33b8c 100644
--- a/include/nncase/ir/graph.h
+++ b/include/nncase/ir/graph.h
@@ -72,7 +72,7 @@ class NNCASE_API graph
     void dce();
     void cse();
     void merge_module_regions();
-    split_graph_result split_subgraph(std::span<node *const> nodes);
+    split_graph_result split_subgraph(std::span<node *const> nodes, bool reorder_input = false);
     graph &add_subgraph(std::unique_ptr<graph> subgraph);
 
 private:
diff --git a/include/nncase/ir/ir_types.h b/include/nncase/ir/ir_types.h
index 149e6d4b14..77582cb97c 100644
--- a/include/nncase/ir/ir_types.h
+++ b/include/nncase/ir/ir_types.h
@@ -30,7 +30,8 @@ enum node_attributes
     node_attr_need_quantize = 2,
     node_attr_fuse_input_slice = 4,
     node_attr_fuse_output_concat = 8,
-    node_attr_skip_constant_folding = 16
+    node_attr_skip_constant_folding = 16,
+    node_attr_skip_quantize = 32,
 };
 
 enum connector_attributes
diff --git a/include/nncase/ir/op_utils.h b/include/nncase/ir/op_utils.h
index 07aa5abc0b..0f379661d5 100644
--- a/include/nncase/ir/op_utils.h
+++ b/include/nncase/ir/op_utils.h
@@ -75,10 +75,22 @@ inline size_t get_bytes(datatype_t type, const shape_t &shape)
     return xt::compute_size(shape) * get_bytes(type);
 }
 
+template <class shape_type, class strides_type>
+inline void compute_strides(const shape_type &shape, strides_type &strides)
+{
+    using strides_value_type = typename std::decay_t<strides_type>::value_type;
+    strides_value_type data_size = 1;
+    for (std::size_t i = shape.size(); i != 0; --i)
+    {
+        strides[i - 1] = data_size;
+        data_size = strides[i - 1] * static_cast<strides_value_type>(shape[i - 1]);
+    }
+}
+
 inline nncase::ir::shape_t to_strides(const nncase::ir::shape_t &shape)
 {
     nncase::ir::shape_t strides(shape.size());
-    xt::compute_strides(shape, xt::layout_type::row_major, strides);
+    compute_strides(shape, strides);
     return strides;
 }
 
@@ -373,6 +385,13 @@ inline bool is_simple_slice(const axis_t &begin, const axis_t &end, const axis_t
     return is_simple_slice;
 }
 
+inline shape_t get_instancenorm_const_shape(const shape_t &in_shape)
+{
+    shape_t const_shape(in_shape.size() - 1, 1);
+    const_shape[0] = in_shape[1];
+    return const_shape;
+}
+
 inline bool is_axis0_squeeze_or_expand_dim_bitcast(const shape_t &in_shape, const shape_t &out_shape)
 {
     auto in_begin = std::find_if_not(in_shape.begin(), in_shape.end(), [](size_t dim) { return dim == 1; });
diff --git a/include/nncase/ir/opcode.def b/include/nncase/ir/opcode.def
index 9d656e5008..0fd5ccfb26 100644
--- a/include/nncase/ir/opcode.def
+++ b/include/nncase/ir/opcode.def
@@ -46,4 +46,10 @@ DEFINE_NEUTRAL_OPCODE(trilu,                Trilu,              0x124)
 DEFINE_NEUTRAL_OPCODE(sigmoid,              Sigmoid,            0x125)
 DEFINE_NEUTRAL_OPCODE(roi_align,            RoiAlign,           0x126)
 DEFINE_NEUTRAL_OPCODE(compare,              Compare,            0x127)
-DEFINE_NEUTRAL_OPCODE(softmax,              Softmax,            0x128)
\ No newline at end of file
+DEFINE_NEUTRAL_OPCODE(softmax,              Softmax,            0x128)
+DEFINE_NEUTRAL_OPCODE(gru,                  GRU,                0x129)
+DEFINE_NEUTRAL_OPCODE(tflite_detection_postprocess,             TfliteDetectionPostprocess,                0x12A)
+DEFINE_NEUTRAL_OPCODE(layernorm, 			LayerNormalization, 0x12B)
+DEFINE_NEUTRAL_OPCODE(compress,             Compress,           0x12C)
+DEFINE_NEUTRAL_OPCODE(gather_elements,      GatherElements,     0x12D)
+DEFINE_NEUTRAL_OPCODE(instancenorm,                             InstanceNormliaztion,                      0x12E)
diff --git a/include/nncase/ir/ops/compress.h b/include/nncase/ir/ops/compress.h
new file mode 100644
index 0000000000..1ee9282a91
--- /dev/null
+++ b/include/nncase/ir/ops/compress.h
@@ -0,0 +1,40 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../node.h"
+#include <xtensor/xtensor.hpp>
+
+namespace nncase::ir
+{
+class NNCASE_API compress : public node
+{
+public:
+    DEFINE_NODE_OPCODE(op_compress);
+
+    input_connector &input() { return input_at(0); }
+    input_connector &condition() { return input_at(1); }
+    output_connector &output() { return output_at(0); }
+
+    int32_t axis() const noexcept { return axis_; }
+
+    compress(datatype_t type, shape_t input_shape, shape_t condition_shape, shape_t output_shape, int32_t axis);
+
+protected:
+    bool properties_equal(node &other) const override;
+
+private:
+    int32_t axis_;
+};
+}
diff --git a/include/nncase/ir/ops/gather_elements.h b/include/nncase/ir/ops/gather_elements.h
new file mode 100644
index 0000000000..fd9bf44c95
--- /dev/null
+++ b/include/nncase/ir/ops/gather_elements.h
@@ -0,0 +1,40 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../node.h"
+#include <xtensor/xtensor.hpp>
+
+namespace nncase::ir
+{
+class NNCASE_API gather_elements : public node
+{
+public:
+    DEFINE_NODE_OPCODE(op_gather_elements);
+
+    input_connector &input() { return input_at(0); }
+    input_connector &indices() { return input_at(1); }
+    output_connector &output() { return output_at(0); }
+
+    int32_t axis() const noexcept { return axis_; }
+
+    gather_elements(datatype_t in_type, datatype_t indices_type, shape_t input_shape, shape_t indices_shape, shape_t output_shape, int32_t axis);
+
+protected:
+    bool properties_equal(node &other) const override;
+
+private:
+    int32_t axis_;
+};
+}
diff --git a/include/nncase/ir/ops/gru.h b/include/nncase/ir/ops/gru.h
new file mode 100644
index 0000000000..6c27e9e87a
--- /dev/null
+++ b/include/nncase/ir/ops/gru.h
@@ -0,0 +1,50 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../node.h"
+#include <xtensor/xtensor.hpp>
+
+namespace nncase::ir
+{
+class NNCASE_API gru : public node
+{
+public:
+    DEFINE_NODE_OPCODE(op_gru);
+
+    input_connector &input() { return input_at(0); }
+    input_connector &w() { return input_at(1); }
+    input_connector &r() { return input_at(2); }
+    input_connector &b() { return input_at(3); }
+    input_connector &initial_h() { return input_at(4); }
+    input_connector &initial_c() { return input_at(5); }
+    output_connector &output() { return output_at(0); }
+    output_connector &output_h() { return output_at(1); }
+
+    lstm_direction direction() const noexcept { return direction_; }
+    std::string framework() const noexcept { return framework_; }
+    bool linear_before_reset() const noexcept { return linear_before_reset_; }
+
+    gru(shape_t input_shape, shape_t w_shape, shape_t r_shape, shape_t b_shape, shape_t output_shape,
+        shape_t output_h_shape, lstm_direction direction, std::string framework, bool linear_before_reset);
+
+protected:
+    bool properties_equal(node &other) const override;
+
+private:
+    lstm_direction direction_;
+    std::string framework_;
+    bool linear_before_reset_;
+};
+}
diff --git a/include/nncase/ir/ops/instancenorm.h b/include/nncase/ir/ops/instancenorm.h
new file mode 100644
index 0000000000..9902f99bc0
--- /dev/null
+++ b/include/nncase/ir/ops/instancenorm.h
@@ -0,0 +1,39 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../node.h"
+#include "nncase/ir/connectors.h"
+
+namespace nncase::ir
+{
+class NNCASE_API instancenorm : public node
+{
+public:
+    DEFINE_NODE_OPCODE(op_instancenorm);
+
+    input_connector &input() { return input_at(0); }
+    input_connector &scale() { return input_at(1); }
+    input_connector &bias() { return input_at(2); }
+    output_connector &output() { return output_at(0); }
+    float epsilon() const noexcept { return epsilon_; }
+    instancenorm(datatype_t input_type, shape_t input_shape, float epsilon);
+
+protected:
+    bool properties_equal(node &other) const override;
+
+private:
+    float epsilon_;
+};
+}
diff --git a/include/nncase/ir/ops/layernorm.h b/include/nncase/ir/ops/layernorm.h
new file mode 100644
index 0000000000..79581c6f92
--- /dev/null
+++ b/include/nncase/ir/ops/layernorm.h
@@ -0,0 +1,42 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../node.h"
+#include "nncase/ir/connectors.h"
+
+namespace nncase::ir
+{
+class NNCASE_API layernorm : public node
+{
+public:
+    DEFINE_NODE_OPCODE(op_layernorm);
+
+    input_connector &input() { return input_at(0); }
+    input_connector &scale() { return input_at(1); }
+    input_connector &bias() { return input_at(2); }
+    output_connector &output() { return output_at(0); }
+    int32_t axis() const noexcept { return axis_; }
+    float epsilon() const noexcept { return epsilon_; }
+    layernorm(datatype_t input_type, shape_t input_shape, int32_t axis, float epsilon);
+
+protected:
+    bool properties_equal(node &other) const override;
+
+private:
+    int32_t axis_;
+    float epsilon_;
+    //todo: support stash_type
+};
+}
diff --git a/include/nncase/ir/ops/tflite_detection_postprocess.h b/include/nncase/ir/ops/tflite_detection_postprocess.h
new file mode 100644
index 0000000000..1d88c3abfa
--- /dev/null
+++ b/include/nncase/ir/ops/tflite_detection_postprocess.h
@@ -0,0 +1,74 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../node.h"
+#include <xtensor/xtensor.hpp>
+
+namespace nncase::ir
+{
+class NNCASE_API tflite_detection_postprocess : public node
+{
+public:
+    DEFINE_NODE_OPCODE(op_tflite_detection_postprocess);
+
+    input_connector &boxes() { return input_at(0); }
+    input_connector &scores() { return input_at(1); }
+    input_connector &anchors() { return input_at(2); }
+    output_connector &output_locations() { return output_at(0); }
+    output_connector &output_classes() { return output_at(1); }
+    output_connector &output_scores() { return output_at(2); }
+    output_connector &output_num_detections() { return output_at(3); }
+
+    int32_t max_detections() const noexcept { return max_detections_; }
+    int32_t max_classes_per_detection() const noexcept { return max_classes_per_detection_; }
+    int32_t detections_per_class() const noexcept { return detections_per_class_; }
+    bool use_regular_non_max_suppression() const noexcept { return use_regular_non_max_suppression_; }
+    float nms_score_threshold() const noexcept { return nms_score_threshold_; }
+    float nms_iou_threshold() const noexcept { return nms_iou_threshold_; };
+    int32_t num_classes() const noexcept { return num_classes_; };
+    float y_scale() const noexcept { return y_scale_; };
+    float x_scale() const noexcept { return x_scale_; };
+    float h_scale() const noexcept { return h_scale_; };
+    float w_scale() const noexcept { return w_scale_; };
+
+    tflite_detection_postprocess(
+        shape_t boxes_shape, shape_t scores_shape, shape_t anchors_shape,
+        shape_t output_shape_0, shape_t output_shape_1, shape_t output_shape_2, shape_t output_shape_3,
+        int32_t max_detections,
+        int32_t max_classes_per_detection,
+        int32_t detections_per_class,
+        bool use_regular_non_max_suppression,
+        float nms_score_threshold,
+        float nms_iou_threshold,
+        int32_t num_classes,
+        float y_scale, float x_scale, float h_scale, float w_scale);
+
+protected:
+    bool properties_equal(node &other) const override;
+
+private:
+    int32_t max_detections_;
+    int32_t max_classes_per_detection_;
+    int32_t detections_per_class_;
+    bool use_regular_non_max_suppression_;
+    float nms_score_threshold_;
+    float nms_iou_threshold_;
+    int32_t num_classes_;
+    float y_scale_;
+    float x_scale_;
+    float h_scale_;
+    float w_scale_;
+};
+}
diff --git a/include/nncase/ir/quantizer.h b/include/nncase/ir/quantizer.h
index dcf8147619..39a83243b1 100644
--- a/include/nncase/ir/quantizer.h
+++ b/include/nncase/ir/quantizer.h
@@ -100,8 +100,8 @@ class NNCASE_API quantizer
             auto r = range.max - range.min;
             if (r == 0)
                 r = 0.1f;
-            else if (r < 0.01f)
-                r = 0.01f;
+            // else if (r < 0.01f)
+            //     r = 0.01f;
             range.max = range.min + r;
         }
 
diff --git a/include/nncase/kernels/cpu/optimized/tensor_compute.h b/include/nncase/kernels/cpu/optimized/tensor_compute.h
index 2b19afea12..89b81f34f1 100644
--- a/include/nncase/kernels/cpu/optimized/tensor_compute.h
+++ b/include/nncase/kernels/cpu/optimized/tensor_compute.h
@@ -85,4 +85,19 @@ template <typename T>
 NNCASE_API result<void> sigmoid(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides,
     const runtime_shape_t &out_strides) noexcept;
 
+template <typename T>
+NNCASE_API result<void> instancenorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, float epsilon) noexcept;
+
+template <typename T>
+NNCASE_API result<void> layernorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept;
+
+template <typename T>
+NNCASE_API result<void> ternary(const float *input_a, const T *input_b, const T *input_c, T *output,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
+    const runtime_shape_t &out_strides) noexcept;
+
+template <typename T>
+result<void> reduce(reduce_op_t op, T init_value, const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept;
 END_NS_NNCASE_KERNELS_CPU_OPT
diff --git a/include/nncase/kernels/cpu/reference/tensor_compute.h b/include/nncase/kernels/cpu/reference/tensor_compute.h
index aba8774ec9..bc5c3c14db 100644
--- a/include/nncase/kernels/cpu/reference/tensor_compute.h
+++ b/include/nncase/kernels/cpu/reference/tensor_compute.h
@@ -13,140 +13,255 @@
  * limitations under the License.
  */
 #pragma once
+
 #include "runtime_types.h"
 #include <nncase/kernels/kernel_context.h>
 
 BEGIN_NS_NNCASE_KERNELS_CPU_REF
 
-NNCASE_API result<void> batch_to_space(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
-    const runtime_shape_t &block_shape, const runtime_paddings_t &crops, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides,
+NNCASE_API result<void> batch_to_space(datatype_t type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape,
+    const runtime_shape_t &block_shape,
+    const runtime_paddings_t &crops,
+    const runtime_shape_t &in_strides,
+    const runtime_shape_t &out_strides,
     kernel_context &context) noexcept;
 
-NNCASE_API result<void> broadcast(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
-    const runtime_shape_t &in_strides, const runtime_shape_t &out_shape, const runtime_shape_t &out_strides, kernel_context &context) noexcept;
+NNCASE_API result<void>
+broadcast(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_shape,
+    const runtime_shape_t &out_strides, kernel_context &context) noexcept;
 
-NNCASE_API result<void> concat(datatype_t type, gsl::span<const gsl::byte *const> inputs, gsl::byte *output, const runtime_shape_t &out_shape,
-    gsl::span<const runtime_shape_t> in_strides, const runtime_shape_t &out_strides, size_t axis, const runtime_shape_t &concat_dims,
+NNCASE_API result<void>
+concat(datatype_t type, gsl::span<const gsl::byte *const> inputs, gsl::byte *output,
+    const runtime_shape_t &out_shape,
+    gsl::span<const runtime_shape_t> in_strides, const runtime_shape_t &out_strides, size_t axis,
+    const runtime_shape_t &concat_dims,
     kernel_context &context) noexcept;
 
-NNCASE_API result<void> convert(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
-    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context) noexcept;
+NNCASE_API result<void>
+convert(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides,
+    const runtime_shape_t &out_strides, kernel_context &context) noexcept;
 
 NNCASE_API result<void> copy(datatype_t type, const gsl::byte *src, gsl::byte *dest,
-    const runtime_shape_t &shape, const runtime_shape_t &src_strides, const runtime_shape_t &dest_strides, kernel_context &context) noexcept;
+    const runtime_shape_t &shape, const runtime_shape_t &src_strides,
+    const runtime_shape_t &dest_strides, kernel_context &context) noexcept;
 
-NNCASE_API result<void> transpose(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
-    const runtime_shape_t &perm, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context) noexcept;
+NNCASE_API result<void>
+transpose(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &perm, const runtime_shape_t &in_strides,
+    const runtime_shape_t &out_strides, kernel_context &context) noexcept;
 
 template <typename T>
 NNCASE_API result<void> binary(binary_op_t op, const T *input_a, const T *input_b, T *output,
-    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
-    const runtime_shape_t &in_b_strides, const runtime_shape_t &out_shape, const runtime_shape_t &out_strides, value_range<float> fused_activation, kernel_context &context) noexcept;
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides,
+    const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &out_shape,
+    const runtime_shape_t &out_strides, value_range<float> fused_activation,
+    kernel_context &context) noexcept;
 
-NNCASE_API result<void> dequantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
-    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, float scale, float bias,
+NNCASE_API result<void>
+dequantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides,
+    const runtime_shape_t &out_strides, float scale, float bias,
     kernel_context &context) noexcept;
 
 template <typename T>
 NNCASE_API result<void> compare(compare_op_t op, const T *input_a, const T *input_b, bool *output,
     const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides,
     const runtime_shape_t &in_b_shape, const runtime_shape_t &in_b_strides,
-    const runtime_shape_t &out_shape, const runtime_shape_t &out_strides) noexcept;
+    const runtime_shape_t &out_shape,
+    const runtime_shape_t &out_strides) noexcept;
 
-NNCASE_API result<void> lut1d(datatype_t type, const gsl::byte *input, const gsl::byte *table, gsl::byte *output, const runtime_shape_t &shape,
-    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const scalar &min, const scalar &max) noexcept;
+NNCASE_API result<void>
+lut1d(datatype_t type, const gsl::byte *input, const gsl::byte *table, gsl::byte *output,
+    const runtime_shape_t &shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const scalar &min,
+    const scalar &max) noexcept;
 
 template <typename T>
 NNCASE_API result<void> matmul(const T *input_a, const T *input_b, const T *bias, T *output,
-    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
-    const runtime_shape_t &in_b_strides, const runtime_shape_t &out_shape, const runtime_shape_t &out_strides,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides,
+    const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &out_shape,
+    const runtime_shape_t &out_strides,
     value_range<float> fused_activation) noexcept;
 
-NNCASE_API result<void> onehot(datatype_t type, const int32_t *indices, gsl::byte *output, const runtime_shape_t &indices_shape, const runtime_shape_t &out_shape,
-    const runtime_shape_t &out_strides, gsl::byte *depth, gsl::byte *off_value, gsl::byte *on_value, size_t axis, onehot_mode_t mode, kernel_context &context) noexcept;
+NNCASE_API result<void>
+onehot(datatype_t type, const int32_t *indices, gsl::byte *output, const runtime_shape_t &indices_shape,
+    const runtime_shape_t &out_shape,
+    const runtime_shape_t &out_strides, gsl::byte *depth, gsl::byte *off_value, gsl::byte *on_value,
+    size_t axis, onehot_mode_t mode, kernel_context &context) noexcept;
 
-NNCASE_API result<void> pad(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
-    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_paddings_t &paddings, pad_mode_t mode, const scalar &pad_value,
+NNCASE_API result<void>
+pad(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides,
+    const runtime_paddings_t &paddings, pad_mode_t mode, const scalar &pad_value,
     kernel_context &context) noexcept;
 
-NNCASE_API result<void> quantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
-    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, float scale, float bias,
+NNCASE_API result<void>
+quantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape, const runtime_shape_t &in_strides,
+    const runtime_shape_t &out_strides, float scale, float bias,
     kernel_context &context) noexcept;
 
-NNCASE_API result<void> unary(unary_op_t op, const float *input, float *output, const runtime_shape_t &shape,
-    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context) noexcept;
+NNCASE_API result<void>
+unary(unary_op_t op, const float *input, float *output, const runtime_shape_t &shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides,
+    kernel_context &context) noexcept;
 
 template <typename T>
-NNCASE_API result<void> reduce(reduce_op_t op, T init_value, const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
-    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept;
+NNCASE_API result<void>
+reduce(reduce_op_t op, T init_value, const T *input, T *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &axis,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims,
+    kernel_context &context) noexcept;
 
 template <typename T>
-NNCASE_API result<void> reduce_arg(reduce_arg_op_t op, const float *input, T *output, const runtime_shape_t &in_shape,
+NNCASE_API result<void>
+reduce_arg(reduce_arg_op_t op, const float *input, T *output, const runtime_shape_t &in_shape,
     const runtime_shape_t &in_strides, const runtime_shape_t &out_strides,
-    const runtime_shape_t &axis, bool keep_dims, bool select_last_idx, kernel_context &context) noexcept;
+    const runtime_shape_t &axis, bool keep_dims, bool select_last_idx,
+    kernel_context &context) noexcept;
 
 template <typename T>
 result<void> reduce_prod(const T *input, T *output, const runtime_shape_t &in_shape,
     const runtime_shape_t &in_strides, const runtime_shape_t &out_strides,
     const runtime_shape_t &axes, bool keep_dims) noexcept;
 
-NNCASE_API result<void> resize_bilinear(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
-    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, int32_t out_h, int32_t out_w, bool align_corners, bool half_pixel_centers,
+NNCASE_API result<void> resize_bilinear(datatype_t type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape,
+    const runtime_shape_t &in_strides,
+    const runtime_shape_t &out_strides, int32_t out_h,
+    int32_t out_w, bool align_corners, bool half_pixel_centers,
     kernel_context &context) noexcept;
 
-NNCASE_API result<void> resize_nearest_neighbor(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
-    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, int32_t out_h, int32_t out_w, bool align_corners, bool half_pixel_centers,
+NNCASE_API result<void>
+resize_nearest_neighbor(datatype_t type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides,
+    int32_t out_h, int32_t out_w, bool align_corners, bool half_pixel_centers,
     kernel_context &context) noexcept;
 
-NNCASE_API result<void> slice(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
-    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_shape_t &begins, const runtime_axis_t &ends, const runtime_axis_t &strides,
+NNCASE_API result<void>
+slice(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides,
+    const runtime_shape_t &begins, const runtime_axis_t &ends, const runtime_axis_t &strides,
     kernel_context &context) noexcept;
 
-NNCASE_API result<void> gather(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, const runtime_shape_t &out_shape,
-    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const int32_t *indices, const runtime_shape_t &indices_shape, size_t axis, kernel_context &context) noexcept;
+NNCASE_API result<void>
+gather(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &out_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const int32_t *indices,
+    const runtime_shape_t &indices_shape, size_t axis, kernel_context &context) noexcept;
 
-NNCASE_API result<void> gather_nd(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, const runtime_shape_t &out_shape,
-    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const int32_t *indices, const runtime_shape_t &indices_shape, size_t batch_dims, kernel_context &context) noexcept;
+NNCASE_API result<void>
+gather_nd(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &out_shape,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const int32_t *indices,
+    const runtime_shape_t &indices_shape, size_t batch_dims, kernel_context &context) noexcept;
 
 template <typename T>
 NNCASE_API result<void> cumsum(const T *input, T *output, const runtime_shape_t &in_shape,
     int32_t axis, bool exclusive, bool reverse) noexcept;
 
 template <typename T>
-NNCASE_API result<void> hardmax(const T *input, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides,
+NNCASE_API result<void>
+hardmax(const T *input, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides,
     T *output, int32_t axis) noexcept;
 
 template <typename T>
-NNCASE_API result<void> random_normal(T *output, const runtime_shape_t &out_shape, float mean, float std, float seed) noexcept;
+NNCASE_API result<void>
+random_normal(T *output, const runtime_shape_t &out_shape, float mean, float std, float seed) noexcept;
 
 template <typename T>
-NNCASE_API result<void> random_uniform(T *output, const runtime_shape_t &out_shape, float low, float high, float seed) noexcept;
+NNCASE_API result<void>
+random_uniform(T *output, const runtime_shape_t &out_shape, float low, float high, float seed) noexcept;
 
 template <typename T>
-NNCASE_API result<void> roi_align(const T *input, const T *rois, int64_t *batch_indices, T *output, const runtime_shape_t &in_shape,
-    const runtime_shape_t &out_shape, roi_align_mode_t mode, float spatial_scale, int64_t sampling_ratio) noexcept;
+NNCASE_API result<void> roi_align(const T *input, const T *rois, int64_t *batch_indices, T *output,
+    const runtime_shape_t &in_shape,
+    const runtime_shape_t &out_shape, roi_align_mode_t mode,
+    float spatial_scale, int64_t sampling_ratio) noexcept;
 
 template <typename T>
-NNCASE_API result<void> sigmoid(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides) noexcept;
+NNCASE_API result<void>
+sigmoid(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides,
+    const runtime_shape_t &out_strides) noexcept;
 
 template <typename T>
-NNCASE_API result<void> softmax(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides,
+NNCASE_API result<void>
+softmax(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides,
     const runtime_shape_t &out_strides, int32_t axis, float beta) noexcept;
 
 template <typename T>
 NNCASE_API result<void> ternary(const float *input_a, const T *input_b, const T *input_c, T *output,
-    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
-    const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides,
+    const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape,
+    const runtime_shape_t &in_c_strides,
     const runtime_shape_t &out_strides) noexcept;
 
 template <typename T>
 NNCASE_API result<void> topk(const T *input, T *output_values, int64_t *output_indices,
     const runtime_shape_t &in_shape, const runtime_shape_t &in_strides,
-    const runtime_shape_t &output_values_shape, const runtime_shape_t &output_values_strides,
-    const runtime_shape_t &output_indices_shape, const runtime_shape_t &output_indices_strides,
-    const int64_t k, const int32_t axis, const bool largest, const bool sorted) noexcept;
+    const runtime_shape_t &output_values_shape,
+    const runtime_shape_t &output_values_strides,
+    const runtime_shape_t &output_indices_shape,
+    const runtime_shape_t &output_indices_strides,
+    const int64_t k, const int32_t axis, const bool largest,
+    const bool sorted) noexcept;
+
+template <typename T>
+NNCASE_API result<void>
+trilu(const T *input, T *output, const runtime_shape_t &in_shape, const bool upper,
+    const int64_t k) noexcept;
+
+template <typename T>
+NNCASE_API result<void>
+gru(const T *input, const T *w, const T *r, const T *b, T *initial_h, T *output, T *output_h,
+    const runtime_shape_t &input_shape, const runtime_shape_t &w_shape, int mode, bool linear_before_reset) noexcept;
+
+template <typename T>
+NNCASE_API result<void>
+tflite_detection_postprocess(const T *boxes, const T *scores, const T *anchors, T *output_locations,
+    T *output_classes, T *output_scores, T *output_num_detections,
+    const runtime_shape_t &boxes_shape, const runtime_shape_t &scores_shape,
+    const runtime_shape_t &anchors_shape,
+    const int32_t max_detections, const int32_t max_classes_per_detection,
+    const int32_t detections_per_class,
+    const bool use_regular_non_max_suppression,
+    const float nms_score_threshold, const float nms_iou_threshold,
+    const int32_t num_classes, const float y_scale, const float x_scale,
+    const float h_scale, const float w_scale) noexcept;
+
+NNCASE_API result<void> space_to_batch(datatype_t type, const gsl::byte *input, gsl::byte *output,
+    const runtime_shape_t &in_shape,
+    const runtime_shape_t &block_shape,
+    const runtime_paddings_t &paddings,
+    const runtime_shape_t &in_strides,
+    const runtime_shape_t &out_strides,
+    kernel_context &context) noexcept;
+
+template <typename TI, typename TK>
+NNCASE_API result<void>
+gather_elements(const TI *input, const TK *indices, TI *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &indices_shape, const int axis) noexcept;
+
+template <typename T>
+NNCASE_API result<void>
+instancenorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape,
+    float epsilon) noexcept;
+
+template <typename T>
+NNCASE_API result<void>
+layernorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, int32_t axis,
+    float epsilon) noexcept;
 
 template <typename T>
-NNCASE_API result<void> trilu(const T *input, T *output, const runtime_shape_t &in_shape, const bool upper, const int64_t k) noexcept;
+NNCASE_API result<void>
+compress(const T *input, const uint8_t *condition, T *output, const runtime_shape_t &input_shape,
+    const runtime_shape_t &condition_shape, const int axis) noexcept;
 
 END_NS_NNCASE_KERNELS_CPU_REF
diff --git a/include/nncase/kernels/neutral/neutral_kernels.h b/include/nncase/kernels/neutral/neutral_kernels.h
index 2a1ee447d5..28add99dbb 100644
--- a/include/nncase/kernels/neutral/neutral_kernels.h
+++ b/include/nncase/kernels/neutral/neutral_kernels.h
@@ -816,4 +816,575 @@ inline void table_lookup1d(const uint8_t *CXX_RESTRICT input, uint8_t *CXX_RESTR
     for (size_t i = 0; i < size; i++)
         output[i] = table[input[i]];
 }
+
+template <class T, class TShape>
+void gru(const T *CXX_RESTRICT input, const T *CXX_RESTRICT w, const T *CXX_RESTRICT r, const T *CXX_RESTRICT b, T *CXX_RESTRICT initial_h, T *CXX_RESTRICT output, T *CXX_RESTRICT output_h,
+    const runtime_shape_t &input_shape, const runtime_shape_t &w_shape, int mode)
+{
+    const int seq_length = input_shape[0];
+    const int batch_size = input_shape[1];
+    const int input_size = input_shape[2];
+    const int num_direction = w_shape[0];
+    const int hidden_size = w_shape[1] / 3;
+
+    auto sigmoid = [&](float x) {
+        return 1 / (1 + std::exp(-x));
+    };
+    auto tanh = [&](float x) {
+        return std::tanh(x);
+    };
+    runtime_shape_t out_shape { (size_t)seq_length, (size_t)num_direction, (size_t)batch_size, (size_t)hidden_size };
+
+    auto x_gate_size = batch_size * input_size;
+    auto w_gate_size = 3 * hidden_size * input_size;
+    auto h_t_size = batch_size * hidden_size;
+    auto r_gate_size = 3 * hidden_size * hidden_size;
+
+    auto tmp_a = std::vector<float>(batch_size * hidden_size, 0.f);
+    auto tmp_b = std::vector<float>(batch_size * hidden_size, 0.f);
+    auto gate_z = std::vector<float>(batch_size * hidden_size, 0.f);
+    auto gate_r = std::vector<float>(batch_size * hidden_size, 0.f);
+    auto gate_h = std::vector<float>(batch_size * hidden_size, 0.f);
+
+    std::vector<int> seq_len_loop;
+    for (int l = 0; l < seq_length; l++)
+        seq_len_loop.push_back(l);
+    if (mode == lstm_direction::kReverse)
+        std::reverse(seq_len_loop.begin(), seq_len_loop.end());
+    auto x_i = input;
+    auto h_t = initial_h;
+    auto w_i = w;
+    auto r_i = r;
+    auto b_i = b;
+    for (int d = 0; d < num_direction; d++)
+    {
+        h_t = initial_h + d * h_t_size;
+        w_i = w + d * w_gate_size;
+        r_i = r + d * r_gate_size;
+        b_i = b + d * 6 * hidden_size;
+        if (d == 1)
+            std::reverse(seq_len_loop.begin(), seq_len_loop.end());
+        for (auto i : seq_len_loop)
+        {
+            x_i = input + i * x_gate_size;
+            // clean gate_z gate_r gate_h
+            std::fill(gate_z.begin(), gate_z.end(), 0.f);
+            std::fill(gate_r.begin(), gate_r.end(), 0.f);
+            std::fill(gate_h.begin(), gate_h.end(), 0.f);
+
+            // clean tmp_a tmp_b
+            std::fill(tmp_a.begin(), tmp_a.end(), 0.f);
+            std::fill(tmp_b.begin(), tmp_b.end(), 0.f);
+            // gate_z = x_i * w_i_z + b_w_z + h_t *r_i_z + b_r_z
+            for (int bs = 0; bs < batch_size; bs++)
+            {
+                for (int hs = 0; hs < hidden_size; hs++)
+                {
+                    for (int is = 0; is < input_size; is++)
+                    {
+                        tmp_a[bs * hidden_size + hs] += x_i[bs * input_size + is] * w_i[hs * input_size + is];
+                    }
+                    tmp_a[bs * hidden_size + hs] += b_i[hs];
+                    for (int rs = 0; rs < hidden_size; rs++)
+                    {
+                        tmp_b[bs * hidden_size + hs] += h_t[bs * hidden_size + rs] * r_i[hs * hidden_size + rs];
+                    }
+                    tmp_b[bs * hidden_size + hs] += b_i[3 * hidden_size + hs];
+                    gate_z[bs * hidden_size + hs] = tmp_a[bs * hidden_size + hs] + tmp_b[bs * hidden_size + hs];
+                }
+            }
+            // gate_z = sigmoid(gate_z);
+            std::transform(gate_z.begin(), gate_z.end(), gate_z.begin(), sigmoid);
+
+            // clear tmp_a tmp_b
+            std::fill(tmp_a.begin(), tmp_a.end(), 0.f);
+            std::fill(tmp_b.begin(), tmp_b.end(), 0.f);
+            // gate_r = x_i * w_i_r + b_w_r + h_t *r_i_r + b_r_r
+            for (int bs = 0; bs < batch_size; bs++)
+            {
+                for (int hs = 0; hs < hidden_size; hs++)
+                {
+                    for (int is = 0; is < input_size; is++)
+                    {
+                        tmp_a[bs * hidden_size + hs] += x_i[bs * input_size + is] * w_i[hidden_size * input_size + hs * input_size + is];
+                    }
+                    tmp_a[bs * hidden_size + hs] += b_i[hidden_size + hs];
+                    for (int rs = 0; rs < hidden_size; rs++)
+                    {
+                        tmp_b[bs * hidden_size + hs] += h_t[bs * hidden_size + rs] * r_i[hidden_size * hidden_size + hs * hidden_size + rs];
+                    }
+                    tmp_b[bs * hidden_size + hs] += b_i[4 * hidden_size + hs];
+                    gate_r[bs * hidden_size + hs] = tmp_a[bs * hidden_size + hs] + tmp_b[bs * hidden_size + hs];
+                }
+            }
+            // gate_r = sigmoid(gate_r);
+            std::transform(gate_r.begin(), gate_r.end(), gate_r.begin(), sigmoid);
+
+            // clear tmp_a tmp_b
+            std::fill(tmp_a.begin(), tmp_a.end(), 0.f);
+            std::fill(tmp_b.begin(), tmp_b.end(), 0.f);
+            // gate_h = x_i * w_i_h + b_w_h + gate_r·h_t *r_i_h + b_r_h
+            for (int bs = 0; bs < batch_size; bs++)
+            {
+                for (int hs = 0; hs < hidden_size; hs++)
+                {
+                    for (int is = 0; is < input_size; is++)
+                    {
+                        tmp_a[bs * hidden_size + hs] += x_i[bs * input_size + is] * w_i[2 * hidden_size * input_size + hs * input_size + is];
+                    }
+                    tmp_a[bs * hidden_size + hs] += b_i[2 * hidden_size + hs];
+                    for (int rs = 0; rs < hidden_size; rs++)
+                    {
+                        // if not linear
+                        tmp_b[bs * hidden_size + hs] += gate_r[bs * hidden_size + rs] * h_t[bs * hidden_size + rs] * r_i[2 * hidden_size * hidden_size + hs * hidden_size + rs];
+                        // if linear
+                        // tmp_b[bs * batch_size + hs] +=  h_t[bs * batch_size + rs] * r_i[hidden_size * hidden_size + hs * hidden_size + rs] + b_i[5 * hidden_size + hs];
+                    }
+                    tmp_b[bs * hidden_size + hs] += b_i[5 * hidden_size + hs];
+
+                    // if not linear
+                    gate_h[bs * hidden_size + hs] = tmp_a[bs * hidden_size + hs] + tmp_b[bs * hidden_size + hs];
+                    // if linear
+                    // gate_h[bs * batch_size + hs] = tmp_a[bs * batch_size + hs] + gate_r[bs * batch_size + rs] * tmp_b[bs * batch_size + hs];
+                }
+            }
+            // gate_h = tanh(gate_h);
+            std::transform(gate_h.begin(), gate_h.end(), gate_h.begin(), tanh);
+
+            for (int k = 0; k < batch_size * hidden_size; k++)
+            {
+                h_t[k] = (1 - gate_z[k]) * gate_h[k] + gate_z[k] * h_t[k];
+                // *output++ = h_t[k];
+                output[i * (num_direction * batch_size * hidden_size) + d * (batch_size * hidden_size) + k] = h_t[k];
+            }
+        }
+        // if (mode == lstm_direction::kReverse || d == 1)
+        //     h_t.reverse();
+        for (int k = 0; k < batch_size * hidden_size; k++)
+        {
+            output_h[d * (batch_size * hidden_size) + k] = h_t[k];
+        }
+    }
+}
+
+template <class T, class TShape>
+void tflite_detection_postprocess(const T *CXX_RESTRICT boxes, const T *CXX_RESTRICT scores, const T *CXX_RESTRICT anchors, T *CXX_RESTRICT output_locations, T *CXX_RESTRICT output_classes, T *CXX_RESTRICT output_scores, T *CXX_RESTRICT output_num_detections,
+    const runtime_shape_t &boxes_shape, const runtime_shape_t &scores_shape, NNCASE_UNUSED const runtime_shape_t &anchors_shape,
+    const int32_t max_detections, const int32_t max_classes_per_detection, const int32_t detections_per_class,
+    const bool use_regular_non_max_suppression, const float nms_score_threshold, const float nms_iou_threshold,
+    const int32_t num_classes, const float y_scale, const float x_scale, const float h_scale, const float w_scale)
+{
+    struct CenterSizeEncoding
+    {
+        float y;
+        float x;
+        float h;
+        float w;
+    };
+    struct BoxCornerEncoding
+    {
+        float ymin;
+        float xmin;
+        float ymax;
+        float xmax;
+    };
+    struct BoxInfo
+    {
+        int index;
+        float score;
+    };
+
+    auto compute_iou = [&](const std::vector<BoxCornerEncoding> &box, const int &i, const int &j) {
+        auto &box_i = box[i];
+        auto &box_j = box[j];
+        const float area_i = (box_i.ymax - box_i.ymin) * (box_i.xmax - box_i.xmin);
+        const float area_j = (box_j.ymax - box_j.ymin) * (box_j.xmax - box_j.xmin);
+        if (area_i <= 0 || area_j <= 0)
+            return 0.f;
+        const float intersection_y_min = std::max<float>(box_i.ymin, box_j.ymin);
+        const float intersection_x_min = std::max<float>(box_i.xmin, box_j.xmin);
+        const float intersection_y_max = std::min<float>(box_i.ymax, box_j.ymax);
+        const float intersection_x_max = std::min<float>(box_i.xmax, box_j.xmax);
+        const float intersection_area = std::max<float>(intersection_y_max - intersection_y_min, 0.0) * std::max<float>(intersection_x_max - intersection_x_min, 0.0);
+        return intersection_area / (area_i + area_j - intersection_area);
+    };
+
+    const auto num_boxes = (int)anchors_shape[0];
+    const auto num_classes_with_background = (int)scores_shape[2]; // num_classes + background
+    const auto num_detections_per_class = std::min(detections_per_class, max_detections);
+    int label_offset = num_classes_with_background - num_classes;
+    // DecodeCenterSizeBoxes： get decoded_boxes
+    std::vector<BoxCornerEncoding> decoded_boxes(boxes_shape[1]);
+    {
+
+        CenterSizeEncoding box_center_size;
+        CenterSizeEncoding scale_values { y_scale, x_scale, h_scale, w_scale };
+        CenterSizeEncoding anchor;
+
+        for (int index = 0; index < num_boxes; index++)
+        {
+            const auto box_encoding_index = index * boxes_shape[2];
+            box_center_size = *reinterpret_cast<const CenterSizeEncoding *>(boxes + box_encoding_index);
+            anchor = *reinterpret_cast<const CenterSizeEncoding *>(anchors + box_encoding_index);
+
+            auto y_center = static_cast<float>(static_cast<double>(box_center_size.y) / static_cast<double>(scale_values.y) * static_cast<double>(anchor.h) + static_cast<double>(anchor.y));
+            auto x_center = static_cast<float>(static_cast<double>(box_center_size.x) / static_cast<double>(scale_values.x) * static_cast<double>(anchor.w) + static_cast<double>(anchor.x));
+            auto half_h = static_cast<float>(0.5 * (std::exp(static_cast<double>(box_center_size.h) / static_cast<double>(scale_values.h))) * static_cast<double>(anchor.h));
+            auto half_w = static_cast<float>(0.5 * (std::exp(static_cast<double>(box_center_size.w) / static_cast<double>(scale_values.w))) * static_cast<double>(anchor.w));
+            decoded_boxes[index].ymin = y_center - half_h;
+            decoded_boxes[index].xmin = x_center - half_w;
+            decoded_boxes[index].ymax = y_center + half_h;
+            decoded_boxes[index].xmax = x_center + half_w;
+        }
+    }
+    // NMS MultiClass
+    {
+        if (use_regular_non_max_suppression)
+        {
+            // NMS Regular
+            int sorted_indices_size = 0;
+            std::vector<BoxInfo> box_info_after_regular_nms(max_detections + num_detections_per_class);
+            std::vector<int> num_selected(num_classes);
+
+            // compute nms
+            std::vector<float> class_scores(num_boxes);
+            std::vector<int> selected;
+            selected.reserve(num_detections_per_class);
+
+            for (auto col = 0; col < num_classes - 1; col++)
+            {
+                const float *scores_base = scores + col + label_offset;
+                for (int row = 0; row < num_boxes; row++)
+                {
+                    // Get scores of boxes corresponding to all anchors for single class
+                    class_scores[row] = *scores_base;
+                    scores_base += num_classes_with_background;
+                }
+                // Perform non-maximal suppression on single class
+                selected.clear();
+
+                // NMS SingleClass
+                {
+                    std::vector<int> keep_indices;
+                    std::vector<float> keep_scores;
+                    // select detection box score above score threshold
+                    {
+                        for (size_t i = 0; i < class_scores.size(); i++)
+                        {
+                            if (class_scores[i] >= nms_score_threshold)
+                            {
+                                keep_scores.emplace_back(class_scores[i]);
+                                keep_indices.emplace_back(i);
+                            }
+                        }
+                    }
+
+                    int num_scores_kept = (int)keep_scores.size();
+                    std::vector<int> sorted_indices;
+                    sorted_indices.resize(num_scores_kept);
+                    // DecreasingArgSort
+                    {
+                        std::iota(sorted_indices.begin(), sorted_indices.begin() + num_scores_kept, 0);
+                        std::stable_sort(
+                            sorted_indices.begin(), sorted_indices.begin() + num_scores_kept,
+                            [&keep_scores](const int i, const int j) { return keep_scores[i] > keep_scores[j]; });
+                    }
+
+                    const int output_size = std::min(num_scores_kept, max_detections);
+                    selected.clear();
+                    int num_active_candidate = num_scores_kept;
+                    std::vector<uint8_t> active_box_candidate(num_scores_kept, 1);
+                    for (int i = 0; i < num_scores_kept; ++i)
+                    {
+                        if (num_active_candidate == 0 || (int)selected.size() >= output_size)
+                            break;
+                        if (active_box_candidate[i] == 1)
+                        {
+                            selected.push_back(keep_indices[sorted_indices[i]]);
+                            active_box_candidate[i] = 0;
+                            num_active_candidate--;
+                        }
+                        else
+                        {
+                            continue;
+                        }
+                        for (int j = i + 1; j < num_scores_kept; ++j)
+                        {
+                            if (active_box_candidate[j] == 1)
+                            {
+
+                                float iou = compute_iou(
+                                    decoded_boxes, keep_indices[sorted_indices[i]],
+                                    keep_indices[sorted_indices[j]]);
+
+                                if (iou > nms_iou_threshold)
+                                {
+                                    active_box_candidate[j] = 0;
+                                    num_active_candidate--;
+                                }
+                            }
+                        }
+                    }
+                }
+                // end NMS SingleClass
+
+                if (selected.empty())
+                {
+                    continue;
+                }
+                for (size_t i = 0; i < selected.size(); ++i)
+                {
+                    box_info_after_regular_nms[sorted_indices_size + i].score = class_scores[selected[i]];
+                    box_info_after_regular_nms[sorted_indices_size + i].index = (selected[i] * num_classes_with_background + col + label_offset);
+                }
+
+                // In-place merge the original boxes and new selected boxes which are both
+                // sorted by scores.
+                std::inplace_merge(box_info_after_regular_nms.begin(), box_info_after_regular_nms.begin() + sorted_indices_size,
+                    box_info_after_regular_nms.begin() + sorted_indices_size + selected.size(),
+                    [](const BoxInfo &a, const BoxInfo &b) { return a.score >= b.score; });
+
+                sorted_indices_size = std::min(sorted_indices_size + static_cast<int>(selected.size()), max_detections);
+            }
+            // end compute nms result
+
+            // Allocate output tensors
+            for (int output_box_index = 0; output_box_index < max_detections; output_box_index++)
+            {
+                if (output_box_index < sorted_indices_size)
+                {
+                    const int anchor_index = floor(
+                        box_info_after_regular_nms[output_box_index].index / num_classes_with_background);
+                    const int class_index = box_info_after_regular_nms[output_box_index].index - anchor_index * num_classes_with_background - label_offset;
+                    const float selected_score = box_info_after_regular_nms[output_box_index].score;
+                    // detection_boxes
+                    reinterpret_cast<BoxCornerEncoding *>(output_locations)[output_box_index] = decoded_boxes[anchor_index];
+                    // detection_classes
+                    output_classes[output_box_index] = class_index;
+                    // detection_scores
+                    output_scores[output_box_index] = selected_score;
+                }
+                else
+                {
+                    // detection_boxes
+                    reinterpret_cast<BoxCornerEncoding *>(output_locations)[output_box_index] = { 0.0f, 0.0f, 0.0f, 0.0f };
+                    // detection_classes
+                    output_classes[output_box_index] = 0.0f;
+                    // detection_scores
+                    output_scores[output_box_index] = 0.0f;
+                }
+            }
+            output_num_detections[0] = sorted_indices_size;
+            box_info_after_regular_nms.clear();
+        }
+        else
+        {
+            // Fast NMS
+
+            const int max_categories_per_anchor = max_classes_per_detection;
+            const int num_categories_per_anchor = std::min(max_categories_per_anchor, num_classes);
+
+            std::vector<float> max_scores;
+            max_scores.resize(num_boxes);
+            std::vector<int> sorted_class_indices;
+            sorted_class_indices.resize(num_boxes * num_categories_per_anchor);
+
+            for (int row = 0; row < num_boxes; row++)
+            {
+                const float *box_scores = scores + row * num_classes_with_background + label_offset;
+                int *class_indices = sorted_class_indices.data() + row * num_categories_per_anchor;
+
+                // DecreasingPartialArgSort
+                if (num_categories_per_anchor == 1)
+                {
+                    auto arg_max_vector = [&](const T *input_data, int size) {
+                        T max_value = input_data[0];
+                        int max_index = 0;
+                        for (int i = 1; i < size; ++i)
+                        {
+                            // const T curr_value = input_data[i];
+                            if (input_data[i] > max_value)
+                            {
+                                max_value = input_data[i];
+                                max_index = i;
+                            }
+                        }
+                        return max_index;
+                    };
+                    class_indices[0] = arg_max_vector(box_scores, num_classes);
+                }
+                else
+                {
+                    std::iota(class_indices, class_indices + num_classes, 0);
+                    std::partial_sort(
+                        class_indices, class_indices + num_categories_per_anchor, class_indices + num_classes,
+                        [&box_scores](const int i, const int j) { return box_scores[i] > box_scores[j]; });
+                }
+                // end DecreasingPartialArgSort
+
+                max_scores[row] = box_scores[class_indices[0]];
+            }
+            std::vector<int> selected;
+            // NMS SingleClass
+            {
+                std::vector<int> keep_indices;
+                std::vector<float> keep_scores;
+                // select detection box score above score threshold
+                {
+                    for (size_t i = 0; i < max_scores.size(); i++)
+                    {
+                        if (max_scores[i] >= nms_score_threshold)
+                        {
+                            keep_scores.emplace_back(max_scores[i]);
+                            keep_indices.emplace_back(i);
+                        }
+                    }
+                }
+
+                int num_scores_kept = (int)keep_scores.size();
+                std::vector<int> sorted_indices;
+                sorted_indices.resize(num_scores_kept);
+                // DecreasingArgSort
+                {
+                    std::iota(sorted_indices.begin(), sorted_indices.begin() + num_scores_kept, 0);
+                    std::stable_sort(
+                        sorted_indices.begin(), sorted_indices.begin() + num_scores_kept,
+                        [&keep_scores](const int i, const int j) { return keep_scores[i] > keep_scores[j]; });
+                }
+                const int output_size = std::min(num_scores_kept, max_detections);
+                selected.clear();
+                int num_active_candidate = num_scores_kept;
+                std::vector<uint8_t> active_box_candidate(num_scores_kept, 1);
+                for (int i = 0; i < num_scores_kept; ++i)
+                {
+                    if (num_active_candidate == 0 || (int)selected.size() >= output_size)
+                        break;
+                    if (active_box_candidate[i] == 1)
+                    {
+                        selected.push_back(keep_indices[sorted_indices[i]]);
+                        active_box_candidate[i] = 0;
+                        num_active_candidate--;
+                    }
+                    else
+                    {
+                        continue;
+                    }
+                    for (int j = i + 1; j < num_scores_kept; ++j)
+                    {
+                        if (active_box_candidate[j] == 1)
+                        {
+
+                            float iou = compute_iou(
+                                decoded_boxes, keep_indices[sorted_indices[i]],
+                                keep_indices[sorted_indices[j]]);
+                            if (iou > nms_iou_threshold)
+                            {
+                                active_box_candidate[j] = 0;
+                                num_active_candidate--;
+                            }
+                        }
+                    }
+                }
+            }
+            // end NMS SingleClass
+
+            // Allocate output tensors
+            int output_box_index = 0;
+            for (const auto &selected_index : selected)
+            {
+                const float *box_scores = scores + selected_index * num_classes_with_background + label_offset;
+                const int *class_indices = sorted_class_indices.data() + selected_index * num_categories_per_anchor;
+
+                for (int col = 0; col < num_categories_per_anchor; ++col)
+                {
+                    int box_offset = max_categories_per_anchor * output_box_index + col;
+                    // detection_boxes
+                    reinterpret_cast<BoxCornerEncoding *>(output_locations)[box_offset] = decoded_boxes[selected_index];
+                    // detection_classes
+                    output_classes[box_offset] = class_indices[col];
+                    // detection_scores
+                    output_scores[box_offset] = box_scores[class_indices[col]];
+                }
+                output_box_index++;
+            }
+            output_num_detections[0] = output_box_index;
+        }
+    }
+}
+
+inline void layernorm(const float *input, float *output, float *scale, float *bias, runtime_shape_t in_shape, int32_t axis, float epsilon)
+{
+    auto outer_size = 1;
+    auto inner_size = 1;
+    for (auto i = 0; i < axis; i++)
+        outer_size *= in_shape[i];
+    for (auto i = axis; i < in_shape.size(); i++)
+        inner_size *= in_shape[i];
+
+    for (int32_t batch = 0; batch < outer_size; batch++)
+    {
+        auto src = input + batch * inner_size;
+        auto dest = output + batch * inner_size;
+
+        float mean1 = 0.f;
+        for (size_t i = 0; i < inner_size; i++)
+            mean1 += src[i] / inner_size;
+
+        std::vector<float> sub(inner_size, 0.f);
+        for (size_t i = 0; i < inner_size; i++)
+            sub[i] = src[i] - mean1;
+
+        std::vector<float> pow(inner_size, 0.f);
+        for (size_t i = 0; i < inner_size; i++)
+            pow[i] = sub[i] * sub[i];
+
+        float mean2 = 0.f;
+        for (size_t i = 0; i < inner_size; i++)
+            mean2 += pow[i] / inner_size;
+
+        float add = mean2 + epsilon;
+        float sqrt = std::sqrt(add);
+
+        std::vector<float> div(inner_size, 0.f);
+        for (size_t i = 0; i < inner_size; i++)
+            div[i] = sub[i] / sqrt;
+
+        for (size_t i = 0; i < inner_size; i++)
+            dest[i] = div[i] * scale[i] + bias[i];
+    }
+}
+
+template <class T>
+void compress(const T *input, const uint8_t *condition, T *output, const runtime_shape_t &input_shape, const runtime_shape_t &condition_shape, const int axis)
+{
+    if (axis == (int)input_shape.size())
+    {
+        for (auto i = 0; i < (int)condition_shape[0]; i++)
+        {
+            if ((float)*(condition + i) == 0)
+            {
+                continue;
+            }
+            *output++ = *(input + i);
+        }
+    }
+    else
+    {
+        int select_slice = 1;
+        for (auto i = axis + 1; i < (int)input_shape.size(); i++)
+        {
+            select_slice *= input_shape[i];
+        }
+        for (auto j = 0; j < (int)kernels::detail::compute_size(input_shape); j++)
+        {
+            auto i = j % (select_slice * input_shape[axis]);
+            auto cond_index = i / select_slice;
+            if (select_slice != 1 && (cond_index >= condition_shape[0] || condition[cond_index] == 0))
+                continue;
+            if (select_slice == 1 && (i % input_shape[axis] >= condition_shape[0] || condition[cond_index % input_shape[axis] % condition_shape[0]] == 0))
+                continue;
+            *output++ = *(input + j);
+        }
+    }
+}
 }
diff --git a/include/nncase/kernels/tensor_compute.h b/include/nncase/kernels/tensor_compute.h
index 0b139f0ebb..6f9c6c8a3d 100644
--- a/include/nncase/kernels/tensor_compute.h
+++ b/include/nncase/kernels/tensor_compute.h
@@ -152,4 +152,31 @@ NNCASE_API result<void> topk(const T *input, T *output_values, int64_t *output_i
 template <typename T>
 NNCASE_API result<void> trilu(const T *input, T *output, const runtime_shape_t &in_shape, const bool upper, const int64_t k) noexcept;
 
+template <typename T>
+NNCASE_API result<void> gru(const T *input, const T *w, const T *r, const T *b, T *initial_h, T *output, T *output_h, const runtime_shape_t &input_shape, const runtime_shape_t &w_shape, int mode, bool linear_before_reset) noexcept;
+
+template <typename T>
+NNCASE_API result<void> tflite_detection_postprocess(const T *boxes, const T *scores, const T *anchors, T *output_locations, T *output_classes, T *output_scores, T *output_num_detections,
+    const runtime_shape_t &boxes_shape, const runtime_shape_t &scores_shape, const runtime_shape_t &anchors_shape,
+    const int32_t max_detections, const int32_t max_classes_per_detection, const int32_t detections_per_class,
+    const bool use_regular_non_max_suppression, const float nms_score_threshold, const float nms_iou_threshold,
+    const int32_t num_classes, const float y_scale, const float x_scale, const float h_scale, const float w_scale) noexcept;
+
+NNCASE_API result<void> space_to_batch(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &block_shape, const runtime_paddings_t &crops, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides,
+    kernel_context &context = default_kernel_context()) noexcept;
+
+template <typename TI, typename TK>
+NNCASE_API result<void> gather_elements(const TI *input, const TK *indices, TI *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &indices_shape, const int axis) noexcept;
+
+template <typename T>
+NNCASE_API result<void> instancenorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, float epsilon) noexcept;
+
+template <typename T>
+NNCASE_API result<void> layernorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept;
+
+template <typename T>
+NNCASE_API result<void> compress(const T *input, const uint8_t *condition, T *output, const runtime_shape_t &input_shape, const runtime_shape_t &condition_shape, const int axis) noexcept;
+
 END_NS_NNCASE_KERNELS
diff --git a/include/nncase/runtime/datatypes.h b/include/nncase/runtime/datatypes.h
index c42013e068..d894cb4100 100644
--- a/include/nncase/runtime/datatypes.h
+++ b/include/nncase/runtime/datatypes.h
@@ -258,7 +258,8 @@ typedef enum _unary_op
     unary_square,
     unary_tanh,
     unary_bitwise_not,
-    unary_logical_not
+    unary_logical_not,
+    unary_erf
 } unary_op_t;
 
 inline std::string unary_op_to_string(unary_op_t op)
@@ -301,6 +302,8 @@ inline std::string unary_op_to_string(unary_op_t op)
         return "unary_bitwise_not";
     case unary_logical_not:
         return "unary_logical_not";
+    case unary_erf:
+        return "unary_erf";
     }
     return "unknown";
 }
diff --git a/include/nncase/runtime/nnil.h b/include/nncase/runtime/nnil.h
index b4b0475f1e..5f690ca378 100644
--- a/include/nncase/runtime/nnil.h
+++ b/include/nncase/runtime/nnil.h
@@ -54,6 +54,7 @@ typedef enum _nnil_opcode
     nnil_min = 0x44,
     nnil_max = 0x45,
     nnil_pow = 0x46,
+    nnil_erf = 0x47,
     nnil_clamp = 0x80,
     nnil_ret = 0xA0
 } nnil_opcode_t;
diff --git a/include/nncase/runtime/runtime_op_utility.h b/include/nncase/runtime/runtime_op_utility.h
index 0399096d11..a284e7df9c 100644
--- a/include/nncase/runtime/runtime_op_utility.h
+++ b/include/nncase/runtime/runtime_op_utility.h
@@ -71,7 +71,7 @@ inline void adapt_strides(const shape_type &shape, strides_type &strides,
 
 template <class shape_type, class strides_type, class bs_ptr>
 inline std::size_t compute_strides(const shape_type &shape,
-    strides_type &strides, bs_ptr bs)
+    strides_type &strides, NNCASE_UNUSED bs_ptr bs)
 {
     using strides_value_type = typename std::decay_t<strides_type>::value_type;
     strides_value_type data_size = 1;
@@ -79,7 +79,7 @@ inline std::size_t compute_strides(const shape_type &shape,
     {
         strides[i - 1] = data_size;
         data_size = strides[i - 1] * static_cast<strides_value_type>(shape[i - 1]);
-        adapt_strides(shape, strides, bs, i - 1);
+        // adapt_strides(shape, strides, bs, i - 1);
     }
     return static_cast<std::size_t>(data_size);
 }
@@ -283,7 +283,7 @@ inline bool is_optimized_binary_op(binary_op_t op)
 
 inline bool is_optimized_unary_op(unary_op_t op)
 {
-    return op == unary_abs || op == unary_ceil || op == unary_cos || op == unary_exp || op == unary_floor || op == unary_log || op == unary_neg || op == unary_round || op == unary_rsqrt || op == unary_sign || op == unary_sin || op == unary_sqrt || op == unary_square || op == unary_tanh;
+    return op == unary_abs || op == unary_ceil || op == unary_cos || op == unary_exp || op == unary_floor || op == unary_log || op == unary_neg || op == unary_round || op == unary_sign || op == unary_sin || op == unary_sqrt || op == unary_square || op == unary_tanh;
 }
 
 template <class TShape>
@@ -300,6 +300,16 @@ bool is_optimized_input_shape(TShape in_shape, TShape out_shape)
     return false;
 }
 
+inline void get_gather_index(const std::vector<int> &per_axis_size, std::vector<int> &index, size_t i, int axis, int idx)
+{
+    if (idx != (int)per_axis_size.size())
+    {
+        int new_idx = i / per_axis_size[idx];
+        index.push_back(new_idx);
+        get_gather_index(per_axis_size, index, i - new_idx * per_axis_size[idx], axis, idx + 1);
+    }
+}
+
 struct DefaultCallable
 {
 };
diff --git a/include/nncase/runtime/runtime_tensor.h b/include/nncase/runtime/runtime_tensor.h
index 7ef31f200e..1c7b8955ef 100644
--- a/include/nncase/runtime/runtime_tensor.h
+++ b/include/nncase/runtime/runtime_tensor.h
@@ -134,12 +134,12 @@ class NNCASE_API mapped_buffer
 typedef std::function<void(gsl::byte *)> data_deleter_t;
 
 NNCASE_API runtime_tensor_type &tensor_type() noexcept;
-NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
-NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, gsl::span<gsl::byte> data, bool copy, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
-NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, gsl::span<gsl::byte> data, data_deleter_t data_deleter, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
-NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
-NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, gsl::span<gsl::byte> data, bool copy, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
-NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, gsl::span<gsl::byte> data, data_deleter_t data_deleter, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept;
+NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, memory_pool_t pool = pool_shared, uintptr_t physical_address = 0) noexcept;
+NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, gsl::span<gsl::byte> data, bool copy, memory_pool_t pool = pool_shared, uintptr_t physical_address = 0) noexcept;
+NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, gsl::span<gsl::byte> data, data_deleter_t data_deleter, memory_pool_t pool = pool_shared, uintptr_t physical_address = 0) noexcept;
+NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, memory_pool_t pool = pool_shared, uintptr_t physical_address = 0) noexcept;
+NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, gsl::span<gsl::byte> data, bool copy, memory_pool_t pool = pool_shared, uintptr_t physical_address = 0) noexcept;
+NNCASE_API result<runtime_tensor> create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, gsl::span<gsl::byte> data, data_deleter_t data_deleter, memory_pool_t pool = pool_shared, uintptr_t physical_address = 0) noexcept;
 NNCASE_API result<memory_pool_t> memory_pool(const runtime_tensor &tensor) noexcept;
 NNCASE_API result<mapped_buffer> map(runtime_tensor &tensor, map_access_t access) noexcept;
 NNCASE_API result<void> sync(runtime_tensor &tensor, sync_op_t op, bool force = false) noexcept;
diff --git a/include/nncase/runtime/small_vector.hpp b/include/nncase/runtime/small_vector.hpp
index 895a489dab..ece4eb7510 100644
--- a/include/nncase/runtime/small_vector.hpp
+++ b/include/nncase/runtime/small_vector.hpp
@@ -668,7 +668,6 @@ struct small_vector : Alloc
     iterator insert(const_iterator position, InputIterator first, InputIterator last)
     {
         auto pos = grow_at(position, last - first);
-        size_type i = 0;
         auto np = pos;
         for (auto p = first; p != last; ++p, ++np)
         {
diff --git a/include/nncase/runtime/stackvm/op_profile.h b/include/nncase/runtime/stackvm/op_profile.h
index 4b0df8a892..b70c82ed83 100644
--- a/include/nncase/runtime/stackvm/op_profile.h
+++ b/include/nncase/runtime/stackvm/op_profile.h
@@ -16,18 +16,44 @@
 #include <iostream>
 #include <unordered_map>
 
+#if defined(__riscv)
+
+#define RISCVFREQUENCY 1600000000
+
+static uint64_t k230_get_cycles()
+{
+    uint64_t x;
+    __asm volatile("rdcycle %0;"
+                   : "=r"(x)::);
+    return x;
+}
+#endif
+
 class op_profile
 {
 public:
     op_profile(const std::string &op_type = "op_profile")
         : op_type_(op_type)
     {
+#if defined(__riscv)
+        begin_ = k230_get_cycles();
+#else
         begin_ = clock();
+#endif
     }
+
     ~op_profile()
     {
+#if defined(__riscv)
+
+        end_ = k230_get_cycles();
+        auto cast_time = end_ - begin_;
+        // std::cout << "cpu op:" << op_type_ << " cast time:" << cast_time << " begin time:" << begin_ << " end time:" << end_ << " " << std::endl;
+#else
         end_ = clock();
         auto cast_time = (end_ - begin_) / (double)1000;
+        // std::cout << "cpu op:" << op_type_ << " cast time:" << cast_time << " begin time:" << begin_ << " end time:" << end_ << " " <<std::endl;
+#endif
         if (op_timing_.find(op_type_) == op_timing_.end())
         {
             op_timing_.emplace(op_type_, cast_time);
@@ -37,7 +63,8 @@ class op_profile
             op_timing_[op_type_] += cast_time;
         }
     }
-    void print();
+
+    static void print();
 
 public:
     static std::unordered_map<std::string, double> op_timing_;
diff --git a/include/nncase/runtime/stackvm/op_reader.h b/include/nncase/runtime/stackvm/op_reader.h
index 549d16c99e..5246ab4ea7 100644
--- a/include/nncase/runtime/stackvm/op_reader.h
+++ b/include/nncase/runtime/stackvm/op_reader.h
@@ -1,4 +1,4 @@
-/* This file is generated by tools/stackvm_gen/IsaGen at 4/25/2022 3:29:27 PM +08:00.
+/* This file is generated by tools/stackvm_gen/IsaGen at 2023/5/9 下午5:18:43 +08:00.
  *
  * Copyright 2019-2021 Canaan Inc.
  *
@@ -1614,6 +1614,24 @@ struct op_reader<tensor_softmax_op_t>
     }
 };
 
+template <>
+struct op_reader<tensor_space_to_batch_op_t>
+{
+    tensor_space_to_batch_op_t operator()(span_reader &reader) const
+    {
+        tensor_space_to_batch_op_t op(default_init);
+        op.opcode = static_cast<opcode_t>(reader.read_unaligned<uint8_t>());
+        op.funct = static_cast<tensor_function_t>(reader.read_unaligned<uint16_t>());
+        op.datatype = static_cast<datatype_t>(reader.read_unaligned<uint8_t>());
+        op.rshape_src = reader.read_unaligned<uint8_t>();
+        op.rstride_src = reader.read_unaligned<uint8_t>();
+        op.rstride_dest = reader.read_unaligned<uint8_t>();
+        op.rshape_block = reader.read_unaligned<uint8_t>();
+        op.rpad_crops = reader.read_unaligned<uint8_t>();
+        return op;
+    }
+};
+
 template <>
 struct op_reader<tensor_ternary_op_t>
 {
@@ -1707,6 +1725,109 @@ struct op_reader<tensor_transpose_op_t>
     }
 };
 
+template <>
+struct op_reader<tensor_gru_op_t>
+{
+    tensor_gru_op_t operator()(span_reader &reader) const
+    {
+        tensor_gru_op_t op(default_init);
+        op.opcode = static_cast<opcode_t>(reader.read_unaligned<uint8_t>());
+        op.funct = static_cast<tensor_function_t>(reader.read_unaligned<uint16_t>());
+        op.input_shape_src = reader.read_unaligned<uint8_t>();
+        op.w_shape_src = reader.read_unaligned<uint8_t>();
+        op.direction = reader.read_unaligned<uint8_t>();
+        op.linear_before_reset = reader.read_unaligned<bool>();
+        return op;
+    }
+};
+
+template <>
+struct op_reader<tensor_tflite_detection_postprocess_op_t>
+{
+    tensor_tflite_detection_postprocess_op_t operator()(span_reader &reader) const
+    {
+        tensor_tflite_detection_postprocess_op_t op(default_init);
+        op.opcode = static_cast<opcode_t>(reader.read_unaligned<uint8_t>());
+        op.funct = static_cast<tensor_function_t>(reader.read_unaligned<uint16_t>());
+        op.box_shape_src = reader.read_unaligned<uint8_t>();
+        op.score_shape_src = reader.read_unaligned<uint8_t>();
+        op.anchor_shape_src = reader.read_unaligned<uint8_t>();
+        op.max_detections = reader.read_unaligned<int32_t>();
+        op.max_classes_per_detection = reader.read_unaligned<int32_t>();
+        op.detections_per_class = reader.read_unaligned<int32_t>();
+        op.use_regular_non_max_suppression = reader.read_unaligned<bool>();
+        op.nms_score_threshold = reader.read_unaligned<float>();
+        op.nms_iou_threshold = reader.read_unaligned<float>();
+        op.num_classes = reader.read_unaligned<int32_t>();
+        op.y_scale = reader.read_unaligned<float>();
+        op.x_scale = reader.read_unaligned<float>();
+        op.h_scale = reader.read_unaligned<float>();
+        op.w_scale = reader.read_unaligned<float>();
+        return op;
+    }
+};
+
+template <>
+struct op_reader<tensor_layer_normalization_op_t>
+{
+    tensor_layer_normalization_op_t operator()(span_reader &reader) const
+    {
+        tensor_layer_normalization_op_t op(default_init);
+        op.opcode = static_cast<opcode_t>(reader.read_unaligned<uint8_t>());
+        op.funct = static_cast<tensor_function_t>(reader.read_unaligned<uint16_t>());
+        op.datatype = static_cast<datatype_t>(reader.read_unaligned<uint8_t>());
+        op.input_shape = reader.read_unaligned<uint8_t>();
+        op.axis = reader.read_unaligned<int32_t>();
+        op.epsilon = reader.read_unaligned<float>();
+        return op;
+    }
+};
+
+template <>
+struct op_reader<tensor_compress_op_t>
+{
+    tensor_compress_op_t operator()(span_reader &reader) const
+    {
+        tensor_compress_op_t op(default_init);
+        op.opcode = static_cast<opcode_t>(reader.read_unaligned<uint8_t>());
+        op.funct = static_cast<tensor_function_t>(reader.read_unaligned<uint16_t>());
+        op.input_shape_src = reader.read_unaligned<uint8_t>();
+        op.condition_shape_src = reader.read_unaligned<uint8_t>();
+        op.axis = reader.read_unaligned<float>();
+        return op;
+    }
+};
+
+template <>
+struct op_reader<tensor_gather_elements_op_t>
+{
+    tensor_gather_elements_op_t operator()(span_reader &reader) const
+    {
+        tensor_gather_elements_op_t op(default_init);
+        op.opcode = static_cast<opcode_t>(reader.read_unaligned<uint8_t>());
+        op.funct = static_cast<tensor_function_t>(reader.read_unaligned<uint16_t>());
+        op.input_shape_src = reader.read_unaligned<uint8_t>();
+        op.indices_shape_src = reader.read_unaligned<uint8_t>();
+        op.axis = reader.read_unaligned<int32_t>();
+        return op;
+    }
+};
+
+template <>
+struct op_reader<tensor_instance_normalization_op_t>
+{
+    tensor_instance_normalization_op_t operator()(span_reader &reader) const
+    {
+        tensor_instance_normalization_op_t op(default_init);
+        op.opcode = static_cast<opcode_t>(reader.read_unaligned<uint8_t>());
+        op.funct = static_cast<tensor_function_t>(reader.read_unaligned<uint16_t>());
+        op.datatype = static_cast<datatype_t>(reader.read_unaligned<uint8_t>());
+        op.input_shape = reader.read_unaligned<uint8_t>();
+        op.epsilon = reader.read_unaligned<float>();
+        return op;
+    }
+};
+
 class NNCASE_API op_visitor
 {
 public:
@@ -1842,11 +1963,18 @@ class NNCASE_API op_visitor
     virtual result<void> visit(NNCASE_UNUSED const tensor_sigmoid_op_t &op) noexcept { return ok(); }
     virtual result<void> visit(NNCASE_UNUSED const tensor_slice_op_t &op) noexcept { return ok(); }
     virtual result<void> visit(NNCASE_UNUSED const tensor_softmax_op_t &op) noexcept { return ok(); }
+    virtual result<void> visit(NNCASE_UNUSED const tensor_space_to_batch_op_t &op) noexcept { return ok(); }
     virtual result<void> visit(NNCASE_UNUSED const tensor_ternary_op_t &op) noexcept { return ok(); }
     virtual result<void> visit(NNCASE_UNUSED const tensor_topk_op_t &op) noexcept { return ok(); }
     virtual result<void> visit(NNCASE_UNUSED const tensor_trilu_op_t &op) noexcept { return ok(); }
     virtual result<void> visit(NNCASE_UNUSED const tensor_unary_op_t &op) noexcept { return ok(); }
     virtual result<void> visit(NNCASE_UNUSED const tensor_transpose_op_t &op) noexcept { return ok(); }
+    virtual result<void> visit(NNCASE_UNUSED const tensor_gru_op_t &op) noexcept { return ok(); }
+    virtual result<void> visit(NNCASE_UNUSED const tensor_tflite_detection_postprocess_op_t &op) noexcept { return ok(); }
+    virtual result<void> visit(NNCASE_UNUSED const tensor_layer_normalization_op_t &op) noexcept { return ok(); }
+    virtual result<void> visit(NNCASE_UNUSED const tensor_compress_op_t &op) noexcept { return ok(); }
+    virtual result<void> visit(NNCASE_UNUSED const tensor_gather_elements_op_t &op) noexcept { return ok(); }
+    virtual result<void> visit(NNCASE_UNUSED const tensor_instance_normalization_op_t &op) noexcept { return ok(); }
 
 protected:
     bool interrupted_;
diff --git a/include/nncase/runtime/stackvm/opcode.h b/include/nncase/runtime/stackvm/opcode.h
index 799cd62582..3263a7f0af 100644
--- a/include/nncase/runtime/stackvm/opcode.h
+++ b/include/nncase/runtime/stackvm/opcode.h
@@ -1,4 +1,4 @@
-/* This file is generated by tools/stackvm_gen/IsaGen at 4/25/2022 3:29:27 PM +08:00.
+/* This file is generated by tools/stackvm_gen/IsaGen at 2023/5/9 下午5:18:43 +08:00.
  *
  * Copyright 2019-2021 Canaan Inc.
  *
@@ -161,6 +161,12 @@ enum class tensor_function_t
     TRANSPOSE = 0x0024,
     TRILU = 0x0025,
     UNARY = 0x0026,
+    GRU = 0x0027,
+    TFLITE_DETECTION_POSTPROCESS = 0x0028,
+    LAYER_NORMALIZATION = 0x0029,
+    COMPRESS = 0x002A,
+    GATHER_ELEMENTS = 0x002B,
+    INSTANCE_NORMALIZATION = 0x002C,
 };
 
 // Instructions
@@ -1754,6 +1760,24 @@ struct tensor_softmax_op_t
     }
 };
 
+struct tensor_space_to_batch_op_t
+{
+    opcode_t opcode;
+    tensor_function_t funct;
+    datatype_t datatype;
+    uint8_t rshape_src;
+    uint8_t rstride_src;
+    uint8_t rstride_dest;
+    uint8_t rshape_block;
+    uint8_t rpad_crops;
+
+    tensor_space_to_batch_op_t(default_init_t) noexcept { }
+    explicit tensor_space_to_batch_op_t(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest, uint8_t rshape_block, uint8_t rpad_crops) noexcept
+        : opcode(opcode_t::TENSOR), funct(tensor_function_t::SPACE_TO_BATCH), datatype(datatype), rshape_src(rshape_src), rstride_src(rstride_src), rstride_dest(rstride_dest), rshape_block(rshape_block), rpad_crops(rpad_crops)
+    {
+    }
+};
+
 struct tensor_ternary_op_t
 {
     opcode_t opcode;
@@ -1847,4 +1871,107 @@ struct tensor_transpose_op_t
     }
 };
 
+struct tensor_gru_op_t
+{
+    opcode_t opcode;
+    tensor_function_t funct;
+    uint8_t input_shape_src;
+    uint8_t w_shape_src;
+    uint8_t direction;
+    bool linear_before_reset;
+
+    tensor_gru_op_t(default_init_t) noexcept { }
+    explicit tensor_gru_op_t(uint8_t input_shape_src, uint8_t w_shape_src, uint8_t direction, bool linear_before_reset) noexcept
+        : opcode(opcode_t::TENSOR), funct(tensor_function_t::GRU), input_shape_src(input_shape_src), w_shape_src(w_shape_src), direction(direction), linear_before_reset(linear_before_reset)
+    {
+    }
+};
+
+struct tensor_tflite_detection_postprocess_op_t
+{
+    opcode_t opcode;
+    tensor_function_t funct;
+    uint8_t box_shape_src;
+    uint8_t score_shape_src;
+    uint8_t anchor_shape_src;
+    int32_t max_detections;
+    int32_t max_classes_per_detection;
+    int32_t detections_per_class;
+    bool use_regular_non_max_suppression;
+    float nms_score_threshold;
+    float nms_iou_threshold;
+    int32_t num_classes;
+    float y_scale;
+    float x_scale;
+    float h_scale;
+    float w_scale;
+
+    tensor_tflite_detection_postprocess_op_t(default_init_t) noexcept { }
+    explicit tensor_tflite_detection_postprocess_op_t(uint8_t box_shape_src, uint8_t score_shape_src, uint8_t anchor_shape_src, int32_t max_detections, int32_t max_classes_per_detection, int32_t detections_per_class, bool use_regular_non_max_suppression, float nms_score_threshold, float nms_iou_threshold, int32_t num_classes, float y_scale, float x_scale, float h_scale, float w_scale) noexcept
+        : opcode(opcode_t::TENSOR), funct(tensor_function_t::TFLITE_DETECTION_POSTPROCESS), box_shape_src(box_shape_src), score_shape_src(score_shape_src), anchor_shape_src(anchor_shape_src), max_detections(max_detections), max_classes_per_detection(max_classes_per_detection), detections_per_class(detections_per_class), use_regular_non_max_suppression(use_regular_non_max_suppression), nms_score_threshold(nms_score_threshold), nms_iou_threshold(nms_iou_threshold), num_classes(num_classes), y_scale(y_scale), x_scale(x_scale), h_scale(h_scale), w_scale(w_scale)
+    {
+    }
+};
+
+struct tensor_layer_normalization_op_t
+{
+    opcode_t opcode;
+    tensor_function_t funct;
+    datatype_t datatype;
+    uint8_t input_shape;
+    int32_t axis;
+    float epsilon;
+
+    tensor_layer_normalization_op_t(default_init_t) noexcept { }
+    explicit tensor_layer_normalization_op_t(datatype_t datatype, uint8_t input_shape, int32_t axis, float epsilon) noexcept
+        : opcode(opcode_t::TENSOR), funct(tensor_function_t::LAYER_NORMALIZATION), datatype(datatype), input_shape(input_shape), axis(axis), epsilon(epsilon)
+    {
+    }
+};
+
+struct tensor_compress_op_t
+{
+    opcode_t opcode;
+    tensor_function_t funct;
+    uint8_t input_shape_src;
+    uint8_t condition_shape_src;
+    float axis;
+
+    tensor_compress_op_t(default_init_t) noexcept { }
+    explicit tensor_compress_op_t(uint8_t input_shape_src, uint8_t condition_shape_src, float axis) noexcept
+        : opcode(opcode_t::TENSOR), funct(tensor_function_t::COMPRESS), input_shape_src(input_shape_src), condition_shape_src(condition_shape_src), axis(axis)
+    {
+    }
+};
+
+struct tensor_gather_elements_op_t
+{
+    opcode_t opcode;
+    tensor_function_t funct;
+    uint8_t input_shape_src;
+    uint8_t indices_shape_src;
+    int32_t axis;
+
+    tensor_gather_elements_op_t(default_init_t) noexcept { }
+    explicit tensor_gather_elements_op_t(uint8_t input_shape_src, uint8_t indices_shape_src, int32_t axis) noexcept
+        : opcode(opcode_t::TENSOR), funct(tensor_function_t::GATHER_ELEMENTS), input_shape_src(input_shape_src), indices_shape_src(indices_shape_src), axis(axis)
+    {
+    }
+};
+
+struct tensor_instance_normalization_op_t
+{
+    opcode_t opcode;
+    tensor_function_t funct;
+    datatype_t datatype;
+    uint8_t input_shape;
+    float epsilon;
+
+    tensor_instance_normalization_op_t(default_init_t) noexcept { }
+    explicit tensor_instance_normalization_op_t(datatype_t datatype, uint8_t input_shape, float epsilon) noexcept
+        : opcode(opcode_t::TENSOR), funct(tensor_function_t::INSTANCE_NORMALIZATION), datatype(datatype), input_shape(input_shape), epsilon(epsilon)
+    {
+    }
+};
+
 END_NS_NNCASE_RT_MODULE
diff --git a/include/nncase/transforms/neutral/fix_output_shape.h b/include/nncase/transforms/neutral/fix_output_shape.h
new file mode 100644
index 0000000000..f44e304804
--- /dev/null
+++ b/include/nncase/transforms/neutral/fix_output_shape.h
@@ -0,0 +1,28 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../transform.h"
+
+namespace nncase::ir::transforms
+{
+class NNCASE_API tflite_detection_postprocess_transform : public transform
+{
+public:
+    void process(transform_context &context) override;
+
+protected:
+    bool on_try_match(ir::node &node, transform_context &context) override;
+};
+}
diff --git a/include/nncase/transforms/neutral/fold_instancenorm.h b/include/nncase/transforms/neutral/fold_instancenorm.h
new file mode 100644
index 0000000000..f01ee2541c
--- /dev/null
+++ b/include/nncase/transforms/neutral/fold_instancenorm.h
@@ -0,0 +1,29 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../transform.h"
+
+namespace nncase::ir::transforms
+{
+class NNCASE_API fold_instancenorm_transform : public transform
+{
+public:
+    void process(transform_context &context) override;
+
+protected:
+    bool skip_self_contained_check() const noexcept override { return true; }
+    bool on_try_match(ir::node &node, transform_context &context) override;
+};
+}
diff --git a/include/nncase/transforms/neutral/fold_layernorm.h b/include/nncase/transforms/neutral/fold_layernorm.h
new file mode 100644
index 0000000000..37022a1886
--- /dev/null
+++ b/include/nncase/transforms/neutral/fold_layernorm.h
@@ -0,0 +1,52 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../transform.h"
+
+namespace nncase::ir::transforms
+{
+// youdao nmt
+class NNCASE_API fold_layernorm_pattern1_transform : public transform
+{
+public:
+    void process(transform_context &context) override;
+
+protected:
+    bool skip_self_contained_check() const noexcept override { return true; }
+    bool on_try_match(ir::node &node, transform_context &context) override;
+};
+
+// daniu
+class NNCASE_API fold_layernorm_pattern2_transform : public transform
+{
+public:
+    void process(transform_context &context) override;
+
+protected:
+    bool skip_self_contained_check() const noexcept override { return true; }
+    bool on_try_match(ir::node &node, transform_context &context) override;
+};
+
+// fastspeech
+class NNCASE_API fold_layernorm_pattern3_transform : public transform
+{
+public:
+    void process(transform_context &context) override;
+
+protected:
+    bool skip_self_contained_check() const noexcept override { return true; }
+    bool on_try_match(ir::node &node, transform_context &context) override;
+};
+}
diff --git a/include/nncase/transforms/neutral/optimize_allocation.h b/include/nncase/transforms/neutral/optimize_allocation.h
index 43d064e4c6..c01b4a1f8b 100644
--- a/include/nncase/transforms/neutral/optimize_allocation.h
+++ b/include/nncase/transforms/neutral/optimize_allocation.h
@@ -71,6 +71,15 @@ class NNCASE_API add_copy_to_output_pass : public graph_pass
     void run_core(graph &graph, nncase::target &target, const run_pass_options &options) override;
 };
 
+class NNCASE_API add_copy_to_bitcast_pass : public graph_pass
+{
+public:
+    using graph_pass::graph_pass;
+
+protected:
+    void run_core(graph &graph, nncase::target &target, const run_pass_options &options) override;
+};
+
 class NNCASE_API remove_exclusive_copy_to_output_transform : public transform
 {
 public:
@@ -89,6 +98,15 @@ class NNCASE_API remove_exclusive_copy_to_concat_transform : public transform
     bool on_try_match(ir::node &node, transform_context &context) override;
 };
 
+class NNCASE_API remove_exclusive_copy_to_bitcast_transform : public transform
+{
+public:
+    void process(transform_context &context) override;
+
+protected:
+    bool on_try_match(ir::node &node, transform_context &context) override;
+};
+
 class NNCASE_API remove_simple_copy_from_slice_transform : public transform
 {
 public:
diff --git a/include/nncase/transforms/neutral/squeeze_dims.h b/include/nncase/transforms/neutral/squeeze_dims.h
new file mode 100644
index 0000000000..26bffd6eb2
--- /dev/null
+++ b/include/nncase/transforms/neutral/squeeze_dims.h
@@ -0,0 +1,29 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "../transform.h"
+
+namespace nncase::ir::transforms
+{
+class NNCASE_API squeeze_dims_transform : public transform
+{
+public:
+    void process(transform_context &context) override;
+
+protected:
+    bool skip_self_contained_check() const noexcept override { return true; }
+    bool on_try_match(ir::node &node, transform_context &context) override;
+};
+}
diff --git a/modules/vulkan/src/codegen/templates/template.cpp b/modules/vulkan/src/codegen/templates/template.cpp
index 49e3394051..535e057be5 100644
--- a/modules/vulkan/src/codegen/templates/template.cpp
+++ b/modules/vulkan/src/codegen/templates/template.cpp
@@ -82,7 +82,7 @@ class xz_reader
 {
 public:
     xz_reader()
-        : archive_(ZipArchive::fromBuffer(xz_res_.data.data(),
+        : archive_(ZipArchive::fromBuffer((void *)xz_res_.data.data(),
             (libzippp_uint32)xz_res_.data.size(), ZipArchive::ReadOnly, true))
     {
         if (!archive_)
diff --git a/pyproject.toml b/pyproject.toml
index b83548a864..ea3670e25f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,13 +1,35 @@
 [project]
-requires-python = ">=3.6"
+name = "nncase"
+dynamic = ["version"]
+requires-python = ">=3.7"
+authors = [{ name = "sunnycase" }, { email = "sunnycase@live.cn" }]
+maintainers = [{ name = "sunnycase" }, { email = "sunnycase@live.cn" }]
+readme = "README.md"
+description = "A neural network compiler for AI accelerators"
+license = { file = "LICENSE" }
+classifiers = [
+    "Programming Language :: C++",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.7",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+]
+keywords = ["kendryte", "nn", "compiler", "k210", "k510"]
+dependencies = ["numpy"]
+
+[project.urls]
+homepage = "https://github.com/kendryte/nncase"
 
 [build-system]
-requires = ["setuptools>=42", "wheel", "conan", "ninja"]
+requires = ["setuptools>=42", "wheel", "conan<=1.59", "ninja"]
 
 [tool.cibuildwheel]
-build = "cp3*"
+build = ["cp37*", "cp38*", "cp39*", "cp310*"]
 skip = "*musllinux*"
-manylinux-x86_64-image = "sunnycase/manylinux_2_24_x86_64:version1.0"
+manylinux-x86_64-image = "sunnycase/manylinux_2_24_x86_64:version1.1"
 test-requires = "pytest"
 test-command = [
   "pytest {project}/tests/other"
@@ -22,7 +44,7 @@ archs = ["AMD64"]
 [tool.cibuildwheel.linux]
 archs = ["x86_64"]
 before-all = [
-  "pip install conan",
+  "pip install conan==1.59",
   "conan profile new default --detect",
   "conan profile update settings.compiler.libcxx=libstdc++11 default",
   "wget https://sdk.lunarg.com/sdk/download/1.2.182.0/linux/vulkansdk-linux-x86_64-1.2.182.0.tar.gz -O vulkansdk.tar.gz",
diff --git a/requirements.test.txt b/requirements.test.txt
new file mode 100644
index 0000000000..e8c789b747
--- /dev/null
+++ b/requirements.test.txt
@@ -0,0 +1,17 @@
+tensorflow==2.10.0
+matplotlib
+pillow
+opencv-python==4.5.1.48
+onnx==1.12.0
+onnx-simplifier==0.3.6
+onnxoptimizer==0.2.6
+onnxruntime==1.12.0
+numpy==1.21.0
+torch==1.9.0
+torchvision==0.10.0
+imageio==2.15.0
+protobuf==3.12.2
+kendryte-caffe
+pytest
+pytest-xdist
+pyyaml
diff --git a/setup.py b/setup.py
index 6980af205d..a69d9cbf3f 100644
--- a/setup.py
+++ b/setup.py
@@ -252,33 +252,11 @@ def find_version():
     raise RuntimeError("Unable to find version string.")
 
 
-requirements = ["numpy"]
-
 setup(name='nncase',
       version=find_version(),
-      author="sunnycase",
-      author_email="sunnycase@live.cn",
-      maintainer="sunnycase",
       packages=['nncase'],
       package_dir={'': 'python'},
-      python_requires=">=3.6",
-      install_requires=requirements,
       ext_modules=[CMakeExtension(name="_nncase", sourcedir='.')],
-      description="A neural network compiler for AI accelerators",
-      url='https://github.com/kendryte/nncase',
-      long_description=open("README.md", 'r', encoding='utf8').read(),
-      long_description_content_type="text/markdown",
-      keywords="kendryte, nn, compiler, k210, k510",
-      classifiers=[
-          "Programming Language :: C++",
-          "Programming Language :: Python :: 3",
-          "Programming Language :: Python :: 3.6",
-          "Programming Language :: Python :: 3.7",
-          "Programming Language :: Python :: 3.8",
-          "Programming Language :: Python :: 3.9",
-          "License :: OSI Approved :: Apache Software License",
-          "Operating System :: OS Independent", ],
-      license='Apache-2.0',
       cmdclass={
           'build_ext': BuildCMakeExt,
           'install_data': InstallCMakeLibsData,
diff --git a/src/codegen/module_builder.cpp b/src/codegen/module_builder.cpp
index 12fc78ad39..fd6ead621c 100644
--- a/src/codegen/module_builder.cpp
+++ b/src/codegen/module_builder.cpp
@@ -189,6 +189,11 @@ function_call_id module_builder::function_id(ir::graph *graph)
     throw std::invalid_argument("Can't find graph " + graph->name() + " in modules");
 }
 
+std::streampos module_builder::get_current_entry_point()
+{
+    return entry_points_.at(current_function_);
+}
+
 void module_builder::set_current_entry_point(std::streampos pos)
 {
     entry_points_[current_function_] = pos;
diff --git a/src/codegen/stackvm/CMakeLists.txt b/src/codegen/stackvm/CMakeLists.txt
index 8e9769e8a3..f99e2483f6 100644
--- a/src/codegen/stackvm/CMakeLists.txt
+++ b/src/codegen/stackvm/CMakeLists.txt
@@ -1,41 +1,48 @@
-﻿cmake_minimum_required (VERSION 3.8)
+﻿cmake_minimum_required(VERSION 3.8)
 
 set(SRCS module_builder.cpp
-         op_writer.cpp
-         ops/batch_to_space.cpp
-         ops/binary.cpp
-         ops/broadcast.cpp
-         ops/call.cpp
-         ops/compare.cpp
-         ops/conv2d.cpp
-         ops/convert.cpp
-         ops/copy.cpp
-         ops/cumsum.cpp
-         ops/dequantize.cpp
-         ops/gather.cpp
-         ops/gather_nd.cpp
-         ops/hardmax.cpp
-         ops/matmul.cpp
-         ops/onehot.cpp
-         ops/pad.cpp
-         ops/quantize.cpp
-         ops/random_normal.cpp
-         ops/random_uniform.cpp
-         ops/reduce.cpp
-         ops/reduce_arg.cpp
-         ops/reduce_prod.cpp
-         ops/reduce_window2d.cpp
-         ops/resize_image.cpp
-         ops/roi_align.cpp
-         ops/slice.cpp
-         ops/sigmoid.cpp
-         ops/softmax.cpp
-         ops/table_lookup1d.cpp
-         ops/ternary.cpp
-         ops/topk.cpp
-         ops/transpose.cpp
-         ops/trilu.cpp
-         ops/unary.cpp)
+        op_writer.cpp
+        ops/batch_to_space.cpp
+        ops/binary.cpp
+        ops/broadcast.cpp
+        ops/call.cpp
+        ops/compare.cpp
+        ops/compress.cpp
+        ops/conv2d.cpp
+        ops/convert.cpp
+        ops/copy.cpp
+        ops/cumsum.cpp
+        ops/dequantize.cpp
+        ops/gather.cpp
+        ops/gather_elements.cpp
+        ops/gather_nd.cpp
+        ops/gru.cpp
+        ops/hardmax.cpp
+        ops/matmul.cpp
+        ops/onehot.cpp
+        ops/pad.cpp
+        ops/quantize.cpp
+        ops/random_normal.cpp
+        ops/random_uniform.cpp
+        ops/reduce.cpp
+        ops/reduce_arg.cpp
+        ops/reduce_prod.cpp
+        ops/reduce_window2d.cpp
+        ops/resize_image.cpp
+        ops/roi_align.cpp
+        ops/slice.cpp
+        ops/sigmoid.cpp
+        ops/softmax.cpp
+        ops/space_to_batch.cpp
+        ops/table_lookup1d.cpp
+        ops/ternary.cpp
+        ops/topk.cpp
+        ops/transpose.cpp
+        ops/trilu.cpp
+        ops/tflite_detection_postprocess.cpp
+        ops/unary.cpp
+        ops/layernorm.cpp
+        ops/instancenorm.cpp)
 
 add_library(codegen_stackvm OBJECT ${SRCS})
 target_link_libraries(codegen_stackvm PUBLIC ir schedule)
diff --git a/src/codegen/stackvm/module_builder.h b/src/codegen/stackvm/module_builder.h
index 758f33efe7..5bf1d712b4 100644
--- a/src/codegen/stackvm/module_builder.h
+++ b/src/codegen/stackvm/module_builder.h
@@ -20,14 +20,19 @@
 #include <nncase/ir/ops/broadcast.h>
 #include <nncase/ir/ops/call.h>
 #include <nncase/ir/ops/compare.h>
+#include <nncase/ir/ops/compress.h>
 #include <nncase/ir/ops/conv2d.h>
 #include <nncase/ir/ops/convert.h>
 #include <nncase/ir/ops/copy.h>
 #include <nncase/ir/ops/cumsum.h>
 #include <nncase/ir/ops/dequantize.h>
 #include <nncase/ir/ops/gather.h>
+#include <nncase/ir/ops/gather_elements.h>
 #include <nncase/ir/ops/gather_nd.h>
+#include <nncase/ir/ops/gru.h>
 #include <nncase/ir/ops/hardmax.h>
+#include <nncase/ir/ops/instancenorm.h>
+#include <nncase/ir/ops/layernorm.h>
 #include <nncase/ir/ops/matmul.h>
 #include <nncase/ir/ops/onehot.h>
 #include <nncase/ir/ops/pad.h>
@@ -43,8 +48,10 @@
 #include <nncase/ir/ops/sigmoid.h>
 #include <nncase/ir/ops/slice.h>
 #include <nncase/ir/ops/softmax.h>
+#include <nncase/ir/ops/space_to_batch.h>
 #include <nncase/ir/ops/table_lookup.h>
 #include <nncase/ir/ops/ternary.h>
+#include <nncase/ir/ops/tflite_detection_postprocess.h>
 #include <nncase/ir/ops/topk.h>
 #include <nncase/ir/ops/transpose.h>
 #include <nncase/ir/ops/trilu.h>
diff --git a/src/codegen/stackvm/op_writer.cpp b/src/codegen/stackvm/op_writer.cpp
index 72cb231780..8a6a38dd9d 100644
--- a/src/codegen/stackvm/op_writer.cpp
+++ b/src/codegen/stackvm/op_writer.cpp
@@ -1,4 +1,4 @@
-/* This file is generated by tools/stackvm_gen/IsaGen at 4/25/2022 3:29:27 PM +08:00.
+/* This file is generated by tools/stackvm_gen/IsaGen at 2023/5/9 下午5:18:43 +08:00.
  *
  * Copyright 2019-2021 Canaan Inc.
  *
@@ -643,6 +643,11 @@ void op_builder::tensor_softmax_(datatype_t datatype, uint8_t rshape_src, uint8_
     op_writer<tensor_softmax_op_t>()(tensor_softmax_op_t(datatype, rshape_src, rstride_src, rstride_dest, axis, beta), writer_);
 }
 
+void op_builder::tensor_space_to_batch_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest, uint8_t rshape_block, uint8_t rpad_crops)
+{
+    op_writer<tensor_space_to_batch_op_t>()(tensor_space_to_batch_op_t(datatype, rshape_src, rstride_src, rstride_dest, rshape_block, rpad_crops), writer_);
+}
+
 void op_builder::tensor_ternary_(datatype_t datatype, uint8_t rshape_src1, uint8_t rstride_src1, uint8_t rshape_src2, uint8_t rstride_src2, uint8_t rshape_src3, uint8_t rstride_src3, uint8_t rstride_dest)
 {
     op_writer<tensor_ternary_op_t>()(tensor_ternary_op_t(datatype, rshape_src1, rstride_src1, rshape_src2, rstride_src2, rshape_src3, rstride_src3, rstride_dest), writer_);
@@ -667,3 +672,33 @@ void op_builder::tensor_transpose_(datatype_t datatype, uint8_t rshape_src, uint
 {
     op_writer<tensor_transpose_op_t>()(tensor_transpose_op_t(datatype, rshape_src, rstride_src, rstride_dest, rshape_perm), writer_);
 }
+
+void op_builder::tensor_gru_(uint8_t input_shape_src, uint8_t w_shape_src, uint8_t direction, bool linear_before_reset)
+{
+    op_writer<tensor_gru_op_t>()(tensor_gru_op_t(input_shape_src, w_shape_src, direction, linear_before_reset), writer_);
+}
+
+void op_builder::tensor_tflite_detection_postprocess_(uint8_t box_shape_src, uint8_t score_shape_src, uint8_t anchor_shape_src, int32_t max_detections, int32_t max_classes_per_detection, int32_t detections_per_class, bool use_regular_non_max_suppression, float nms_score_threshold, float nms_iou_threshold, int32_t num_classes, float y_scale, float x_scale, float h_scale, float w_scale)
+{
+    op_writer<tensor_tflite_detection_postprocess_op_t>()(tensor_tflite_detection_postprocess_op_t(box_shape_src, score_shape_src, anchor_shape_src, max_detections, max_classes_per_detection, detections_per_class, use_regular_non_max_suppression, nms_score_threshold, nms_iou_threshold, num_classes, y_scale, x_scale, h_scale, w_scale), writer_);
+}
+
+void op_builder::tensor_layer_normalization_(datatype_t datatype, uint8_t input_shape, int32_t axis, float epsilon)
+{
+    op_writer<tensor_layer_normalization_op_t>()(tensor_layer_normalization_op_t(datatype, input_shape, axis, epsilon), writer_);
+}
+
+void op_builder::tensor_compress_(uint8_t input_shape_src, uint8_t condition_shape_src, float axis)
+{
+    op_writer<tensor_compress_op_t>()(tensor_compress_op_t(input_shape_src, condition_shape_src, axis), writer_);
+}
+
+void op_builder::tensor_gather_elements_(uint8_t input_shape_src, uint8_t indices_shape_src, int32_t axis)
+{
+    op_writer<tensor_gather_elements_op_t>()(tensor_gather_elements_op_t(input_shape_src, indices_shape_src, axis), writer_);
+}
+
+void op_builder::tensor_instance_normalization_(datatype_t datatype, uint8_t input_shape, float epsilon)
+{
+    op_writer<tensor_instance_normalization_op_t>()(tensor_instance_normalization_op_t(datatype, input_shape, epsilon), writer_);
+}
diff --git a/src/codegen/stackvm/ops.def b/src/codegen/stackvm/ops.def
index 42f14e424c..a30ed404bf 100644
--- a/src/codegen/stackvm/ops.def
+++ b/src/codegen/stackvm/ops.def
@@ -3,13 +3,16 @@ DEFINE_OP(binary)
 DEFINE_OP(broadcast)
 DEFINE_OP(call)
 DEFINE_OP(compare)
+DEFINE_OP(compress)
 DEFINE_OP(conv2d)
 DEFINE_OP(convert)
 DEFINE_OP(copy)
 DEFINE_OP(cumsum)
 DEFINE_OP(dequantize)
 DEFINE_OP(gather)
+DEFINE_OP(gather_elements)
 DEFINE_OP(gather_nd)
+DEFINE_OP(gru)
 DEFINE_OP(hardmax)
 DEFINE_OP(matmul)
 DEFINE_OP(onehot)
@@ -26,9 +29,13 @@ DEFINE_OP(roi_align)
 DEFINE_OP(sigmoid)
 DEFINE_OP(slice)
 DEFINE_OP(softmax)
+DEFINE_OP(space_to_batch)
 DEFINE_OP(table_lookup1d)
 DEFINE_OP(ternary)
 DEFINE_OP(topk)
 DEFINE_OP(transpose)
 DEFINE_OP(trilu)
-DEFINE_OP(unary)
\ No newline at end of file
+DEFINE_OP(tflite_detection_postprocess)
+DEFINE_OP(unary)
+DEFINE_OP(layernorm)
+DEFINE_OP(instancenorm)
\ No newline at end of file
diff --git a/src/codegen/stackvm/ops/compress.cpp b/src/codegen/stackvm/ops/compress.cpp
new file mode 100644
index 0000000000..2e27b6bf4e
--- /dev/null
+++ b/src/codegen/stackvm/ops/compress.cpp
@@ -0,0 +1,34 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../module_builder.h"
+
+using namespace nncase;
+using namespace nncase::codegen;
+using namespace nncase::codegen::stackvm;
+using namespace nncase::ir;
+
+void stackvm_module_builder::emit(compress &node, stackvm_op_builder &builder)
+{
+    auto &input = allocation(node.input());
+    auto &condition = allocation(node.condition());
+    auto &output = allocation(node.output());
+    builder.lea_buffer(input);
+    builder.lea_buffer(condition);
+    builder.lea_buffer(output);
+
+    builder.stshape(0, input.shape);
+    builder.stshape(1, condition.shape);
+    builder.tensor_compress_(0, 1, node.axis());
+}
diff --git a/src/codegen/stackvm/ops/gather_elements.cpp b/src/codegen/stackvm/ops/gather_elements.cpp
new file mode 100644
index 0000000000..23a9cdca54
--- /dev/null
+++ b/src/codegen/stackvm/ops/gather_elements.cpp
@@ -0,0 +1,35 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../module_builder.h"
+
+using namespace nncase;
+using namespace nncase::codegen;
+using namespace nncase::codegen::stackvm;
+using namespace nncase::ir;
+
+void stackvm_module_builder::emit(gather_elements &node, stackvm_op_builder &builder)
+{
+    auto &input = allocation(node.input());
+    auto &indices = allocation(node.indices());
+    auto &output = allocation(node.output());
+    builder.lea_buffer(input);
+    builder.lea_buffer(indices);
+    builder.lea_buffer(output);
+
+    builder.stshape(0, input.shape);
+    builder.stshape(1, indices.shape);
+
+    builder.tensor_gather_elements_(0, 1, node.axis());
+}
\ No newline at end of file
diff --git a/src/codegen/stackvm/ops/gru.cpp b/src/codegen/stackvm/ops/gru.cpp
new file mode 100644
index 0000000000..76147cc2e2
--- /dev/null
+++ b/src/codegen/stackvm/ops/gru.cpp
@@ -0,0 +1,43 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../module_builder.h"
+
+using namespace nncase;
+using namespace nncase::codegen;
+using namespace nncase::codegen::stackvm;
+using namespace nncase::ir;
+
+void stackvm_module_builder::emit(gru &node, stackvm_op_builder &builder)
+{
+    auto &input = allocation(node.input());
+    auto &w = allocation(node.w());
+    auto &r = allocation(node.r());
+    auto &b = allocation(node.b());
+    auto &initial_h = allocation(node.initial_h());
+    auto &output = allocation(node.output());
+    auto &output_h = allocation(node.output_h());
+    builder.lea_buffer(input);
+    builder.lea_buffer(w);
+    builder.lea_buffer(r);
+    builder.lea_buffer(b);
+    builder.lea_buffer(initial_h);
+    builder.lea_buffer(output);
+    builder.lea_buffer(output_h);
+
+    builder.stshape(0, input.shape);
+    builder.stshape(1, w.shape);
+
+    builder.tensor_gru_(0, 1, node.direction(), node.linear_before_reset());
+}
diff --git a/src/codegen/stackvm/ops/instancenorm.cpp b/src/codegen/stackvm/ops/instancenorm.cpp
new file mode 100644
index 0000000000..4f2c323410
--- /dev/null
+++ b/src/codegen/stackvm/ops/instancenorm.cpp
@@ -0,0 +1,37 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../module_builder.h"
+
+using namespace nncase;
+using namespace nncase::codegen;
+using namespace nncase::codegen::stackvm;
+using namespace nncase::ir;
+
+void stackvm_module_builder::emit(instancenorm &node, stackvm_op_builder &builder)
+{
+    auto &input = allocation(node.input());
+    auto &scale = allocation(node.scale());
+    auto &bias = allocation(node.bias());
+    auto &output = allocation(node.output());
+
+    builder.lea_buffer(input);
+    builder.lea_buffer(scale);
+    builder.lea_buffer(bias);
+    builder.lea_buffer(output);
+
+    builder.stshape(0, input.shape);
+
+    builder.tensor_instance_normalization_(node.output().type(), 0, node.epsilon());
+}
diff --git a/src/codegen/stackvm/ops/layernorm.cpp b/src/codegen/stackvm/ops/layernorm.cpp
new file mode 100644
index 0000000000..77042a7e65
--- /dev/null
+++ b/src/codegen/stackvm/ops/layernorm.cpp
@@ -0,0 +1,37 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../module_builder.h"
+
+using namespace nncase;
+using namespace nncase::codegen;
+using namespace nncase::codegen::stackvm;
+using namespace nncase::ir;
+
+void stackvm_module_builder::emit(layernorm &node, stackvm_op_builder &builder)
+{
+    auto &input = allocation(node.input());
+    auto &scale = allocation(node.scale());
+    auto &bias = allocation(node.bias());
+    auto &output = allocation(node.output());
+
+    builder.lea_buffer(input);
+    builder.lea_buffer(scale);
+    builder.lea_buffer(bias);
+    builder.lea_buffer(output);
+
+    builder.stshape(0, input.shape);
+
+    builder.tensor_layer_normalization_(node.output().type(), 0, node.axis(), node.epsilon());
+}
diff --git a/src/codegen/stackvm/ops/space_to_batch.cpp b/src/codegen/stackvm/ops/space_to_batch.cpp
new file mode 100644
index 0000000000..fd518ee31e
--- /dev/null
+++ b/src/codegen/stackvm/ops/space_to_batch.cpp
@@ -0,0 +1,35 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../module_builder.h"
+
+using namespace nncase;
+using namespace nncase::codegen;
+using namespace nncase::codegen::stackvm;
+using namespace nncase::ir;
+
+void stackvm_module_builder::emit(space_to_batch &node, stackvm_op_builder &builder)
+{
+    auto &input = allocation(node.input());
+    auto &output = allocation(node.output());
+    builder.lea_buffer(input);
+    builder.lea_buffer(output);
+
+    builder.stshape(0, input.shape);
+    builder.stshape(1, input.strides);
+    builder.stshape(2, output.strides);
+    builder.stshape(3, shape_t { (size_t)node.block_size_h(), (size_t)node.block_size_w() });
+    builder.stpaddings(0, std::vector<padding> { node.padding_h(), node.padding_w() });
+    builder.tensor_space_to_batch_(node.input().type(), 0, 1, 2, 3, 0);
+}
diff --git a/src/codegen/stackvm/ops/tflite_detection_postprocess.cpp b/src/codegen/stackvm/ops/tflite_detection_postprocess.cpp
new file mode 100644
index 0000000000..5da2d2afb9
--- /dev/null
+++ b/src/codegen/stackvm/ops/tflite_detection_postprocess.cpp
@@ -0,0 +1,47 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../module_builder.h"
+
+using namespace nncase;
+using namespace nncase::codegen;
+using namespace nncase::codegen::stackvm;
+using namespace nncase::ir;
+
+void stackvm_module_builder::emit(tflite_detection_postprocess &node, stackvm_op_builder &builder)
+{
+    auto &box = allocation(node.boxes());
+    auto &score = allocation(node.scores());
+    auto &anchor = allocation(node.anchors());
+    auto &output_locations = allocation(node.output_locations());
+    auto &output_classes = allocation(node.output_classes());
+    auto &output_scores = allocation(node.output_scores());
+    auto &output_num_detections = allocation(node.output_num_detections());
+
+    builder.lea_buffer(box);
+    builder.lea_buffer(score);
+    builder.lea_buffer(anchor);
+    builder.lea_buffer(output_locations);
+    builder.lea_buffer(output_classes);
+    builder.lea_buffer(output_scores);
+    builder.lea_buffer(output_num_detections);
+
+    builder.stshape(0, box.shape);
+    builder.stshape(1, score.shape);
+    builder.stshape(2, anchor.shape);
+
+    builder.tensor_tflite_detection_postprocess_(0, 1, 2, node.max_detections(), node.max_classes_per_detection(), node.detections_per_class(),
+        node.use_regular_non_max_suppression(), node.nms_score_threshold(), node.nms_iou_threshold(),
+        node.num_classes(), node.y_scale(), node.x_scale(), node.h_scale(), node.w_scale());
+}
diff --git a/src/evaluator/ops/neutral/neutral_ops.cpp b/src/evaluator/ops/neutral/neutral_ops.cpp
index 16cb8a7090..6a42b9d585 100644
--- a/src/evaluator/ops/neutral/neutral_ops.cpp
+++ b/src/evaluator/ops/neutral/neutral_ops.cpp
@@ -21,6 +21,7 @@
 #include <nncase/ir/ops/broadcast.h>
 #include <nncase/ir/ops/clamp.h>
 #include <nncase/ir/ops/compare.h>
+#include <nncase/ir/ops/compress.h>
 #include <nncase/ir/ops/concat.h>
 #include <nncase/ir/ops/conv2d.h>
 #include <nncase/ir/ops/conv2d_transpose.h>
@@ -29,8 +30,12 @@
 #include <nncase/ir/ops/dequantize.h>
 #include <nncase/ir/ops/fused_unary.h>
 #include <nncase/ir/ops/gather.h>
+#include <nncase/ir/ops/gather_elements.h>
 #include <nncase/ir/ops/gather_nd.h>
+#include <nncase/ir/ops/gru.h>
 #include <nncase/ir/ops/hardmax.h>
+#include <nncase/ir/ops/instancenorm.h>
+#include <nncase/ir/ops/layernorm.h>
 #include <nncase/ir/ops/matmul.h>
 #include <nncase/ir/ops/onehot.h>
 #include <nncase/ir/ops/pad.h>
@@ -46,8 +51,10 @@
 #include <nncase/ir/ops/sigmoid.h>
 #include <nncase/ir/ops/slice.h>
 #include <nncase/ir/ops/softmax.h>
+#include <nncase/ir/ops/space_to_batch.h>
 #include <nncase/ir/ops/table_lookup.h>
 #include <nncase/ir/ops/ternary.h>
+#include <nncase/ir/ops/tflite_detection_postprocess.h>
 #include <nncase/ir/ops/topk.h>
 #include <nncase/ir/ops/transpose.h>
 #include <nncase/ir/ops/trilu.h>
@@ -108,8 +115,7 @@ void register_neutral_evaluators()
             runtime_shape_t { (size_t)rnode.block_size_h(), (size_t)rnode.block_size_w() },
             runtime_paddings_t { padding { rnode.crop_h()[0], rnode.crop_h()[1] }, padding { rnode.crop_w()[0], rnode.crop_w()[1] } },
             input.strides(), output.strides())
-            .unwrap_or_throw();
-    });
+            .unwrap_or_throw(); });
 
     register_evaluator(op_binary, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<binary &>(node);
@@ -141,8 +147,7 @@ void register_neutral_evaluators()
             break;
         default:
             std::cerr << "unsupported dtype for binary: " + std::string(datatype_names(input_type));
-        }
-    });
+        } });
 
     register_evaluator(op_broadcast, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<broadcast &>(node);
@@ -151,8 +156,7 @@ void register_neutral_evaluators()
         auto output = context.memory_at(rnode.output());
         kernels::broadcast(input.datatype(), input.buffer().data(), output.buffer().data(),
             input.shape(), input.strides(), output.shape(), output.strides())
-            .unwrap_or_throw();
-    });
+            .unwrap_or_throw(); });
 
     register_evaluator(op_concat, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<concat &>(node);
@@ -170,8 +174,7 @@ void register_neutral_evaluators()
         runtime_shape_t concat_dims { rnode.concat_dims().begin(), rnode.concat_dims().end() };
         kernels::concat(rnode.output().type(), inputs_mem, output.buffer().data(), output.shape(), inputs_strides,
             output.strides(), rnode.axis(), concat_dims)
-            .unwrap_or_throw();
-    });
+            .unwrap_or_throw(); });
 
     register_evaluator(op_conv2d, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<conv2d &>(node);
@@ -190,8 +193,7 @@ void register_neutral_evaluators()
         kernels::conv2d(input_mem.data(), weights_mem.data(), bias_mem.data(), output_mem.data(), input.shape(), input.strides(),
             weights.shape(), weights.strides(), bias.strides(), output.strides(), rnode.padding_h(), rnode.padding_w(),
             rnode.groups(), rnode.stride_h(), rnode.stride_w(), rnode.dilation_h(), rnode.dilation_w(), rnode.fused_activation())
-            .unwrap_or_throw();
-    });
+            .unwrap_or_throw(); });
 
     register_evaluator(op_conv2d_transpose, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<conv2d_transpose &>(node);
@@ -204,8 +206,7 @@ void register_neutral_evaluators()
 
         neutral::conv2d_transpose(input.data(), output.data(), weights.data(), bias.data(), to(rnode.input().shape()),
             rnode.groups(), to(rnode.output().shape()), rnode.filter_h(), rnode.filter_w(), rnode.stride_h(), rnode.stride_w(),
-            rnode.dilation_h(), rnode.dilation_w(), rnode.padding_h(), rnode.padding_w(), rnode.fused_activation());
-    });
+            rnode.dilation_h(), rnode.dilation_w(), rnode.padding_h(), rnode.padding_w(), rnode.fused_activation()); });
 
     register_evaluator(op_dequantize, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<dequantize &>(node);
@@ -229,8 +230,7 @@ void register_neutral_evaluators()
             assert(false && "not supported type!");
 
 #undef DEQUANTIZE
-        }
-    });
+        } });
 
     register_evaluator(op_compare, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<compare &>(node);
@@ -268,8 +268,7 @@ void register_neutral_evaluators()
             break;
         default:
             std::cerr << "unsupported dtype for compare: " + std::string(datatype_names(input_type));
-        }
-    });
+        } });
 
     register_evaluator(op_fused_unary, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<fused_unary &>(node);
@@ -286,8 +285,7 @@ void register_neutral_evaluators()
         auto buf = ss.str();
         std::vector<gsl::byte> body(reinterpret_cast<gsl::byte *>(buf.data()), reinterpret_cast<gsl::byte *>(buf.data() + buf.size()));
         kernels::nnil_unary_method(input.data(), output.data(), input.size(), body)
-            .unwrap_or_throw();
-    });
+            .unwrap_or_throw(); });
 
     register_evaluator(op_matmul, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<matmul &>(node);
@@ -305,8 +303,7 @@ void register_neutral_evaluators()
 
         kernels::matmul(input_a_mem.data(), input_b_mem.data(), bias_mem.data(), output_mem.data(), input_a.shape(), input_a.strides(),
             input_b.shape(), input_b.strides(), output.shape(), output.strides(), rnode.fused_activation())
-            .unwrap_or_throw();
-    });
+            .unwrap_or_throw(); });
 
     register_evaluator(op_pad, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<pad &>(node);
@@ -318,8 +315,7 @@ void register_neutral_evaluators()
 
         kernels::pad(input.datatype(), input_mem.data(), output_mem.data(), input.shape(), input.strides(),
             output.strides(), to(rnode.paddings()), rnode.pad_mode(), rnode.pad_value())
-            .unwrap_or_throw();
-    });
+            .unwrap_or_throw(); });
 
     register_evaluator(op_quantize, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<quantize &>(node);
@@ -339,8 +335,7 @@ void register_neutral_evaluators()
         default:
             assert(false && "not supported type!");
 #undef QUANTIZE
-        }
-    });
+        } });
 
     register_evaluator(op_reduce, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<reduce &>(node);
@@ -360,10 +355,14 @@ void register_neutral_evaluators()
                 output.buffer().as_span<int32_t>().data(), input.shape(), to(rnode.axis()), input.strides(), output.strides(), rnode.keep_dims())
                 .unwrap_or_throw();
             break;
+        case dt_int64:
+            kernels::reduce(rnode.reduce_op(), static_cast<int64_t>(rnode.init_value()), input.buffer().as_span<int64_t>().data(),
+                            output.buffer().as_span<int64_t>().data(), input.shape(), to(rnode.axis()), input.strides(), output.strides(), rnode.keep_dims())
+                    .unwrap_or_throw();
+            break;
         default:
             std::cerr << "unsupported dtype for reduce: " + std::string(datatype_names(input_type));
-        }
-    });
+        } });
 
     register_evaluator(op_reduce_arg, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<reduce_arg &>(node);
@@ -389,8 +388,7 @@ void register_neutral_evaluators()
             break;
         default:
             std::cerr << "unsupported dtype for reduce_arg: " + std::string(datatype_names(output_type));
-        }
-    });
+        } });
 
     register_evaluator(op_reduce_prod, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<reduce_prod &>(node);
@@ -412,8 +410,7 @@ void register_neutral_evaluators()
             break;
         default:
             std::cerr << "unsupported dtype for reduce_prod: " + std::string(datatype_names(input_type));
-        }
-    });
+        } });
 
     register_evaluator(op_reduce_window2d, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<reduce_window2d &>(node);
@@ -427,8 +424,7 @@ void register_neutral_evaluators()
         kernels::reduce_window2d(rnode.reduce_op(), input_mem.data(), rnode.init_value(), output_mem.data(),
             input.shape(), input.strides(), output.strides(), rnode.padding_h(), rnode.padding_w(), rnode.filter_h(), rnode.filter_w(),
             rnode.stride_h(), rnode.stride_w(), rnode.dilation_h(), rnode.dilation_w(), rnode.fused_activation())
-            .unwrap_or_throw();
-    });
+            .unwrap_or_throw(); });
 
     register_evaluator(op_bitcast, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<bitcast &>(node);
@@ -436,8 +432,7 @@ void register_neutral_evaluators()
         auto input = context.memory_at(rnode.input()).buffer();
         auto output = context.memory_at(rnode.output()).buffer();
 
-        std::copy(input.begin(), input.end(), output.begin());
-    });
+        std::copy(input.begin(), input.end(), output.begin()); });
 
     register_evaluator(op_resize_image, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<resize_image &>(node);
@@ -460,8 +455,7 @@ void register_neutral_evaluators()
                 input.shape(), input.strides(), output.strides(), new_size[0], new_size[1],
                 rnode.align_corners(), rnode.half_pixel_centers())
                 .unwrap_or_throw();
-        }
-    });
+        } });
 
     register_evaluator(op_roi_align, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<roi_align &>(node);
@@ -482,8 +476,7 @@ void register_neutral_evaluators()
             break;
         default:
             std::cerr << "unsupported dtype for roi_align: " + std::string(datatype_names(input_type));
-        }
-    });
+        } });
 
     register_evaluator(op_sigmoid, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<sigmoid &>(node);
@@ -501,8 +494,7 @@ void register_neutral_evaluators()
             break;
         default:
             std::cerr << "unsupported dtype for sigmoid: " + std::string(datatype_names(output_type));
-        }
-    });
+        } });
 
     register_evaluator(op_slice, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<slice &>(node);
@@ -514,8 +506,7 @@ void register_neutral_evaluators()
 
         kernels::slice(input.datatype(), input_mem.data(), output_mem.data(), input.shape(),
             input.strides(), output.strides(), to(rnode.begin()), to<int32_t>(rnode.end()), to<int32_t>(rnode.strides()))
-            .unwrap_or_throw();
-    });
+            .unwrap_or_throw(); });
 
     register_evaluator(op_softmax, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<softmax &>(node);
@@ -533,8 +524,19 @@ void register_neutral_evaluators()
             break;
         default:
             std::cerr << "unsupported dtype for softmax: " + std::string(datatype_names(output_type));
-        }
-    });
+        } });
+
+    register_evaluator(op_space_to_batch, [](ir::node &node, function_evaluate_context &context) {
+        auto &rnode = static_cast<space_to_batch &>(node);
+
+        auto input = context.memory_at(rnode.input());
+        auto output = context.memory_at(rnode.output());
+
+        kernels::space_to_batch(input.datatype(), input.buffer().data(), output.buffer().data(), input.shape(),
+            runtime_shape_t { (size_t)rnode.block_size_h(), (size_t)rnode.block_size_w() },
+            runtime_paddings_t { rnode.padding_h(), rnode.padding_w() },
+            input.strides(), output.strides())
+            .unwrap_or_throw(); });
 
     register_evaluator(op_ternary, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<ternary &>(node);
@@ -553,10 +555,15 @@ void register_neutral_evaluators()
                 input_b.shape(), input_b.strides(), input_c.shape(), input_c.strides(), output.strides())
                 .unwrap_or_throw();
             break;
+        case dt_int64:
+            kernels::ternary(input_a.buffer().as_span<float>().data(), input_b.buffer().as_span<int64_t>().data(),
+                input_c.buffer().as_span<int64_t>().data(), output.buffer().as_span<int64_t>().data(), input_a.shape(), input_a.strides(),
+                input_b.shape(), input_b.strides(), input_c.shape(), input_c.strides(), output.strides())
+                .unwrap_or_throw();
+            break;
         default:
             std::cerr << "unsupported dtype for ternary: " + std::string(datatype_names(output_type));
-        }
-    });
+        } });
 
     register_evaluator(op_transpose, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<transpose &>(node);
@@ -568,8 +575,7 @@ void register_neutral_evaluators()
 
         kernels::transpose(input.datatype(), input_mem.data(), output_mem.data(), input.shape(), to(rnode.perm()),
             input.strides(), output.strides())
-            .unwrap_or_throw();
-    });
+            .unwrap_or_throw(); });
 
     register_evaluator(op_unary, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<unary &>(node);
@@ -635,10 +641,12 @@ void register_neutral_evaluators()
         case unary_tanh:
             unary([](auto a) { return tanh(a); });
             break;
+        case unary_erf:
+            unary([](auto a) { return erf(a); });
+            break;
         default:
             throw std::runtime_error("Not supported unary");
-        }
-    });
+        } });
 
     register_evaluator(op_table_lookup1d, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<table_lookup1d &>(node);
@@ -648,8 +656,7 @@ void register_neutral_evaluators()
         auto table = context.memory_at(rnode.table()).buffer().as_span<uint8_t>();
         auto output = context.memory_at(rnode.output()).buffer().as_span<uint8_t>();
 
-        kernels::neutral::table_lookup1d(input.data(), output.data(), input.size(), table.data());
-    });
+        kernels::neutral::table_lookup1d(input.data(), output.data(), input.size(), table.data()); });
 
     register_evaluator(op_clamp, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<clamp &>(node);
@@ -667,8 +674,7 @@ void register_neutral_evaluators()
         for (size_t i = 0; i < input.size(); i++)
         {
             output_ptr[i] = std::clamp(input_ptr[i], low, high);
-        }
-    });
+        } });
 
     register_evaluator(op_convert, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<convert &>(node);
@@ -680,8 +686,7 @@ void register_neutral_evaluators()
 
         kernels::convert(input.datatype(), output.datatype(), input_mem.data(), output_mem.data(), input.shape(),
             input.strides(), output.strides())
-            .unwrap_or_throw();
-    });
+            .unwrap_or_throw(); });
 
     register_evaluator(op_gather, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<gather &>(node);
@@ -694,8 +699,7 @@ void register_neutral_evaluators()
 
         kernels::gather(input.datatype(), input_mem.data(), output_mem.data(), input.shape(), output.shape(),
             input.strides(), output.strides(), reinterpret_cast<const int32_t *>(indices.buffer().data()), indices.shape(), rnode.axis())
-            .unwrap_or_throw();
-    });
+            .unwrap_or_throw(); });
 
     register_evaluator(op_gather_nd, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<gather_nd &>(node);
@@ -708,8 +712,7 @@ void register_neutral_evaluators()
 
         kernels::gather_nd(input.datatype(), input_mem.data(), output_mem.data(), input.shape(), output.shape(),
             input.strides(), output.strides(), reinterpret_cast<const int32_t *>(indices.buffer().data()), indices.shape(), rnode.batch_dims())
-            .unwrap_or_throw();
-    });
+            .unwrap_or_throw(); });
 
     register_evaluator(op_onehot, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<onehot &>(node);
@@ -726,8 +729,7 @@ void register_neutral_evaluators()
         auto off_value_mem = off_value.buffer().data();
         kernels::onehot(output.datatype(), indices_mem, output_mem, indices.shape(), output.shape(),
             output.strides(), depth_mem, off_value_mem, on_value_mem, rnode.axis(), rnode.mode())
-            .unwrap_or_throw();
-    });
+            .unwrap_or_throw(); });
 
     register_evaluator(op_cumsum, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<cumsum &>(node);
@@ -749,8 +751,7 @@ void register_neutral_evaluators()
             break;
         default:
             throw std::runtime_error("unsupported dtype for cumsum: " + std::string(datatype_names(datatype)));
-        }
-    });
+        } });
 
     register_evaluator(op_hardmax, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<hardmax &>(node);
@@ -767,8 +768,7 @@ void register_neutral_evaluators()
             break;
         default:
             throw std::runtime_error("unsupported dtype for hardmax: " + std::string(datatype_names(datatype)));
-        }
-    });
+        } });
 
     register_evaluator(op_random_normal, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<random_normal &>(node);
@@ -783,8 +783,7 @@ void register_neutral_evaluators()
             break;
         default:
             throw std::runtime_error("unsupported dtype for random_normal: " + std::string(datatype_names(datatype)));
-        }
-    });
+        } });
 
     register_evaluator(op_random_uniform, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<random_uniform &>(node);
@@ -799,8 +798,7 @@ void register_neutral_evaluators()
             break;
         default:
             throw std::runtime_error("unsupported dtype for random_uniform: " + std::string(datatype_names(datatype)));
-        }
-    });
+        } });
 
     register_evaluator(op_topk, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<topk &>(node);
@@ -821,8 +819,7 @@ void register_neutral_evaluators()
             break;
         default:
             throw std::runtime_error("unsupported dtype for topk: " + std::string(datatype_names(datatype)));
-        }
-    });
+        } });
 
     register_evaluator(op_trilu, [](ir::node &node, function_evaluate_context &context) {
         auto &rnode = static_cast<trilu &>(node);
@@ -839,8 +836,107 @@ void register_neutral_evaluators()
             break;
         default:
             throw std::runtime_error("unsupported dtype for topk: " + std::string(datatype_names(datatype)));
+        } });
+
+    register_evaluator(op_gru, [](ir::node &node, function_evaluate_context &context) {
+        auto &rnode = static_cast<gru &>(node);
+        auto input = context.memory_at(rnode.input());
+        auto W = context.memory_at(rnode.w());
+        auto R = context.memory_at(rnode.r());
+        auto B = context.memory_at(rnode.b());
+        auto initial_h = context.memory_at(rnode.initial_h());
+        auto output = context.memory_at(rnode.output());
+        auto output_h = context.memory_at(rnode.output_h());
+        kernels::gru(input.buffer().as_span<float>().data(), W.buffer().as_span<float>().data(), R.buffer().as_span<float>().data(),
+            B.buffer().as_span<float>().data(), initial_h.buffer().as_span<float>().data(), output.buffer().as_span<float>().data(), output_h.buffer().as_span<float>().data(),
+            input.shape(), W.shape(), rnode.direction(), rnode.linear_before_reset())
+            .unwrap_or_throw(); });
+
+    register_evaluator(op_tflite_detection_postprocess, [](ir::node &node, function_evaluate_context &context) {
+        auto &rnode = static_cast<tflite_detection_postprocess &>(node);
+        auto box = context.memory_at(rnode.boxes());
+        auto score = context.memory_at(rnode.scores());
+        auto anchor = context.memory_at(rnode.anchors());
+        auto output_locations = context.memory_at(rnode.output_locations());
+        auto output_classes = context.memory_at(rnode.output_classes());
+        auto output_scores = context.memory_at(rnode.output_scores());
+        auto output_num_detections = context.memory_at(rnode.output_num_detections());
+        kernels::tflite_detection_postprocess(box.buffer().as_span<float>().data(), score.buffer().as_span<float>().data(), anchor.buffer().as_span<float>().data(),
+            output_locations.buffer().as_span<float>().data(), output_classes.buffer().as_span<float>().data(), output_scores.buffer().as_span<float>().data(), output_num_detections.buffer().as_span<float>().data(),
+            box.shape(), score.shape(), anchor.shape(), rnode.max_detections(), rnode.max_classes_per_detection(), 
+            rnode.detections_per_class(), rnode.use_regular_non_max_suppression(), rnode.nms_score_threshold(), rnode.nms_iou_threshold(),
+            rnode.num_classes(), rnode.y_scale(), rnode.x_scale(), rnode.h_scale(), rnode.w_scale())
+            .unwrap_or_throw(); });
+
+    register_evaluator(op_gather_elements, [](ir::node &node, function_evaluate_context &context) {
+        auto &rnode = static_cast<gather_elements &>(node);
+        auto input = context.memory_at(rnode.input());
+        auto indices = context.memory_at(rnode.indices());
+        auto output = context.memory_at(rnode.output());
+        auto input_datatype = rnode.input().type();
+
+        switch (input_datatype)
+        {
+        case dt_float32:
+            kernels::gather_elements(input.buffer().as_span<float>().data(), indices.buffer().as_span<int64_t>().data(), output.buffer().as_span<float>().data(),
+                input.shape(), indices.shape(), rnode.axis())
+                .unwrap_or_throw();
+            break;
+        default:
+            throw std::runtime_error("unsupported dtype for gather_elements: " + std::string(datatype_names(input_datatype)));
         }
     });
+
+    register_evaluator(op_instancenorm, [](ir::node &node, function_evaluate_context &context) {
+        auto &rnode = static_cast<instancenorm &>(node);
+
+        auto input = context.memory_at(rnode.input());
+        auto scale = context.memory_at(rnode.scale());
+        auto bias = context.memory_at(rnode.bias());
+        auto output = context.memory_at(rnode.output());
+
+        auto output_type = rnode.output().type();
+        switch (output_type)
+        {
+        case dt_float32:
+            kernels::instancenorm(input.buffer().as_span<float>().data(), output.buffer().as_span<float>().data(),
+                scale.buffer().as_span<float>().data(), bias.buffer().as_span<float>().data(), input.shape(),
+                rnode.epsilon())
+                .unwrap_or_throw();
+            break;
+        default:
+            std::cerr << "unsupported dtype for layernorm: " + std::string(datatype_names(output_type));
+        } });
+
+    register_evaluator(op_layernorm, [](ir::node &node, function_evaluate_context &context) {
+        auto &rnode = static_cast<layernorm &>(node);
+
+        auto input = context.memory_at(rnode.input());
+        auto scale = context.memory_at(rnode.scale());
+        auto bias = context.memory_at(rnode.bias());
+        auto output = context.memory_at(rnode.output());
+
+        auto output_type = rnode.output().type();
+        switch (output_type)
+        {
+        case dt_float32:
+            kernels::layernorm(input.buffer().as_span<float>().data(), output.buffer().as_span<float>().data(),
+             scale.buffer().as_span<float>().data(), bias.buffer().as_span<float>().data(), input.shape(),
+                rnode.axis(), rnode.epsilon())
+                .unwrap_or_throw();
+            break;
+        default:
+            std::cerr << "unsupported dtype for layernorm: " + std::string(datatype_names(output_type));
+        } });
+
+    register_evaluator(op_compress, [](ir::node &node, function_evaluate_context &context) {
+        auto &rnode = static_cast<compress &>(node);
+        auto input = context.memory_at(rnode.input());
+        auto condition = context.memory_at(rnode.condition());
+        auto output = context.memory_at(rnode.output());
+        kernels::compress(input.buffer().as_span<float>().data(), condition.buffer().as_span<uint8_t>().data(), output.buffer().as_span<float>().data(),
+            input.shape(), condition.shape(), rnode.axis())
+            .unwrap_or_throw(); });
 }
 
 }
diff --git a/src/importer/caffe/CMakeLists.txt b/src/importer/caffe/CMakeLists.txt
index 7fbea7a58c..70e91f9b78 100644
--- a/src/importer/caffe/CMakeLists.txt
+++ b/src/importer/caffe/CMakeLists.txt
@@ -25,6 +25,6 @@ set(SRCS caffe.pb.cc
 add_library(caffe_importer ${SRCS})
 target_include_directories(caffe_importer PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 target_include_directories(caffe_importer PUBLIC ${PARENT_SOURCE_DIR}/include)
-target_link_libraries(caffe_importer PUBLIC ir)
+target_link_libraries(caffe_importer PUBLIC ir protobuf::libprotobuf)
 target_link_libraries(caffe_importer PRIVATE)
 set_target_properties(caffe_importer PROPERTIES POSITION_INDEPENDENT_CODE ON)
\ No newline at end of file
diff --git a/src/importer/onnx/CMakeLists.txt b/src/importer/onnx/CMakeLists.txt
index 1f4a58fadc..e00d09f29c 100644
--- a/src/importer/onnx/CMakeLists.txt
+++ b/src/importer/onnx/CMakeLists.txt
@@ -28,6 +28,7 @@ set(ONNX_IMPORTER_OPS_SOURCES
     ops/hardmax.cpp
     ops/identity.cpp
     ops/instancenorm.cpp
+    ops/layernorm.cpp
     ops/lpnorm.cpp
     ops/lrn.cpp
     ops/matmul.cpp
@@ -59,6 +60,9 @@ set(ONNX_IMPORTER_OPS_SOURCES
     ops/unsqueeze.cpp
     ops/lstm.cpp
     ops/where.cpp
+    ops/gru.cpp
+    ops/gather_elements.cpp
+    ops/compress.cpp
     )
 
 add_library(onnx_importer ${ONNX_IMPORTER_SOURCES} ${ONNX_IMPORTER_OPS_SOURCES})
diff --git a/src/importer/onnx/onnx_importer.h b/src/importer/onnx/onnx_importer.h
index 6b496b7258..d9af3c4906 100644
--- a/src/importer/onnx/onnx_importer.h
+++ b/src/importer/onnx/onnx_importer.h
@@ -53,6 +53,7 @@ class onnx_importer
     void convert_op_logical(const onnx::NodeProto &node, const binary_op_t binary_op);
     void convert_op_arg(const onnx::NodeProto &node, reduce_arg_op_t op);
     void convert_op_compare(const onnx::NodeProto &node, const compare_op_t compare_op);
+    void convert_op_compress(const onnx::NodeProto &node);
 
     template <bool global = false>
     void convert_pool(const onnx::NodeProto &node, const reduce_op_t reduce_op, const float initial_value);
diff --git a/src/importer/onnx/opcode.def b/src/importer/onnx/opcode.def
index 4d43f16dfd..935edec33f 100644
--- a/src/importer/onnx/opcode.def
+++ b/src/importer/onnx/opcode.def
@@ -13,6 +13,7 @@ DEFINE_OPCODE(Cast)
 DEFINE_OPCODE(Ceil)
 DEFINE_OPCODE(Celu)
 DEFINE_OPCODE(Clip)
+DEFINE_OPCODE(Compress)
 DEFINE_OPCODE(Concat)
 DEFINE_OPCODE(Constant)
 DEFINE_OPCODE(ConstantOfShape)
@@ -29,20 +30,24 @@ DEFINE_OPCODE(Elu)
 DEFINE_OPCODE(Exp)
 DEFINE_OPCODE(Expand)
 DEFINE_OPCODE(Equal)
+DEFINE_OPCODE(Erf)
 DEFINE_OPCODE(Flatten)
 DEFINE_OPCODE(Floor)
 DEFINE_OPCODE(Gather)
+DEFINE_OPCODE(GatherElements)
 DEFINE_OPCODE(GatherND)
 DEFINE_OPCODE(Gemm)
 DEFINE_OPCODE(GlobalAveragePool)
 DEFINE_OPCODE(GlobalMaxPool)
 DEFINE_OPCODE(Greater)
 DEFINE_OPCODE(GreaterOrEqual)
+DEFINE_OPCODE(GRU)
 DEFINE_OPCODE(Hardmax)
 DEFINE_OPCODE(HardSigmoid)
 DEFINE_OPCODE(HardSwish)
 DEFINE_OPCODE(Identity)
 DEFINE_OPCODE(InstanceNormalization)
+DEFINE_OPCODE(LayerNormalization)
 DEFINE_OPCODE(LpNormalization)
 DEFINE_OPCODE(LeakyRelu)
 DEFINE_OPCODE(Less)
@@ -83,6 +88,7 @@ DEFINE_OPCODE(Resize)
 DEFINE_OPCODE(ReverseSequence)
 DEFINE_OPCODE(RoiAlign)
 DEFINE_OPCODE(Round)
+DEFINE_OPCODE(Rsqrt)
 DEFINE_OPCODE(Selu)
 DEFINE_OPCODE(Shape)
 DEFINE_OPCODE(Sign)
@@ -105,6 +111,7 @@ DEFINE_OPCODE(Tile)
 DEFINE_OPCODE(TopK)
 DEFINE_OPCODE(Transpose)
 DEFINE_OPCODE(Trilu)
+DEFINE_OPCODE(ThresholdedRelu)
 DEFINE_OPCODE(Upsample)
 DEFINE_OPCODE(Unsqueeze)
 DEFINE_OPCODE(Where)
diff --git a/src/importer/onnx/ops/activations.cpp b/src/importer/onnx/ops/activations.cpp
index d6e2b3ed07..1ae5edc323 100644
--- a/src/importer/onnx/ops/activations.cpp
+++ b/src/importer/onnx/ops/activations.cpp
@@ -18,7 +18,9 @@
 #include <nncase/ir/graph.h>
 #include <nncase/ir/ops/binary.h>
 #include <nncase/ir/ops/clamp.h>
+#include <nncase/ir/ops/compare.h>
 #include <nncase/ir/ops/constant.h>
+#include <nncase/ir/ops/convert.h>
 #include <nncase/ir/ops/reduce.h>
 #include <nncase/ir/ops/sigmoid.h>
 #include <nncase/ir/ops/trilu.h>
@@ -571,4 +573,34 @@ void onnx_importer::convert_op_Softsign(const NodeProto &node)
     input_tensors_.emplace(&abs->input(), input);
     input_tensors_.emplace(&div->input_a(), input);
     output_tensors_.emplace(output, &div->output());
+}
+
+void onnx_importer::convert_op_ThresholdedRelu(const NodeProto &node)
+{
+    const auto &input = node.input()[0];
+    const auto &output = node.output()[0];
+    auto in_shape = get_shape(input);
+    const auto input_type = get_datatype(input).value();
+    const auto &op_name { generate_name(node) };
+
+    const auto alpha_value = get_attribute<float>(node, "alpha").value_or(1.0);
+    auto alpha = graph_.emplace<constant>(alpha_value);
+    alpha->name(op_name + ".alpha(ThresholdedRelu)");
+
+    auto cmp = graph_.emplace<compare>(compare_op_t::compare_greater, input_type, in_shape, alpha->output().shape());
+    cmp->name(op_name + ".greater(ThresholdedRelu)");
+    cmp->input_b().connect(alpha->output());
+
+    auto new_alpha = graph_.emplace<convert>(cmp->output().type(), cmp->output().shape(), dt_float32);
+    new_alpha->name(op_name + ".new_alpha(ThresholdedRelu)");
+    new_alpha->input().connect(cmp->output());
+
+    auto b_max = graph_.emplace<binary>(binary_mul, input_type, in_shape, new_alpha->output().shape(), value_range<float>::nonnegative());
+    b_max->name(op_name + ".mul(ThresholdedRelu)");
+
+    b_max->input_b().connect(new_alpha->output());
+
+    input_tensors_.emplace(&cmp->input_a(), input);
+    input_tensors_.emplace(&b_max->input_a(), input);
+    output_tensors_.emplace(output, &b_max->output());
 }
\ No newline at end of file
diff --git a/src/importer/onnx/ops/binary.cpp b/src/importer/onnx/ops/binary.cpp
index 8f1d1f1901..a92c3df60a 100644
--- a/src/importer/onnx/ops/binary.cpp
+++ b/src/importer/onnx/ops/binary.cpp
@@ -74,11 +74,26 @@ void onnx_importer::convert_binary(const onnx::NodeProto &node, const binary_op_
     auto input_a_shape = get_shape(input_a);
     const auto input_type = get_datatype(input_a).value();
     auto input_b_shape = get_shape(input_b);
+    const auto input_b_type = get_datatype(input_b).value();
+    convert *cvt = nullptr;
+    if (input_type != input_b_type)
+    {
+        cvt = graph_.emplace<convert>(input_b_type, input_b_shape, input_type);
+        cvt->name(op_name + "(Convert)");
+    }
     auto op = graph_.emplace<binary>(binary_op, input_type, input_a_shape, input_b_shape, value_range<float>::full());
     op->name(op_name + '(' + binary_op_to_string(binary_op) + ')');
 
     input_tensors_.emplace(&op->input_a(), input_a);
-    input_tensors_.emplace(&op->input_b(), input_b);
+    if (cvt)
+    {
+        input_tensors_.emplace(&cvt->input(), input_b);
+        op->input_b().connect(cvt->output());
+    }
+    else
+    {
+        input_tensors_.emplace(&op->input_b(), input_b);
+    }
     output_tensors_.emplace(output, &op->output());
 }
 
diff --git a/src/importer/onnx/ops/compress.cpp b/src/importer/onnx/ops/compress.cpp
new file mode 100644
index 0000000000..13fea4606e
--- /dev/null
+++ b/src/importer/onnx/ops/compress.cpp
@@ -0,0 +1,49 @@
+/* Copyright 2020 Alexey Chernov <4ernov@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../onnx_importer.h"
+#include <cassert>
+#include <nncase/ir/graph.h>
+#include <nncase/ir/ops/compress.h>
+#include <vector>
+
+using namespace nncase;
+using namespace nncase::importer;
+using namespace nncase::ir;
+using namespace onnx;
+
+void onnx_importer::convert_op_Compress(const NodeProto &node)
+{
+    const auto &op_name { generate_name(node) };
+
+    auto input = node.input()[0];
+    auto condition = node.input()[1];
+    auto output = node.output()[0];
+
+    const auto in_type = get_datatype(input).value();
+    const auto in_shape = get_shape(input);
+    const auto condition_shape = get_shape(condition);
+    const auto out_shape = get_shape(output);
+
+    auto onnx_axis = get_attribute<int>(node, "axis").value_or((int)in_shape.size());
+
+    auto onnx_compress = graph_.emplace<compress>(in_type, in_shape, condition_shape, out_shape, onnx_axis);
+    onnx_compress->name(op_name);
+
+    input_tensors_.emplace(&onnx_compress->input_at(0), node.input()[0]);
+    input_tensors_.emplace(&onnx_compress->input_at(1), node.input()[1]);
+
+    output_tensors_.emplace(node.output()[0], &onnx_compress->output());
+}
diff --git a/src/importer/onnx/ops/conv.cpp b/src/importer/onnx/ops/conv.cpp
index 34bd36bdc0..f548d514dc 100644
--- a/src/importer/onnx/ops/conv.cpp
+++ b/src/importer/onnx/ops/conv.cpp
@@ -16,6 +16,7 @@
 #include "../onnx_importer.h"
 #include <cassert>
 #include <nncase/ir/graph.h>
+#include <nncase/ir/ir_types.h>
 #include <nncase/ir/op_utils.h>
 #include <nncase/ir/ops/bitcast.h>
 #include <nncase/ir/ops/constant.h>
@@ -176,19 +177,38 @@ void onnx_importer::convert_op_ConvTranspose(const NodeProto &node)
 
     auto input_shape = get_shape(input);
     auto weight_shape = get_shape(weight);
-    auto weight_type = get_datatype(weight).value();
     auto output_shape = get_shape(output);
+    auto input_type = get_datatype(input).value();
+    auto weight_type = get_datatype(weight).value();
+    auto output_type = get_datatype(output).value();
+
+    bool model_3d = input_shape.size() == 3;
 
     // group
     const auto &group_attr = get_attribute<int>(node, "group");
     size_t group = group_attr ? group_attr.value() : 1;
 
-    auto tp = graph_.emplace<transpose>(weight_type, weight_shape, axis_t { 1, 0, 2, 3 });
-    tp->name(op_name + "(Transpose)");
-    auto tp_shape = tp->output().shape();
-    auto bc = graph_.emplace<bitcast>(weight_type, tp_shape, shape_t { tp_shape[0] * group, tp_shape[1] / group, tp_shape[2], tp_shape[3] });
-    bc->name(op_name + "(Bitcast)");
-    auto bc_shape = bc->output().shape();
+    transpose *tp;
+    bitcast *bc;
+    shape_t bc_shape, tp_shape;
+    if (model_3d)
+    {
+        tp = graph_.emplace<transpose>(weight_type, weight_shape, axis_t { 1, 0, 2 });
+        tp->name(op_name + "(Transpose)");
+        tp_shape = tp->output().shape();
+        bc = graph_.emplace<bitcast>(weight_type, tp_shape, shape_t { tp_shape[0] * group, tp_shape[1] / group, tp_shape[2], 1 });
+        bc->name(op_name + "(Bitcast)");
+        bc_shape = bc->output().shape();
+    }
+    else
+    {
+        tp = graph_.emplace<transpose>(weight_type, weight_shape, axis_t { 1, 0, 2, 3 });
+        tp->name(op_name + "(Transpose)");
+        tp_shape = tp->output().shape();
+        bc = graph_.emplace<bitcast>(weight_type, tp_shape, shape_t { tp_shape[0] * group, tp_shape[1] / group, tp_shape[2], tp_shape[3] });
+        bc->name(op_name + "(Bitcast)");
+        bc_shape = bc->output().shape();
+    }
 
     // stride
     std::array<size_t, 2> strides = { 1, 1 };
@@ -244,7 +264,8 @@ void onnx_importer::convert_op_ConvTranspose(const NodeProto &node)
     {
         std::array<int, 2> total_paddings { { 0, 0 } };
         total_paddings[0] = strides[0] * (input_shape[2] - 1) + output_paddings[0] + ((tp_shape[2] - 1) * dilations[0] + 1) - output_shape[2];
-        total_paddings[1] = strides[1] * (input_shape[3] - 1) + output_paddings[1] + ((tp_shape[3] - 1) * dilations[1] + 1) - output_shape[3];
+        if (!model_3d)
+            total_paddings[1] = strides[1] * (input_shape[3] - 1) + output_paddings[1] + ((tp_shape[3] - 1) * dilations[1] + 1) - output_shape[3];
 
         if (pad_mode == "SAME_UPPER")
         {
@@ -269,23 +290,35 @@ void onnx_importer::convert_op_ConvTranspose(const NodeProto &node)
             if (paddings_attr)
             {
                 const auto &paddings_values = paddings_attr.value();
-                if (paddings_values.size() > 1)
+                if (model_3d)
                 {
-                    paddings[0].before = paddings_values[0];
-                    paddings[1].before = paddings_values[1];
+                    if (paddings_values.size() > 1)
+                    {
+                        paddings[0].before = paddings_values[0];
+                        paddings[0].after = paddings_values[1];
+                    }
                 }
-
-                if (paddings_values.size() > 3)
+                else
                 {
-                    paddings[0].after = paddings_values[2];
-                    paddings[1].after = paddings_values[3];
+                    if (paddings_values.size() > 1)
+                    {
+                        paddings[0].before = paddings_values[0];
+                        paddings[1].before = paddings_values[1];
+                    }
+
+                    if (paddings_values.size() > 3)
+                    {
+                        paddings[0].after = paddings_values[2];
+                        paddings[1].after = paddings_values[3];
+                    }
                 }
             }
         }
         else if (pad_mode == "SAME_UPPER")
         {
             paddings[0] = get_windowed_padding(input_shape[2], tp_shape[2], strides[0], dilations[0], true);
-            paddings[1] = get_windowed_padding(input_shape[3], tp_shape[3], strides[1], dilations[1], true);
+            if (!model_3d)
+                paddings[1] = get_windowed_padding(input_shape[3], tp_shape[3], strides[1], dilations[1], true);
         }
         else if (pad_mode == "SAME_LOWER")
         {
@@ -293,19 +326,46 @@ void onnx_importer::convert_op_ConvTranspose(const NodeProto &node)
             if (paddings[0].before < paddings[0].after)
                 std::swap(paddings[0].before, paddings[0].after);
 
-            paddings[1] = get_windowed_padding(input_shape[3], tp_shape[3], strides[1], dilations[1], true);
-            if (paddings[1].before < paddings[1].after)
-                std::swap(paddings[1].before, paddings[1].after);
+            if (!model_3d)
+            {
+                paddings[1] = get_windowed_padding(input_shape[3], tp_shape[3], strides[1], dilations[1], true);
+                if (paddings[1].before < paddings[1].after)
+                    std::swap(paddings[1].before, paddings[1].after);
+            }
         }
     }
 
+    // fit 3D input
+    auto data_shape = input_shape;
+    if (model_3d)
+    {
+        paddings[1] = padding::zero();
+        strides[1] = 1;
+        dilations[1] = 1;
+        input_shape.push_back(1);
+
+        output_shape.push_back(1);
+    }
+
     // ConvTranspose
     auto conv_transpose = graph_.emplace<conv2d_transpose>(input_shape, bc_shape, output_shape, group, paddings[0], paddings[1],
         output_paddings[0], output_paddings[1], strides[0], strides[1], dilations[0], dilations[1], value_range<float>::full());
     conv_transpose->name(op_name + "(ConvTranspose)");
 
-    input_tensors_.emplace(&conv_transpose->input(), input);
-    input_tensors_.emplace(&tp->input(), weight);
+    if (model_3d)
+    {
+        auto bitc_data = graph_.emplace<bitcast>(input_type, data_shape, input_shape);
+        conv_transpose->input().connect(bitc_data->output());
+        input_tensors_.emplace(&bitc_data->input(), input);
+
+        input_tensors_.emplace(&tp->input(), weight);
+    }
+    else
+    {
+        input_tensors_.emplace(&conv_transpose->input(), input);
+        input_tensors_.emplace(&tp->input(), weight);
+    }
+
     bc->input().connect(tp->output());
     conv_transpose->weights().connect(bc->output());
     if (node.input().size() > 2)
@@ -319,5 +379,15 @@ void onnx_importer::convert_op_ConvTranspose(const NodeProto &node)
         auto bias = graph_.emplace<constant>(dt_float32, shape, zeros);
         conv_transpose->bias().connect(bias->output());
     }
-    output_tensors_.emplace(output, &conv_transpose->output());
-}
+
+    if (model_3d)
+    {
+        auto bitc_out = graph_.emplace<bitcast>(output_type, conv_transpose->output().shape(), shape_t { conv_transpose->output().shape()[0], conv_transpose->output().shape()[1], conv_transpose->output().shape()[2] });
+        bitc_out->input().connect(conv_transpose->output());
+        output_tensors_.emplace(output, &bitc_out->output());
+    }
+    else
+    {
+        output_tensors_.emplace(output, &conv_transpose->output());
+    }
+}
\ No newline at end of file
diff --git a/src/importer/onnx/ops/expand.cpp b/src/importer/onnx/ops/expand.cpp
index dd219dd114..0871d24a29 100644
--- a/src/importer/onnx/ops/expand.cpp
+++ b/src/importer/onnx/ops/expand.cpp
@@ -36,9 +36,25 @@ void onnx_importer::convert_op_Expand(const NodeProto &node)
 
     auto shape_vec = get_constant_value<int64_t>(node.input()[1]);
     shape_t shape { shape_vec.begin(), shape_vec.end() };
-    auto ones = xt::ones<float>(shape);
-    std::vector<float> ones_vec { ones.begin(), ones.end() };
-    auto con = graph_.emplace<constant>(input_type, shape, ones_vec);
+    constant *con = nullptr;
+    if (input_type == dt_int64)
+    {
+        auto ones = xt::ones<int64_t>(shape);
+        std::vector<int64_t> ones_vec { ones.begin(), ones.end() };
+        con = graph_.emplace<constant>(input_type, shape, ones_vec);
+    }
+    else if (input_type == dt_float32)
+    {
+        auto ones = xt::ones<float>(shape);
+        std::vector<float> ones_vec { ones.begin(), ones.end() };
+        con = graph_.emplace<constant>(input_type, shape, ones_vec);
+    }
+    else if (input_type == dt_uint8)
+    {
+        auto ones = xt::ones<uint8_t>(shape);
+        std::vector<uint8_t> ones_vec { ones.begin(), ones.end() };
+        con = graph_.emplace<constant>(input_type, shape, ones_vec);
+    }
     auto op = graph_.emplace<binary>(binary_mul, input_type, input_shape, shape, value_range<float>::full());
     op->name(generate_name(node) + "(Expand)");
 
diff --git a/src/importer/onnx/ops/gather_elements.cpp b/src/importer/onnx/ops/gather_elements.cpp
new file mode 100644
index 0000000000..b68010af90
--- /dev/null
+++ b/src/importer/onnx/ops/gather_elements.cpp
@@ -0,0 +1,58 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../onnx_importer.h"
+#include <nncase/importer/util.h>
+#include <nncase/ir/ops/convert.h>
+#include <nncase/ir/ops/gather_elements.h>
+
+using namespace nncase;
+using namespace nncase::importer;
+using namespace nncase::ir;
+using namespace onnx;
+
+void onnx_importer::convert_op_GatherElements(const NodeProto &node)
+{
+    const auto &input = node.input()[0];
+    const auto &indices = node.input()[1];
+    const auto &output = node.output()[0];
+
+    const datatype_t input_type = get_datatype(input).value();
+    const datatype_t indices_type = get_datatype(indices).value();
+    const auto input_shape = get_shape(input);
+    const auto indices_shape = get_shape(indices);
+    const auto out_shape = get_shape(output);
+
+    auto axis = get_attribute<int32_t>(node, "axis").value_or(0);
+    if (axis < 0)
+    {
+        axis += static_cast<int32_t>(input_shape.size());
+    }
+
+    auto ga = graph_.emplace<gather_elements>(input_type, dt_int64, input_shape, indices_shape, out_shape, axis);
+
+    auto mid_ptr = &ga->indices();
+    if (indices_type == dt_int32)
+    {
+        auto cvt = graph_.emplace<convert>(indices_type, indices_shape, dt_int64);
+        cvt->name(ga->name() + "(cvt_int_to_int64)");
+        ga->indices().connect(cvt->output());
+        mid_ptr = &cvt->input();
+    }
+
+    link_input_tensor(&ga->input(), input);
+    link_input_tensor(mid_ptr, indices);
+    link_output_tensor(output, &ga->output());
+}
\ No newline at end of file
diff --git a/src/importer/onnx/ops/gru.cpp b/src/importer/onnx/ops/gru.cpp
new file mode 100644
index 0000000000..227101fded
--- /dev/null
+++ b/src/importer/onnx/ops/gru.cpp
@@ -0,0 +1,121 @@
+/* Copyright 2020 Alexey Chernov <4ernov@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../onnx_importer.h"
+#include <cassert>
+#include <nncase/ir/graph.h>
+#include <nncase/ir/ops/bitcast.h>
+#include <nncase/ir/ops/gru.h>
+
+using namespace nncase;
+using namespace nncase::importer;
+using namespace nncase::ir;
+using namespace onnx;
+
+void onnx_importer::convert_op_GRU(const NodeProto &node)
+{
+    const auto &op_name { generate_name(node) };
+
+    // attribute
+    auto direction_str = get_attribute<std::string>(node, "direction").value_or("forward");
+    lstm_direction direction = kForward;
+    if (direction_str == "forward")
+        direction = kForward;
+    else if (direction_str == "reverse")
+        direction = kReverse;
+    else
+        direction = kBidirectional;
+    size_t num_directions = direction == kBidirectional ? 2 : 1;
+
+    auto linear_before_reset = get_attribute<int64_t>(node, "linear_before_reset").value_or(0);
+
+    // input
+    auto input_size = node.input_size();
+    assert(input_size >= 3 && input_size <= 8);
+    const auto &input = node.input()[0];
+    const auto &W = node.input()[1];
+    const auto &R = node.input()[2];
+
+    const datatype_t input_type = get_datatype(input).value();
+    const auto &input_shape = get_shape(input);
+    const auto &W_shape = get_shape(W);
+    const auto &R_shape = get_shape(R);
+
+    size_t seq_length = input_shape[0];
+    size_t batch_size = input_shape[1];
+    size_t hidden_size = get_attribute<std::int64_t>(node, "hidden_size").value_or(W_shape[1] / 3);
+
+    // bias
+    std::string B;
+    shape_t B_shape { num_directions, 6 * hidden_size };
+    if (input_size >= 4)
+    {
+        B = node.input()[3];
+    }
+
+    // initial_h
+    std::string initial_h;
+    shape_t initial_shape { num_directions, batch_size, hidden_size };
+    if (input_size >= 6)
+    {
+        initial_h = node.input()[5];
+    }
+
+    // output
+    auto output_size = node.output_size();
+    assert(output_size >= 0 && output_size <= 3);
+    std::string output;
+    if (output_size >= 1)
+        output = node.output()[0];
+
+    std::string output_h;
+    if (output_size >= 2)
+        output_h = node.output()[1];
+
+    shape_t output_shape { seq_length, num_directions, batch_size, hidden_size };
+    auto lstm_node = graph_.emplace<gru>(input_shape, W_shape, R_shape, B_shape, output_shape, initial_shape, direction, "onnx", linear_before_reset == 0 ? false : true);
+    lstm_node->name(op_name);
+
+    input_tensors_.emplace(&lstm_node->input_at(0), input);
+    input_tensors_.emplace(&lstm_node->input_at(1), W);
+    input_tensors_.emplace(&lstm_node->input_at(2), R);
+    if (!B.empty())
+    {
+        input_tensors_.emplace(&lstm_node->input_at(3), B);
+    }
+    else
+    {
+        std::vector<float> v(xt::compute_size(B_shape), 0.f);
+        auto c = graph_.emplace<constant>(input_type, B_shape, v);
+        lstm_node->b().connect(c->output());
+    }
+
+    if (!initial_h.empty())
+    {
+        input_tensors_.emplace(&lstm_node->input_at(4), initial_h);
+    }
+    else
+    {
+        std::vector<float> v(xt::compute_size(initial_shape), 0.f);
+        auto c = graph_.emplace<constant>(input_type, initial_shape, v);
+        lstm_node->initial_h().connect(c->output());
+    }
+
+    if (!output.empty())
+        output_tensors_.emplace(output, &lstm_node->output());
+
+    if (!output_h.empty())
+        output_tensors_.emplace(output_h, &lstm_node->output_h());
+}
diff --git a/src/importer/onnx/ops/instancenorm.cpp b/src/importer/onnx/ops/instancenorm.cpp
index 80719fcffa..79d35bf23c 100644
--- a/src/importer/onnx/ops/instancenorm.cpp
+++ b/src/importer/onnx/ops/instancenorm.cpp
@@ -19,6 +19,7 @@
 #include <nncase/ir/ops/binary.h>
 #include <nncase/ir/ops/bitcast.h>
 #include <nncase/ir/ops/constant.h>
+#include <nncase/ir/ops/instancenorm.h>
 #include <nncase/ir/ops/reduce.h>
 #include <nncase/ir/ops/unary.h>
 
@@ -58,67 +59,85 @@ void onnx_importer::convert_op_InstanceNormalization(const NodeProto &node)
     auto bias_constant = graph_.emplace<constant>(get_datatype<float>(), bias_new_shape, bias_value);
     bias_constant->name(op_name + ".bias(InstanceNormalization)");
 
-    // mean
-    axis_t axes;
-    for (size_t i = 2; i < input_shape.size(); i++)
-    {
-        axes.push_back(i);
-    }
-    float init_value = 0.f;
-    bool keepdims = true;
-    auto mean = graph_.emplace<reduce>(reduce_mean, input_type, input_shape, axes, init_value, keepdims);
-    mean->name(op_name + ".reduce_mean(InstanceNormalization)");
-
-    // x - mean
-    auto sub = graph_.emplace<binary>(binary_sub, input_type, input_shape, mean->output().shape(), value_range<float>::full());
-    sub->name(op_name + ".sub(InstanceNormalization)");
-
-    // scale * (x - mean)
-    auto mul = graph_.emplace<binary>(binary_mul, input_type, scale_new_shape, sub->output().shape(), value_range<float>::full());
-    mul->name(op_name + ".mul(InstanceNormalization)");
-
-    // variance
-    auto square = graph_.emplace<unary>(unary_square, sub->output().shape());
-    square->name(op_name + ".square(InstanceNormalization)");
-    auto variance = graph_.emplace<reduce>(reduce_mean, input_type, square->output().shape(), axes, init_value, keepdims);
-    variance->name(op_name + ".reduce(InstanceNormalization)");
-
-    // sqrt(variance + epsilon)
     auto epsilon_attr = get_attribute<float>(node, "epsilon");
     auto epsilon = epsilon_attr ? epsilon_attr.value() : 1e-05f;
-    auto eps_constant = graph_.emplace<constant>(epsilon);
-    eps_constant->name(op_name + ".eps(InstanceNormalization)");
-    auto add_eps = graph_.emplace<binary>(binary_add, input_type, variance->output().shape(), eps_constant->output().shape(), value_range<float>::full());
-    add_eps->name(op_name + ".add(InstanceNormalization)");
-    auto sqrt = graph_.emplace<unary>(unary_sqrt, add_eps->output().shape());
-    sqrt->name(op_name + ".sqrt(InstanceNormalization)");
 
-    // scale * (x - mean) / sqrt(variance + epsilon) + B
-    auto div = graph_.emplace<binary>(binary_div, input_type, mul->output().shape(), sqrt->output().shape(), value_range<float>::full());
-    div->name(op_name + ".scale(InstanceNormalization)");
-    auto add_bias = graph_.emplace<binary>(binary_add, input_type, div->output().shape(), bias_new_shape, value_range<float>::full());
-    add_bias->name(op_name + ".bias(InstanceNormalization)");
-
-    sub->input_b().connect(mean->output());
-
-    mul->input_a().connect(scale_constant->output());
-    mul->input_b().connect(sub->output());
-
-    square->input().connect(sub->output());
-    variance->input().connect(square->output());
-
-    add_eps->input_a().connect(variance->output());
-    add_eps->input_b().connect(eps_constant->output());
-
-    sqrt->input().connect(add_eps->output());
-
-    div->input_a().connect(mul->output());
-    div->input_b().connect(sqrt->output());
-
-    add_bias->input_a().connect(div->output());
-    add_bias->input_b().connect(bias_constant->output());
-
-    input_tensors_.emplace(&mean->input(), input);
-    input_tensors_.emplace(&sub->input_a(), input);
-    output_tensors_.emplace(output, &add_bias->output());
+    auto instance_norm = graph_.emplace<instancenorm>(input_type, input_shape, epsilon);
+    instance_norm->scale().connect(scale_constant->output());
+    instance_norm->bias().connect(bias_constant->output());
+    input_tensors_.emplace(&instance_norm->input(), input);
+    output_tensors_.emplace(output, &instance_norm->output());
+
+    // // mean
+    // axis_t axes;
+    // for (size_t i = 2; i < input_shape.size(); i++)
+    // {
+    //     axes.push_back(i);
+    // }
+    // float init_value = 0.f;
+    // bool keepdims = true;
+    // auto mean = graph_.emplace<reduce>(reduce_mean, input_type, input_shape, axes, init_value, keepdims);
+    // mean->attributes(mean->attributes() | node_attributes::node_attr_skip_quantize);
+    // mean->name(op_name + ".reduce_mean(InstanceNormalization)");
+
+    // // x - mean
+    // auto sub = graph_.emplace<binary>(binary_sub, input_type, input_shape, mean->output().shape(), value_range<float>::full());
+    // sub->attributes(sub->attributes() | node_attributes::node_attr_skip_quantize);
+    // sub->name(op_name + ".sub(InstanceNormalization)");
+
+    // // scale * (x - mean)
+    // auto mul = graph_.emplace<binary>(binary_mul, input_type, scale_new_shape, sub->output().shape(), value_range<float>::full());
+    // mul->attributes(mul->attributes() | node_attributes::node_attr_skip_quantize);
+    // mul->name(op_name + ".mul(InstanceNormalization)");
+
+    // // variance
+    // auto square = graph_.emplace<unary>(unary_square, sub->output().shape());
+    // square->attributes(square->attributes() | node_attributes::node_attr_skip_quantize);
+    // square->name(op_name + ".square(InstanceNormalization)");
+    // auto variance = graph_.emplace<reduce>(reduce_mean, input_type, square->output().shape(), axes, init_value, keepdims);
+    // variance->attributes(variance->attributes() | node_attributes::node_attr_skip_quantize);
+    // variance->name(op_name + ".reduce(InstanceNormalization)");
+
+    // // sqrt(variance + epsilon)
+    // auto epsilon_attr = get_attribute<float>(node, "epsilon");
+    // auto epsilon = epsilon_attr ? epsilon_attr.value() : 1e-05f;
+    // auto eps_constant = graph_.emplace<constant>(epsilon);
+    // eps_constant->name(op_name + ".eps(InstanceNormalization)");
+    // auto add_eps = graph_.emplace<binary>(binary_add, input_type, variance->output().shape(), eps_constant->output().shape(), value_range<float>::full());
+    // add_eps->attributes(add_eps->attributes() | node_attributes::node_attr_skip_quantize);
+    // add_eps->name(op_name + ".add(InstanceNormalization)");
+    // auto sqrt = graph_.emplace<unary>(unary_sqrt, add_eps->output().shape());
+    // sqrt->attributes(sqrt->attributes() | node_attributes::node_attr_skip_quantize);
+    // sqrt->name(op_name + ".sqrt(InstanceNormalization)");
+
+    // // scale * (x - mean) / sqrt(variance + epsilon) + B
+    // auto div = graph_.emplace<binary>(binary_div, input_type, mul->output().shape(), sqrt->output().shape(), value_range<float>::full());
+    // div->attributes(div->attributes() | node_attributes::node_attr_skip_quantize);
+    // div->name(op_name + ".scale(InstanceNormalization)");
+    // auto add_bias = graph_.emplace<binary>(binary_add, input_type, div->output().shape(), bias_new_shape, value_range<float>::full());
+    // add_bias->attributes(add_bias->attributes() | node_attributes::node_attr_skip_quantize);
+    // add_bias->name(op_name + ".bias(InstanceNormalization)");
+
+    // sub->input_b().connect(mean->output());
+
+    // mul->input_a().connect(scale_constant->output());
+    // mul->input_b().connect(sub->output());
+
+    // square->input().connect(sub->output());
+    // variance->input().connect(square->output());
+
+    // add_eps->input_a().connect(variance->output());
+    // add_eps->input_b().connect(eps_constant->output());
+
+    // sqrt->input().connect(add_eps->output());
+
+    // div->input_a().connect(mul->output());
+    // div->input_b().connect(sqrt->output());
+
+    // add_bias->input_a().connect(div->output());
+    // add_bias->input_b().connect(bias_constant->output());
+
+    // input_tensors_.emplace(&mean->input(), input);
+    // input_tensors_.emplace(&sub->input_a(), input);
+    // output_tensors_.emplace(output, &add_bias->output());
 }
diff --git a/src/importer/onnx/ops/layernorm.cpp b/src/importer/onnx/ops/layernorm.cpp
new file mode 100644
index 0000000000..239cbc053a
--- /dev/null
+++ b/src/importer/onnx/ops/layernorm.cpp
@@ -0,0 +1,72 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nncase/ir/ops/layernorm.h"
+#include "../onnx_importer.h"
+#include "nncase/ir/ir_types.h"
+#include <cassert>
+#include <nncase/ir/graph.h>
+#include <nncase/ir/ops/constant.h>
+
+using namespace nncase;
+using namespace nncase::importer;
+using namespace nncase::ir;
+using namespace onnx;
+
+void onnx_importer::convert_op_LayerNormalization(const NodeProto &node)
+{
+    assert(node.input().size() >= 2);
+
+    const auto &op_name { generate_name(node) };
+
+    const auto &input = node.input()[0];
+    const auto &scale = node.input()[1];
+    const auto output = node.output()[0];
+
+    auto input_shape = get_shape(input);
+    const auto input_type = get_datatype(input).value();
+
+    std::vector<float> scale_value;
+    auto scale_initializer = get_initializer(scale);
+    scale_value = scale_initializer ? to<std::vector<float>>(scale_initializer.value()) : get_constant_input_data<float>(scale).value();
+    auto scale_shape = get_shape(scale);
+    auto scale_constant = graph_.emplace<constant>(get_datatype<float>(), scale_shape, scale_value);
+    scale_constant->name(op_name + ".scale(LayerNormalization)");
+
+    auto bias_shape = scale_shape;
+    std::vector<float> bias_value(xt::compute_size(scale_shape), 0.f);
+    if (node.input().size() > 2)
+    {
+        const auto &bias = node.input()[2];
+        auto bias_initializer = get_initializer(bias);
+        bias_value = bias_initializer ? to<std::vector<float>>(bias_initializer.value()) : get_constant_input_data<float>(bias).value();
+    }
+    auto bias_constant = graph_.emplace<constant>(get_datatype<float>(), bias_shape, bias_value);
+    bias_constant->name(op_name + ".bias(LayerNormalization)");
+
+    auto axis_attr = get_attribute<int>(node, "axis");
+    int32_t axis = axis_attr ? axis_attr.value() : -1;
+
+    auto epsilon_attr = get_attribute<float>(node, "epsilon");
+    auto epsilon = epsilon_attr ? epsilon_attr.value() : 1e-05f;
+
+    auto ln = graph_.emplace<layernorm>(input_type, input_shape, axis, epsilon);
+    ln->name(op_name + ".layer_norm(LayerNormalization)");
+
+    input_tensors_.emplace(&ln->input(), input);
+    ln->scale().connect(scale_constant->output());
+    ln->bias().connect(bias_constant->output());
+    output_tensors_.emplace(output, &ln->output());
+}
diff --git a/src/importer/onnx/ops/pool.cpp b/src/importer/onnx/ops/pool.cpp
index 41c01d8002..804af97961 100644
--- a/src/importer/onnx/ops/pool.cpp
+++ b/src/importer/onnx/ops/pool.cpp
@@ -74,6 +74,7 @@ void onnx_importer::convert_pool(const NodeProto &node, const reduce_op_t reduce
     const auto &output = node.output()[0];
 
     auto input_shape = get_shape(input);
+    auto output_shape = get_shape(output);
     padding_mode pad_mode = padding_mode::notset;
 
     const auto &auto_pad_attr = get_attribute<std::string>(node, "auto_pad");
@@ -82,6 +83,13 @@ void onnx_importer::convert_pool(const NodeProto &node, const reduce_op_t reduce
         pad_mode = parse_padding_mode(auto_pad_attr.value());
     }
 
+    int ceil_mode = 0;
+    const auto &ceil_mode_attr = get_attribute<int>(node, "ceil_mode");
+    if (ceil_mode_attr)
+    {
+        ceil_mode = static_cast<int>(ceil_mode_attr.value());
+    }
+
     bool count_include_pad = false;
     const auto &count_include_pad_attr = get_attribute<int>(node, "count_include_pad");
     if (count_include_pad_attr)
@@ -133,6 +141,17 @@ void onnx_importer::convert_pool(const NodeProto &node, const reduce_op_t reduce
     }
     }
 
+    if (ceil_mode)
+    {
+        auto get_input_size = [](int output_size, int k, int s, int p) { return (output_size - 1) * s + k - p; };
+        auto extra_paddg_h = get_input_size(output_shape[2], kernel_shape[0], strides[0], pads[0].sum()) - input_shape[2];
+        if (extra_paddg_h > 0)
+            pads[0].after += extra_paddg_h;
+        auto extra_paddg_w = get_input_size(output_shape[3], kernel_shape[1], strides[1], pads[1].sum()) - input_shape[3];
+        if (extra_paddg_w > 0)
+            pads[1].after += extra_paddg_w;
+    }
+
     auto op = graph_.emplace<reduce_window2d>(reduce_op, move(input_shape), init_value, kernel_shape[0], kernel_shape[1],
         pads[0], pads[1], strides[0], strides[1], dilations[0], dilations[1], value_range<float>::full(), false, count_include_pad);
 
diff --git a/src/importer/onnx/ops/slice.cpp b/src/importer/onnx/ops/slice.cpp
index 3684e56394..ffade78561 100644
--- a/src/importer/onnx/ops/slice.cpp
+++ b/src/importer/onnx/ops/slice.cpp
@@ -17,6 +17,7 @@
 #include <cassert>
 #include <nncase/ir/graph.h>
 #include <nncase/ir/ops/slice.h>
+#include <nncase/runtime/datatypes.h>
 #include <vector>
 
 using namespace nncase;
@@ -31,6 +32,23 @@ void onnx_importer::convert_op_Slice(const NodeProto &node)
     const shape_t &input_shape = get_shape(input);
     auto ndim = input_shape.size();
 
+#define GET_ATTRIBUTE(index, dst)                                             \
+    {                                                                         \
+        const std::string &name = node.input()[index];                        \
+        const datatype_t type = get_datatype(name).value();                   \
+                                                                              \
+        if (type == datatype_t::dt_int32)                                     \
+        {                                                                     \
+            auto vec = get_constant_value<int, int32_t>(node.input()[index]); \
+            dst.assign(vec.begin(), vec.end());                               \
+        }                                                                     \
+        else                                                                  \
+        {                                                                     \
+            auto vec = get_constant_value<int, int64_t>(node.input()[index]); \
+            dst.assign(vec.begin(), vec.end());                               \
+        }                                                                     \
+    }
+
     // starts/stops
     axis_t starts, stops;
     bool use_opset_1 = node.input().size() == 1;
@@ -43,11 +61,9 @@ void onnx_importer::convert_op_Slice(const NodeProto &node)
     else
     {
         // opset 10/11/13
-        auto vec = get_constant_value<int, int64_t>(node.input()[1]);
-        starts.assign(vec.begin(), vec.end());
+        GET_ATTRIBUTE(1, starts)
 
-        vec = get_constant_value<int, int64_t>(node.input()[2]);
-        stops.assign(vec.begin(), vec.end());
+        GET_ATTRIBUTE(2, stops)
     }
     assert(starts.size() == stops.size());
     assert(starts.size() <= ndim);
@@ -63,8 +79,7 @@ void onnx_importer::convert_op_Slice(const NodeProto &node)
     }
     else if (node.input().size() >= 4)
     {
-        auto vec = get_constant_value<int, int64_t>(node.input()[3]);
-        axes.assign(vec.begin(), vec.end());
+        GET_ATTRIBUTE(3, axes)
     }
 
     if (axes.empty())
@@ -77,8 +92,7 @@ void onnx_importer::convert_op_Slice(const NodeProto &node)
     axis_t steps;
     if (node.input().size() >= 5)
     {
-        auto vec = get_constant_value<int, int64_t>(node.input()[4]);
-        steps.assign(vec.begin(), vec.end());
+        GET_ATTRIBUTE(4, steps);
         assert(steps.size() == axes.size());
     }
 
diff --git a/src/importer/onnx/ops/unary.cpp b/src/importer/onnx/ops/unary.cpp
index f2b6a32c55..724a037da1 100644
--- a/src/importer/onnx/ops/unary.cpp
+++ b/src/importer/onnx/ops/unary.cpp
@@ -129,11 +129,21 @@ void onnx_importer::convert_op_Sqrt(const onnx::NodeProto &node)
     convert_unary(node, unary_sqrt);
 }
 
+void onnx_importer::convert_op_Rsqrt(const onnx::NodeProto &node)
+{
+    convert_unary(node, unary_rsqrt);
+}
+
 void onnx_importer::convert_op_Tanh(const onnx::NodeProto &node)
 {
     convert_unary(node, unary_tanh);
 }
 
+void onnx_importer::convert_op_Erf(const onnx::NodeProto &node)
+{
+    convert_unary(node, unary_erf);
+}
+
 void onnx_importer::convert_unary(const onnx::NodeProto &node, const unary_op_t unary_op)
 {
     assert(node.input().size() == 1);
diff --git a/src/importer/onnx/ops/where.cpp b/src/importer/onnx/ops/where.cpp
index 49c66cfcf3..f86985e5cb 100644
--- a/src/importer/onnx/ops/where.cpp
+++ b/src/importer/onnx/ops/where.cpp
@@ -35,10 +35,9 @@ void onnx_importer::convert_op_Where(const onnx::NodeProto &node)
     const auto &input_c = node.input()[2];
     const auto &output = node.output()[0];
 
-    quant_param_t qparam { 0, 1.f };
     datatype_t dtype = dt_float32;
-    auto deq_a = graph_.emplace<dequantize>(get_datatype(input_a).value(), get_shape(input_a), dtype, qparam);
-    deq_a->name(op_name + "/deq_a");
+    auto deq_a = graph_.emplace<convert>(get_datatype(input_a).value(), get_shape(input_a), dtype);
+    deq_a->name(op_name + "/cvt");
 
     auto op = graph_.emplace<ternary>(dtype, get_datatype(input_b).value(), deq_a->output().shape(), get_shape(input_b), get_shape(input_c));
     op->name(op_name + "/ternary");
diff --git a/src/importer/tflite/CMakeLists.txt b/src/importer/tflite/CMakeLists.txt
index 1be871d3c6..1439d623d9 100644
--- a/src/importer/tflite/CMakeLists.txt
+++ b/src/importer/tflite/CMakeLists.txt
@@ -37,7 +37,7 @@ set(SRCS tflite_importer.cpp
          ops/split.cpp)
 
 add_library(tflite_importer OBJECT ${SRCS})
-target_link_libraries(tflite_importer PUBLIC ir flatbuffers)
+target_link_libraries(tflite_importer PUBLIC ir flatbuffers::flatbuffers)
 target_include_directories(tflite_importer PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 add_dependencies(tflite_importer TFLITE_FB)
 set_target_properties(tflite_importer PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/src/importer/tflite/ops/custom.cpp b/src/importer/tflite/ops/custom.cpp
index 2637db2557..3ef635d077 100644
--- a/src/importer/tflite/ops/custom.cpp
+++ b/src/importer/tflite/ops/custom.cpp
@@ -15,6 +15,7 @@
 #include "../tflite_importer.h"
 #include <flatbuffers/flexbuffers.h>
 #include <nncase/ir/ops/random_uniform.h>
+#include <nncase/ir/ops/tflite_detection_postprocess.h>
 
 using namespace nncase;
 using namespace nncase::importer;
@@ -43,6 +44,53 @@ DEFINE_TFLITE_LOWER(CUSTOM)
         node->name(output.name()->string_view());
         link_output_tensor(op.outputs()->Get(0), &node->output());
     }
+    else if (custom_code == "TFLite_Detection_PostProcess")
+    {
+        auto &input_decoded_boxes = get_tensor(op.inputs(), 0);
+        auto &input_scores = get_tensor(op.inputs(), 1);
+        auto &input_anchors = get_tensor(op.inputs(), 2);
+
+        // get_shape(output_x.shape()): get error shape, ignore it in this step. fix it in independent transform
+        auto &output_locations = get_tensor(op.outputs(), 0); //detection_boxes   (1, num_detected_boxes, 4)
+        auto &output_classes = get_tensor(op.outputs(), 1); //detection_classes (1, num_detected_boxes)
+        auto &output_scores = get_tensor(op.outputs(), 2); //detection_scores  (1, num_detected_boxes)
+        auto &output_num_detections = get_tensor(op.outputs(), 3); //num_detections    (1)
+
+        auto custom_options = op.custom_options();
+
+        const auto &m = flexbuffers::GetRoot(custom_options->data(), custom_options->size()).AsMap();
+        auto max_detections = m["max_detections"].AsInt32();
+        auto max_classes_per_detection = m["max_classes_per_detection"].AsInt32();
+
+        int32_t detections_per_class = 100;
+        if (!m["detections_per_class"].IsNull())
+            detections_per_class = m["detections_per_class"].AsInt32();
+
+        bool use_regular_non_max_suppression = false;
+        if (!m["use_regular_nms"].IsNull())
+            use_regular_non_max_suppression = m["use_regular_nms"].AsBool();
+
+        auto non_max_suppression_score_threshold = m["nms_score_threshold"].AsFloat();
+        auto intersection_over_union_threshold = m["nms_iou_threshold"].AsFloat();
+        auto num_classes = m["num_classes"].AsInt32();
+        auto y = m["y_scale"].AsFloat();
+        auto x = m["x_scale"].AsFloat();
+        auto h = m["h_scale"].AsFloat();
+        auto w = m["w_scale"].AsFloat();
+
+        auto node = graph_.emplace<tflite_detection_postprocess>(get_shape(input_decoded_boxes.shape()), get_shape(input_scores.shape()), get_shape(input_anchors.shape()),
+            get_shape(output_locations.shape()), get_shape(output_classes.shape()), get_shape(output_scores.shape()), get_shape(output_num_detections.shape()),
+            max_detections, max_classes_per_detection, detections_per_class, use_regular_non_max_suppression, non_max_suppression_score_threshold,
+            intersection_over_union_threshold, num_classes, y, x, h, w);
+
+        link_input_tensor(&node->boxes(), op.inputs()->Get(0));
+        link_input_tensor(&node->scores(), op.inputs()->Get(1));
+        link_input_tensor(&node->anchors(), op.inputs()->Get(2));
+        link_output_tensor(op.outputs()->Get(0), &node->output_locations());
+        link_output_tensor(op.outputs()->Get(1), &node->output_classes());
+        link_output_tensor(op.outputs()->Get(2), &node->output_scores());
+        link_output_tensor(op.outputs()->Get(3), &node->output_num_detections());
+    }
     else
     {
         throw std::runtime_error(std::string("Unsupported tflite CUSTOM code: ") + custom_code);
diff --git a/src/importer/tflite/ops/quantize.cpp b/src/importer/tflite/ops/quantize.cpp
index 9c446193e8..46fa70f2de 100644
--- a/src/importer/tflite/ops/quantize.cpp
+++ b/src/importer/tflite/ops/quantize.cpp
@@ -27,30 +27,12 @@ DEFINE_TFLITE_LOWER(QUANTIZE)
     auto &input = get_tensor(op.inputs(), 0);
     auto &output = get_tensor(op.outputs(), 0);
 
-    [[maybe_unused]] dequantize *deq;
-
-    auto tp1 = graph_.emplace<transpose>(to_data_type(input.type()), get_shape(input.shape()), axis_t { 0, 3, 1, 2 });
-    tp1->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/pre_trans");
-    auto mid_output = &tp1->output();
-    if (input.type() != tflite::TensorType_FLOAT32)
-    {
-        deq = graph_.emplace<dequantize>(tp1->output().type(), tp1->output().shape(), dt_float32,
-            to_quant_param(input.quantization()));
-        deq->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/deq");
-        mid_output = &deq->output();
-        deq->input().connect(tp1->output());
-    }
-
-    auto q = graph_.emplace<quantize>(dt_float32, mid_output->shape(), to_data_type(output.type()),
+    auto q = graph_.emplace<quantize>(to_data_type(input.type()), get_shape(input.shape()), to_data_type(output.type()),
         to_quant_param(output.quantization()));
     q->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/q");
-    auto tp2 = graph_.emplace<transpose>(q->output().type(), q->output().shape(), axis_t { 0, 2, 3, 1 });
-    tp2->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/trans");
 
-    q->input().connect(*mid_output);
-    tp2->input().connect(q->output());
-    link_input_tensor(&tp1->input(), op.inputs()->Get(0));
-    link_output_tensor(op.outputs()->Get(0), &tp2->output());
+    link_input_tensor(&q->input(), op.inputs()->Get(0));
+    link_output_tensor(op.outputs()->Get(0), &q->output());
 }
 
 DEFINE_TFLITE_LOWER(FAKE_QUANT)
@@ -71,34 +53,12 @@ DEFINE_TFLITE_LOWER(DEQUANTIZE)
     auto &input = get_tensor(op.inputs(), 0);
     auto &output = get_tensor(op.outputs(), 0);
 
-    [[maybe_unused]] dequantize *deq;
-    [[maybe_unused]] quantize *q;
-
-    auto tp1 = graph_.emplace<transpose>(to_data_type(input.type()), get_shape(input.shape()), axis_t { 0, 3, 1, 2 });
-    tp1->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/pre_trans");
-    auto mid_output = &tp1->output();
-    //    auto mid_input = &tp1->output();
-    if (input.type() != tflite::TensorType_FLOAT32)
+    if (op.outputs()->size() != 0)
     {
-        deq = graph_.emplace<dequantize>(tp1->output().type(), tp1->output().shape(), dt_float32,
+        auto deq = graph_.emplace<dequantize>(to_data_type(input.type()), get_shape(input.shape()), to_data_type(output.type()),
             to_quant_param(input.quantization()));
         deq->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/deq");
-        mid_output = &deq->output();
-        deq->input().connect(tp1->output());
+        link_input_tensor(&deq->input(), op.inputs()->Get(0));
+        link_output_tensor(op.outputs()->Get(0), &deq->output());
     }
-
-    if (output.type() != tflite::TensorType_FLOAT32)
-    {
-        q = graph_.emplace<quantize>(dt_float32, mid_output->shape(), to_data_type(output.type()),
-            to_quant_param(output.quantization()));
-        q->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/q");
-        mid_output = &q->output();
-        q->input().connect(tp1->output());
-    }
-    auto tp2 = graph_.emplace<transpose>(mid_output->type(), mid_output->shape(), axis_t { 0, 2, 3, 1 });
-    tp2->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/trans");
-
-    tp2->input().connect(*mid_output);
-    link_input_tensor(&tp1->input(), op.inputs()->Get(0));
-    link_output_tensor(op.outputs()->Get(0), &tp2->output());
 }
\ No newline at end of file
diff --git a/src/importer/tflite/ops/space_to_batch.cpp b/src/importer/tflite/ops/space_to_batch.cpp
index ae9e1edad0..caba5f32a3 100644
--- a/src/importer/tflite/ops/space_to_batch.cpp
+++ b/src/importer/tflite/ops/space_to_batch.cpp
@@ -86,6 +86,7 @@ DEFINE_TFLITE_LOWER(SPACE_TO_BATCH_ND)
     else
     {
         block_size_w = block_shape.data()[1];
+        real_block_size_w = block_shape.data()[1];
         tp1 = graph_.emplace<transpose>(to_data_type(input.type()), get_shape(input.shape()), axis_t { 0, 3, 1, 2 });
         input_conn = &tp1->input();
     }
diff --git a/src/importer/tflite/ops/unary.cpp b/src/importer/tflite/ops/unary.cpp
index d52e9eba9a..2722b6c648 100644
--- a/src/importer/tflite/ops/unary.cpp
+++ b/src/importer/tflite/ops/unary.cpp
@@ -63,22 +63,7 @@ DEFINE_TFLITE_LOWER(ROUND)
 
 DEFINE_TFLITE_LOWER(RSQRT)
 {
-    auto &input = get_tensor(op.inputs(), 0);
-
-    auto one = graph_.emplace<constant>(1.f);
-    auto sqrt = graph_.emplace<unary>(unary_sqrt, get_shape(input.shape()));
-    auto div = graph_.emplace<binary>(binary_div, to_data_type(input.type()), one->output().shape(), sqrt->output().shape(), value_range<float>::full());
-
-    auto name = std::string(get_tensor(op.outputs(), 0).name()->string_view());
-    one->name(name);
-    sqrt->name(name);
-    div->name(name);
-
-    div->input_a().connect(one->output());
-    div->input_b().connect(sqrt->output());
-
-    link_input_tensor(&sqrt->input(), op.inputs()->Get(0));
-    link_output_tensor(op.outputs()->Get(0), &div->output());
+    convert_unary(op, unary_rsqrt);
 }
 
 DEFINE_TFLITE_LOWER(SIN)
diff --git a/src/ir/graph.cpp b/src/ir/graph.cpp
index e0efc5b462..310a5b3d0e 100644
--- a/src/ir/graph.cpp
+++ b/src/ir/graph.cpp
@@ -130,7 +130,7 @@ void graph::dce()
     nodes_.erase(end, std::end(nodes_));
 }
 
-split_graph_result graph::split_subgraph(std::span<node *const> nodes)
+split_graph_result graph::split_subgraph(std::span<node *const> nodes, bool reorder_input)
 {
     split_graph_result result;
     result.subgraph = std::make_unique<graph>(nodes.front()->module_type());
@@ -148,9 +148,38 @@ split_graph_result graph::split_subgraph(std::span<node *const> nodes)
         }
     }
 
+#define ADD_INODE                                                               \
+    auto inode = result.subgraph->emplace<input_node>(in->type(), in->shape()); \
+    inode->name(in->connection()->owner().name());                              \
+    inode->module_type(in->owner().module_type());                              \
+    result.inputs.emplace(inode, in->connection());                             \
+    inputs.emplace(in->connection(), inode);                                    \
+    in->connect(inode->output());
+
+#define ADD_ONODE                                                                  \
+    auto onode = result.subgraph->emplace<output_node>(out->type(), out->shape()); \
+    onode->name(out->owner().name());                                              \
+    onode->module_type(out->owner().module_type());                                \
+                                                                                   \
+    for (auto in : dup(conns))                                                     \
+    {                                                                              \
+        if (!subgraph_nodes.contains(&in->owner()))                                \
+        {                                                                          \
+            result.outputs[onode].emplace_back(in);                                \
+            in->clear_connection();                                                \
+        }                                                                          \
+    }                                                                              \
+                                                                                   \
+    out->connect(onode->input());
+
     // 2. Find in/out connectors
     std::unordered_set<output_connector *> outputs;
     std::unordered_map<output_connector *, input_node *> inputs;
+    std::vector<input_connector *> graph_inputs;
+    std::vector<input_connector *> remained_inputs;
+    std::vector<size_t> input_order;
+    std::vector<output_connector *> graph_outputs;
+    std::vector<size_t> output_order;
     for (auto node : nodes)
     {
         for (auto in : node->inputs())
@@ -159,16 +188,22 @@ split_graph_result graph::split_subgraph(std::span<node *const> nodes)
             {
                 if (outputs.emplace(in->connection()).second)
                 {
-                    auto inode = result.subgraph->emplace<input_node>(in->type(), in->shape());
-                    inode->name(in->connection()->owner().name());
-                    inode->module_type(node->module_type());
-                    result.inputs.emplace(inode, in->connection());
-                    inputs.emplace(in->connection(), inode);
-                    in->connect(inode->output());
+                    if (reorder_input && node_cast<input_node>(in->connection()->owner()))
+                    {
+                        graph_inputs.push_back(in);
+                        input_order.push_back(std::distance(inputs_.begin(), std::find(inputs_.begin(), inputs_.end(), node_cast<input_node>(in->connection()->owner()))));
+                    }
+                    else
+                    {
+                        ADD_INODE
+                    }
                 }
                 else
                 {
-                    in->connect(inputs.at(in->connection())->output());
+                    if (reorder_input && node_cast<input_node>(in->connection()->owner()))
+                        remained_inputs.push_back(in);
+                    else
+                        in->connect(inputs.at(in->connection())->output());
                 }
             }
         }
@@ -178,24 +213,49 @@ split_graph_result graph::split_subgraph(std::span<node *const> nodes)
             auto conns = out->connections();
             if (std::any_of(conns.begin(), conns.end(), [&](input_connector *in) { return !subgraph_nodes.contains(&in->owner()); }))
             {
-                auto onode = result.subgraph->emplace<output_node>(out->type(), out->shape());
-                onode->name(out->owner().name());
-                onode->module_type(node->module_type());
-
-                for (auto in : dup(conns))
+                auto it = std::find_if(conns.begin(), conns.end(), [&](input_connector *in) { return node_cast<output_node>(in->owner()); });
+                if (it != conns.end())
                 {
-                    if (!subgraph_nodes.contains(&in->owner()))
-                    {
-                        result.outputs[onode].emplace_back(in);
-                        in->clear_connection();
-                    }
+                    graph_outputs.push_back(out);
+                    output_order.push_back(std::distance(outputs_.begin(), std::find(outputs_.begin(), outputs_.end(), node_cast<output_node>((*it)->owner()))));
+                }
+                else
+                {
+                    ADD_ONODE
                 }
-
-                out->connect(onode->input());
             }
         }
     }
 
+    auto sort_indexes = [](const std::vector<size_t> &v) {
+        std::vector<size_t> idx(v.size());
+        iota(idx.begin(), idx.end(), 0);
+
+        stable_sort(idx.begin(), idx.end(),
+            [&v](size_t i1, size_t i2) { return v[i1] < v[i2]; });
+
+        return idx;
+    };
+
+    auto ordered_in_indexes = sort_indexes(input_order);
+    for (auto idx : ordered_in_indexes)
+    {
+        auto in = graph_inputs[idx];
+        ADD_INODE
+    }
+    for (auto &in : remained_inputs)
+        in->connect(inputs.at(in->connection())->output());
+
+    auto ordered_out_indexes = sort_indexes(output_order);
+    for (auto idx : ordered_out_indexes)
+    {
+        auto out = graph_outputs[idx];
+        auto conns = out->connections();
+        ADD_ONODE
+    }
+
+#undef ADD_ONODE
+
     return result;
 }
 
diff --git a/src/ir/graph.partition.cpp b/src/ir/graph.partition.cpp
index c48f79e695..faea8c21eb 100644
--- a/src/ir/graph.partition.cpp
+++ b/src/ir/graph.partition.cpp
@@ -17,6 +17,7 @@
 #include <nncase/ir/ops/constant.h>
 #include <nncase/ir/visitor.h>
 #include <nncase/runtime/stackvm/runtime_module.h>
+#include <queue>
 #include <unordered_set>
 
 using namespace nncase;
@@ -32,6 +33,7 @@ struct region
     std::unordered_set<node *> nodes_set;
     std::unordered_set<input_connector *> region_inputs;
     std::unordered_set<output_connector *> outputs;
+    std::unordered_map<output_connector *, int> need_remove_outputs;
 
     region(module_type_t module_type)
         : module_type(module_type)
@@ -47,14 +49,33 @@ struct region
                 region_inputs.emplace(in);
             for (auto out : n.outputs())
                 outputs.emplace(out);
+
             for (auto it = region_inputs.begin(); it != region_inputs.end();)
             {
                 if (outputs.contains((*it)->connection()))
+                {
+                    if (need_remove_outputs.find((*it)->connection()) != need_remove_outputs.end())
+                        need_remove_outputs.at((*it)->connection()) -= 1;
+                    else
+                        need_remove_outputs.emplace((*it)->connection(),
+                            (*it)->connection()->connections().size() - 1);
                     it = region_inputs.erase(it);
+                }
                 else
                     ++it;
             }
 
+            for (auto it = need_remove_outputs.begin(); it != need_remove_outputs.end();)
+            {
+                if (it->second == 0)
+                {
+                    outputs.erase(it->first);
+                    it = need_remove_outputs.erase(it);
+                }
+                else
+                    it++;
+            }
+
             if (is_all_noaction && n.attributes() & node_attr_action)
                 is_all_noaction = false;
             return true;
@@ -101,6 +122,115 @@ struct region
     }
 };
 
+typedef struct Region_node
+{
+    std::list<region>::iterator node;
+    Region_node *parent = nullptr;
+    Region_node *child = nullptr;
+    Region_node *bro = nullptr;
+} Region_node, *Region_Tree;
+
+class Region_tree
+{
+public:
+    Region_tree(std::list<region> &rg)
+        : regions_(rg) { }
+    Region_node *create_tree(std::list<region>::iterator new_node, int depth)
+    {
+
+        Region_node *root = create_node();
+        root->node = new_node;
+
+        // find a path from itb--> ita
+        if (new_node == target_region_)
+        {
+            leaves_.push_back(root);
+            return root;
+        }
+
+        // limit tree depth
+        if (depth >= 10)
+        {
+            skip_ = true;
+            return root;
+        }
+
+        for (auto it : new_node->region_inputs)
+        {
+            for (auto itb = regions_.begin(); itb != regions_.end(); itb++)
+            {
+                if (itb->outputs.contains(it->connection()))
+                {
+                    if (root->child == nullptr)
+                    {
+                        root->child = create_tree(itb, depth + 1);
+                        root->child->parent = root;
+                    }
+                    else
+                    {
+                        root->bro = create_tree(itb, depth);
+                        root->bro->parent = root;
+                        root->bro = root->bro->bro;
+                    }
+                }
+            }
+        }
+
+        return root;
+    }
+
+    bool not_have_circle()
+    {
+        // if tree depth > 10, ignore merge itb--> ita
+        if (skip_)
+            return false;
+        // each leaf has only one path to root.
+        // if all the paths of leaves to root don't have CPU op ,itb can merge to ita.
+        for (auto it : leaves_)
+        {
+            auto condition_ptr = it->parent;
+            if (condition_ptr->node == start_region_)
+                continue;
+            while (condition_ptr != nullptr)
+            {
+                if (condition_ptr->node->module_type == runtime::stackvm::stackvm_module_type && !condition_ptr->node->is_all_noaction)
+                    return false;
+                condition_ptr = condition_ptr->parent;
+            }
+        }
+        return true;
+    }
+
+    void set_label_region(std::list<region>::iterator ita, std::list<region>::iterator itb)
+    {
+        start_region_ = itb;
+        target_region_ = ita;
+    }
+
+    void free_tree(Region_node *root)
+    {
+        if (root != nullptr)
+        {
+            free_tree(root->child);
+            free_tree(root->bro);
+            delete root;
+        }
+    }
+
+private:
+    Region_node *create_node()
+    {
+        Region_Tree node = new Region_node();
+        return node;
+    }
+
+    std::list<region>::iterator start_region_;
+    std::list<region>::iterator target_region_;
+    std::vector<Region_node *> leaves_;
+    bool skip_;
+    std::list<region> &regions_;
+};
+
 class graph_merger
 {
 public:
@@ -133,6 +263,13 @@ class graph_merger
             for (auto in : node.inputs())
             {
                 auto &conn = in->connection()->owner();
+
+                if (conn.runtime_opcode() == op_constant)
+                {
+                    last_region = nullptr;
+                    break;
+                }
+
                 auto it = node_to_region_.find(&conn);
                 if (it != node_to_region_.end())
                 {
@@ -181,7 +318,44 @@ class graph_merger
             changed |= merge_child_region();
             changed |= merge_parent_region();
             changed |= merge_same_input_region();
+
         } while (changed);
+
+        do
+        {
+            changed = false;
+            changed |= merge_child_region_stage_2();
+        } while (changed);
+    }
+
+    bool check_circle(std::list<region>::iterator ita, std::list<region>::iterator itb)
+    {
+        // merge directly
+        bool merge_directly = true;
+        for (auto it : ita->outputs)
+        {
+            if (std::all_of(it->connections().begin(), it->connections().end(),
+                    [&](input_connector *out) {
+                        return itb->region_inputs.contains(out);
+                    }))
+                continue;
+            else
+                merge_directly = false;
+        }
+        if (merge_directly)
+            return true;
+
+        if (itb->region_inputs.size() == 1)
+        {
+            return true;
+        }
+
+        auto check = std::make_shared<Region_tree>(regions_);
+        check->set_label_region(ita, itb);
+        auto root = check->create_tree(itb, 0);
+        auto flag = check->not_have_circle();
+        check->free_tree(root);
+        return flag;
     }
 
     bool merge_child_region()
@@ -224,6 +398,47 @@ class graph_merger
         return ever_changed;
     }
 
+    bool merge_child_region_stage_2()
+    {
+        bool ever_changed = false;
+        bool changed;
+        do
+        {
+            changed = false;
+            for (auto ita = regions_.begin(); ita != regions_.end(); ++ita)
+            {
+                std::vector<std::list<region>::iterator> to_be_merge;
+                for (auto itb = regions_.begin(); itb != regions_.end(); ++itb)
+                {
+                    // don't merge stackvm region
+                    if (ita == itb
+                        || (ita->module_type == runtime::stackvm::stackvm_module_type
+                            && itb->module_type == runtime::stackvm::stackvm_module_type))
+                        continue;
+
+                    // itb's has inputs connect to ita's output without circle
+                    if ((ita->module_type == itb->module_type || itb->is_all_noaction)
+                        && std::any_of(itb->region_inputs.begin(), itb->region_inputs.end(), [&](input_connector *in) { return ita->outputs.contains(in->connection()); })
+                        && check_circle(ita, itb))
+                        to_be_merge.emplace_back(itb);
+                }
+
+                if (!to_be_merge.empty())
+                {
+                    for (auto region : to_be_merge)
+                    {
+                        ita->merge(*region);
+                        regions_.erase(region);
+                    }
+
+                    changed = ever_changed = true;
+                    break;
+                }
+            }
+        } while (changed);
+        return ever_changed;
+    }
+
     bool merge_parent_region()
     {
         bool ever_changed = false;
@@ -314,6 +529,8 @@ class graph_merger
 
     void add_node_to_region(region &region, node &node)
     {
+        if (node.module_type() != runtime::stackvm::stackvm_module_type)
+            region.module_type = node.module_type();
         region.add_node(node);
         node_to_region_.emplace(&node, &region);
     }
@@ -337,7 +554,7 @@ void graph::merge_module_regions()
         if (region.module_type == runtime::stackvm::stackvm_module_type)
             continue;
 
-        auto split = split_subgraph(region.nodes);
+        auto split = split_subgraph(region.nodes, true);
         auto &subg = add_subgraph(std::move(split.subgraph));
         auto c = emplace<call>(subg);
         c->name(std::string(region.module_type.data()) + "_" + std::to_string(subids[region.module_type.data()]++));
diff --git a/src/ir/ops/CMakeLists.txt b/src/ir/ops/CMakeLists.txt
index 82829287c6..f8125e4e57 100644
--- a/src/ir/ops/CMakeLists.txt
+++ b/src/ir/ops/CMakeLists.txt
@@ -1,4 +1,4 @@
-﻿cmake_minimum_required (VERSION 3.13)
+﻿cmake_minimum_required(VERSION 3.13)
 
 target_sources(ir PRIVATE
     call.cpp
@@ -45,4 +45,11 @@ target_sources(ir PRIVATE
     softmax.cpp
     ternary.cpp
     topk.cpp
-    trilu.cpp)
+    trilu.cpp
+    gru.cpp
+    tflite_detection_postprocess.cpp
+    gather_elements.cpp
+    layernorm.cpp
+    compress.cpp
+    instancenorm.cpp
+)
diff --git a/src/ir/ops/compress.cpp b/src/ir/ops/compress.cpp
new file mode 100644
index 0000000000..065aad6187
--- /dev/null
+++ b/src/ir/ops/compress.cpp
@@ -0,0 +1,35 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <nncase/ir/op_utils.h>
+#include <nncase/ir/ops/compress.h>
+#include <xtensor/xarray.hpp>
+
+using namespace nncase;
+using namespace nncase::ir;
+
+compress::compress(datatype_t type, shape_t input_shape, shape_t condition_shape, shape_t output_shape, int32_t axis)
+    : axis_(normalize_axis(input_shape, axis))
+{
+    add_input("input", type, input_shape);
+    add_input("condition", dt_uint8, condition_shape);
+
+    add_output("output", type, output_shape);
+}
+
+bool compress::properties_equal(node &other) const
+{
+    auto &r = static_cast<compress &>(other);
+    return axis() == r.axis();
+}
diff --git a/src/ir/ops/fused_unary.cpp b/src/ir/ops/fused_unary.cpp
index e600ef9bb1..56d80b93dd 100644
--- a/src/ir/ops/fused_unary.cpp
+++ b/src/ir/ops/fused_unary.cpp
@@ -140,6 +140,9 @@ void fused_unary::compile_graph(const std::vector<fused_unary_op> &subgraph, cod
             case unary_logical_not:
                 builder.emit_logical_not();
                 break;
+            case unary_erf:
+                builder.emit_erf();
+                break;
             default:
                 throw std::invalid_argument("Unsupported unary op for nnil: " + (std::string)magic_enum::enum_name(op.unary.unary_op));
             }
diff --git a/src/ir/ops/gather_elements.cpp b/src/ir/ops/gather_elements.cpp
new file mode 100644
index 0000000000..a678ceac4f
--- /dev/null
+++ b/src/ir/ops/gather_elements.cpp
@@ -0,0 +1,34 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <nncase/ir/op_utils.h>
+#include <nncase/ir/ops/gather_elements.h>
+#include <xtensor/xarray.hpp>
+
+using namespace nncase;
+using namespace nncase::ir;
+
+gather_elements::gather_elements(datatype_t in_type, datatype_t indices_type, shape_t input_shape, shape_t indices_shape, shape_t output_shape, int32_t axis)
+    : axis_(axis)
+{
+    add_input("input", in_type, input_shape);
+    add_input("indices", indices_type, indices_shape);
+    add_output("output", in_type, output_shape);
+}
+
+bool gather_elements::properties_equal(node &other) const
+{
+    auto &r = static_cast<gather_elements &>(other);
+    return axis() == r.axis();
+}
diff --git a/src/ir/ops/gru.cpp b/src/ir/ops/gru.cpp
new file mode 100644
index 0000000000..334ed8a109
--- /dev/null
+++ b/src/ir/ops/gru.cpp
@@ -0,0 +1,40 @@
+/* Copyright 2019-2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <nncase/ir/op_utils.h>
+#include <nncase/ir/ops/gru.h>
+#include <xtensor/xarray.hpp>
+
+using namespace nncase;
+using namespace nncase::ir;
+
+gru::gru(shape_t input_shape, shape_t w_shape, shape_t r_shape, shape_t b_shape, shape_t output_shape,
+    shape_t output_h_shape, lstm_direction direction, std::string framework, bool linear_before_reset)
+    : direction_(direction), framework_(framework), linear_before_reset_(linear_before_reset)
+{
+    add_input("input", dt_float32, input_shape);
+    add_input("w", dt_float32, w_shape);
+    add_input("r", dt_float32, r_shape);
+    add_input("b", dt_float32, b_shape);
+    add_input("initial_h", dt_float32, output_h_shape);
+
+    add_output("output", dt_float32, output_shape);
+    add_output("output_h", dt_float32, output_h_shape);
+}
+
+bool gru::properties_equal(node &other) const
+{
+    auto &r = static_cast<gru &>(other);
+    return direction() == r.direction() && framework() == r.framework() && linear_before_reset() == r.linear_before_reset();
+}
diff --git a/src/ir/ops/instancenorm.cpp b/src/ir/ops/instancenorm.cpp
new file mode 100644
index 0000000000..7d20bb0add
--- /dev/null
+++ b/src/ir/ops/instancenorm.cpp
@@ -0,0 +1,35 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <nncase/ir/op_utils.h>
+#include <nncase/ir/ops/instancenorm.h>
+#include <xtensor/xarray.hpp>
+
+using namespace nncase;
+using namespace nncase::ir;
+
+instancenorm::instancenorm(datatype_t input_type, shape_t input_shape, float epsilon)
+    : epsilon_(epsilon)
+{
+    add_input("input", input_type, input_shape);
+    add_input("scale", input_type, get_instancenorm_const_shape(input_shape));
+    add_input("bias", input_type, get_instancenorm_const_shape(input_shape));
+    add_output("output", input_type, input_shape);
+}
+
+bool instancenorm::properties_equal(node &other) const
+{
+    auto &r = static_cast<instancenorm &>(other);
+    return epsilon() == r.epsilon();
+}
diff --git a/src/ir/ops/layernorm.cpp b/src/ir/ops/layernorm.cpp
new file mode 100644
index 0000000000..373e117eb9
--- /dev/null
+++ b/src/ir/ops/layernorm.cpp
@@ -0,0 +1,36 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nncase/ir/ir_types.h"
+#include <nncase/ir/op_utils.h>
+#include <nncase/ir/ops/layernorm.h>
+#include <xtensor/xarray.hpp>
+
+using namespace nncase;
+using namespace nncase::ir;
+
+layernorm::layernorm(datatype_t input_type, shape_t input_shape, int32_t axis, float epsilon)
+    : axis_(normalize_axis(input_shape, axis)), epsilon_(epsilon)
+{
+    add_input("input", input_type, input_shape);
+    add_input("scale", input_type, shape_t { input_shape.begin() + axis_, input_shape.end() });
+    add_input("bias", input_type, shape_t { input_shape.begin() + axis_, input_shape.end() });
+    add_output("output", input_type, input_shape);
+}
+
+bool layernorm::properties_equal(node &other) const
+{
+    auto &r = static_cast<layernorm &>(other);
+    return axis() == r.axis() && epsilon() == r.epsilon();
+}
diff --git a/src/ir/ops/tflite_detection_postprocess.cpp b/src/ir/ops/tflite_detection_postprocess.cpp
new file mode 100644
index 0000000000..78b91ff94f
--- /dev/null
+++ b/src/ir/ops/tflite_detection_postprocess.cpp
@@ -0,0 +1,57 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <nncase/ir/op_utils.h>
+#include <nncase/ir/ops/tflite_detection_postprocess.h>
+#include <xtensor/xarray.hpp>
+
+using namespace nncase;
+using namespace nncase::ir;
+
+tflite_detection_postprocess::tflite_detection_postprocess(
+    shape_t boxes_shape, shape_t scores_shape, shape_t anchors_shape,
+    shape_t output_shape_0, shape_t output_shape_1, shape_t output_shape_2, shape_t output_shape_3,
+    int32_t max_detections,
+    int32_t max_classes_per_detection,
+    int32_t detections_per_class,
+    bool use_regular_non_max_suppression,
+    float nms_score_threshold,
+    float nms_iou_threshold,
+    int32_t num_classes,
+    float y_scale,
+    float x_scale,
+    float h_scale,
+    float w_scale)
+    : max_detections_(max_detections), max_classes_per_detection_(max_classes_per_detection), detections_per_class_(detections_per_class), use_regular_non_max_suppression_(use_regular_non_max_suppression), nms_score_threshold_(nms_score_threshold), nms_iou_threshold_(nms_iou_threshold), num_classes_(num_classes), y_scale_(y_scale), x_scale_(x_scale), h_scale_(h_scale), w_scale_(w_scale)
+{
+    add_input("boxes", dt_float32, boxes_shape);
+    add_input("scores", dt_float32, scores_shape);
+    add_input("anchors", dt_float32, anchors_shape);
+    add_output("output_locations", dt_float32, output_shape_0);
+    add_output("output_classes", dt_float32, output_shape_1);
+    add_output("output_scores", dt_float32, output_shape_2);
+    add_output("output_num_detections", dt_float32, output_shape_3);
+}
+
+bool tflite_detection_postprocess::properties_equal(node &other) const
+{
+    auto &r = static_cast<tflite_detection_postprocess &>(other);
+    return max_detections() == r.max_detections()
+        && max_classes_per_detection() == r.max_classes_per_detection()
+        && detections_per_class() == r.detections_per_class()
+        && use_regular_non_max_suppression() == r.use_regular_non_max_suppression()
+        && nms_score_threshold() == r.nms_score_threshold()
+        && nms_iou_threshold() == r.nms_iou_threshold() && num_classes() == r.num_classes()
+        && y_scale() == r.y_scale() && x_scale() == r.x_scale() && h_scale() == r.h_scale() && w_scale() == r.w_scale();
+}
diff --git a/src/kernels/cpu/optimized/CMakeLists.txt b/src/kernels/cpu/optimized/CMakeLists.txt
index 80b4ff608b..e1c8569f8f 100644
--- a/src/kernels/cpu/optimized/CMakeLists.txt
+++ b/src/kernels/cpu/optimized/CMakeLists.txt
@@ -1,24 +1,28 @@
-cmake_minimum_required (VERSION 3.13)
+cmake_minimum_required(VERSION 3.13)
 
-if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "riscv64")
+if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "riscv64")
     set(ARCH riscv64)
 else()
     set(ARCH x86_64)
 endif()
 
 set(SRCS convolution.cpp
-         concat.cpp
-         slice.cpp
-         copy.cpp
-         dequantize.cpp
-         resize_image.cpp
-         gather.cpp
-         gather_nd.cpp
-         quantize.cpp
-         onehot.cpp
-         ${ARCH}/binary.cpp
-         ${ARCH}/unary.cpp
-         ${ARCH}/matmul.cpp
-         ${ARCH}/sigmoid.cpp
-         ${ARCH}/softmax.cpp)
-target_sources(kernels PRIVATE ${SRCS})
\ No newline at end of file
+    concat.cpp
+    slice.cpp
+    copy.cpp
+    dequantize.cpp
+    resize_image.cpp
+    gather.cpp
+    gather_nd.cpp
+    quantize.cpp
+    onehot.cpp
+    ${ARCH}/binary.cpp
+    ${ARCH}/unary.cpp
+    ${ARCH}/matmul.cpp
+    ${ARCH}/sigmoid.cpp
+    ${ARCH}/softmax.cpp
+    ${ARCH}/layernorm.cpp
+    ${ARCH}/ternary.cpp
+    ${ARCH}/reduce.cpp
+    ${ARCH}/instancenorm.cpp)
+target_sources(kernels PRIVATE ${SRCS})
diff --git a/src/kernels/cpu/optimized/riscv64/instancenorm.cpp b/src/kernels/cpu/optimized/riscv64/instancenorm.cpp
new file mode 100644
index 0000000000..d19dc3a894
--- /dev/null
+++ b/src/kernels/cpu/optimized/riscv64/instancenorm.cpp
@@ -0,0 +1,172 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <nncase/kernels/cpu/optimized/tensor_compute.h>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::optimized;
+
+// #if __riscv_vector
+// #define RVV_LMUL 8
+// #define _STR(x) #x
+// #define STR(x) _STR(x)
+// #define _CONNECT(a, b) a##b
+// #define CONNECT(a, b) _CONNECT(a, b)
+// #define RVVSETVLI2(evl, avl, elen) "vsetvli " STR(evl) "," STR(avl) "," STR(elen) "," STR(CONNECT(m, RVV_LMUL)) ";"
+
+// static float get_mean(const float *data, int n)
+// {
+//     float ret;
+//     __asm volatile(
+//         "mv a0, %[avl];"
+//         "mv a1, %[input_ptr1];" RVVSETVLI2(t0, a0, e32) "vmv.s.x v0, x0;"
+//                                                         "XXXXXX%=:;" RVVSETVLI2(t0, a0, e32) "vle32.v v8, (a1);"
+//                                                                                              "sub a0,a0, t0;"
+//                                                                                              "slli t1, t0, 2;"
+//                                                                                              "vfredsum.vs v0,v8,v0;"
+
+//                                                                                              "add a1, a1, t1;"
+//                                                                                              "bnez a0, XXXXXX%=;"
+//                                                                                              "vfmv.f.s f0, v0;"
+//                                                                                              "fcvt.s.w f1, %[avl];"
+//                                                                                              "fdiv.s %[ret], f0, f1;"
+
+//         : [ret] "=f"(ret)
+//         : [avl] "r"(n), [input_ptr1] "r"(data)
+//         : "t0", "t1", "a0", "a1", "f0", "f1", "v0", "v8");
+//     return ret;
+// }
+
+// static float get_var(const float *data, int n, float mean)
+// {
+//     float ret;
+//     __asm volatile(
+
+//         "mv a0, %[avl];"
+//         "mv a1, %[input_ptr1];" RVVSETVLI2(t0, a0, e32) "vmv.s.x v0, x0;"
+
+//                                                         "vle32.v v8, (a1);"
+//                                                         "sub a0,a0, t0;"
+//                                                         "slli t1, t0, 2;"
+//                                                         "vfsub.vf v8, v8, %[mean];"
+//                                                         "vfmul.vv v8, v8, v8;"
+//                                                         "add a1, a1, t1;"
+//                                                         "beqz a0, X1_END%=;"
+//                                                         "X1_STRAT%=:;" RVVSETVLI2(t0, a0, e32) "vle32.v v16, (a1);"
+//                                                                                                "sub a0,a0, t0;"
+//                                                                                                "slli t1, t0, 2;"
+//                                                                                                "vfsub.vf v16, v16, %[mean];"
+//                                                                                                "vfmacc.vv v8, v16, v16;"
+
+//                                                                                                "add a1, a1, t1;"
+//                                                                                                "bnez a0, X1_STRAT%=;"
+
+//                                                                                                "X1_END%=:"
+
+//                                                                                                "vfredsum.vs v0,v8,v0;"
+
+//                                                                                                "vfmv.f.s f0, v0;"
+//                                                                                                "fcvt.s.w f1, %[avl];"
+//                                                                                                "fdiv.s %[ret], f0, f1;"
+
+//         : [ret] "=f"(ret)
+//         : [avl] "r"(n), [input_ptr1] "r"(data), [mean] "f"(mean)
+//         : "t0", "t1", "a0", "a1", "v0", "v8", "v16", "f0", "f1");
+//     return ret;
+// }
+
+// static void layer_norm_update1(const float *data, float *out, int len, float mean, float var, float *r1, float e, float *b)
+// {
+//     float r_sqrt = 1.0f / sqrtf(var + e);
+//     __asm volatile(
+//         "mv a0, %[avl];"
+//         "mv a1, %[input_ptr1];"
+//         "mv a2, %[out];"
+//         "mv a3, %[scale];"
+//         "mv a4, %[b];"
+//         "layer_norm_update1%=:;" RVVSETVLI2(t0, a0, e32) "vle32.v v16, (a1);"
+//                                                          "vle32.v v8, (a3);"
+//                                                          "sub a0,a0, t0;"
+//                                                          "slli t1, t0, 2;"
+//                                                          "vfsub.vf v16, v16, %[mean];"
+//                                                          "add a1, a1, t1;"
+//                                                          "vfmul.vf v16, v16, %[r_sqrt];"
+
+//                                                          "add a3, a3, t1;"
+//                                                          "vfmul.vv v16, v8, v16;"
+
+//                                                          "vle32.v v8, (a4);"
+//                                                          "vfadd.vv v16, v16, v8;"
+//                                                          "add a4, a4, t1;"
+
+//                                                          "vse32.v v16, (a2);"
+//                                                          "add a2, a2, t1;"
+//                                                          "bnez a0, layer_norm_update1%=;"
+
+//         :
+//         : [avl] "r"(len), [input_ptr1] "r"(data), [mean] "f"(mean), [r_sqrt] "f"(r_sqrt), [b] "r"(b), [out] "r"(out), [scale] "r"(r1)
+//         : "t0", "t1", "a0", "a1", "a2", "v0", "v16", "a3", "a4", "v8");
+// }
+
+// result<void> layernorm_impl(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon)
+// {
+//     if (axis < 0)
+//     {
+//         axis = (int)in_shape.size() + axis;
+//     }
+//     auto outer_size = 1;
+//     auto inner_size = 1;
+//     for (auto i = 0; i < axis; i++)
+//         outer_size *= in_shape[i];
+//     for (auto i = axis; i < static_cast<int>(in_shape.size()); i++)
+//         inner_size *= in_shape[i];
+
+//     for (int32_t batch = 0; batch < outer_size; batch++)
+//     {
+//         const float *src = input + batch * inner_size;
+//         float *dest = output + batch * inner_size;
+
+//         float mean = get_mean(src, inner_size);
+
+//         float var_data = get_var(src, inner_size, mean);
+
+//         layer_norm_update1(src, dest, inner_size, mean, var_data, scale, epsilon, bias);
+//     }
+//     return ok();
+// }
+// #endif
+
+template <>
+result<void> optimized::instancenorm<float>(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, float epsilon) noexcept
+{
+    // #if __riscv_vector
+    //     return instancenorm_impl(input, output, scale, bias, in_shape, epsilon);
+    // #else
+    return cpu::reference::instancenorm(input, output, scale, bias, in_shape, epsilon);
+    // #endif
+}
+
+template <typename T>
+result<void> optimized::instancenorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, float epsilon) noexcept
+{
+    return cpu::reference::instancenorm(input, output, scale, bias, in_shape, epsilon);
+}
diff --git a/src/kernels/cpu/optimized/riscv64/layernorm.cpp b/src/kernels/cpu/optimized/riscv64/layernorm.cpp
new file mode 100644
index 0000000000..2bde01bb03
--- /dev/null
+++ b/src/kernels/cpu/optimized/riscv64/layernorm.cpp
@@ -0,0 +1,173 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <nncase/kernels/cpu/optimized/tensor_compute.h>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::optimized;
+
+#if __riscv_vector
+#define RVV_LMUL 8
+#define _STR(x) #x
+#define STR(x) _STR(x)
+#define _CONNECT(a, b) a##b
+#define CONNECT(a, b) _CONNECT(a, b)
+#define RVVSETVLI2(evl, avl, elen) "vsetvli " STR(evl) "," STR(avl) "," STR(elen) "," STR(CONNECT(m, RVV_LMUL)) ";"
+
+static float get_mean(const float *data, int n)
+{
+    float ret;
+    __asm volatile(
+        "mv a0, %[avl];"
+        "mv a1, %[input_ptr1];" RVVSETVLI2(t0, a0, e32) "vmv.s.x v0, x0;"
+                                                        "XXXXXX%=:;" RVVSETVLI2(t0, a0, e32) "vle32.v v8, (a1);"
+                                                                                             "sub a0,a0, t0;"
+                                                                                             "slli t1, t0, 2;"
+                                                                                             "vfredsum.vs v0,v8,v0;"
+
+                                                                                             "add a1, a1, t1;"
+                                                                                             "bnez a0, XXXXXX%=;"
+                                                                                             "vfmv.f.s f0, v0;"
+                                                                                             "fcvt.s.w f1, %[avl];"
+                                                                                             "fdiv.s %[ret], f0, f1;"
+
+        : [ret] "=f"(ret)
+        : [avl] "r"(n), [input_ptr1] "r"(data)
+        : "t0", "t1", "a0", "a1", "f0", "f1", "v0", "v8");
+    return ret;
+}
+
+static float get_var(const float *data, int n, float mean)
+{
+    float ret;
+    __asm volatile(
+
+        "mv a0, %[avl];"
+        "mv a1, %[input_ptr1];" RVVSETVLI2(t0, a0, e32) "vmv.s.x v0, x0;"
+
+                                                        "vle32.v v8, (a1);"
+                                                        "sub a0,a0, t0;"
+                                                        "slli t1, t0, 2;"
+                                                        "vfsub.vf v8, v8, %[mean];"
+                                                        "vfmul.vv v8, v8, v8;"
+                                                        "add a1, a1, t1;"
+                                                        "beqz a0, X1_END%=;"
+                                                        "X1_STRAT%=:;" RVVSETVLI2(t0, a0, e32) "vle32.v v16, (a1);"
+                                                                                               "sub a0,a0, t0;"
+                                                                                               "slli t1, t0, 2;"
+                                                                                               "vfsub.vf v16, v16, %[mean];"
+                                                                                               "vfmacc.vv v8, v16, v16;"
+
+                                                                                               "add a1, a1, t1;"
+                                                                                               "bnez a0, X1_STRAT%=;"
+
+                                                                                               "X1_END%=:"
+
+                                                                                               "vfredsum.vs v0,v8,v0;"
+
+                                                                                               "vfmv.f.s f0, v0;"
+                                                                                               "fcvt.s.w f1, %[avl];"
+                                                                                               "fdiv.s %[ret], f0, f1;"
+
+        : [ret] "=f"(ret)
+        : [avl] "r"(n), [input_ptr1] "r"(data), [mean] "f"(mean)
+        : "t0", "t1", "a0", "a1", "v0", "v8", "v16", "f0", "f1");
+    return ret;
+}
+
+static void layer_norm_update1(const float *data, float *out, int len, float mean, float var, float *r1, float e, float *b)
+{
+    float r_sqrt = 1.0f / sqrtf(var + e);
+    __asm volatile(
+        "mv a0, %[avl];"
+        "mv a1, %[input_ptr1];"
+        "mv a2, %[out];"
+        "mv a3, %[scale];"
+        "mv a4, %[b];"
+        "layer_norm_update1%=:;" RVVSETVLI2(t0, a0, e32) "vle32.v v16, (a1);"
+                                                         "vle32.v v8, (a3);"
+                                                         "sub a0,a0, t0;"
+                                                         "slli t1, t0, 2;"
+                                                         "vfsub.vf v16, v16, %[mean];"
+                                                         "add a1, a1, t1;"
+                                                         "vfmul.vf v16, v16, %[r_sqrt];"
+
+                                                         "add a3, a3, t1;"
+                                                         "vfmul.vv v16, v8, v16;"
+
+                                                         "vle32.v v8, (a4);"
+                                                         "vfadd.vv v16, v16, v8;"
+                                                         "add a4, a4, t1;"
+
+                                                         "vse32.v v16, (a2);"
+                                                         "add a2, a2, t1;"
+                                                         "bnez a0, layer_norm_update1%=;"
+
+        :
+        : [avl] "r"(len), [input_ptr1] "r"(data), [mean] "f"(mean), [r_sqrt] "f"(r_sqrt), [b] "r"(b), [out] "r"(out), [scale] "r"(r1)
+        : "t0", "t1", "a0", "a1", "a2", "v0", "v16", "a3", "a4", "v8");
+}
+
+result<void> layernorm_impl(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon)
+{
+    if (axis < 0)
+    {
+        axis = (int)in_shape.size() + axis;
+    }
+    auto outer_size = 1;
+    auto inner_size = 1;
+    for (auto i = 0; i < axis; i++)
+        outer_size *= in_shape[i];
+    for (auto i = axis; i < static_cast<int>(in_shape.size()); i++)
+        inner_size *= in_shape[i];
+
+    for (int32_t batch = 0; batch < outer_size; batch++)
+    {
+        const float *src = input + batch * inner_size;
+        float *dest = output + batch * inner_size;
+
+        float mean = get_mean(src, inner_size);
+
+        float var_data = get_var(src, inner_size, mean);
+
+        layer_norm_update1(src, dest, inner_size, mean, var_data, scale, epsilon, bias);
+    }
+    return ok();
+}
+#endif
+
+template <>
+result<void> optimized::layernorm<float>(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept
+{
+#if __riscv_vector
+    return layernorm_impl(input, output, scale, bias, in_shape, axis, epsilon);
+#else
+    return cpu::reference::layernorm(input, output, scale, bias, in_shape, axis, epsilon);
+#endif
+}
+
+template <typename T>
+result<void> optimized::layernorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept
+
+{
+    return cpu::reference::layernorm(input, output, scale, bias, in_shape, axis, epsilon);
+}
diff --git a/src/kernels/cpu/optimized/riscv64/reduce.cpp b/src/kernels/cpu/optimized/riscv64/reduce.cpp
new file mode 100644
index 0000000000..ef9d65b080
--- /dev/null
+++ b/src/kernels/cpu/optimized/riscv64/reduce.cpp
@@ -0,0 +1,132 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <nncase/kernels/cpu/optimized/tensor_compute.h>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::optimized;
+
+result<void> reduce_mean_rvv(NNCASE_UNUSED float init_value, const float *input, float *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    NNCASE_UNUSED const runtime_shape_t &in_strides, NNCASE_UNUSED const runtime_shape_t &out_shape, NNCASE_UNUSED const runtime_shape_t &out_strides) noexcept;
+
+template <>
+result<void> optimized::reduce<float>(reduce_op_t op, float init_value, const float *input, float *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept
+{
+    if (reduce_mean == op)
+    {
+        auto out_shape = kernels::detail::get_reduced_shape(in_shape, axis, keep_dims);
+        return reduce_mean_rvv(init_value, input, output, in_shape, axis, in_strides, out_shape, out_strides);
+    }
+    else
+    {
+        return cpu::reference::reduce(op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context);
+    }
+}
+
+template result<void> optimized::reduce<int32_t>(reduce_op_t op, int32_t init_value, const int32_t *input, int32_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept;
+
+template result<void> optimized::reduce<int64_t>(reduce_op_t op, int64_t init_value, const int64_t *input, int64_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept;
+
+template <typename T>
+result<void> optimized::reduce(reduce_op_t op, T init_value, const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept
+{
+    return cpu::reference::reduce(op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context);
+}
+
+result<void> reduce_mean_rvv(NNCASE_UNUSED float init_value, const float *input, float *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    NNCASE_UNUSED const runtime_shape_t &in_strides, NNCASE_UNUSED const runtime_shape_t &out_shape, NNCASE_UNUSED const runtime_shape_t &out_strides) noexcept
+{
+    if (axis[0] == 0)
+    {
+        size_t out_n = 1;
+        size_t inner_n = 1;
+        for (size_t i = 0; i < axis.size(); ++i)
+        {
+            inner_n *= in_shape[i];
+        }
+        for (size_t i = 0; i < in_shape.size() - axis.size(); ++i)
+        {
+            out_n *= in_shape[in_shape.size() - i - 1];
+        }
+#if (!__riscv_vector)
+        for (size_t i = 0; i < out_n; ++i)
+        {
+            float sum = 0.0f;
+            for (size_t j = 0; j < inner_n; ++j)
+            {
+                sum += input[j * out_n + i];
+            }
+            output[i] = sum / inner_n;
+        }
+#else
+        size_t vl;
+        float lr = 1.0f / inner_n;
+        size_t i = 0;
+        size_t n2 = out_n;
+        while (n2)
+        {
+            vl = vsetvl_e32m8(n2);
+            vfloat32m8_t _p = vle32_v_f32m8(input + 0 * out_n + i, vl);
+
+            for (size_t j = 1; j < inner_n; ++j)
+            {
+                vfloat32m8_t _p1 = vle32_v_f32m8(input + j * out_n + i, vl);
+                _p = vfadd_vv_f32m8(_p, _p1, vl);
+            }
+            _p = vfmul_vf_f32m8(_p, lr, vl);
+            vse32_v_f32m8(output + i, _p, vl);
+            i += vl;
+            n2 -= vl;
+        }
+#endif
+    }
+    else
+    {
+        size_t out_n = 1;
+        size_t inner_n = 1;
+        for (size_t i = 0; i < axis.size(); ++i)
+        {
+            out_n *= in_shape[i];
+        }
+        for (size_t i = 0; i < in_shape.size() - axis.size(); ++i)
+        {
+            inner_n *= in_shape[in_shape.size() - i - 1];
+        }
+        for (size_t i = 0; i < out_n; ++i)
+        {
+            float sum = 0.0f;
+            for (size_t j = 0; j < inner_n; ++j)
+            {
+                sum += input[i * inner_n + j];
+            }
+            output[i] = sum / inner_n;
+        }
+    }
+    return ok();
+}
\ No newline at end of file
diff --git a/src/kernels/cpu/optimized/riscv64/softmax.cpp b/src/kernels/cpu/optimized/riscv64/softmax.cpp
index b6f67bb805..0f96979ca3 100644
--- a/src/kernels/cpu/optimized/riscv64/softmax.cpp
+++ b/src/kernels/cpu/optimized/riscv64/softmax.cpp
@@ -32,6 +32,205 @@ namespace
 {
 #if __riscv_vector
 
+static __inline __attribute__((__always_inline__))
+vfloat32m8_t
+exp_ps2(vfloat32m8_t _p, size_t vl)
+{
+    _p = vfmax_vf_f32m8(_p, -88.0f, vl);
+    _p = vfmul_vf_f32m8(_p, 12102203.0f, vl);
+    _p = vfadd_vf_f32m8(_p, 1065414017, vl);
+
+    vint32m8_t p2 = vfcvt_x_f_v_i32m8(_p, vl);
+    _p = vreinterpret_v_i32m8_f32m8(p2);
+    return _p;
+}
+
+vfloat32m8_t exp_ps2_opt(vfloat32m8_t _p, const float c0, const float c1, const float c2, size_t vl)
+{
+    _p = vfmax_vf_f32m8(_p, c0, vl);
+    _p = vfmadd_vf_f32m8(_p, c1, vfmv_v_f_f32m8(c2, vl), vl);
+
+    vint32m8_t p2 = vfcvt_x_f_v_i32m8(_p, vl);
+    _p = vreinterpret_v_i32m8_f32m8(p2);
+    return _p;
+}
+
+result<void> optimized_softmax_impl_opt(const float *input, float *output, const runtime_shape_t &in_shape, int32_t axis, float beta) noexcept
+{
+    size_t ndim = in_shape.size();
+    size_t positive_axis = axis < 0 ? ndim + axis : axis;
+    size_t axis_dim = in_shape[positive_axis];
+
+    size_t out_side = 1;
+    for (size_t i = 0; i < positive_axis; i++)
+        out_side *= in_shape[i];
+
+    size_t in_side = 1;
+    for (size_t i = positive_axis + 1; i < ndim; i++)
+        in_side *= in_shape[i];
+    float c0 = -88.0f * beta;
+    float c1 = 12102203.0f * beta;
+    float c2 = 1065414017.0f * beta;
+
+    // axis == -1
+    if (positive_axis == (ndim - 1))
+    {
+        const float *ptr_input = input;
+        float *ptr_output = output;
+        for (size_t i = 0; i < out_side; i++)
+        {
+            auto n = axis_dim;
+            const float *ptr_input_vl = ptr_input;
+            float *ptr_output_vl = ptr_output;
+
+            // max
+            float max = std::numeric_limits<float>::lowest();
+            auto s = vfmv_v_f_f32m1(max, vsetvl_e32m8(n));
+            while (n)
+            {
+                auto vl = vsetvl_e32m8(n);
+                auto v = vle32_v_f32m8(ptr_input_vl, vl);
+                s = vfredmax_vs_f32m8_f32m1(s, v, s, vl);
+                ptr_input_vl += vl;
+                n -= vl;
+            }
+            max = vfmv_f_s_f32m1_f32(s);
+
+            // exp((x - max) * beta) and sum(exp)
+            float sum = 0.f;
+            ptr_input_vl = ptr_input;
+            n = axis_dim;
+            s = vfmv_v_f_f32m1(sum, vsetvl_e32m8(n));
+            while (n)
+            {
+                auto vl = vsetvl_e32m8(n);
+                auto v_in = vle32_v_f32m8(ptr_input_vl, vl);
+                auto v_out = exp_ps2_opt(vfsub_vf_f32m8(v_in, max, vl), c0, c1, c2, vl);
+                s = vfredusum_vs_f32m8_f32m1(s, v_out, s, vl);
+
+                vse32_v_f32m8(ptr_output_vl, v_out, vl);
+                ptr_input_vl += vl;
+                ptr_output_vl += vl;
+                n -= vl;
+            }
+            sum = vfmv_f_s_f32m1_f32(s);
+
+            // div
+            ptr_input_vl = ptr_input;
+            ptr_output_vl = ptr_output;
+            n = axis_dim;
+            sum = 1.0f / sum;
+            while (n)
+            {
+                auto vl = vsetvl_e32m8(n);
+                auto v_out = vle32_v_f32m8(ptr_output_vl, vl);
+                v_out = vfmul_vf_f32m8(v_out, sum, vl);
+
+                vse32_v_f32m8(ptr_output_vl, v_out, vl);
+                ptr_output_vl += vl;
+                n -= vl;
+            }
+
+            ptr_input += axis_dim;
+            ptr_output += axis_dim;
+        }
+    }
+    else
+    {
+        runtime_shape_t axes { positive_axis };
+        auto reduced_shape = kernels::detail::get_reduced_shape(in_shape, axes, true);
+        auto reduced_size = compute_size(reduced_shape);
+        std::vector<float> max(reduced_size, std::numeric_limits<float>::lowest());
+        std::vector<float> sum(reduced_size, 0.f);
+
+        for (size_t i = 0; i < out_side; i++)
+        {
+            const float *ptr_input = input + i * axis_dim * in_side;
+            const float *ptr_input_vl = ptr_input;
+
+            float *ptr_output = output + i * axis_dim * in_side;
+            float *ptr_output_vl = ptr_output;
+
+            float *ptr_max = max.data() + i * in_side;
+            float *ptr_max_vl = ptr_max;
+
+            float *ptr_sum = sum.data() + i * in_side;
+            float *ptr_sum_vl = ptr_sum;
+
+            // max
+            for (size_t j = 0; j < axis_dim; j++)
+            {
+                ptr_max_vl = ptr_max;
+                auto n = in_side;
+                while (n)
+                {
+                    auto vl = vsetvl_e32m8(n);
+                    auto v_in = vle32_v_f32m8(ptr_input_vl, vl);
+                    auto v_max = vle32_v_f32m8(ptr_max_vl, vl);
+
+                    v_max = vfmax_vv_f32m8(v_in, v_max, vl);
+                    vse32_v_f32m8(ptr_max_vl, v_max, vl);
+
+                    ptr_input_vl += vl;
+                    ptr_max_vl += vl;
+                    n -= vl;
+                }
+            }
+
+            // exp((x - max) * beta) and sum(exp)
+            ptr_input_vl = ptr_input;
+            ptr_output_vl = ptr_output;
+            for (size_t j = 0; j < axis_dim; j++)
+            {
+                ptr_max_vl = ptr_max;
+                ptr_sum_vl = ptr_sum;
+                auto n = in_side;
+                while (n)
+                {
+                    auto vl = vsetvl_e32m8(n);
+                    auto v_in = vle32_v_f32m8(ptr_input_vl, vl);
+                    auto v_max = vle32_v_f32m8(ptr_max_vl, vl);
+                    auto v_sum = vle32_v_f32m8(ptr_sum_vl, vl);
+
+                    auto v_out = exp_ps(vfmul_vf_f32m8(vfsub_vv_f32m8(v_in, v_max, vl), beta, vl), vl);
+                    vse32_v_f32m8(ptr_output_vl, v_out, vl);
+
+                    v_sum = vfadd_vv_f32m8(v_sum, v_out, vl);
+                    vse32_v_f32m8(ptr_sum_vl, v_sum, vl);
+
+                    ptr_input_vl += vl;
+                    ptr_output_vl += vl;
+                    ptr_max_vl += vl;
+                    ptr_sum_vl += vl;
+                    n -= vl;
+                }
+            }
+
+            // div
+            ptr_output_vl = ptr_output;
+            for (size_t j = 0; j < axis_dim; j++)
+            {
+                ptr_sum_vl = ptr_sum;
+                auto n = in_side;
+                while (n)
+                {
+                    auto vl = vsetvl_e32m8(n);
+                    auto v_out = vle32_v_f32m8(ptr_output_vl, vl);
+                    auto v_sum = vle32_v_f32m8(ptr_sum_vl, vl);
+
+                    v_out = vfdiv_vv_f32m8(v_out, v_sum, vl);
+                    vse32_v_f32m8(ptr_output_vl, v_out, vl);
+
+                    ptr_output_vl += vl;
+                    ptr_sum_vl += vl;
+                    n -= vl;
+                }
+            }
+        }
+    }
+    return ok();
+}
+
 result<void> optimized_softmax_impl(const float *input, float *output, const runtime_shape_t &in_shape, int32_t axis, float beta) noexcept
 {
     size_t ndim = in_shape.size();
@@ -96,11 +295,12 @@ result<void> optimized_softmax_impl(const float *input, float *output, const run
             ptr_input_vl = ptr_input;
             ptr_output_vl = ptr_output;
             n = axis_dim;
+            sum = 1.0f / sum;
             while (n)
             {
                 auto vl = vsetvl_e32m8(n);
                 auto v_out = vle32_v_f32m8(ptr_output_vl, vl);
-                v_out = vfdiv_vf_f32m8(v_out, sum, vl);
+                v_out = vfmul_vf_f32m8(v_out, sum, vl);
                 vse32_v_f32m8(ptr_output_vl, v_out, vl);
                 ptr_output_vl += vl;
                 n -= vl;
@@ -205,6 +405,7 @@ result<void> optimized_softmax_impl(const float *input, float *output, const run
     }
     return ok();
 }
+
 #endif
 }
 
diff --git a/src/kernels/cpu/optimized/riscv64/ternary.cpp b/src/kernels/cpu/optimized/riscv64/ternary.cpp
new file mode 100644
index 0000000000..59ec58170b
--- /dev/null
+++ b/src/kernels/cpu/optimized/riscv64/ternary.cpp
@@ -0,0 +1,165 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <nncase/kernels/cpu/optimized/tensor_compute.h>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::optimized;
+
+#if __riscv_vector
+
+void ternary_vec(const float *input_a, int input_a_len, const float *input_b, int input_b_len, const float *input_c,
+    [[maybe_unused]] int input_c_len, float *out, int out_len)
+{
+    __asm volatile(
+
+        "div a4, %[dst_len], %[mask_len];"
+        "mv a2, %[c];"
+        "mv a3, %[dst];"
+
+        "beq %[mask_len], %[b_len], B_IS_VECTOR%=;"
+
+        "flw ft0, (%[b]);"
+
+        "TERNARY_RVV%=:;"
+
+        "mv a0, %[mask_len];"
+        "mv a1, %[mask];"
+
+        "XXXXXX%=:"
+        "vsetvli t0, a0, e32, m8;"
+        "vle32.v v8, (a1);"
+        "vle32.v v16,(a2);"
+        "vmsne.vx v0, v8, x0;"
+        "vfmerge.vfm v8, v16, ft0, v0;"
+        "vse32.v v8, (a3);"
+
+        "slli t1, t0, 2;"
+        "sub a0, a0, t0; "
+        "add a1, a1, t1;"
+        "add a2, a2, t1;"
+        "add a3, a3, t1;"
+        "bnez a0, XXXXXX%=;"
+
+        "addi a4, a4, -1;"
+        "bnez a4, TERNARY_RVV%=;"
+        "j END%=;"
+
+        //////////////////////////////////////
+        "B_IS_VECTOR%=:;"
+        "TERNARY_RVV2%=:;"
+
+        "mv a0, %[mask_len];"
+        "mv a1, %[mask];"
+        "mv a5, %[b];"
+
+        "XXXXXX2%=:"
+        "vsetvli t0, a0, e32, m8;"
+        "vle32.v v8, (a1);"
+        "vle32.v v16,(a2);"
+        "vle32.v v24, (a5);"
+        "vmsne.vx v0, v8, x0;"
+        "vmerge.vvm v8, v16, v24, v0;"
+        "vse32.v v8, (a3);"
+
+        "slli t1, t0, 2;"
+        "sub a0, a0, t0; "
+        "add a1, a1, t1;"
+        "add a2, a2, t1;"
+        "add a3, a3, t1;"
+        "add a5, a5, t1;"
+        "bnez a0, XXXXXX2%=;"
+
+        "addi a4, a4, -1;"
+        "bnez a4, TERNARY_RVV2%=;"
+
+        "END%=:;"
+
+        :
+        : [mask] "r"(input_a), [mask_len] "r"(input_a_len), [b] "r"(input_b), [b_len] "r"(input_b_len), [c] "r"(input_c), [dst] "r"(out), [dst_len] "r"(out_len)
+        : "t0", "t1", "a0", "a1", "a2", "a3", "a4", "a5", "ft0", "v0", "v8", "v16", "v24");
+}
+
+result<void> tenary_impl(const float *input_a, const float *input_b, const float *input_c, float *output,
+    const runtime_shape_t &in_a_shape, [[maybe_unused]] const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    [[maybe_unused]] const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, [[maybe_unused]] const runtime_shape_t &in_c_strides,
+    [[maybe_unused]] const runtime_shape_t &out_strides)
+{
+
+    int len_a = 1;
+    for (int i = 0; i < (int)in_a_shape.size(); ++i)
+    {
+        len_a *= in_a_shape[i];
+    }
+    int len_b = 1;
+    for (int i = 0; i < (int)in_b_shape.size(); ++i)
+    {
+        len_b *= in_b_shape[i];
+    }
+    int len_c = 1;
+    for (int i = 0; i < (int)in_c_shape.size(); ++i)
+    {
+        len_c *= in_c_shape[i];
+    }
+    const auto out_shape = kernels::detail::get_binary_output_shape(kernels::detail::get_binary_output_shape(in_a_shape, in_b_shape), in_c_shape);
+    int len_out = 1;
+    for (int i = 0; i < (int)out_shape.size(); ++i)
+    {
+        len_out *= out_shape[i];
+    }
+    ternary_vec(input_a, len_a, input_b, len_b, input_c, len_c, output, len_out);
+    return ok();
+}
+
+#endif
+
+template <>
+result<void> optimized::ternary<float>(const float *input_a, const float *input_b, const float *input_c, float *output,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
+    const runtime_shape_t &out_strides) noexcept
+{
+#if __riscv_vector
+    return tenary_impl(input_a, input_b, input_c, output, in_a_shape, in_a_strides, in_b_shape, in_b_strides, in_c_shape, in_c_strides, out_strides);
+#else
+    return cpu::reference::ternary(input_a, input_b, input_c, output, in_a_shape, in_a_strides, in_b_shape, in_b_strides, in_c_shape, in_c_strides, out_strides);
+#endif
+}
+
+template result<void> optimized::ternary<int64_t>(const float *input_a, const int64_t *input_b, const int64_t *input_c, int64_t *output,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
+    const runtime_shape_t &out_strides) noexcept;
+
+template result<void> optimized::ternary<int32_t>(const float *input_a, const int32_t *input_b, const int32_t *input_c, int32_t *output,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
+    const runtime_shape_t &out_strides) noexcept;
+
+template <typename T>
+result<void> optimized::ternary(const float *input_a, const T *input_b, const T *input_c, T *output,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
+    const runtime_shape_t &out_strides) noexcept
+{
+    return cpu::reference::ternary(input_a, input_b, input_c, output, in_a_shape, in_a_strides, in_b_shape, in_b_strides, in_c_shape, in_c_strides, out_strides);
+}
diff --git a/src/kernels/cpu/optimized/riscv64/unary.cpp b/src/kernels/cpu/optimized/riscv64/unary.cpp
index 80e9583ee3..16f607a655 100644
--- a/src/kernels/cpu/optimized/riscv64/unary.cpp
+++ b/src/kernels/cpu/optimized/riscv64/unary.cpp
@@ -43,7 +43,7 @@ struct unary_op_abs_rvv
 
 struct unary_op_ceil_rvv
 {
-    vfloat32m8_t operator()(const vfloat32m8_t &x, const word_type &vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t &x, const size_t &vl) const
     {
         vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl);
         vbool4_t _mask = vmflt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl);
@@ -61,7 +61,7 @@ struct unary_op_cos_rvv
 
 struct unary_op_exp_rvv
 {
-    vfloat32m8_t operator()(const vfloat32m8_t &x, const word_type &vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t &x, const size_t &vl) const
     {
         return exp_ps(x, vl);
     }
diff --git a/src/kernels/cpu/optimized/riscv64/utils.h b/src/kernels/cpu/optimized/riscv64/utils.h
index 967ee57db8..2e147793d6 100644
--- a/src/kernels/cpu/optimized/riscv64/utils.h
+++ b/src/kernels/cpu/optimized/riscv64/utils.h
@@ -31,7 +31,7 @@
 #define c_cephes_log_q2 0.693359375
 
 #define _RVV_FLOAT32_LOG_OP(LMUL, MLEN)                                                                              \
-    static inline vfloat32m##LMUL##_t log_ps(vfloat32m##LMUL##_t x, word_type vl)                                    \
+    static inline vfloat32m##LMUL##_t log_ps(vfloat32m##LMUL##_t x, size_t vl)                                       \
     {                                                                                                                \
         x = vfmax_vf_f32m##LMUL(x, 0.f, vl); /* force flush to zero on denormal values */                            \
         vbool##MLEN##_t invalid_mask = vmfle_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl);                                    \
@@ -117,7 +117,7 @@ _RVV_FLOAT32_LOG_OP(8, 4)
 #define c_cephes_exp_p5 5.0000001201E-1
 
 #define _RVV_FLOAT32_EXP_OP(LMUL, MLEN)                                                   \
-    static inline vfloat32m##LMUL##_t exp_ps(vfloat32m##LMUL##_t x, word_type vl)         \
+    static inline vfloat32m##LMUL##_t exp_ps(vfloat32m##LMUL##_t x, size_t vl)            \
     {                                                                                     \
         vfloat32m##LMUL##_t tmp, fx;                                                      \
                                                                                           \
@@ -183,7 +183,7 @@ _RVV_FLOAT32_EXP_OP(8, 4)
 #define c_cephes_FOPI 1.27323954473516 // 4 / M_PI
 
 #define _RVV_FLOAT32_SINCOS_OP(LMUL, MLEN)                                                                                          \
-    static inline void sincos_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t *ysin, vfloat32m##LMUL##_t *ycos, word_type vl)         \
+    static inline void sincos_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t *ysin, vfloat32m##LMUL##_t *ycos, size_t vl)            \
     {                                                                                                                               \
         /* any x */                                                                                                                 \
         vfloat32m##LMUL##_t xmm1, xmm2, xmm3, y;                                                                                    \
@@ -256,12 +256,12 @@ _RVV_FLOAT32_SINCOS_OP(2, 16)
 _RVV_FLOAT32_SINCOS_OP(4, 8)
 _RVV_FLOAT32_SINCOS_OP(8, 4)
 
-#define _RVV_FLOAT32_SIN_OP(LMUL, MLEN)                                           \
-    static inline vfloat32m##LMUL##_t sin_ps(vfloat32m##LMUL##_t x, word_type vl) \
-    {                                                                             \
-        vfloat32m##LMUL##_t ysin, ycos;                                           \
-        sincos_ps(x, &ysin, &ycos, vl);                                           \
-        return ysin;                                                              \
+#define _RVV_FLOAT32_SIN_OP(LMUL, MLEN)                                        \
+    static inline vfloat32m##LMUL##_t sin_ps(vfloat32m##LMUL##_t x, size_t vl) \
+    {                                                                          \
+        vfloat32m##LMUL##_t ysin, ycos;                                        \
+        sincos_ps(x, &ysin, &ycos, vl);                                        \
+        return ysin;                                                           \
     }
 
 _RVV_FLOAT32_SIN_OP(1, 32)
@@ -269,12 +269,12 @@ _RVV_FLOAT32_SIN_OP(2, 16)
 _RVV_FLOAT32_SIN_OP(4, 8)
 _RVV_FLOAT32_SIN_OP(8, 4)
 
-#define _RVV_FLOAT32_COS_OP(LMUL, MLEN)                                           \
-    static inline vfloat32m##LMUL##_t cos_ps(vfloat32m##LMUL##_t x, word_type vl) \
-    {                                                                             \
-        vfloat32m##LMUL##_t ysin, ycos;                                           \
-        sincos_ps(x, &ysin, &ycos, vl);                                           \
-        return ycos;                                                              \
+#define _RVV_FLOAT32_COS_OP(LMUL, MLEN)                                        \
+    static inline vfloat32m##LMUL##_t cos_ps(vfloat32m##LMUL##_t x, size_t vl) \
+    {                                                                          \
+        vfloat32m##LMUL##_t ysin, ycos;                                        \
+        sincos_ps(x, &ysin, &ycos, vl);                                        \
+        return ycos;                                                           \
     }
 
 _RVV_FLOAT32_COS_OP(1, 32)
@@ -292,7 +292,7 @@ _RVV_FLOAT32_COS_OP(8, 4)
 #define c_cephes_tanh_p4 -3.33332819422E-1
 
 #define _RVV_FLOAT32_TANH_OP(LMUL, MLEN)                                                                                              \
-    static inline vfloat32m##LMUL##_t tanh_ps(vfloat32m##LMUL##_t x, word_type vl)                                                    \
+    static inline vfloat32m##LMUL##_t tanh_ps(vfloat32m##LMUL##_t x, size_t vl)                                                       \
     {                                                                                                                                 \
         vfloat32m##LMUL##_t x2 = vfsgnj_vf_f32m##LMUL(x, 1.f, vl);                                                                    \
                                                                                                                                       \
@@ -340,11 +340,11 @@ _RVV_FLOAT32_TANH_OP(2, 16)
 _RVV_FLOAT32_TANH_OP(4, 8)
 _RVV_FLOAT32_TANH_OP(8, 4)
 
-#define _RVV_FLOAT32_POW_OP(LMUL, MLEN)                                                                  \
-    static inline vfloat32m##LMUL##_t pow_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, word_type vl) \
-    {                                                                                                    \
-        /* pow(x, m) = exp(m * log(x)) */                                                                \
-        return exp_ps(vfmul_vv_f32m##LMUL(b, log_ps(a, vl), vl), vl);                                    \
+#define _RVV_FLOAT32_POW_OP(LMUL, MLEN)                                                               \
+    static inline vfloat32m##LMUL##_t pow_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, size_t vl) \
+    {                                                                                                 \
+        /* pow(x, m) = exp(m * log(x)) */                                                             \
+        return exp_ps(vfmul_vv_f32m##LMUL(b, log_ps(a, vl), vl), vl);                                 \
     }
 
 _RVV_FLOAT32_POW_OP(1, 32)
diff --git a/src/kernels/cpu/optimized/x86_64/avx_mathfun.h b/src/kernels/cpu/optimized/x86_64/avx_mathfun.h
new file mode 100644
index 0000000000..b5743ea4f2
--- /dev/null
+++ b/src/kernels/cpu/optimized/x86_64/avx_mathfun.h
@@ -0,0 +1,827 @@
+/*
+   AVX implementation of sin, cos, sincos, exp and log
+
+   Based on "sse_mathfun.h", by Julien Pommier
+   http://gruntthepeon.free.fr/ssemath/
+
+   Copyright (C) 2012 Giovanni Garberoglio
+   Interdisciplinary Laboratory for Computational Science (LISC)
+   Fondazione Bruno Kessler and University of Trento
+   via Sommarive, 18
+   I-38123 Trento (Italy)
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#ifndef AVX_MATHFUN_H
+#define AVX_MATHFUN_H
+
+#include <emmintrin.h>
+#include <math.h>
+#if __AVX__
+#include <immintrin.h>
+#if __XOP__
+#ifdef _MSC_VER
+#include <ammintrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#endif
+#endif
+
+#ifdef _MSC_VER /* visual c++ */
+#define ALIGN32_BEG __declspec(align(32))
+#define ALIGN32_END
+#else /* gcc or icc */
+#define ALIGN32_BEG
+#define ALIGN32_END __attribute__((aligned(32)))
+#endif
+
+#ifdef _MSC_VER
+#define CAN_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+#define CAN_FORCEINLINE inline __attribute__((__always_inline__))
+#elif defined(__CLANG__)
+#if __has_attribute(__always_inline__)
+#define CAN_FORCEINLINE inline __attribute__((__always_inline__))
+#else
+#define CAN_FORCEINLINE inline
+#endif
+#else
+#define CAN_FORCEINLINE inline
+#endif
+
+#ifndef __FMA__
+static CAN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(const __m256 &_a, const __m256 &_b, const __m256 &_c)
+{
+    return _mm256_add_ps(_mm256_mul_ps(_a, _b), _c);
+}
+static CAN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(const __m256 &_a, const __m256 &_b, const __m256 &_c)
+{
+    return _mm256_sub_ps(_c, _mm256_mul_ps(_a, _b));
+}
+#else
+static CAN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(const __m256 &_a, const __m256 &_b, const __m256 &_c)
+{
+    return _mm256_fmadd_ps(_a, _b, _c);
+}
+static CAN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(const __m256 &_a, const __m256 &_b, const __m256 &_c)
+{
+    // return -a * b + c
+    return _mm256_fnmadd_ps(_a, _b, _c);
+}
+#endif
+
+static CAN_FORCEINLINE __m256 _mm256_fmadd_1_ps(const __m256 &a, const __m256 &b, float c)
+{
+    return _mm256_comp_fmadd_ps(b, _mm256_set1_ps(c), a);
+}
+
+static CAN_FORCEINLINE __m256 _mm256_fmrsub_1_ps(const __m256 &a, const __m256 &b, float c)
+{
+    // return a - b * c
+    return _mm256_comp_fnmadd_ps(b, _mm256_set1_ps(c), a);
+}
+
+static CAN_FORCEINLINE float _mm256_reduce_add_ps(__m256 x)
+{
+    /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */
+    const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
+    /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */
+    const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
+    /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */
+    const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
+    /* Conversion to float is a no-op on x86-64 */
+    return _mm_cvtss_f32(x32);
+}
+
+static CAN_FORCEINLINE float _mm256_reduce_max_ps(__m256 x)
+{
+    const __m128 x128 = _mm_max_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
+    const __m128 x64 = _mm_max_ps(x128, _mm_movehl_ps(x128, x128));
+    const __m128 x32 = _mm_max_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
+    return _mm_cvtss_f32(x32);
+}
+
+#define _PI32AVX_CONST(Name, Val) \
+    static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val }
+
+_PI32AVX_CONST(1, 1);
+_PI32AVX_CONST(inv1, ~1);
+_PI32AVX_CONST(2, 2);
+_PI32AVX_CONST(4, 4);
+
+/* declare some AVX constants -- why can't I figure a better way to do that? */
+#define _PS256_CONST(Name, Val) \
+    static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
+#define _PI32_CONST256(Name, Val) \
+    static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
+#define _PS256_CONST_TYPE(Name, Type, Val) \
+    static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
+
+_PS256_CONST(1, 1.0f);
+_PS256_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST256(0, 0);
+_PI32_CONST256(1, 1);
+_PI32_CONST256(inv1, ~1);
+_PI32_CONST256(2, 2);
+_PI32_CONST256(4, 4);
+_PI32_CONST256(0x7f, 0x7f);
+
+_PS256_CONST(cephes_SQRTHF, 0.707106781186547524f);
+_PS256_CONST(cephes_log_p0, 7.0376836292E-2f);
+_PS256_CONST(cephes_log_p1, -1.1514610310E-1f);
+_PS256_CONST(cephes_log_p2, 1.1676998740E-1f);
+_PS256_CONST(cephes_log_p3, -1.2420140846E-1f);
+_PS256_CONST(cephes_log_p4, +1.4249322787E-1f);
+_PS256_CONST(cephes_log_p5, -1.6668057665E-1f);
+_PS256_CONST(cephes_log_p6, +2.0000714765E-1f);
+_PS256_CONST(cephes_log_p7, -2.4999993993E-1f);
+_PS256_CONST(cephes_log_p8, +3.3333331174E-1f);
+_PS256_CONST(cephes_log_q1, -2.12194440e-4f);
+_PS256_CONST(cephes_log_q2, 0.693359375f);
+
+#ifndef __AVX2__
+typedef union imm_xmm_union
+{
+    __m256i imm;
+    __m128i xmm[2];
+} imm_xmm_union;
+
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)      \
+    {                                            \
+        ALIGN32_BEG imm_xmm_union u ALIGN32_END; \
+        u.imm = imm_;                            \
+        xmm0_ = u.xmm[0];                        \
+        xmm1_ = u.xmm[1];                        \
+    }
+
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)      \
+    {                                            \
+        ALIGN32_BEG imm_xmm_union u ALIGN32_END; \
+        u.xmm[0] = xmm0_;                        \
+        u.xmm[1] = xmm1_;                        \
+        imm_ = u.imm;                            \
+    }
+
+#define AVX2_BITOP_USING_SSE2(fn)                                     \
+    static CAN_FORCEINLINE __m256i _mm256_comp_##fn(__m256i x, int a) \
+    {                                                                 \
+        /* use SSE2 instruction to perform the bitop AVX2 */          \
+        __m128i x1, x2;                                               \
+        __m256i ret;                                                  \
+        COPY_IMM_TO_XMM(x, x1, x2);                                   \
+        x1 = _mm_##fn(x1, a);                                         \
+        x2 = _mm_##fn(x2, a);                                         \
+        COPY_XMM_TO_IMM(x1, x2, ret);                                 \
+        return (ret);                                                 \
+    }
+#define AVX2_INTOP_USING_SSE2(fn)                                         \
+    static CAN_FORCEINLINE __m256i _mm256_comp_##fn(__m256i x, __m256i y) \
+    {                                                                     \
+        /* use SSE2 instructions to perform the AVX2 integer operation */ \
+        __m128i x1, x2;                                                   \
+        __m128i y1, y2;                                                   \
+        __m256i ret;                                                      \
+        COPY_IMM_TO_XMM(x, x1, x2);                                       \
+        COPY_IMM_TO_XMM(y, y1, y2);                                       \
+        x1 = _mm_##fn(x1, y1);                                            \
+        x2 = _mm_##fn(x2, y2);                                            \
+        COPY_XMM_TO_IMM(x1, x2, ret);                                     \
+        return (ret);                                                     \
+    }
+#else
+#define AVX2_BITOP_USING_SSE2(fn)                                     \
+    static CAN_FORCEINLINE __m256i _mm256_comp_##fn(__m256i x, int a) \
+    {                                                                 \
+        return _mm256_##fn(x, a);                                     \
+    }
+#define AVX2_INTOP_USING_SSE2(fn)                                         \
+    static CAN_FORCEINLINE __m256i _mm256_comp_##fn(__m256i x, __m256i y) \
+    {                                                                     \
+        return _mm256_##fn(x, y);                                         \
+    }
+#endif
+
+AVX2_BITOP_USING_SSE2(slli_epi32)
+AVX2_BITOP_USING_SSE2(srli_epi32)
+AVX2_INTOP_USING_SSE2(cmpeq_epi32)
+AVX2_INTOP_USING_SSE2(sub_epi32)
+AVX2_INTOP_USING_SSE2(add_epi32)
+
+// Replace 256 bit operations with 128 bit ones when AVX2 is disabled
+#ifndef __AVX2__
+AVX2_INTOP_USING_SSE2(and_si128)
+AVX2_INTOP_USING_SSE2(andnot_si128)
+#endif
+
+/* natural logarithm computed for 8 simultaneous float
+   return NaN for x <= 0
+*/
+static CAN_FORCEINLINE __m256 log256_ps(__m256 x)
+{
+    __m256i imm0;
+    __m256 one = *(__m256 *)_ps256_1;
+
+    //__m256 invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+    __m256 invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
+
+    x = _mm256_max_ps(x, *(__m256 *)_ps256_min_norm_pos); /* cut off denormalized stuff */
+
+    // can be done with AVX2
+    imm0 = _mm256_comp_srli_epi32(_mm256_castps_si256(x), 23);
+
+    /* keep only the fractional part */
+    x = _mm256_and_ps(x, *(__m256 *)_ps256_inv_mant_mask);
+    x = _mm256_or_ps(x, *(__m256 *)_ps256_0p5);
+
+    // this is again another AVX2 instruction
+    imm0 = _mm256_comp_sub_epi32(imm0, *(__m256i *)_pi32_256_0x7f);
+    __m256 e = _mm256_cvtepi32_ps(imm0);
+
+    e = _mm256_add_ps(e, one);
+
+    /* part2:
+       if( x < SQRTHF ) {
+         e -= 1;
+         x = x + x - 1.0;
+       } else { x = x - 1.0; }
+    */
+    //__m256 mask = _mm256_cmplt_ps(x, *(__m256*)_ps256_cephes_SQRTHF);
+    __m256 mask = _mm256_cmp_ps(x, *(__m256 *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+    __m256 tmp = _mm256_and_ps(x, mask);
+    x = _mm256_sub_ps(x, one);
+    e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
+    x = _mm256_add_ps(x, tmp);
+
+    __m256 z = _mm256_mul_ps(x, x);
+
+    __m256 y = *(__m256 *)_ps256_cephes_log_p0;
+    y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p1);
+    y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p2);
+    y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p3);
+    y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p4);
+    y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p5);
+    y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p6);
+    y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p7);
+    y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p8);
+    y = _mm256_mul_ps(y, x);
+
+    y = _mm256_mul_ps(y, z);
+
+    y = _mm256_comp_fmadd_ps(e, *(__m256 *)_ps256_cephes_log_q1, y);
+
+    //y = -z * 0.5 + y
+    y = _mm256_comp_fnmadd_ps(z, *(__m256 *)_ps256_0p5, y);
+
+    x = _mm256_add_ps(x, y);
+    x = _mm256_comp_fmadd_ps(e, *(__m256 *)_ps256_cephes_log_q2, x);
+    y = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
+    return y;
+}
+
+_PS256_CONST(exp_hi, 88.3762626647949f);
+_PS256_CONST(exp_lo, -88.3762626647949f);
+
+_PS256_CONST(cephes_LOG2EF, 1.44269504088896341f);
+_PS256_CONST(cephes_exp_C1, 0.693359375f);
+_PS256_CONST(cephes_exp_C2, -2.12194440e-4f);
+
+_PS256_CONST(cephes_exp_p0, 1.9875691500E-4f);
+_PS256_CONST(cephes_exp_p1, 1.3981999507E-3f);
+_PS256_CONST(cephes_exp_p2, 8.3334519073E-3f);
+_PS256_CONST(cephes_exp_p3, 4.1665795894E-2f);
+_PS256_CONST(cephes_exp_p4, 1.6666665459E-1f);
+_PS256_CONST(cephes_exp_p5, 5.0000001201E-1f);
+
+static CAN_FORCEINLINE __m256 exp256_ps(__m256 x)
+{
+    __m256 tmp = _mm256_setzero_ps(), fx;
+    __m256i imm0;
+    __m256 one = *(__m256 *)_ps256_1;
+
+    x = _mm256_min_ps(x, *(__m256 *)_ps256_exp_hi);
+    x = _mm256_max_ps(x, *(__m256 *)_ps256_exp_lo);
+
+    /* express exp(x) as exp(g + n*log(2)) */
+    fx = _mm256_comp_fmadd_ps(x, *(__m256 *)_ps256_cephes_LOG2EF, *(__m256 *)_ps256_0p5);
+
+    /* how to perform a floorf with SSE: just below */
+    //imm0 = _mm256_cvttps_epi32(fx);
+    //tmp  = _mm256_cvtepi32_ps(imm0);
+
+    tmp = _mm256_floor_ps(fx);
+
+    /* if greater, subtract 1 */
+    //__m256 mask = _mm256_cmpgt_ps(tmp, fx);
+    __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
+    mask = _mm256_and_ps(mask, one);
+    fx = _mm256_sub_ps(tmp, mask);
+
+    // x = x - fx * exp_C1
+    x = _mm256_comp_fnmadd_ps(fx, *(__m256 *)_ps256_cephes_exp_C1, x);
+    // x = x - fx * exp_C2
+    x = _mm256_comp_fnmadd_ps(fx, *(__m256 *)_ps256_cephes_exp_C2, x);
+
+    tmp = _mm256_mul_ps(x, x);
+
+    __m256 y = *(__m256 *)_ps256_cephes_exp_p0;
+    y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_exp_p1);
+    y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_exp_p2);
+    y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_exp_p3);
+    y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_exp_p4);
+    y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_exp_p5);
+    y = _mm256_comp_fmadd_ps(y, tmp, x);
+    y = _mm256_add_ps(y, one);
+
+    /* build 2^n */
+    imm0 = _mm256_cvttps_epi32(fx);
+    // another two AVX2 instructions
+    imm0 = _mm256_comp_add_epi32(imm0, *(__m256i *)_pi32_256_0x7f);
+    imm0 = _mm256_comp_slli_epi32(imm0, 23);
+    __m256 pow2n = _mm256_castsi256_ps(imm0);
+    y = _mm256_mul_ps(y, pow2n);
+    return y;
+}
+
+_PS256_CONST(tanh_hi, 9.0f);
+_PS256_CONST(tanh_lo, -9.0f);
+
+_PS256_CONST(cephes_tanh_p0, -2.76076847742355E-16f);
+_PS256_CONST(cephes_tanh_p1, 2.00018790482477E-13f);
+_PS256_CONST(cephes_tanh_p2, -8.60467152213735E-11f);
+_PS256_CONST(cephes_tanh_p3, 5.12229709037114E-08f);
+_PS256_CONST(cephes_tanh_p4, 1.48572235717979E-05f);
+_PS256_CONST(cephes_tanh_p5, 6.37261928875436E-04f);
+_PS256_CONST(cephes_tanh_p6, 4.89352455891786E-03f);
+
+_PS256_CONST(cephes_tanh_p7, 1.19825839466702e-06f);
+_PS256_CONST(cephes_tanh_p8, 1.18534705686654e-04f);
+_PS256_CONST(cephes_tanh_p9, 2.26843463243900e-03f);
+
+// an approximation of tanh
+static inline __m256 tanh256_ps(const __m256 x)
+{
+    __m256 value = x;
+    value = _mm256_max_ps(*(__m256 *)_ps256_tanh_lo, value);
+    value = _mm256_min_ps(*(__m256 *)_ps256_tanh_hi, value);
+
+    __m256 value_squared = _mm256_mul_ps(value, value);
+
+    __m256 p;
+    p = _mm256_comp_fmadd_ps(value_squared, *(__m256 *)_ps256_cephes_tanh_p0, *(__m256 *)_ps256_cephes_tanh_p1);
+    p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256 *)_ps256_cephes_tanh_p2);
+    p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256 *)_ps256_cephes_tanh_p3);
+    p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256 *)_ps256_cephes_tanh_p4);
+    p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256 *)_ps256_cephes_tanh_p5);
+    p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256 *)_ps256_cephes_tanh_p6);
+    p = _mm256_mul_ps(p, value);
+
+    __m256 q;
+    q = _mm256_comp_fmadd_ps(value_squared, *(__m256 *)_ps256_cephes_tanh_p7, *(__m256 *)_ps256_cephes_tanh_p8);
+    q = _mm256_comp_fmadd_ps(q, value_squared, *(__m256 *)_ps256_cephes_tanh_p9);
+    q = _mm256_comp_fmadd_ps(q, value_squared, *(__m256 *)_ps256_cephes_tanh_p6);
+
+    __m256 dst = _mm256_div_ps(p, q);
+    return dst;
+}
+
+_PS256_CONST(minus_cephes_DP1, -0.78515625f);
+_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
+_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
+_PS256_CONST(sincof_p0, -1.9515295891E-4f);
+_PS256_CONST(sincof_p1, 8.3321608736E-3f);
+_PS256_CONST(sincof_p2, -1.6666654611E-1f);
+_PS256_CONST(coscof_p0, 2.443315711809948E-005f);
+_PS256_CONST(coscof_p1, -1.388731625493765E-003f);
+_PS256_CONST(coscof_p2, 4.166664568298827E-002f);
+_PS256_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
+
+/* evaluation of 8 sines at onces using AVX intrisics
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+*/
+static CAN_FORCEINLINE __m256 sin256_ps(__m256 x)
+{ // any x
+    __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
+    __m256i imm0, imm2;
+
+#ifndef __AVX2__
+    __m128i imm0_1, imm0_2;
+    __m128i imm2_1, imm2_2;
+#endif
+
+    sign_bit = x;
+    /* take the absolute value */
+    x = _mm256_and_ps(x, *(__m256 *)_ps256_inv_sign_mask);
+    /* extract the sign bit (upper one) */
+    sign_bit = _mm256_and_ps(sign_bit, *(__m256 *)_ps256_sign_mask);
+
+    /* scale by 4/Pi */
+    y = _mm256_mul_ps(x, *(__m256 *)_ps256_cephes_FOPI);
+
+    /*
+      Here we start a series of integer operations, which are in the
+      realm of AVX2.
+      If we don't have AVX, let's perform them using SSE2 directives
+    */
+
+#ifdef __AVX2__
+    /* store the integer part of y in mm0 */
+    imm2 = _mm256_cvttps_epi32(y);
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    // another two AVX2 instruction
+    imm2 = _mm256_comp_add_epi32(imm2, *(__m256i *)_pi32_256_1);
+    imm2 = _mm256_and_si256(imm2, *(__m256i *)_pi32_256_inv1);
+    y = _mm256_cvtepi32_ps(imm2);
+
+    /* get the swap sign flag */
+    imm0 = _mm256_and_si256(imm2, *(__m256i *)_pi32_256_4);
+    imm0 = _mm256_comp_slli_epi32(imm0, 29);
+    /* get the polynom selection mask
+       there is one polynom for 0 <= x <= Pi/4
+       and another one for Pi/4<x<=Pi/2
+
+       Both branches will be computed.
+    */
+    imm2 = _mm256_and_si256(imm2, *(__m256i *)_pi32_256_2);
+    imm2 = _mm256_cmpeq_epi32(imm2, *(__m256i *)_pi32_256_0);
+#else
+    /* we use SSE2 routines to perform the integer ops */
+    COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+    imm2_1 = _mm_add_epi32(imm2_1, *(__m128i *)_pi32avx_1);
+    imm2_2 = _mm_add_epi32(imm2_2, *(__m128i *)_pi32avx_1);
+
+    imm2_1 = _mm_and_si128(imm2_1, *(__m128i *)_pi32avx_inv1);
+    imm2_2 = _mm_and_si128(imm2_2, *(__m128i *)_pi32avx_inv1);
+
+    COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+    y = _mm256_cvtepi32_ps(imm2);
+
+    imm0_1 = _mm_and_si128(imm2_1, *(__m128i *)_pi32avx_4);
+    imm0_2 = _mm_and_si128(imm2_2, *(__m128i *)_pi32avx_4);
+
+    imm0_1 = _mm_slli_epi32(imm0_1, 29);
+    imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+    COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+    imm2_1 = _mm_and_si128(imm2_1, *(__m128i *)_pi32avx_2);
+    imm2_2 = _mm_and_si128(imm2_2, *(__m128i *)_pi32avx_2);
+
+    imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+    imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+    COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+
+    __m256 swap_sign_bit = _mm256_castsi256_ps(imm0);
+    __m256 poly_mask = _mm256_castsi256_ps(imm2);
+    sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
+
+    /* The magic pass: "Extended precision modular arithmetic"
+       x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = *(__m256 *)_ps256_minus_cephes_DP1;
+    xmm2 = *(__m256 *)_ps256_minus_cephes_DP2;
+    xmm3 = *(__m256 *)_ps256_minus_cephes_DP3;
+    x = _mm256_comp_fmadd_ps(y, xmm1, x);
+    x = _mm256_comp_fmadd_ps(y, xmm2, x);
+    x = _mm256_comp_fmadd_ps(y, xmm3, x);
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+    y = *(__m256 *)_ps256_coscof_p0;
+    __m256 z = _mm256_mul_ps(x, x);
+
+    y = _mm256_comp_fmadd_ps(y, z, *(__m256 *)_ps256_coscof_p1);
+    y = _mm256_comp_fmadd_ps(y, z, *(__m256 *)_ps256_coscof_p2);
+    y = _mm256_mul_ps(y, z);
+    y = _mm256_mul_ps(y, z);
+    // y = y - z * 0.5
+    y = _mm256_comp_fnmadd_ps(z, *(__m256 *)_ps256_0p5, y);
+    y = _mm256_add_ps(y, *(__m256 *)_ps256_1);
+
+    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+    __m256 y2 = *(__m256 *)_ps256_sincof_p0;
+    y2 = _mm256_comp_fmadd_ps(y2, z, *(__m256 *)_ps256_sincof_p1);
+    y2 = _mm256_comp_fmadd_ps(y2, z, *(__m256 *)_ps256_sincof_p2);
+    y2 = _mm256_mul_ps(y2, z);
+    y2 = _mm256_comp_fmadd_ps(y2, x, x);
+
+    /* select the correct result from the two polynoms */
+    xmm3 = poly_mask;
+    y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+    y = _mm256_andnot_ps(xmm3, y);
+    y = _mm256_add_ps(y, y2);
+    /* update the sign */
+    y = _mm256_xor_ps(y, sign_bit);
+
+    return y;
+}
+
+/* almost the same as sin_ps */
+static CAN_FORCEINLINE __m256 cos256_ps(__m256 x)
+{ // any x
+    __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
+    __m256i imm0, imm2;
+
+#ifndef __AVX2__
+    __m128i imm0_1, imm0_2;
+    __m128i imm2_1, imm2_2;
+#endif
+
+    /* take the absolute value */
+    x = _mm256_and_ps(x, *(__m256 *)_ps256_inv_sign_mask);
+
+    /* scale by 4/Pi */
+    y = _mm256_mul_ps(x, *(__m256 *)_ps256_cephes_FOPI);
+
+#ifdef __AVX2__
+    /* store the integer part of y in mm0 */
+    imm2 = _mm256_cvttps_epi32(y);
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    imm2 = _mm256_comp_add_epi32(imm2, *(__m256i *)_pi32_256_1);
+    imm2 = _mm256_and_si256(imm2, *(__m256i *)_pi32_256_inv1);
+    y = _mm256_cvtepi32_ps(imm2);
+    imm2 = _mm256_comp_sub_epi32(imm2, *(__m256i *)_pi32_256_2);
+
+    /* get the swap sign flag */
+    imm0 = _mm256_andnot_si256(imm2, *(__m256i *)_pi32_256_4);
+    imm0 = _mm256_comp_slli_epi32(imm0, 29);
+    /* get the polynom selection mask */
+    imm2 = _mm256_and_si256(imm2, *(__m256i *)_pi32_256_2);
+    imm2 = _mm256_cmpeq_epi32(imm2, *(__m256i *)_pi32_256_0);
+#else
+
+    /* we use SSE2 routines to perform the integer ops */
+    COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+    imm2_1 = _mm_add_epi32(imm2_1, *(__m128i *)_pi32avx_1);
+    imm2_2 = _mm_add_epi32(imm2_2, *(__m128i *)_pi32avx_1);
+
+    imm2_1 = _mm_and_si128(imm2_1, *(__m128i *)_pi32avx_inv1);
+    imm2_2 = _mm_and_si128(imm2_2, *(__m128i *)_pi32avx_inv1);
+
+    COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+    y = _mm256_cvtepi32_ps(imm2);
+
+    imm2_1 = _mm_sub_epi32(imm2_1, *(__m128i *)_pi32avx_2);
+    imm2_2 = _mm_sub_epi32(imm2_2, *(__m128i *)_pi32avx_2);
+
+    imm0_1 = _mm_andnot_si128(imm2_1, *(__m128i *)_pi32avx_4);
+    imm0_2 = _mm_andnot_si128(imm2_2, *(__m128i *)_pi32avx_4);
+
+    imm0_1 = _mm_slli_epi32(imm0_1, 29);
+    imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+    COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+    imm2_1 = _mm_and_si128(imm2_1, *(__m128i *)_pi32avx_2);
+    imm2_2 = _mm_and_si128(imm2_2, *(__m128i *)_pi32avx_2);
+
+    imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+    imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+    COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+
+    __m256 sign_bit = _mm256_castsi256_ps(imm0);
+    __m256 poly_mask = _mm256_castsi256_ps(imm2);
+
+    /* The magic pass: "Extended precision modular arithmetic"
+       x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = *(__m256 *)_ps256_minus_cephes_DP1;
+    xmm2 = *(__m256 *)_ps256_minus_cephes_DP2;
+    xmm3 = *(__m256 *)_ps256_minus_cephes_DP3;
+    x = _mm256_comp_fmadd_ps(y, xmm1, x);
+    x = _mm256_comp_fmadd_ps(y, xmm2, x);
+    x = _mm256_comp_fmadd_ps(y, xmm3, x);
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+    y = *(__m256 *)_ps256_coscof_p0;
+    __m256 z = _mm256_mul_ps(x, x);
+
+    y = _mm256_comp_fmadd_ps(y, z, *(__m256 *)_ps256_coscof_p1);
+    y = _mm256_comp_fmadd_ps(y, z, *(__m256 *)_ps256_coscof_p2);
+    y = _mm256_mul_ps(y, z);
+    y = _mm256_mul_ps(y, z);
+    // y = y - z * 0.5
+    y = _mm256_comp_fnmadd_ps(z, *(__m256 *)_ps256_0p5, y);
+    y = _mm256_add_ps(y, *(__m256 *)_ps256_1);
+
+    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+    __m256 y2 = *(__m256 *)_ps256_sincof_p0;
+    y2 = _mm256_mul_ps(y2, z);
+    y2 = _mm256_add_ps(y2, *(__m256 *)_ps256_sincof_p1);
+    y2 = _mm256_mul_ps(y2, z);
+    y2 = _mm256_add_ps(y2, *(__m256 *)_ps256_sincof_p2);
+    y2 = _mm256_mul_ps(y2, z);
+    y2 = _mm256_mul_ps(y2, x);
+    y2 = _mm256_add_ps(y2, x);
+
+    /* select the correct result from the two polynoms */
+    xmm3 = poly_mask;
+    y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+    y = _mm256_andnot_ps(xmm3, y);
+    y = _mm256_add_ps(y, y2);
+    /* update the sign */
+    y = _mm256_xor_ps(y, sign_bit);
+
+    return y;
+}
+
+/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+static CAN_FORCEINLINE void sincos256_ps(__m256 x, __m256 *s, __m256 *c)
+{
+    __m256 xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
+    __m256i imm0, imm2, imm4;
+
+#ifndef __AVX2__
+    __m128i imm0_1, imm0_2;
+    __m128i imm2_1, imm2_2;
+    __m128i imm4_1, imm4_2;
+#endif
+
+    sign_bit_sin = x;
+    /* take the absolute value */
+    x = _mm256_and_ps(x, *(__m256 *)_ps256_inv_sign_mask);
+    /* extract the sign bit (upper one) */
+    sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256 *)_ps256_sign_mask);
+
+    /* scale by 4/Pi */
+    y = _mm256_mul_ps(x, *(__m256 *)_ps256_cephes_FOPI);
+
+#ifdef __AVX2__
+    /* store the integer part of y in imm2 */
+    imm2 = _mm256_cvttps_epi32(y);
+
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    imm2 = _mm256_comp_add_epi32(imm2, *(__m256i *)_pi32_256_1);
+    imm2 = _mm256_and_si256(imm2, *(__m256i *)_pi32_256_inv1);
+
+    y = _mm256_cvtepi32_ps(imm2);
+    imm4 = imm2;
+
+    /* get the swap sign flag for the sine */
+    imm0 = _mm256_and_si256(imm2, *(__m256i *)_pi32_256_4);
+    imm0 = _mm256_comp_slli_epi32(imm0, 29);
+    //__m256 swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+
+    /* get the polynom selection mask for the sine*/
+    imm2 = _mm256_and_si256(imm2, *(__m256i *)_pi32_256_2);
+    imm2 = _mm256_cmpeq_epi32(imm2, *(__m256i *)_pi32_256_0);
+    //__m256 poly_mask = _mm256_castsi256_ps(imm2);
+#else
+    /* we use SSE2 routines to perform the integer ops */
+    COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+    imm2_1 = _mm_add_epi32(imm2_1, *(__m128i *)_pi32avx_1);
+    imm2_2 = _mm_add_epi32(imm2_2, *(__m128i *)_pi32avx_1);
+
+    imm2_1 = _mm_and_si128(imm2_1, *(__m128i *)_pi32avx_inv1);
+    imm2_2 = _mm_and_si128(imm2_2, *(__m128i *)_pi32avx_inv1);
+
+    COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+    y = _mm256_cvtepi32_ps(imm2);
+
+    imm4_1 = imm2_1;
+    imm4_2 = imm2_2;
+
+    imm0_1 = _mm_and_si128(imm2_1, *(__m128i *)_pi32avx_4);
+    imm0_2 = _mm_and_si128(imm2_2, *(__m128i *)_pi32avx_4);
+
+    imm0_1 = _mm_slli_epi32(imm0_1, 29);
+    imm0_2 = _mm_slli_epi32(imm0_2, 29);
+
+    COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
+
+    imm2_1 = _mm_and_si128(imm2_1, *(__m128i *)_pi32avx_2);
+    imm2_2 = _mm_and_si128(imm2_2, *(__m128i *)_pi32avx_2);
+
+    imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
+    imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
+
+    COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
+#endif
+    __m256 swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+    __m256 poly_mask = _mm256_castsi256_ps(imm2);
+
+    /* The magic pass: "Extended precision modular arithmetic"
+       x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = *(__m256 *)_ps256_minus_cephes_DP1;
+    xmm2 = *(__m256 *)_ps256_minus_cephes_DP2;
+    xmm3 = *(__m256 *)_ps256_minus_cephes_DP3;
+    x = _mm256_comp_fmadd_ps(y, xmm1, x);
+    x = _mm256_comp_fmadd_ps(y, xmm2, x);
+    x = _mm256_comp_fmadd_ps(y, xmm3, x);
+
+#ifdef __AVX2__
+    imm4 = _mm256_comp_sub_epi32(imm4, *(__m256i *)_pi32_256_2);
+    imm4 = _mm256_andnot_si256(imm4, *(__m256i *)_pi32_256_4);
+    imm4 = _mm256_comp_slli_epi32(imm4, 29);
+#else
+    imm4_1 = _mm_sub_epi32(imm4_1, *(__m128i *)_pi32avx_2);
+    imm4_2 = _mm_sub_epi32(imm4_2, *(__m128i *)_pi32avx_2);
+
+    imm4_1 = _mm_andnot_si128(imm4_1, *(__m128i *)_pi32avx_4);
+    imm4_2 = _mm_andnot_si128(imm4_2, *(__m128i *)_pi32avx_4);
+
+    imm4_1 = _mm_slli_epi32(imm4_1, 29);
+    imm4_2 = _mm_slli_epi32(imm4_2, 29);
+
+    COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
+#endif
+
+    __m256 sign_bit_cos = _mm256_castsi256_ps(imm4);
+
+    sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+    __m256 z = _mm256_mul_ps(x, x);
+    y = *(__m256 *)_ps256_coscof_p0;
+
+    y = _mm256_comp_fmadd_ps(y, z, *(__m256 *)_ps256_coscof_p1);
+    y = _mm256_comp_fmadd_ps(y, z, *(__m256 *)_ps256_coscof_p2);
+    y = _mm256_mul_ps(y, z);
+    y = _mm256_mul_ps(y, z);
+    // y = y - z * 0.5
+    y = _mm256_comp_fnmadd_ps(z, *(__m256 *)_ps256_0p5, y);
+    y = _mm256_add_ps(y, *(__m256 *)_ps256_1);
+
+    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+    __m256 y2 = *(__m256 *)_ps256_sincof_p0;
+    y2 = _mm256_comp_fmadd_ps(y2, z, *(__m256 *)_ps256_sincof_p1);
+    y2 = _mm256_comp_fmadd_ps(y2, z, *(__m256 *)_ps256_sincof_p2);
+    y2 = _mm256_mul_ps(y2, z);
+    y2 = _mm256_comp_fmadd_ps(y2, x, x);
+
+    /* select the correct result from the two polynoms */
+    xmm3 = poly_mask;
+    __m256 ysin2 = _mm256_and_ps(xmm3, y2);
+    __m256 ysin1 = _mm256_andnot_ps(xmm3, y);
+    y2 = _mm256_sub_ps(y2, ysin2);
+    y = _mm256_sub_ps(y, ysin1);
+
+    xmm1 = _mm256_add_ps(ysin1, ysin2);
+    xmm2 = _mm256_add_ps(y, y2);
+
+    /* update the sign */
+    *s = _mm256_xor_ps(xmm1, sign_bit_sin);
+    *c = _mm256_xor_ps(xmm2, sign_bit_cos);
+}
+
+static CAN_FORCEINLINE __m256 tan256_ps(__m256 x)
+{
+    __m256 ysin, ycos;
+    __m256 eps = _mm256_set1_ps(1E-8f);
+    sincos256_ps(x, &ysin, &ycos);
+    __m256 mask = _mm256_cmp_ps(ycos, _mm256_setzero_ps(), _CMP_EQ_OS);
+    __m256 _tmp = _mm256_and_ps(eps, mask);
+    ycos = _mm256_add_ps(ycos, _tmp);
+    __m256 ytan = _mm256_div_ps(ysin, ycos);
+    return ytan;
+}
+
+static CAN_FORCEINLINE __m256 pow256_ps(__m256 a, __m256 b)
+{
+    // pow(x, m) = exp(m * log(x))
+    return exp256_ps(_mm256_mul_ps(b, log256_ps(a)));
+}
+
+#endif // AVX_MATHFUN_H
diff --git a/src/kernels/cpu/optimized/x86_64/instancenorm.cpp b/src/kernels/cpu/optimized/x86_64/instancenorm.cpp
new file mode 100644
index 0000000000..b80b8d7f6b
--- /dev/null
+++ b/src/kernels/cpu/optimized/x86_64/instancenorm.cpp
@@ -0,0 +1,34 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <nncase/kernels/cpu/optimized/tensor_compute.h>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::optimized;
+
+template result<void> optimized::instancenorm<float>(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, float epsilon) noexcept;
+
+template <typename T>
+result<void> optimized::instancenorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, float epsilon) noexcept
+{
+    return cpu::reference::instancenorm(input, output, scale, bias, in_shape, epsilon);
+}
\ No newline at end of file
diff --git a/src/kernels/cpu/optimized/x86_64/layernorm.cpp b/src/kernels/cpu/optimized/x86_64/layernorm.cpp
new file mode 100644
index 0000000000..fbe05bead3
--- /dev/null
+++ b/src/kernels/cpu/optimized/x86_64/layernorm.cpp
@@ -0,0 +1,34 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <nncase/kernels/cpu/optimized/tensor_compute.h>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::optimized;
+
+template result<void> optimized::layernorm<float>(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept;
+
+template <typename T>
+result<void> optimized::layernorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept
+{
+    return cpu::reference::layernorm(input, output, scale, bias, in_shape, axis, epsilon);
+}
\ No newline at end of file
diff --git a/src/kernels/cpu/optimized/x86_64/reduce.cpp b/src/kernels/cpu/optimized/x86_64/reduce.cpp
new file mode 100644
index 0000000000..01869f4ed6
--- /dev/null
+++ b/src/kernels/cpu/optimized/x86_64/reduce.cpp
@@ -0,0 +1,43 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <nncase/kernels/cpu/optimized/tensor_compute.h>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::optimized;
+// nncase::kernels::cpu::optimized
+
+template result<void> optimized::reduce<float>(reduce_op_t op, float init_value, const float *input, float *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept;
+
+template result<void> optimized::reduce<int32_t>(reduce_op_t op, int32_t init_value, const int32_t *input, int32_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept;
+
+template result<void> optimized::reduce<int64_t>(reduce_op_t op, int64_t init_value, const int64_t *input, int64_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept;
+
+template <typename T>
+result<void> optimized::reduce(reduce_op_t op, T init_value, const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept
+{
+    return cpu::reference::reduce(op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context);
+}
\ No newline at end of file
diff --git a/src/kernels/cpu/optimized/x86_64/ternary.cpp b/src/kernels/cpu/optimized/x86_64/ternary.cpp
new file mode 100644
index 0000000000..1ec3d48414
--- /dev/null
+++ b/src/kernels/cpu/optimized/x86_64/ternary.cpp
@@ -0,0 +1,55 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <nncase/kernels/cpu/optimized/tensor_compute.h>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::optimized;
+
+template result<void> optimized::ternary<float>(const float *input_a, const float *input_b, const float *input_c, float *output,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
+    const runtime_shape_t &out_strides) noexcept;
+
+template result<void> optimized::ternary<int64_t>(const float *input_a, const int64_t *input_b, const int64_t *input_c, int64_t *output,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
+    const runtime_shape_t &out_strides) noexcept;
+
+template result<void> optimized::ternary<int32_t>(const float *input_a, const int32_t *input_b, const int32_t *input_c, int32_t *output,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
+    const runtime_shape_t &out_strides) noexcept;
+
+// template result<void> optimized::ternary<long>(const float *input_a, const long *input_b, const long *input_c, long *output,
+// const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+// const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
+// const runtime_shape_t &out_strides) noexcept;
+
+template <typename T>
+result<void> optimized::ternary(const float *input_a, const T *input_b, const T *input_c, T *output,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
+    const runtime_shape_t &out_strides) noexcept
+{
+    return cpu::reference::ternary(input_a, input_b, input_c, output, in_a_shape, in_a_strides, in_b_shape, in_b_strides, in_c_shape, in_c_strides, out_strides);
+}
\ No newline at end of file
diff --git a/src/kernels/cpu/optimized/x86_64/unary.cpp b/src/kernels/cpu/optimized/x86_64/unary.cpp
index c5752e5487..c5f65e92be 100644
--- a/src/kernels/cpu/optimized/x86_64/unary.cpp
+++ b/src/kernels/cpu/optimized/x86_64/unary.cpp
@@ -12,6 +12,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include <nncase/kernels/cpu/optimized/tensor_compute.h>
 #include <nncase/kernels/cpu/reference/tensor_compute.h>
 #include <nncase/kernels/kernel_utils.h>
@@ -23,8 +24,762 @@ using namespace nncase::kernels;
 using namespace nncase::kernels::cpu;
 using namespace nncase::kernels::cpu::optimized;
 
+#if defined(X86_64_SIMD_ON)
+
+#include "avx_mathfun.h"
+static void round_f32_vec(const float *a, float *b, int n)
+{
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 vector_a = _mm256_loadu_ps(a);
+        __m256 dst_a = _mm256_round_ps(vector_a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+        _mm256_storeu_ps(b, dst_a);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = roundf(a[j]);
+    }
+}
+
+static void ceil_f32_vec(const float *a, float *b, int n)
+{
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 vector_a = _mm256_loadu_ps(a);
+        __m256 dst_a = _mm256_round_ps(vector_a, (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
+        _mm256_storeu_ps(b, dst_a);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = ceilf(a[j]);
+    }
+}
+
+static void floor_f32_vec(const float *a, float *b, int n)
+{
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 vector_a = _mm256_loadu_ps(a);
+        __m256 dst_a = _mm256_round_ps(vector_a, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
+        _mm256_storeu_ps(b, dst_a);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = floorf(a[j]);
+    }
+}
+
+static void sqrt_f32_vec(const float *a, float *b, int n)
+{
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 vector_a = _mm256_loadu_ps(a);
+        __m256 dst_a = _mm256_sqrt_ps(vector_a);
+        _mm256_storeu_ps(b, dst_a);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = sqrtf(a[j]);
+    }
+}
+
+static void rsqrt_f32_vec(const float *a, float *b, int n)
+{
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 aa = _mm256_loadu_ps(a);
+        __m256 bb = _mm256_rsqrt_ps(aa);
+        _mm256_storeu_ps(b, bb);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = 1.0f / sqrtf(a[j]);
+    }
+}
+
+static void exp_f32_vec(const float *a, float *b, int n)
+{
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 vector_a = _mm256_loadu_ps(a);
+        __m256 dst_a = exp256_ps(vector_a);
+        _mm256_storeu_ps(b, dst_a);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = expf(a[j]);
+    }
+}
+
+static void log_f32_vec(const float *a, float *b, int n)
+{
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 vector_a = _mm256_loadu_ps(a);
+        __m256 dst_a = log256_ps(vector_a);
+        _mm256_storeu_ps(b, dst_a);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = logf(a[j]);
+    }
+}
+
+static void cos_f32_vec(const float *a, float *b, int n)
+{
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 vector_a = _mm256_loadu_ps(a);
+        __m256 dst_a = cos256_ps(vector_a);
+        _mm256_storeu_ps(b, dst_a);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = cosf(a[j]);
+    }
+}
+
+static void sin_f32_vec(const float *a, float *b, int n)
+{
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 vector_a = _mm256_loadu_ps(a);
+        __m256 dst_a = sin256_ps(vector_a);
+        _mm256_storeu_ps(b, dst_a);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = sinf(a[j]);
+    }
+}
+
+static void negative_f32_vec(const float *a, float *b, int n)
+{
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 vector_a = _mm256_loadu_ps(a);
+        __m256 dst_a = _mm256_sub_ps(_mm256_setzero_ps(), vector_a);
+        _mm256_storeu_ps(b, dst_a);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = -(a[j]);
+    }
+}
+
+static void logical_not_f32_vec(const float *a, float *b, int n)
+{
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    __m256i i_zeros = _mm256_setzero_si256();
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256i vector_a = _mm256_loadu_si256((__m256i const *)a);
+        __m256i i_dst_a = _mm256_cmpeq_epi32(vector_a, i_zeros);
+        i_dst_a = _mm256_sub_epi32(i_zeros, i_dst_a);
+        __m256 f_dst_a = _mm256_cvtepi32_ps(i_dst_a);
+        _mm256_storeu_ps(b, f_dst_a);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = !a[j];
+    }
+}
+
+static void abs_f32_vec(const float *a, float *b, int n)
+{
+    const ALIGN32_BEG int32_t remove_sign_bit_data[8] ALIGN32_END = {
+        0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+        0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
+    };
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    __m256i remove_sign_bit_flag = _mm256_load_si256((__m256i const *)remove_sign_bit_data);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256i vector_a = _mm256_loadu_si256((__m256i const *)a);
+        __m256i dst_a = _mm256_and_si256(vector_a, remove_sign_bit_flag);
+        _mm256_storeu_si256((__m256i *)b, dst_a);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = fabs(a[j]);
+    }
+}
+
+static void tanh_f32_vec(const float *a, float *b, int n)
+{
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 vector_a = _mm256_loadu_ps(a);
+        __m256 dst_a = tanh256_ps(vector_a);
+        _mm256_storeu_ps(b, dst_a);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = tanhf(a[j]);
+    }
+}
+
+#ifdef _MSC_VER /* visual c++ */
+static CAN_FORCEINLINE float abs_f32(float x)
+{
+    return fabsf(x);
+}
+#else /* gcc or icc */
+static CAN_FORCEINLINE float abs_f32(float x)
+{
+    asm(
+        "and $0x7FFFFFFF, %0;"
+        : "+r"(x)::);
+    return x;
+}
+#endif
+
+static CAN_FORCEINLINE __m256 _mm256_can_acos_ps(__m256 x)
+{
+    const __m256 zero = _mm256_set1_ps(0.0f);
+    const __m256 two = _mm256_set1_ps(2.0f);
+    const __m256 mtwo = _mm256_set1_ps(-2.0f);
+    const __m256 c0 = _mm256_set1_ps(0x1.c86000p-22f); //  4.25032340e-7
+    const __m256 c1 = _mm256_set1_ps(-0x1.0258fap-19f); // -1.92483935e-6
+    const __m256 c2 = _mm256_set1_ps(0x1.90c5c4p-18f); //  5.97197595e-6
+    const __m256 c3 = _mm256_set1_ps(-0x1.55668cp-19f); // -2.54363249e-6
+    const __m256 c4 = _mm256_set1_ps(0x1.c3f78ap-16f); //  2.69393295e-5
+    const __m256 c5 = _mm256_set1_ps(0x1.e8f446p-14f); //  1.16575764e-4
+    const __m256 c6 = _mm256_set1_ps(0x1.6df072p-11f); //  6.97973708e-4
+    const __m256 c7 = _mm256_set1_ps(0x1.3332a6p-8f); //  4.68746712e-3
+    const __m256 c8 = _mm256_set1_ps(0x1.555550p-5f); //  4.16666567e-2
+    const __m256 pi0 = _mm256_set1_ps(0x1.ddcb02p+0f); //  1.86637890e+0
+    const __m256 pi1 = _mm256_set1_ps(0x1.aee9d6p+0f); //  1.68325555e+0
+    __m256 s, r, t, m;
+
+    s = two;
+    t = mtwo;
+    m = _mm256_cmp_ps(x, zero, _CMP_LT_OQ);
+    t = _mm256_blendv_ps(t, s, m);
+    t = _mm256_fmadd_ps(x, t, s);
+    s = _mm256_sqrt_ps(t);
+    r = c0;
+    r = _mm256_fmadd_ps(r, t, c1);
+    r = _mm256_fmadd_ps(r, t, c2);
+    r = _mm256_fmadd_ps(r, t, c3);
+    r = _mm256_fmadd_ps(r, t, c4);
+    r = _mm256_fmadd_ps(r, t, c5);
+    r = _mm256_fmadd_ps(r, t, c6);
+    r = _mm256_fmadd_ps(r, t, c7);
+    r = _mm256_fmadd_ps(r, t, c8);
+    r = _mm256_mul_ps(r, t);
+    r = _mm256_fmadd_ps(r, s, s);
+    t = _mm256_sub_ps(zero, r);
+    t = _mm256_fmadd_ps(pi0, pi1, t);
+    r = _mm256_blendv_ps(r, t, m);
+    return r;
+}
+
+//t > 0.921875f
+static CAN_FORCEINLINE __m256 erf_core_ps1(__m256 a, __m256 t, __m256 s, __m256 r0, __m256 r1, __m256 r2,
+    __m256 r3, __m256 r4, __m256 r5, __m256 r6, __m256i n_flag)
+{
+    __m256 r = _mm256_fmadd_ps(r0, t, r1);
+    __m256 u = _mm256_fmadd_ps(r2, t, r3);
+    r = _mm256_fmadd_ps(r, s, u);
+    r = _mm256_fmadd_ps(r, t, r4);
+    r = _mm256_fmadd_ps(r, t, r5);
+    r = _mm256_fmadd_ps(r, t, r6);
+    r = _mm256_fmadd_ps(r, t, t);
+    __m256 _zeros = _mm256_setzero_ps();
+    __m256 _ones = _mm256_set1_ps(1.0f);
+    __m256 minus_r = _mm256_sub_ps(_zeros, r);
+    r = exp256_ps(minus_r);
+    r = _mm256_sub_ps(_ones, r);
+
+    __m256i sign_flag = _mm256_andnot_si256(n_flag, _mm256_castps_si256(a));
+    __m256i pr = _mm256_and_si256(n_flag, _mm256_castps_si256(r));
+    r = _mm256_castsi256_ps(_mm256_or_si256(sign_flag, pr));
+    return r;
+}
+
+// t <= 0.921875f
+static CAN_FORCEINLINE __m256 erf_core_ps2(__m256 a, __m256 s, __m256 r1, __m256 r2,
+    __m256 r3, __m256 r4, __m256 r5, __m256 r6)
+{
+    __m256 r = _mm256_fmadd_ps(r1, s, r2);
+    r = _mm256_fmadd_ps(r, s, r3);
+    r = _mm256_fmadd_ps(r, s, r4);
+    r = _mm256_fmadd_ps(r, s, r5);
+    r = _mm256_fmadd_ps(r, s, r6);
+    r = _mm256_fmadd_ps(r, a, a);
+    return r;
+}
+
+static void erf_f32_vec(const float *a, float *b, int n)
+{
+    const float erf_const_data[] = { -0x1.3a1a82p-11f, 0x1.473f48p-08f, -0x1.b68bd2p-06f,
+        0x1.ce1a46p-04f, -0x1.8126e0p-02f, 0x1.06eba6p-03f };
+    __m256 r1 = _mm256_broadcast_ss(erf_const_data);
+    __m256 r2 = _mm256_broadcast_ss(erf_const_data + 1);
+    __m256 r3 = _mm256_broadcast_ss(erf_const_data + 2);
+    __m256 r4 = _mm256_broadcast_ss(erf_const_data + 3);
+    __m256 r5 = _mm256_broadcast_ss(erf_const_data + 4);
+    __m256 r6 = _mm256_broadcast_ss(erf_const_data + 5);
+
+    /////////////////////////////
+    // if t > 0.921875f
+    const __m256 c0 = _mm256_set1_ps(0x1.222900p-16f);
+    const __m256 c1 = _mm256_set1_ps(-0x1.91d2ccp-12f);
+    const __m256 c2 = _mm256_set1_ps(0x1.fd1336p-09f);
+    const __m256 c3 = _mm256_set1_ps(-0x1.8d6300p-06f);
+    const __m256 c4 = _mm256_set1_ps(0x1.b55cb0p-4f);
+    const __m256 c5 = _mm256_set1_ps(0x1.450aa0p-1f);
+    const __m256 c6 = _mm256_set1_ps(0x1.079d0cp-3f);
+    const __m256 c7 = _mm256_set1_ps(0.921875f);
+    /////////////////////////////
+
+    __m256i n_flag = _mm256_set1_epi32(0x7fffffff);
+
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 aa = _mm256_loadu_ps(a);
+        __m256 s = _mm256_mul_ps(aa, aa); // s
+        __m256 t = _mm256_castsi256_ps(_mm256_and_si256(_mm256_castps_si256(aa), n_flag));
+
+        __m256 ret1 = erf_core_ps1(aa, t, s, c0, c1, c2, c3, c4, c5, c6, n_flag);
+        __m256 ret2 = erf_core_ps2(aa, s, r1, r2, r3, r4, r5, r6);
+
+        __m256 _flag = _mm256_cmp_ps(t, c7, _CMP_LT_OQ);
+
+        ret1 = _mm256_blendv_ps(ret1, ret2, _flag);
+        _mm256_storeu_ps(b, ret1);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = erff(a[j]);
+    }
+}
+
+static void sign_f32_vec(const float *a, float *b, int n)
+{
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 aa = _mm256_loadu_ps(a);
+        __m256 b1 = _mm256_cmp_ps(_mm256_setzero_ps(), aa, _CMP_LT_OQ);
+        __m256 b2 = _mm256_cmp_ps(aa, _mm256_setzero_ps(), _CMP_LT_OQ);
+        __m256i ib1 = _mm256_castps_si256(b1);
+        __m256i ib2 = _mm256_castps_si256(b2);
+        __m256i ret = _mm256_sub_epi32(ib2, ib1);
+
+        __m256 kbb = _mm256_cvtepi32_ps(ret);
+        _mm256_storeu_ps(b, kbb);
+
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = (0.f < a[j]) - (a[j] < 0.f);
+    }
+}
+
+static void acos_f32_vec(const float *a, float *b, int n)
+{
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 vecotr_a = _mm256_loadu_ps(a);
+        __m256 dst_a = _mm256_can_acos_ps(vecotr_a);
+        _mm256_storeu_ps(b, dst_a);
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = acosf(a[j]);
+    }
+}
+
+CAN_FORCEINLINE __m256 asinf_core_ps(__m256 a, __m256 r0, __m256 r1, __m256 r2, __m256 r3, __m256 r4)
+{
+    __m256 ss = _mm256_mul_ps(a, a); // s = a * a;
+    __m256 r = r0;
+    r = _mm256_fmadd_ps(r, ss, r1); //r = fmaf(r, s, 0x1.29a5cep-6f); // 1.81669723e-23
+    r = _mm256_fmadd_ps(r, ss, r2);
+    r = _mm256_fmadd_ps(r, ss, r3);
+    r = _mm256_fmadd_ps(r, ss, r4);
+    r = _mm256_mul_ps(r, ss);
+    r = _mm256_fmadd_ps(r, a, a);
+    return r;
+}
+
+CAN_FORCEINLINE __m256 asinf_core2_ps(__m256 a, __m256 r0, __m256 r1, __m256 r2, __m256 r3, __m256 r4, __m256 one_256, __m256 half_one_256, __m256 half_pi_256,
+    __m256i abs_flag, __m256i sign_flag)
+{
+    __m256 s; // = a;
+
+    ////////////////////
+    // 获取符号位
+    __m256i isign_flag = _mm256_and_si256(_mm256_castps_si256(a), sign_flag);
+    __m256i _xv = _mm256_and_si256(_mm256_castps_si256(a), abs_flag);
+    s = _mm256_castsi256_ps(_xv);
+    ////////////////////
+
+    ////////////////////////////
+    //  before
+    s = _mm256_sub_ps(one_256, s); // 1 - x
+    s = _mm256_mul_ps(half_one_256, s); // (1 - x) / 2
+    s = _mm256_sqrt_ps(s);
+    /////////////////////////////
+
+    __m256 ss = _mm256_mul_ps(s, s); // s = a * a;
+    __m256 r = r0;
+    r = _mm256_fmadd_ps(r, ss, r1); //r = fmaf(r, s, 0x1.29a5cep-6f); // 1.81669723e-23
+    r = _mm256_fmadd_ps(r, ss, r2);
+    r = _mm256_fmadd_ps(r, ss, r3);
+    r = _mm256_fmadd_ps(r, ss, r4);
+    r = _mm256_mul_ps(r, ss);
+    r = _mm256_fmadd_ps(r, s, s);
+
+    ////////////////////////////
+    //  after
+    s = _mm256_div_ps(r, half_one_256); // 2 * asinf_core(x)
+    s = _mm256_sub_ps(half_pi_256, s); // pi / 2 - 2 * asinf_core(x)
+    /////////////////////////////
+
+    ////////////////////
+    // 恢复符号位
+    s = _mm256_castsi256_ps(_mm256_or_si256(_mm256_castps_si256(s), isign_flag));
+    return s;
+}
+
+void asinf_f32_vec(const float *a, float *b, int n)
+{
+    const float pi = 3.1415926f;
+    const float __init__data[] = { 0x1.a7f260p-5f, 0x1.29a5cep-6f, 0x1.7f0842p-5f, 0x1.329256p-4f, 0x1.555728p-3f, 1.0f, 0.5f, pi / 2 };
+    __m256 r0 = _mm256_broadcast_ss(__init__data);
+    __m256 r1 = _mm256_broadcast_ss(__init__data + 1);
+    __m256 r2 = _mm256_broadcast_ss(__init__data + 2);
+    __m256 r3 = _mm256_broadcast_ss(__init__data + 3);
+    __m256 r4 = _mm256_broadcast_ss(__init__data + 4);
+
+    __m256 one_256 = _mm256_broadcast_ss(__init__data + 5);
+    __m256 half_one_256 = _mm256_broadcast_ss(__init__data + 6);
+    __m256 half_pi_256 = _mm256_broadcast_ss(__init__data + 7);
+
+    const ALIGN32_BEG int32_t x1[8] ALIGN32_END = {
+        0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+        0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
+    };
+    const ALIGN32_BEG uint32_t x2[8] ALIGN32_END = {
+        0x80000000, 0x80000000, 0x80000000, 0x80000000,
+        0x80000000, 0x80000000, 0x80000000, 0x80000000
+    };
+
+    __m256i abs_flag = _mm256_load_si256((__m256i const *)x1);
+    __m256i sign_flag = _mm256_load_si256((__m256i const *)x2);
+
+    int n8 = (n >> 3);
+    int n8_left = n & (8 - 1);
+    for (int j = 0; j < n8; ++j)
+    {
+        __m256 s = _mm256_loadu_ps(a);
+        __m256 s1 = asinf_core_ps(s, r0, r1, r2, r3, r4);
+        ////////////
+        // fabsf 是否大于 0.5f
+        /////////////
+        __m256 abs_s = _mm256_castsi256_ps(_mm256_and_si256(_mm256_castps_si256(s), abs_flag));
+        ////__m256 _mm256_cmp_ps(__m256 a, __m256 b, const int imm8);
+        __m256 flags_half_2 = _mm256_cmp_ps(abs_s, half_one_256, _CMP_NLT_UQ);
+
+        __m256 flags_half_2_1 = _mm256_cmp_ps(half_one_256, abs_s, _CMP_NLT_UQ);
+
+        __m256 s2 = asinf_core2_ps(s, r0, r1, r2, r3, r4, one_256, half_one_256, half_pi_256, abs_flag, sign_flag);
+
+        s1 = _mm256_and_ps(s1, flags_half_2_1);
+        s2 = _mm256_and_ps(s2, flags_half_2);
+        s2 = _mm256_or_ps(s1, s2);
+        _mm256_storeu_ps(b, s2);
+
+        a += 8;
+        b += 8;
+    }
+    for (int j = 0; j < n8_left; ++j)
+    {
+        b[j] = asinf(a[j]);
+    }
+}
+#else // X86_64_SIMD_ON
+
+static void round_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = roundf(a[j]);
+    }
+}
+
+static void ceil_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = ceilf(a[j]);
+    }
+}
+
+static void floor_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = floorf(a[j]);
+    }
+}
+
+static void sqrt_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = sqrtf(a[j]);
+    }
+}
+
+static void rsqrt_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = 1.0f / sqrtf(a[j]);
+    }
+}
+
+static void exp_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = expf(a[j]);
+    }
+}
+
+static void log_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = logf(a[j]);
+    }
+}
+
+static void cos_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = cosf(a[j]);
+    }
+}
+
+static void sin_f32_vec(const float *a, float *b, int n)
+{
+
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = sinf(a[j]);
+    }
+}
+
+static void negative_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = -(a[j]);
+    }
+}
+
+static void logical_not_f32_vec(const float *a, float *b, int n)
+{
+
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = !a[j];
+    }
+}
+
+static void abs_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = fabs(a[j]);
+    }
+}
+
+static void tanh_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = tanhf(a[j]);
+    }
+}
+
+static void erf_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = erff(a[j]);
+    }
+}
+
+static void sign_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = (0.f < a[j]) - (a[j] < 0.f);
+    }
+}
+
+static void acos_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = acosf(a[j]);
+    }
+}
+
+static void asinf_f32_vec(const float *a, float *b, int n)
+{
+    for (int j = 0; j < n; ++j)
+    {
+        b[j] = asinf(a[j]);
+    }
+}
+#endif // X86_64_SIMD_ON
+
 result<void> optimized::unary(unary_op_t op, const float *input, float *output, const runtime_shape_t &shape,
     const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context) noexcept
 {
-    return cpu::reference::unary(op, input, output, shape, in_strides, out_strides, context);
+    result<void> ret_value = ok();
+    int len = (int)compute_size(shape);
+
+    if (op == unary_round)
+    {
+        round_f32_vec(input, output, len);
+    }
+    else if (op == unary_ceil)
+    {
+        ceil_f32_vec(input, output, len);
+    }
+    else if (op == unary_floor)
+    {
+        floor_f32_vec(input, output, len);
+    }
+    else if (op == unary_sqrt)
+    {
+        sqrt_f32_vec(input, output, len);
+    }
+    else if (op == unary_rsqrt)
+    {
+        rsqrt_f32_vec(input, output, len);
+    }
+    else if (op == unary_exp)
+    {
+        exp_f32_vec(input, output, len);
+    }
+    else if (op == unary_log)
+    {
+        log_f32_vec(input, output, len);
+    }
+    else if (op == unary_cos)
+    {
+        cos_f32_vec(input, output, len);
+    }
+    else if (op == unary_sin)
+    {
+        sin_f32_vec(input, output, len);
+    }
+    else if (op == unary_neg)
+    {
+        negative_f32_vec(input, output, len);
+    }
+    else if (op == unary_abs)
+    {
+        abs_f32_vec(input, output, len);
+    }
+    else if (op == unary_logical_not)
+    {
+        logical_not_f32_vec(input, output, len);
+    }
+    else if (op == unary_tanh)
+    {
+        tanh_f32_vec(input, output, len);
+    }
+    else if (op == unary_erf)
+    {
+        erf_f32_vec(input, output, len);
+    }
+    else if (op == unary_sign)
+    {
+        sign_f32_vec(input, output, len);
+    }
+    else if (op == unary_acos)
+    {
+        acos_f32_vec(input, output, len);
+    }
+    else if (op == unary_asin)
+    {
+        asinf_f32_vec(input, output, len);
+    }
+    else
+    {
+        ret_value = cpu::reference::unary(op, input, output, shape, in_strides, out_strides, context);
+    }
+    return ret_value;
 }
diff --git a/src/kernels/cpu/reference/CMakeLists.txt b/src/kernels/cpu/reference/CMakeLists.txt
index c24e49e55d..0c14cde7c8 100644
--- a/src/kernels/cpu/reference/CMakeLists.txt
+++ b/src/kernels/cpu/reference/CMakeLists.txt
@@ -4,6 +4,7 @@ set(SRCS batch_to_space.cpp
          binary.cpp
          broadcast.cpp
          compare.cpp
+         compress.cpp
          concat.cpp
          convolution.cpp
          convert.cpp
@@ -11,7 +12,9 @@ set(SRCS batch_to_space.cpp
          cumsum.cpp
          dequantize.cpp
          gather.cpp
+         gather_elements.cpp
          gather_nd.cpp
+         gru.cpp
          hardmax.cpp
          lut1d.cpp
          matmul.cpp
@@ -29,9 +32,13 @@ set(SRCS batch_to_space.cpp
          sigmoid.cpp
          softmax.cpp
          slice.cpp
+         space_to_batch.cpp
          ternary.cpp
          topk.cpp
          transpose.cpp
          trilu.cpp
-         unary.cpp)
+         tflite_detection_postprocess.cpp
+         unary.cpp
+         layernorm.cpp
+         instancenorm.cpp)
 target_sources(kernels PRIVATE ${SRCS})
diff --git a/src/kernels/cpu/reference/compress.cpp b/src/kernels/cpu/reference/compress.cpp
new file mode 100644
index 0000000000..1bccc3a0be
--- /dev/null
+++ b/src/kernels/cpu/reference/compress.cpp
@@ -0,0 +1,61 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <chrono>
+#include <iostream>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::reference;
+
+template result<void> reference::compress<float>(const float *input, const uint8_t *condition, float *output, const runtime_shape_t &input_shape, const runtime_shape_t &condition_shape, const int axis) noexcept;
+
+template <class T>
+result<void> reference::compress(const T *input, const uint8_t *condition, T *output, const runtime_shape_t &input_shape, const runtime_shape_t &condition_shape, const int axis) noexcept
+{
+    if (axis == (int)input_shape.size())
+    {
+        for (auto i = 0; i < (int)condition_shape[0]; i++)
+        {
+            if ((float)*(condition + i) == 0)
+            {
+                continue;
+            }
+            *output++ = input[i];
+        }
+    }
+    else
+    {
+        int select_slice = 1;
+        for (auto i = axis + 1; i < (int)input_shape.size(); i++)
+        {
+            select_slice *= input_shape[i];
+        }
+        for (auto j = 0; j < (int)kernels::detail::compute_size(input_shape); j++)
+        {
+            auto i = j % (select_slice * input_shape[axis]);
+            auto cond_index = i / select_slice;
+            if (select_slice != 1 && (cond_index >= condition_shape[0] || condition[cond_index] == 0))
+                continue;
+            if (select_slice == 1 && (i % input_shape[axis] >= condition_shape[0] || condition[cond_index % input_shape[axis] % condition_shape[0]] == 0))
+                continue;
+            *output++ = input[j];
+        }
+    }
+    return ok();
+}
\ No newline at end of file
diff --git a/src/kernels/cpu/reference/convert.cpp b/src/kernels/cpu/reference/convert.cpp
index b5c6d26168..0de06d8c4c 100644
--- a/src/kernels/cpu/reference/convert.cpp
+++ b/src/kernels/cpu/reference/convert.cpp
@@ -30,7 +30,10 @@ result<void> convert_impl(const TInput *input, TOutput *output, const runtime_sh
 {
     return apply(in_shape, [&](const runtime_shape_t &index) -> result<void> {
         auto value = input[offset(in_strides, index)];
-        output[offset(out_strides, index)] = static_cast<TOutput>(value);
+        if (to_datatype<TOutput>() == dt_bfloat16)
+            output[offset(out_strides, index)] = bfloat16::round_to_bfloat16(static_cast<float>(value));
+        else
+            output[offset(out_strides, index)] = static_cast<TOutput>(value);
         return ok();
     });
 }
@@ -72,6 +75,7 @@ result<void> convert_f32_to_fp16_impl(const float *input, half *output, const ru
         CONVERT_IMPL_LV2(input_t, int32_t);  \
         CONVERT_IMPL_LV2(input_t, int64_t);  \
         CONVERT_IMPL_LV2(input_t, float);    \
+        CONVERT_IMPL_LV2(input_t, bfloat16); \
     }
 
 result<void> reference::convert(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output,
diff --git a/src/kernels/cpu/reference/gather_elements.cpp b/src/kernels/cpu/reference/gather_elements.cpp
new file mode 100644
index 0000000000..736ecd925a
--- /dev/null
+++ b/src/kernels/cpu/reference/gather_elements.cpp
@@ -0,0 +1,71 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::reference;
+using namespace std;
+
+template result<void> reference::gather_elements(const float *input, const int64_t *indices, float *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &indices_shape, const int axis) noexcept;
+
+template <typename TI, typename TK>
+result<void> reference::gather_elements(const TI *input, const TK *indices, TI *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &indices_shape, const int axis) noexcept
+{
+    // indices_shape == output_shape
+    // out[i][j][k] = input[index[i][j][k]][j][k] if axis = 0,
+    // out[i][j][k] = input[i][index[i][j][k]][k] if axis = 1,
+    // out[i][j][k] = input[i][j][index[i][j][k]] if axis = 2,
+    std::vector<int> per_axis_size(indices_shape.size(), 1);
+    std::vector<int> input_per_axis_size(indices_shape.size(), 1);
+
+    // compute size per axis
+    for (int idx = indices_shape.size() - 2; idx >= 0; idx--)
+    {
+        per_axis_size[idx] = indices_shape[idx + 1] * per_axis_size[idx + 1];
+        input_per_axis_size[idx] = in_shape[idx + 1] * input_per_axis_size[idx + 1];
+    }
+
+    for (size_t i = 0; i < compute_size(indices_shape); i++)
+    {
+        std::vector<int> index;
+        get_gather_index(per_axis_size, index, i, axis, 0);
+
+        // compute indices offset to update index
+        int indice_index = 0;
+        for (size_t t = 0; t < index.size(); t++)
+        {
+            indice_index += per_axis_size[t] * index[t];
+        }
+        // process index value if negative value
+        index[axis] = indices[indice_index] < 0 ? indices[indice_index] + in_shape[axis] : indices[indice_index];
+
+        // compute input offset
+        int input_index = 0;
+        for (size_t t = 0; t < index.size(); t++)
+        {
+            input_index += input_per_axis_size[t] * index[t];
+        }
+        output[i] = input[input_index];
+    }
+
+    return ok();
+}
diff --git a/src/kernels/cpu/reference/gru.cpp b/src/kernels/cpu/reference/gru.cpp
new file mode 100644
index 0000000000..d61b0bedb1
--- /dev/null
+++ b/src/kernels/cpu/reference/gru.cpp
@@ -0,0 +1,184 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::reference;
+
+template result<void>
+reference::gru<float>(const float *input, const float *w, const float *r, const float *b, float *initial_h,
+    float *output, float *output_h, const runtime_shape_t &input_shape,
+    const runtime_shape_t &w_shape, int mode, bool linear_before_reset) noexcept;
+
+template <typename T>
+result<void> reference::gru(const T *input, const T *w, const T *r, const T *b, T *initial_h, T *output, T *output_h,
+    const runtime_shape_t &input_shape, const runtime_shape_t &w_shape, int mode,
+    bool linear_before_reset) noexcept
+{
+    const int seq_length = input_shape[0];
+    const int batch_size = input_shape[1];
+    const int input_size = input_shape[2];
+    const int num_direction = w_shape[0];
+    const int hidden_size = w_shape[1] / 3;
+
+    auto sigmoid = [&](float x) {
+        return 1 / (1 + std::exp(-x));
+    };
+    auto tanh = [&](float x) {
+        return std::tanh(x);
+    };
+    // copy input to output
+    runtime_shape_t out_shape { (size_t)seq_length, (size_t)num_direction, (size_t)batch_size, (size_t)hidden_size };
+
+    auto x_gate_size = batch_size * input_size;
+    auto w_gate_size = 3 * hidden_size * input_size;
+    auto h_t_size = batch_size * hidden_size;
+    auto r_gate_size = 3 * hidden_size * hidden_size;
+
+    auto tmp_a = std::vector<float>(batch_size * hidden_size, 0.f);
+    auto tmp_b = std::vector<float>(batch_size * hidden_size, 0.f);
+    auto gate_z = std::vector<float>(batch_size * hidden_size, 0.f);
+    auto gate_r = std::vector<float>(batch_size * hidden_size, 0.f);
+    auto gate_h = std::vector<float>(batch_size * hidden_size, 0.f);
+
+    std::vector<int> seq_len_loop;
+    for (int l = 0; l < seq_length; l++)
+        seq_len_loop.push_back(l);
+    if (mode == lstm_direction::kReverse)
+        std::reverse(seq_len_loop.begin(), seq_len_loop.end());
+    auto x_i = input;
+    auto h_t = initial_h;
+    auto w_i = w;
+    auto r_i = r;
+    auto b_i = b;
+    for (int d = 0; d < num_direction; d++)
+    {
+        h_t = initial_h + d * h_t_size;
+        w_i = w + d * w_gate_size;
+        r_i = r + d * r_gate_size;
+        b_i = b + d * 6 * hidden_size;
+        if (d == 1)
+            std::reverse(seq_len_loop.begin(), seq_len_loop.end());
+        for (auto i : seq_len_loop)
+        {
+            x_i = input + i * x_gate_size;
+            // clean gate_z gate_r gate_h
+            std::fill(gate_z.begin(), gate_z.end(), 0.f);
+            std::fill(gate_r.begin(), gate_r.end(), 0.f);
+            std::fill(gate_h.begin(), gate_h.end(), 0.f);
+
+            // clean tmp_a tmp_b
+            std::fill(tmp_a.begin(), tmp_a.end(), 0.f);
+            std::fill(tmp_b.begin(), tmp_b.end(), 0.f);
+            // gate_z = x_i * w_i_z + b_w_z + h_t *r_i_z + b_r_z
+            for (int bs = 0; bs < batch_size; bs++)
+            {
+                for (int hs = 0; hs < hidden_size; hs++)
+                {
+                    for (int is = 0; is < input_size; is++)
+                    {
+                        tmp_a[bs * hidden_size + hs] += x_i[bs * input_size + is] * w_i[hs * input_size + is];
+                    }
+                    tmp_a[bs * hidden_size + hs] += b_i[hs];
+                    for (int rs = 0; rs < hidden_size; rs++)
+                    {
+                        tmp_b[bs * hidden_size + hs] += h_t[bs * hidden_size + rs] * r_i[hs * hidden_size + rs];
+                    }
+                    tmp_b[bs * hidden_size + hs] += b_i[3 * hidden_size + hs];
+                    gate_z[bs * hidden_size + hs] = tmp_a[bs * hidden_size + hs] + tmp_b[bs * hidden_size + hs];
+                }
+            }
+            // gate_z = sigmoid(gate_z);
+            std::transform(gate_z.begin(), gate_z.end(), gate_z.begin(), sigmoid);
+
+            // clear tmp_a tmp_b
+            std::fill(tmp_a.begin(), tmp_a.end(), 0.f);
+            std::fill(tmp_b.begin(), tmp_b.end(), 0.f);
+            // gate_r = x_i * w_i_r + b_w_r + h_t *r_i_r + b_r_r
+            for (int bs = 0; bs < batch_size; bs++)
+            {
+                for (int hs = 0; hs < hidden_size; hs++)
+                {
+                    for (int is = 0; is < input_size; is++)
+                    {
+                        tmp_a[bs * hidden_size + hs] += x_i[bs * input_size + is] * w_i[hidden_size * input_size + hs * input_size + is];
+                    }
+                    tmp_a[bs * hidden_size + hs] += b_i[hidden_size + hs];
+                    for (int rs = 0; rs < hidden_size; rs++)
+                    {
+                        tmp_b[bs * hidden_size + hs] += h_t[bs * hidden_size + rs] * r_i[hidden_size * hidden_size + hs * hidden_size + rs];
+                    }
+                    tmp_b[bs * hidden_size + hs] += b_i[4 * hidden_size + hs];
+                    gate_r[bs * hidden_size + hs] = tmp_a[bs * hidden_size + hs] + tmp_b[bs * hidden_size + hs];
+                }
+            }
+            // gate_r = sigmoid(gate_r);
+            std::transform(gate_r.begin(), gate_r.end(), gate_r.begin(), sigmoid);
+
+            // clear tmp_a tmp_b
+            std::fill(tmp_a.begin(), tmp_a.end(), 0.f);
+            std::fill(tmp_b.begin(), tmp_b.end(), 0.f);
+            // gate_h = x_i * w_i_h + b_w_h + gate_r·h_t *r_i_h + b_r_h
+            for (int bs = 0; bs < batch_size; bs++)
+            {
+                for (int hs = 0; hs < hidden_size; hs++)
+                {
+                    for (int is = 0; is < input_size; is++)
+                    {
+                        tmp_a[bs * hidden_size + hs] += x_i[bs * input_size + is] * w_i[2 * hidden_size * input_size + hs * input_size + is];
+                    }
+                    tmp_a[bs * hidden_size + hs] += b_i[2 * hidden_size + hs];
+
+                    for (int rs = 0; rs < hidden_size; rs++)
+                    {
+                        if (!linear_before_reset)
+                            tmp_b[bs * hidden_size + hs] += gate_r[bs * hidden_size + rs] * h_t[bs * hidden_size + rs] * r_i[2 * hidden_size * hidden_size + hs * hidden_size + rs];
+                        else
+                            tmp_b[bs * hidden_size + hs] += h_t[bs * hidden_size + rs] * r_i[2 * hidden_size * hidden_size + hs * hidden_size + rs];
+                    }
+                    tmp_b[bs * hidden_size + hs] += b_i[5 * hidden_size + hs];
+
+                    if (!linear_before_reset)
+                        gate_h[bs * hidden_size + hs] = tmp_a[bs * hidden_size + hs] + tmp_b[bs * hidden_size + hs];
+                    else
+                        gate_h[bs * hidden_size + hs] = tmp_a[bs * hidden_size + hs] + gate_r[bs * hidden_size + hs] * tmp_b[bs * hidden_size + hs];
+                }
+            }
+            // gate_h = tanh(gate_h);
+            std::transform(gate_h.begin(), gate_h.end(), gate_h.begin(), tanh);
+
+            for (int k = 0; k < batch_size * hidden_size; k++)
+            {
+                h_t[k] = (1 - gate_z[k]) * gate_h[k] + gate_z[k] * h_t[k];
+                // *output++ = h_t[k];
+                output[i * (num_direction * batch_size * hidden_size) + d * (batch_size * hidden_size) + k] = h_t[k];
+            }
+        }
+        // if (mode == lstm_direction::kReverse || d == 1)
+        //     h_t.reverse();
+        for (int k = 0; k < batch_size * hidden_size; k++)
+        {
+            output_h[d * (batch_size * hidden_size) + k] = h_t[k];
+        }
+    }
+
+    return ok();
+}
diff --git a/src/kernels/cpu/reference/instancenorm.cpp b/src/kernels/cpu/reference/instancenorm.cpp
new file mode 100644
index 0000000000..d688aa77a1
--- /dev/null
+++ b/src/kernels/cpu/reference/instancenorm.cpp
@@ -0,0 +1,68 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cmath>
+#include <iostream>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::reference;
+
+template result<void> reference::instancenorm<float>(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, float epsilon) noexcept;
+
+template <typename T>
+result<void> reference::instancenorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, float epsilon) noexcept
+{
+    auto outer_size = static_cast<int>(in_shape[0]);
+    auto inner_size = 1;
+    for (size_t i = 2; i < in_shape.size(); i++)
+        inner_size *= static_cast<int>(in_shape[i]);
+    for (auto batch = 0; batch < outer_size; batch++)
+    {
+        for (size_t c = 0; c < in_shape[1]; c++)
+        {
+            auto src = input + batch * inner_size * in_shape[1] + c * inner_size;
+            auto dest = output + batch * inner_size * in_shape[1] + c * inner_size;
+
+            double mean1 = 0.f;
+            for (auto i = 0; i < inner_size; i++)
+                mean1 += src[i] / inner_size;
+
+            std::vector<double> sub(inner_size, 0.f);
+            for (auto i = 0; i < inner_size; i++)
+                sub[i] = (src[i] - mean1);
+
+            std::vector<double> pow(inner_size, 0.f);
+            for (auto i = 0; i < inner_size; i++)
+                pow[i] = sub[i] * sub[i];
+
+            double mean2 = 0.f;
+            for (auto i = 0; i < inner_size; i++)
+                mean2 += pow[i] / inner_size;
+
+            double add = mean2 + epsilon;
+            double sqrt = std::sqrt(add);
+
+            for (auto i = 0; i < inner_size; i++)
+                dest[i] = sub[i] * scale[c] / sqrt + bias[c];
+        }
+    }
+
+    return ok();
+}
\ No newline at end of file
diff --git a/src/kernels/cpu/reference/layernorm.cpp b/src/kernels/cpu/reference/layernorm.cpp
new file mode 100644
index 0000000000..f8c57cbf2c
--- /dev/null
+++ b/src/kernels/cpu/reference/layernorm.cpp
@@ -0,0 +1,72 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cmath>
+#include <iostream>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::reference;
+
+template result<void> reference::layernorm<float>(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept;
+
+template <typename T>
+result<void> reference::layernorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept
+{
+    auto outer_size = 1;
+    auto inner_size = 1;
+    for (auto i = 0; i < axis; i++)
+        outer_size *= in_shape[i];
+    for (auto i = axis; i < static_cast<int>(in_shape.size()); i++)
+        inner_size *= in_shape[i];
+
+    for (int32_t batch = 0; batch < outer_size; batch++)
+    {
+        auto src = input + batch * inner_size;
+        auto dest = output + batch * inner_size;
+
+        float mean1 = 0.f;
+        for (auto i = 0; i < inner_size; i++)
+            mean1 += src[i] / inner_size;
+
+        std::vector<float> sub(inner_size, 0.f);
+        for (auto i = 0; i < inner_size; i++)
+            sub[i] = src[i] - mean1;
+
+        std::vector<float> pow(inner_size, 0.f);
+        for (auto i = 0; i < inner_size; i++)
+            pow[i] = sub[i] * sub[i];
+
+        float mean2 = 0.f;
+        for (auto i = 0; i < inner_size; i++)
+            mean2 += pow[i] / inner_size;
+
+        float add = mean2 + epsilon;
+        float sqrt = std::sqrt(add);
+
+        std::vector<float> div(inner_size, 0.f);
+        for (auto i = 0; i < inner_size; i++)
+            div[i] = sub[i] / sqrt;
+
+        for (auto i = 0; i < inner_size; i++)
+            dest[i] = div[i] * scale[i] + bias[i];
+    }
+
+    return ok();
+}
\ No newline at end of file
diff --git a/src/kernels/cpu/reference/nnil.cpp b/src/kernels/cpu/reference/nnil.cpp
index ec66a0ab89..a417a6a704 100644
--- a/src/kernels/cpu/reference/nnil.cpp
+++ b/src/kernels/cpu/reference/nnil.cpp
@@ -78,6 +78,9 @@ result<void> reference::nnil_unary_method(const float *input, float *output, siz
             case nnil_floor:
                 stack.push(floorf(stack.pop()));
                 break;
+            case nnil_erf:
+                stack.push(erff(stack.pop()));
+                break;
             case nnil_log:
                 stack.push(logf(stack.pop()));
                 break;
diff --git a/src/kernels/cpu/reference/reduce.cpp b/src/kernels/cpu/reference/reduce.cpp
index ced8f804f5..da5a3f69dd 100644
--- a/src/kernels/cpu/reference/reduce.cpp
+++ b/src/kernels/cpu/reference/reduce.cpp
@@ -71,6 +71,9 @@ template result<void> reference::reduce<float>(reduce_op_t op, float init_value,
 template result<void> reference::reduce<int32_t>(reduce_op_t op, int32_t init_value, const int32_t *input, int32_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
     const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept;
 
+template result<void> reference::reduce<int64_t>(reduce_op_t op, int64_t init_value, const int64_t *input, int64_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept;
+
 template <typename T>
 result<void> reference::reduce(reduce_op_t op, T init_value, const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
     const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept
diff --git a/src/kernels/cpu/reference/reduce_arg.cpp b/src/kernels/cpu/reference/reduce_arg.cpp
index 400b7289ba..e24ee758f5 100644
--- a/src/kernels/cpu/reference/reduce_arg.cpp
+++ b/src/kernels/cpu/reference/reduce_arg.cpp
@@ -92,7 +92,7 @@ result<void> reference::reduce_arg(reduce_arg_op_t op, const float *input, T *ou
         return reduce_arg_impl([](float a, float b) { return a < b; }, std::numeric_limits<float>::max(),
             input, output, in_shape, out_shape, in_strides, out_strides, axes, keep_dims, select_last_idx, context);
     case reduce_arg_max:
-        return reduce_arg_impl([](float a, float b) { return a > b; }, std::numeric_limits<float>::min(),
+        return reduce_arg_impl([](float a, float b) { return a > b; }, std::numeric_limits<float>::lowest(),
             input, output, in_shape, out_shape, in_strides, out_strides, axes, keep_dims, select_last_idx, context);
     default:
         return err(std::errc::not_supported);
diff --git a/src/kernels/cpu/reference/space_to_batch.cpp b/src/kernels/cpu/reference/space_to_batch.cpp
new file mode 100644
index 0000000000..fc732ef578
--- /dev/null
+++ b/src/kernels/cpu/reference/space_to_batch.cpp
@@ -0,0 +1,112 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::reference;
+
+namespace
+{
+runtime_shape_t get_padded_shape(const runtime_shape_t &in_shape, const runtime_paddings_t &paddings)
+{
+    runtime_shape_t out_shape(in_shape.size());
+    for (size_t i = 0; i < in_shape.size(); i++)
+        out_shape[i] = (size_t)((int32_t)in_shape[i] + paddings[i].sum() + (in_shape[i] - 1) * paddings[i].interior);
+    return out_shape;
+}
+
+inline runtime_shape_t get_transposed_shape(const runtime_shape_t &input_shape, const runtime_shape_t &perm)
+{
+    runtime_shape_t new_shape(input_shape.size());
+    for (size_t i = 0; i < new_shape.size(); i++)
+        new_shape[i] = input_shape[perm[i]];
+    return new_shape;
+}
+
+template <class T>
+result<void> space_to_batch_impl(datatype_t dt, const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &block_shape,
+    const runtime_paddings_t &paddings, const runtime_shape_t &in_strides, [[maybe_unused]] const runtime_shape_t &out_strides, kernel_context &context) noexcept
+{
+    auto in_shape_size = in_shape.size();
+    auto spatial_size = block_shape.size();
+    auto new_paddings = runtime_paddings_t(in_shape_size, { 0, 0, 0 });
+    for (size_t i = 0; i < spatial_size; ++i)
+    {
+        new_paddings[in_shape_size - spatial_size + i] = paddings[i];
+    }
+    auto pad_out_shape = get_padded_shape(in_shape, new_paddings);
+    auto pad_output = std::make_unique<float[]>(compute_size(pad_out_shape));
+    auto pad_out_strides = get_default_strides(pad_out_shape);
+    scalar pad_value(0);
+
+    try_(reference::pad(dt, reinterpret_cast<const gsl::byte *>(input),
+        reinterpret_cast<gsl::byte *>(pad_output.get()), in_shape, in_strides,
+        pad_out_strides, new_paddings,
+        pad_mode_t::pad_constant,
+        pad_value, context));
+
+    runtime_shape_t new_shape;
+    new_shape.reserve(in_shape_size + spatial_size);
+    new_shape.assign(pad_out_shape.begin(), pad_out_shape.begin() + in_shape_size - spatial_size);
+
+    runtime_shape_t perms(in_shape_size - spatial_size);
+    perms.reserve(in_shape_size + spatial_size);
+    std::iota(perms.begin(), perms.begin() + in_shape_size - spatial_size, 0);
+
+    runtime_shape_t spatial_perms;
+    spatial_perms.reserve(spatial_size);
+
+    for (size_t i = 0; i < spatial_size; i++)
+    {
+        size_t idx = in_shape_size - spatial_size + i;
+        perms.push_back(new_shape.size());
+        new_shape.push_back(pad_out_shape[idx] / block_shape[i]);
+
+        spatial_perms.push_back(new_shape.size());
+        new_shape.push_back(block_shape[i]);
+    }
+
+    perms.insert(perms.begin(), spatial_perms.begin(), spatial_perms.end());
+
+    auto tp_shape = get_transposed_shape(new_shape, perms);
+    auto tp_stride = get_default_strides(tp_shape);
+    try_(reference::transpose(dt, reinterpret_cast<const gsl::byte *>(pad_output.get()), reinterpret_cast<gsl::byte *>(output), new_shape, perms, get_default_strides(new_shape), tp_stride, context));
+    return ok();
+}
+}
+
+#define SPACE_TO_BATCH_IMPL(size, type) \
+    case size:                          \
+        return space_to_batch_impl(dt, reinterpret_cast<const type *>(input), reinterpret_cast<type *>(output), in_shape, block_shape, crops, in_strides, out_strides, context)
+
+result<void> reference::space_to_batch(datatype_t dt, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &block_shape, const runtime_paddings_t &crops, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context) noexcept
+{
+    switch (runtime::get_bytes(dt))
+    {
+        SPACE_TO_BATCH_IMPL(1, uint8_t);
+        SPACE_TO_BATCH_IMPL(2, uint16_t);
+        SPACE_TO_BATCH_IMPL(4, uint32_t);
+        SPACE_TO_BATCH_IMPL(8, uint64_t);
+    default:
+        return err(std::errc::not_supported);
+    }
+}
diff --git a/src/kernels/cpu/reference/ternary.cpp b/src/kernels/cpu/reference/ternary.cpp
index c8966a4135..e2f341396c 100644
--- a/src/kernels/cpu/reference/ternary.cpp
+++ b/src/kernels/cpu/reference/ternary.cpp
@@ -27,6 +27,16 @@ template result<void> reference::ternary<float>(const float *input_a, const floa
     const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
     const runtime_shape_t &out_strides) noexcept;
 
+template result<void> reference::ternary<int64_t>(const float *input_a, const int64_t *input_b, const int64_t *input_c, int64_t *output,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
+    const runtime_shape_t &out_strides) noexcept;
+
+template result<void> reference::ternary<int32_t>(const float *input_a, const int32_t *input_b, const int32_t *input_c, int32_t *output,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
+    const runtime_shape_t &out_strides) noexcept;
+
 template <typename T>
 result<void> reference::ternary(const float *input_a, const T *input_b, const T *input_c, T *output,
     const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
diff --git a/src/kernels/cpu/reference/tflite_detection_postprocess.cpp b/src/kernels/cpu/reference/tflite_detection_postprocess.cpp
new file mode 100644
index 0000000000..05251d1d6a
--- /dev/null
+++ b/src/kernels/cpu/reference/tflite_detection_postprocess.cpp
@@ -0,0 +1,376 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <chrono>
+#include <iostream>
+#include <nncase/kernels/cpu/reference/tensor_compute.h>
+#include <nncase/kernels/kernel_utils.h>
+#include <nncase/runtime/runtime_op_utility.h>
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::kernels;
+using namespace nncase::kernels::cpu;
+using namespace nncase::kernels::cpu::reference;
+
+template result<void> reference::tflite_detection_postprocess<float>(const float *boxes, const float *scores, const float *anchors, float *output_locations, float *output_classes, float *output_scores, float *output_num_detections,
+    const runtime_shape_t &boxes_shape, const runtime_shape_t &scores_shape, const runtime_shape_t &anchors_shape,
+    const int32_t max_detections, const int32_t max_classes_per_detection, const int32_t detections_per_class,
+    const bool use_regular_non_max_suppression, const float nms_score_threshold, const float nms_iou_threshold,
+    const int32_t num_classes, const float y_scale, const float x_scale, const float h_scale, const float w_scale) noexcept;
+
+template <typename T>
+result<void> reference::tflite_detection_postprocess(const T *boxes, const T *scores, const T *anchors, T *output_locations, T *output_classes, T *output_scores, T *output_num_detections,
+    const runtime_shape_t &boxes_shape, const runtime_shape_t &scores_shape, const runtime_shape_t &anchors_shape,
+    const int32_t max_detections, const int32_t max_classes_per_detection, const int32_t detections_per_class,
+    const bool use_regular_non_max_suppression, const float nms_score_threshold, const float nms_iou_threshold,
+    const int32_t num_classes, const float y_scale, const float x_scale, const float h_scale, const float w_scale) noexcept
+{
+    struct CenterSizeEncoding
+    {
+        float y;
+        float x;
+        float h;
+        float w;
+    };
+    struct BoxCornerEncoding
+    {
+        float ymin;
+        float xmin;
+        float ymax;
+        float xmax;
+    };
+    struct BoxInfo
+    {
+        int index;
+        float score;
+    };
+
+    auto compute_iou = [&](const std::vector<BoxCornerEncoding> &box, const int &i, const int &j) {
+        auto &box_i = box[i];
+        auto &box_j = box[j];
+        const float area_i = (box_i.ymax - box_i.ymin) * (box_i.xmax - box_i.xmin);
+        const float area_j = (box_j.ymax - box_j.ymin) * (box_j.xmax - box_j.xmin);
+        if (area_i <= 0 || area_j <= 0)
+            return 0.f;
+        const float intersection_y_min = std::max<float>(box_i.ymin, box_j.ymin);
+        const float intersection_x_min = std::max<float>(box_i.xmin, box_j.xmin);
+        const float intersection_y_max = std::min<float>(box_i.ymax, box_j.ymax);
+        const float intersection_x_max = std::min<float>(box_i.xmax, box_j.xmax);
+        const float intersection_area = std::max<float>(intersection_y_max - intersection_y_min, 0.0) * std::max<float>(intersection_x_max - intersection_x_min, 0.0);
+        return intersection_area / (area_i + area_j - intersection_area);
+    };
+
+    const auto num_boxes = (int)anchors_shape[0];
+    const auto num_classes_with_background = (int)scores_shape[2]; // num_classes + background
+    const auto num_detections_per_class = std::min(detections_per_class, max_detections);
+    int label_offset = num_classes_with_background - num_classes;
+    // DecodeCenterSizeBoxes： get decoded_boxes
+    std::vector<BoxCornerEncoding> decoded_boxes(boxes_shape[1]);
+    {
+        CenterSizeEncoding box_center_size;
+        CenterSizeEncoding scale_values { y_scale, x_scale, h_scale, w_scale };
+        CenterSizeEncoding anchor;
+
+        for (int index = 0; index < num_boxes; index++)
+        {
+            const auto box_encoding_index = index * boxes_shape[2];
+            box_center_size = *reinterpret_cast<const CenterSizeEncoding *>(boxes + box_encoding_index);
+            anchor = *reinterpret_cast<const CenterSizeEncoding *>(anchors + box_encoding_index);
+
+            auto y_center = static_cast<float>(static_cast<double>(box_center_size.y) / static_cast<double>(scale_values.y) * static_cast<double>(anchor.h) + static_cast<double>(anchor.y));
+            auto x_center = static_cast<float>(static_cast<double>(box_center_size.x) / static_cast<double>(scale_values.x) * static_cast<double>(anchor.w) + static_cast<double>(anchor.x));
+            auto half_h = static_cast<float>(0.5 * (std::exp(static_cast<double>(box_center_size.h) / static_cast<double>(scale_values.h))) * static_cast<double>(anchor.h));
+            auto half_w = static_cast<float>(0.5 * (std::exp(static_cast<double>(box_center_size.w) / static_cast<double>(scale_values.w))) * static_cast<double>(anchor.w));
+            decoded_boxes[index].ymin = y_center - half_h;
+            decoded_boxes[index].xmin = x_center - half_w;
+            decoded_boxes[index].ymax = y_center + half_h;
+            decoded_boxes[index].xmax = x_center + half_w;
+        }
+    }
+    // NMS MultiClass
+    {
+        if (use_regular_non_max_suppression)
+        {
+            // NMS Regular
+            int sorted_indices_size = 0;
+            std::vector<BoxInfo> box_info_after_regular_nms(max_detections + num_detections_per_class);
+            std::vector<int> num_selected(num_classes);
+
+            // compute nms
+            std::vector<float> class_scores(num_boxes);
+            std::vector<int> selected;
+            selected.reserve(num_detections_per_class);
+
+            for (auto col = 0; col < num_classes - 1; col++)
+            {
+                const float *scores_base = scores + col + label_offset;
+                for (int row = 0; row < num_boxes; row++)
+                {
+                    // Get scores of boxes corresponding to all anchors for single class
+                    class_scores[row] = *scores_base;
+                    scores_base += num_classes_with_background;
+                }
+                // Perform non-maximal suppression on single class
+                selected.clear();
+
+                // NMS SingleClass
+                {
+                    std::vector<int> keep_indices;
+                    std::vector<float> keep_scores;
+                    // select detection box score above score threshold
+                    {
+                        for (size_t i = 0; i < class_scores.size(); i++)
+                        {
+                            if (class_scores[i] >= nms_score_threshold)
+                            {
+                                keep_scores.emplace_back(class_scores[i]);
+                                keep_indices.emplace_back(i);
+                            }
+                        }
+                    }
+
+                    int num_scores_kept = (int)keep_scores.size();
+                    std::vector<int> sorted_indices;
+                    sorted_indices.resize(num_scores_kept);
+                    // DecreasingArgSort
+                    {
+                        std::iota(sorted_indices.begin(), sorted_indices.begin() + num_scores_kept, 0);
+                        std::stable_sort(
+                            sorted_indices.begin(), sorted_indices.begin() + num_scores_kept,
+                            [&keep_scores](const int i, const int j) { return keep_scores[i] > keep_scores[j]; });
+                    }
+
+                    const int output_size = std::min(num_scores_kept, max_detections);
+                    selected.clear();
+                    int num_active_candidate = num_scores_kept;
+                    std::vector<uint8_t> active_box_candidate(num_scores_kept, 1);
+                    for (int i = 0; i < num_scores_kept; ++i)
+                    {
+                        if (num_active_candidate == 0 || (int)selected.size() >= output_size)
+                            break;
+                        if (active_box_candidate[i] == 1)
+                        {
+                            selected.push_back(keep_indices[sorted_indices[i]]);
+                            active_box_candidate[i] = 0;
+                            num_active_candidate--;
+                        }
+                        else
+                        {
+                            continue;
+                        }
+                        for (int j = i + 1; j < num_scores_kept; ++j)
+                        {
+                            if (active_box_candidate[j] == 1)
+                            {
+
+                                float iou = compute_iou(
+                                    decoded_boxes, keep_indices[sorted_indices[i]],
+                                    keep_indices[sorted_indices[j]]);
+
+                                if (iou > nms_iou_threshold)
+                                {
+                                    active_box_candidate[j] = 0;
+                                    num_active_candidate--;
+                                }
+                            }
+                        }
+                    }
+                }
+                // end NMS SingleClass
+
+                if (selected.empty())
+                {
+                    continue;
+                }
+                for (size_t i = 0; i < selected.size(); ++i)
+                {
+                    box_info_after_regular_nms[sorted_indices_size + i].score = class_scores[selected[i]];
+                    box_info_after_regular_nms[sorted_indices_size + i].index = (selected[i] * num_classes_with_background + col + label_offset);
+                }
+
+                // In-place merge the original boxes and new selected boxes which are both
+                // sorted by scores.
+                std::inplace_merge(box_info_after_regular_nms.begin(), box_info_after_regular_nms.begin() + sorted_indices_size,
+                    box_info_after_regular_nms.begin() + sorted_indices_size + selected.size(),
+                    [](const BoxInfo &a, const BoxInfo &b) { return a.score >= b.score; });
+
+                sorted_indices_size = std::min(sorted_indices_size + static_cast<int>(selected.size()), max_detections);
+            }
+            // end compute nms result
+
+            // Allocate output tensors
+            for (int output_box_index = 0; output_box_index < max_detections; output_box_index++)
+            {
+                if (output_box_index < sorted_indices_size)
+                {
+                    const int anchor_index = floor(
+                        box_info_after_regular_nms[output_box_index].index / num_classes_with_background);
+                    const int class_index = box_info_after_regular_nms[output_box_index].index - anchor_index * num_classes_with_background - label_offset;
+                    const float selected_score = box_info_after_regular_nms[output_box_index].score;
+                    // detection_boxes
+                    reinterpret_cast<BoxCornerEncoding *>(output_locations)[output_box_index] = decoded_boxes[anchor_index];
+                    // detection_classes
+                    output_classes[output_box_index] = class_index;
+                    // detection_scores
+                    output_scores[output_box_index] = selected_score;
+                }
+                else
+                {
+                    // detection_boxes
+                    reinterpret_cast<BoxCornerEncoding *>(output_locations)[output_box_index] = { 0.0f, 0.0f, 0.0f, 0.0f };
+                    // detection_classes
+                    output_classes[output_box_index] = 0.0f;
+                    // detection_scores
+                    output_scores[output_box_index] = 0.0f;
+                }
+            }
+            output_num_detections[0] = sorted_indices_size;
+            box_info_after_regular_nms.clear();
+        }
+        else
+        {
+            // Fast NMS
+
+            const int max_categories_per_anchor = max_classes_per_detection;
+            const int num_categories_per_anchor = std::min(max_categories_per_anchor, num_classes);
+
+            std::vector<float> max_scores;
+            max_scores.resize(num_boxes);
+            std::vector<int> sorted_class_indices;
+            sorted_class_indices.resize(num_boxes * num_categories_per_anchor);
+
+            for (int row = 0; row < num_boxes; row++)
+            {
+                const float *box_scores = scores + row * num_classes_with_background + label_offset;
+                int *class_indices = sorted_class_indices.data() + row * num_categories_per_anchor;
+
+                // DecreasingPartialArgSort
+                if (num_categories_per_anchor == 1)
+                {
+                    auto arg_max_vector = [&](const T *input_data, int size) {
+                        T max_value = input_data[0];
+                        int max_index = 0;
+                        for (int i = 1; i < size; ++i)
+                        {
+                            // const T curr_value = input_data[i];
+                            if (input_data[i] > max_value)
+                            {
+                                max_value = input_data[i];
+                                max_index = i;
+                            }
+                        }
+                        return max_index;
+                    };
+                    class_indices[0] = arg_max_vector(box_scores, num_classes);
+                }
+                else
+                {
+                    std::iota(class_indices, class_indices + num_classes, 0);
+                    std::partial_sort(
+                        class_indices, class_indices + num_categories_per_anchor, class_indices + num_classes,
+                        [&box_scores](const int i, const int j) { return box_scores[i] > box_scores[j]; });
+                }
+                // end DecreasingPartialArgSort
+
+                max_scores[row] = box_scores[class_indices[0]];
+            }
+            std::vector<int> selected;
+            // NMS SingleClass
+            {
+                std::vector<int> keep_indices;
+                std::vector<float> keep_scores;
+                // select detection box score above score threshold
+                {
+                    for (size_t i = 0; i < max_scores.size(); i++)
+                    {
+                        if (max_scores[i] >= nms_score_threshold)
+                        {
+                            keep_scores.emplace_back(max_scores[i]);
+                            keep_indices.emplace_back(i);
+                        }
+                    }
+                }
+
+                int num_scores_kept = (int)keep_scores.size();
+                std::vector<int> sorted_indices;
+                sorted_indices.resize(num_scores_kept);
+                // DecreasingArgSort
+                {
+                    std::iota(sorted_indices.begin(), sorted_indices.begin() + num_scores_kept, 0);
+                    std::stable_sort(
+                        sorted_indices.begin(), sorted_indices.begin() + num_scores_kept,
+                        [&keep_scores](const int i, const int j) { return keep_scores[i] > keep_scores[j]; });
+                }
+                const int output_size = std::min(num_scores_kept, max_detections);
+                selected.clear();
+                int num_active_candidate = num_scores_kept;
+                std::vector<uint8_t> active_box_candidate(num_scores_kept, 1);
+                for (int i = 0; i < num_scores_kept; ++i)
+                {
+                    if (num_active_candidate == 0 || (int)selected.size() >= output_size)
+                        break;
+                    if (active_box_candidate[i] == 1)
+                    {
+                        selected.push_back(keep_indices[sorted_indices[i]]);
+                        active_box_candidate[i] = 0;
+                        num_active_candidate--;
+                    }
+                    else
+                    {
+                        continue;
+                    }
+                    for (int j = i + 1; j < num_scores_kept; ++j)
+                    {
+                        if (active_box_candidate[j] == 1)
+                        {
+
+                            float iou = compute_iou(
+                                decoded_boxes, keep_indices[sorted_indices[i]],
+                                keep_indices[sorted_indices[j]]);
+                            if (iou > nms_iou_threshold)
+                            {
+                                active_box_candidate[j] = 0;
+                                num_active_candidate--;
+                            }
+                        }
+                    }
+                }
+            }
+            // end NMS SingleClass
+
+            // Allocate output tensors
+            int output_box_index = 0;
+            for (const auto &selected_index : selected)
+            {
+                const float *box_scores = scores + selected_index * num_classes_with_background + label_offset;
+                const int *class_indices = sorted_class_indices.data() + selected_index * num_categories_per_anchor;
+
+                for (int col = 0; col < num_categories_per_anchor; ++col)
+                {
+                    int box_offset = max_categories_per_anchor * output_box_index + col;
+                    // detection_boxes
+                    reinterpret_cast<BoxCornerEncoding *>(output_locations)[box_offset] = decoded_boxes[selected_index];
+                    // detection_classes
+                    output_classes[box_offset] = class_indices[col];
+                    // detection_scores
+                    output_scores[box_offset] = box_scores[class_indices[col]];
+                }
+                output_box_index++;
+            }
+            output_num_detections[0] = output_box_index;
+        }
+    }
+
+    return ok();
+}
diff --git a/src/kernels/cpu/reference/unary.cpp b/src/kernels/cpu/reference/unary.cpp
index a9c242d483..53c07b0aad 100644
--- a/src/kernels/cpu/reference/unary.cpp
+++ b/src/kernels/cpu/reference/unary.cpp
@@ -62,6 +62,7 @@ result<void> reference::unary(unary_op_t op, const float *input, float *output,
         UNARY_IMPL(unary_sqrt, sqrtf);
         UNARY_IMPL(unary_square, [](float v) { return v * v; });
         UNARY_IMPL(unary_tanh, tanhf);
+        UNARY_IMPL(unary_erf, erff);
     default:
         return err(std::errc::not_supported);
     }
diff --git a/src/kernels/tensor_compute.cpp b/src/kernels/tensor_compute.cpp
index e60decc722..9e29a234e7 100644
--- a/src/kernels/tensor_compute.cpp
+++ b/src/kernels/tensor_compute.cpp
@@ -226,7 +226,7 @@ result<void> kernels::binary(binary_op_t op, const T *input_a, const T *input_b,
 result<void> kernels::unary(unary_op_t op, const float *input, float *output, const runtime_shape_t &shape,
     const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context) noexcept
 {
-    if (is_contiguous(shape, in_strides) && is_contiguous(shape, out_strides) && is_optimized_unary_op(op))
+    if (is_contiguous(shape, in_strides) && is_contiguous(shape, out_strides))
     {
         // optimization
         return cpu::optimized::unary(op, input, output, shape, in_strides, out_strides, context);
@@ -240,11 +240,15 @@ template result<void> kernels::reduce<float>(reduce_op_t op, float init_value, c
 template result<void> kernels::reduce<int32_t>(reduce_op_t op, int32_t init_value, const int32_t *input, int32_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
     const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept;
 
+template result<void> kernels::reduce<int64_t>(reduce_op_t op, int64_t init_value, const int64_t *input, int64_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
+    const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept;
+
 template <typename T>
 result<void> kernels::reduce(reduce_op_t op, T init_value, const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis,
     const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept
 {
-    return cpu::reference::reduce(op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context);
+    // return cpu::reference::reduce(op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context);
+    return cpu::optimized::reduce(op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context);
 }
 
 template result<void> kernels::reduce_arg<int32_t>(reduce_arg_op_t op, const float *input, int32_t *output, const runtime_shape_t &in_shape,
@@ -433,13 +437,18 @@ template result<void> kernels::ternary<float>(const float *input_a, const float
     const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
     const runtime_shape_t &out_strides) noexcept;
 
+template result<void> kernels::ternary<int64_t>(const float *input_a, const int64_t *input_b, const int64_t *input_c, int64_t *output,
+    const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
+    const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
+    const runtime_shape_t &out_strides) noexcept;
+
 template <typename T>
 result<void> kernels::ternary(const float *input_a, const T *input_b, const T *input_c, T *output,
     const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape,
     const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides,
     const runtime_shape_t &out_strides) noexcept
 {
-    return cpu::reference::ternary(input_a, input_b, input_c, output, in_a_shape, in_a_strides, in_b_shape, in_b_strides,
+    return cpu::optimized::ternary(input_a, input_b, input_c, output, in_a_shape, in_a_strides, in_b_shape, in_b_strides,
         in_c_shape, in_c_strides, out_strides);
 }
 
@@ -467,3 +476,72 @@ result<void> kernels::trilu(const T *input, T *output, const runtime_shape_t &in
 {
     return cpu::reference::trilu(input, output, in_shape, upper, k);
 }
+
+template result<void> kernels::gru<float>(const float *input, const float *w, const float *r, const float *b, float *initial_h, float *output, float *output_h, const runtime_shape_t &input_shape, const runtime_shape_t &w_shape, int mode, bool linear_before_reset) noexcept;
+
+template <typename T>
+result<void> kernels::gru(const T *input, const T *w, const T *r, const T *b, T *initial_h, T *output, T *output_h, const runtime_shape_t &input_shape, const runtime_shape_t &w_shape, int mode, bool linear_before_reset) noexcept
+{
+    return cpu::reference::gru(input, w, r, b, initial_h, output, output_h, input_shape, w_shape, mode, linear_before_reset);
+}
+
+template result<void> kernels::tflite_detection_postprocess<float>(const float *boxes, const float *scores, const float *anchors, float *output_locations, float *output_classes, float *output_scores, float *output_num_detections,
+    const runtime_shape_t &boxes_shape, const runtime_shape_t &scores_shape, const runtime_shape_t &anchors_shape,
+    const int32_t max_detections, const int32_t max_classes_per_detection, const int32_t detections_per_class,
+    const bool use_regular_non_max_suppression, const float nms_score_threshold, const float nms_iou_threshold,
+    const int32_t num_classes, const float y_scale, const float x_scale, const float h_scale, const float w_scale) noexcept;
+
+template <typename T>
+result<void> kernels::tflite_detection_postprocess(const T *boxes, const T *scores, const T *anchors, T *output_locations, T *output_classes, T *output_scores, T *output_num_detections,
+    const runtime_shape_t &boxes_shape, const runtime_shape_t &scores_shape, const runtime_shape_t &anchors_shape,
+    const int32_t max_detections, const int32_t max_classes_per_detection, const int32_t detections_per_class,
+    const bool use_regular_non_max_suppression, const float nms_score_threshold, const float nms_iou_threshold,
+    const int32_t num_classes, const float y_scale, const float x_scale, const float h_scale, const float w_scale) noexcept
+{
+    return cpu::reference::tflite_detection_postprocess(boxes, scores, anchors, output_locations, output_classes, output_scores, output_num_detections,
+        boxes_shape, scores_shape, anchors_shape,
+        max_detections, max_classes_per_detection, detections_per_class,
+        use_regular_non_max_suppression, nms_score_threshold, nms_iou_threshold,
+        num_classes, y_scale, x_scale, h_scale, w_scale);
+}
+
+result<void> kernels::space_to_batch(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &block_shape, const runtime_paddings_t &crops, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context) noexcept
+{
+    return cpu::reference::space_to_batch(type, input, output, in_shape, block_shape, crops, in_strides, out_strides, context);
+}
+
+template result<void> kernels::gather_elements(const float *input, const int64_t *indices, float *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &indices_shape, const int axis) noexcept;
+
+template <typename TI, typename TK>
+result<void> kernels::gather_elements(const TI *input, const TK *indices, TI *output, const runtime_shape_t &in_shape,
+    const runtime_shape_t &indices_shape, const int axis) noexcept
+{
+    return cpu::reference::gather_elements(input, indices, output, in_shape, indices_shape, axis);
+}
+
+template result<void> kernels::instancenorm<float>(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, float epsilon) noexcept;
+
+template <typename T>
+result<void> kernels::instancenorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, float epsilon) noexcept
+{
+    return cpu::optimized::instancenorm(input, output, scale, bias, in_shape, epsilon);
+}
+
+template result<void> kernels::layernorm<float>(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept;
+
+template <typename T>
+result<void> kernels::layernorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept
+{
+    // return cpu::reference::layernorm(input, output, scale, bias, in_shape, axis, epsilon);
+    return cpu::optimized::layernorm(input, output, scale, bias, in_shape, axis, epsilon);
+}
+
+template result<void> kernels::compress<float>(const float *input, const uint8_t *condition, float *output, const runtime_shape_t &input_shape, const runtime_shape_t &condition_shape, const int axis) noexcept;
+
+template <typename T>
+result<void> kernels::compress(const T *input, const uint8_t *condition, T *output, const runtime_shape_t &input_shape, const runtime_shape_t &condition_shape, const int axis) noexcept
+{
+    return cpu::reference::compress(input, condition, output, input_shape, condition_shape, axis);
+}
diff --git a/src/nncase/compiler.cpp b/src/nncase/compiler.cpp
index c6c6e95466..d75a2c74ff 100644
--- a/src/nncase/compiler.cpp
+++ b/src/nncase/compiler.cpp
@@ -488,13 +488,16 @@ class compiler_impl : public compiler
             pmgr.add_pass<add_copy_to_concat_pass>();
             pmgr.add_pass<add_copy_to_slice_pass>();
             pmgr.add_pass<add_copy_to_output_pass>();
+            pmgr.add_pass<add_copy_to_bitcast_pass>();
 
             transform_pass pass("optimize_copy");
             pass.emplace<remove_exclusive_copy_to_output_transform>();
             pass.emplace<remove_simple_copy_from_slice_transform>();
             pass.emplace<remove_non_simple_copy_from_slice_transform>();
             pass.emplace<remove_exclusive_copy_to_concat_transform>();
-            pmgr.add_pass(std::move(pass)); });
+            pass.emplace<remove_exclusive_copy_to_bitcast_transform>();
+            pmgr.add_pass(std::move(pass));
+        });
     }
 
     void optimize_target_dependent_after_buffer_fusion(ir::graph &graph)
diff --git a/src/runtime/op_profile.cpp b/src/runtime/op_profile.cpp
index 0cd8633951..6f717f6d51 100644
--- a/src/runtime/op_profile.cpp
+++ b/src/runtime/op_profile.cpp
@@ -32,13 +32,34 @@ void op_profile::print()
         [=](std::pair<std::string, double> &a, std::pair<std::string, double> &b) { return a.second > b.second; });
 
     std::cout << "stackvm OPs profile" << std::endl;
-    std::cout << std::setw(24) << std::left << "stackvm tensor op" << std::setw(12) << std::left << "timing(ms)"
-              << std::setw(12) << std::left << "percent(%)" << std::endl;
+    std::cout << "|" << std::setw(30) << std::left << "stackvm tensor op"
+              << "|" << std::setw(12) << std::left << "timing(ms)"
+              << "|" << std::setw(12) << std::left << "percent(%)"
+              << "|" << std::endl;
+
+    std::cout << "|" << std::setw(30) << std::left << "---"
+              << "|" << std::setw(12) << std::left << "---"
+              << "|" << std::setw(12) << std::left << "---"
+              << "|" << std::endl;
+#if !defined(__riscv)
+    double convert_number = 1.0f;
+#else
+    double convert_number = RISCVFREQUENCY / 1000.0f;
+#endif
+
     for (auto e : v)
     {
-        std::cout << std::setw(24) << std::left << e.first << std::setw(12) << std::left << e.second
-                  << std::setw(12) << std::left << e.second / total * 100 << std::endl;
+        std::cout << "|" << std::setw(30) << std::left << e.first
+                  << "|" << std::setw(12) << std::left << e.second / convert_number
+                  << "|" << std::setw(12) << std::left << e.second / total * 100
+                  << "|" << std::endl;
     }
-    std::cout << std::setw(24) << std::left << "total" << std::setw(12) << std::left << total
-              << std::setw(12) << std::left << total / total * 100 << std::endl;
+
+    std::cout << "|" << std::setw(30) << std::left << "total"
+              << "|" << std::setw(12) << std::left << total / convert_number
+              << "|" << std::setw(12) << std::left << total / total * 100
+              << "|" << std::endl
+              << std::endl;
+
+    op_timing_.clear();
 }
\ No newline at end of file
diff --git a/src/runtime/stackvm/CMakeLists.txt b/src/runtime/stackvm/CMakeLists.txt
index a26e7675c8..7aaeaf3755 100644
--- a/src/runtime/stackvm/CMakeLists.txt
+++ b/src/runtime/stackvm/CMakeLists.txt
@@ -1,48 +1,55 @@
-﻿cmake_minimum_required (VERSION 3.13)
+﻿cmake_minimum_required(VERSION 3.13)
 
 set(SRCS runtime_module.cpp
-         runtime_function.cpp
-         op_reader.cpp
-         evaluate_stack.cpp
-         ops/control.cpp
-         ops/loadstore.cpp
-         ops/stack.cpp
-         ops/scalar.cpp
-         ops/conversion.cpp
-         ops/tensor.batch_to_space.cpp
-         ops/tensor.binary.cpp
-         ops/tensor.broadcast.cpp
-         ops/tensor.call.cpp
-         ops/tensor.compare.cpp
-         ops/tensor.conv2d.cpp
-         ops/tensor.convert.cpp
-         ops/tensor.copy.cpp
-         ops/tensor.cumsum.cpp
-         ops/tensor.dequantize.cpp
-         ops/tensor.gather.cpp
-         ops/tensor.gather_nd.cpp
-         ops/tensor.hardmax.cpp
-         ops/tensor.lut1d.cpp
-         ops/tensor.matmul.cpp
-         ops/tensor.onehot.cpp
-         ops/tensor.pad.cpp
-         ops/tensor.quantize.cpp
-         ops/tensor.random_normal.cpp
-         ops/tensor.random_uniform.cpp
-         ops/tensor.reduce.cpp
-         ops/tensor.reduce_arg.cpp
-         ops/tensor.reduce_prod.cpp
-         ops/tensor.reduce_window2d.cpp
-         ops/tensor.resize_image.cpp
-         ops/tensor.roi_align.cpp
-         ops/tensor.sigmoid.cpp
-         ops/tensor.slice.cpp
-         ops/tensor.softmax.cpp
-         ops/tersor.ternary.cpp
-         ops/tensor.topk.cpp
-         ops/tensor.transpose.cpp
-         ops/tensor.trilu.cpp
-         ops/tensor.unary.cpp)
+        runtime_function.cpp
+        op_reader.cpp
+        evaluate_stack.cpp
+        ops/control.cpp
+        ops/loadstore.cpp
+        ops/stack.cpp
+        ops/scalar.cpp
+        ops/conversion.cpp
+        ops/tensor.batch_to_space.cpp
+        ops/tensor.binary.cpp
+        ops/tensor.broadcast.cpp
+        ops/tensor.call.cpp
+        ops/tensor.compare.cpp
+        ops/tensor.compress.cpp
+        ops/tensor.conv2d.cpp
+        ops/tensor.convert.cpp
+        ops/tensor.copy.cpp
+        ops/tensor.cumsum.cpp
+        ops/tensor.dequantize.cpp
+        ops/tensor.gather.cpp
+        ops/tensor.gather_elements.cpp
+        ops/tensor.gather_nd.cpp
+        ops/tensor.gru.cpp
+        ops/tensor.hardmax.cpp
+        ops/tensor.lut1d.cpp
+        ops/tensor.matmul.cpp
+        ops/tensor.onehot.cpp
+        ops/tensor.pad.cpp
+        ops/tensor.quantize.cpp
+        ops/tensor.random_normal.cpp
+        ops/tensor.random_uniform.cpp
+        ops/tensor.reduce.cpp
+        ops/tensor.reduce_arg.cpp
+        ops/tensor.reduce_prod.cpp
+        ops/tensor.reduce_window2d.cpp
+        ops/tensor.resize_image.cpp
+        ops/tensor.roi_align.cpp
+        ops/tensor.sigmoid.cpp
+        ops/tensor.slice.cpp
+        ops/tensor.softmax.cpp
+        ops/tensor.space_to_batch.cpp
+        ops/tersor.ternary.cpp
+        ops/tensor.topk.cpp
+        ops/tensor.transpose.cpp
+        ops/tensor.trilu.cpp
+        ops/tensor.tflite_detection_postprocess.cpp
+        ops/tensor.unary.cpp
+        ops/tensor.layernorm.cpp
+        ops/tensor.instancenorm.cpp)
 
 if (BUILDING_RUNTIME)
     add_library(runtime_stackvm OBJECT ${SRCS})
@@ -50,9 +57,9 @@ if (BUILDING_RUNTIME)
     target_link_libraries(runtime_stackvm PRIVATE kernels)
     set_property(TARGET runtime_stackvm PROPERTY POSITION_INDEPENDENT_CODE ON)
     install(TARGETS runtime_stackvm EXPORT nncaseruntimeTargets)
-else()
+else ()
     add_library(simulator_stackvm OBJECT ${SRCS})
     target_link_libraries(simulator_stackvm PUBLIC simulator)
     target_link_libraries(simulator_stackvm PRIVATE kernels)
     set_property(TARGET simulator_stackvm PROPERTY POSITION_INDEPENDENT_CODE ON)
-endif()
+endif ()
diff --git a/src/runtime/stackvm/evaluate_stack.h b/src/runtime/stackvm/evaluate_stack.h
index e408c2bc2a..c9a7f738c9 100644
--- a/src/runtime/stackvm/evaluate_stack.h
+++ b/src/runtime/stackvm/evaluate_stack.h
@@ -85,6 +85,7 @@ class stack_entry
     int8_t as_i1() const noexcept { return (int8_t)i_; }
     int16_t as_i2() const noexcept { return (int16_t)i_; }
     int32_t as_i4() const noexcept { return (int32_t)i_; }
+    int64_t as_i8() const noexcept { return (int64_t)i_; }
     uintptr_t as_u() const noexcept { return (uintptr_t)i_; }
     intptr_t as_i() const noexcept { return i_; }
 
diff --git a/src/runtime/stackvm/op_reader.cpp b/src/runtime/stackvm/op_reader.cpp
index c7509e4480..e36fcabf11 100644
--- a/src/runtime/stackvm/op_reader.cpp
+++ b/src/runtime/stackvm/op_reader.cpp
@@ -1,4 +1,4 @@
-/* This file is generated by tools/stackvm_gen/IsaGen at 4/25/2022 3:29:27 PM +08:00.
+/* This file is generated by tools/stackvm_gen/IsaGen at 2023/5/9 下午5:18:43 +08:00.
  *
  * Copyright 2019-2021 Canaan Inc.
  *
@@ -232,6 +232,13 @@ result<void> op_visitor::next() noexcept
 #endif
             return visit(op_reader<tensor_softmax_op_t>()(reader_));
         }
+        case tensor_function_t::SPACE_TO_BATCH:
+        {
+#if defined ENABLE_OP_PROFILE
+            op_profile st("tensor_space_to_batch");
+#endif
+            return visit(op_reader<tensor_space_to_batch_op_t>()(reader_));
+        }
         case tensor_function_t::TERNARY:
         {
 #if defined ENABLE_OP_PROFILE
@@ -267,6 +274,48 @@ result<void> op_visitor::next() noexcept
 #endif
             return visit(op_reader<tensor_transpose_op_t>()(reader_));
         }
+        case tensor_function_t::GRU:
+        {
+#if defined ENABLE_OP_PROFILE
+            op_profile st("tensor_gru");
+#endif
+            return visit(op_reader<tensor_gru_op_t>()(reader_));
+        }
+        case tensor_function_t::TFLITE_DETECTION_POSTPROCESS:
+        {
+#if defined ENABLE_OP_PROFILE
+            op_profile st("tensor_tflite_detection_postprocess");
+#endif
+            return visit(op_reader<tensor_tflite_detection_postprocess_op_t>()(reader_));
+        }
+        case tensor_function_t::LAYER_NORMALIZATION:
+        {
+#if defined ENABLE_OP_PROFILE
+            op_profile st("tensor_layer_normalization");
+#endif
+            return visit(op_reader<tensor_layer_normalization_op_t>()(reader_));
+        }
+        case tensor_function_t::COMPRESS:
+        {
+#if defined ENABLE_OP_PROFILE
+            op_profile st("tensor_compress");
+#endif
+            return visit(op_reader<tensor_compress_op_t>()(reader_));
+        }
+        case tensor_function_t::GATHER_ELEMENTS:
+        {
+#if defined ENABLE_OP_PROFILE
+            op_profile st("tensor_gather_elements");
+#endif
+            return visit(op_reader<tensor_gather_elements_op_t>()(reader_));
+        }
+        case tensor_function_t::INSTANCE_NORMALIZATION:
+        {
+#if defined ENABLE_OP_PROFILE
+            op_profile st("tensor_instance_normalization");
+#endif
+            return visit(op_reader<tensor_instance_normalization_op_t>()(reader_));
+        }
         default:
             break;
         }
@@ -480,8 +529,7 @@ result<void> op_visitor::visit(gsl::span<const gsl::byte> text) noexcept
         try_(next());
 
 #ifdef ENABLE_OP_PROFILE
-    op_profile profile_time;
-    profile_time.print();
+    op_profile::print();
 #endif
 
     return ok();
diff --git a/src/runtime/stackvm/ops/tensor.compress.cpp b/src/runtime/stackvm/ops/tensor.compress.cpp
new file mode 100644
index 0000000000..0e1a9688ed
--- /dev/null
+++ b/src/runtime/stackvm/ops/tensor.compress.cpp
@@ -0,0 +1,34 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../runtime_function.h"
+#include <iostream>
+#include <nncase/kernels/tensor_compute.h>
+#include <nncase/runtime/debug.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::runtime::stackvm;
+
+result<void> stackvm_runtime_function::visit(const tensor_compress_op_t &op) noexcept
+{
+    try_var(output, pop_addr());
+    try_var(condition, pop_addr());
+    try_var(input, pop_addr());
+    try_var(input_shape, module().shape_reg(op.input_shape_src));
+    try_var(condition_shape, module().shape_reg(op.condition_shape_src));
+
+    return kernels::compress(reinterpret_cast<const float *>(input), reinterpret_cast<const uint8_t *>(condition),
+        reinterpret_cast<float *>(output), input_shape, condition_shape, op.axis);
+}
diff --git a/src/runtime/stackvm/ops/tensor.gather_elements.cpp b/src/runtime/stackvm/ops/tensor.gather_elements.cpp
new file mode 100644
index 0000000000..b668a35911
--- /dev/null
+++ b/src/runtime/stackvm/ops/tensor.gather_elements.cpp
@@ -0,0 +1,35 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../runtime_function.h"
+#include <nncase/kernels/tensor_compute.h>
+#include <nncase/runtime/interpreter.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::runtime::stackvm;
+
+result<void> stackvm_runtime_function::visit(const tensor_gather_elements_op_t &op) noexcept
+{
+    try_var(output, pop_addr());
+    try_var(indices, pop_addr());
+    try_var(input, pop_addr());
+
+    try_var(in_shape, module().shape_reg(op.input_shape_src));
+    try_var(indices_shape, module().shape_reg(op.indices_shape_src));
+
+    return kernels::gather_elements(reinterpret_cast<const float *>(input), reinterpret_cast<const int64_t *>(indices),
+        reinterpret_cast<float *>(output), in_shape, indices_shape, op.axis);
+}
\ No newline at end of file
diff --git a/src/runtime/stackvm/ops/tensor.gru.cpp b/src/runtime/stackvm/ops/tensor.gru.cpp
new file mode 100644
index 0000000000..e80c5e44ae
--- /dev/null
+++ b/src/runtime/stackvm/ops/tensor.gru.cpp
@@ -0,0 +1,41 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../runtime_function.h"
+#include <nncase/kernels/tensor_compute.h>
+#include <nncase/runtime/interpreter.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::runtime::stackvm;
+
+result<void> stackvm_runtime_function::visit(const tensor_gru_op_t &op) noexcept
+{
+    try_var(output_h, pop_addr());
+    try_var(output, pop_addr());
+    try_var(initial_h, pop_addr());
+    try_var(b, pop_addr());
+    try_var(r, pop_addr());
+    try_var(w, pop_addr());
+    try_var(input, pop_addr());
+
+    try_var(in_shape, module().shape_reg(op.input_shape_src));
+    try_var(w_shape, module().shape_reg(op.w_shape_src));
+
+    return kernels::gru(reinterpret_cast<const float *>(input), reinterpret_cast<const float *>(w),
+        reinterpret_cast<const float *>(r), reinterpret_cast<const float *>(b),
+        reinterpret_cast<float *>(initial_h), reinterpret_cast<float *>(output),
+        reinterpret_cast<float *>(output_h), in_shape, w_shape, op.direction, op.linear_before_reset);
+}
\ No newline at end of file
diff --git a/src/runtime/stackvm/ops/tensor.instancenorm.cpp b/src/runtime/stackvm/ops/tensor.instancenorm.cpp
new file mode 100644
index 0000000000..ebcaa1ced7
--- /dev/null
+++ b/src/runtime/stackvm/ops/tensor.instancenorm.cpp
@@ -0,0 +1,42 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../runtime_function.h"
+#include <iostream>
+#include <nncase/kernels/tensor_compute.h>
+#include <nncase/runtime/debug.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::runtime::stackvm;
+
+result<void> stackvm_runtime_function::visit(const tensor_instance_normalization_op_t &op) noexcept
+{
+    try_var(output, pop_addr());
+    try_var(bias, pop_addr());
+    try_var(scale, pop_addr());
+    try_var(input, pop_addr());
+    try_var(in_shape, module().shape_reg(op.input_shape));
+
+    switch (op.datatype)
+    {
+    case dt_float32:
+        return kernels::instancenorm(reinterpret_cast<const float *>(input), reinterpret_cast<float *>(output),
+            reinterpret_cast<float *>(scale), reinterpret_cast<float *>(bias), in_shape, op.epsilon);
+        break;
+    default:
+        std::cerr << "unsupported dtype for instancenorm: " + std::string(datatype_names(op.datatype));
+        return err(std::errc::invalid_argument);
+    }
+}
diff --git a/src/runtime/stackvm/ops/tensor.layernorm.cpp b/src/runtime/stackvm/ops/tensor.layernorm.cpp
new file mode 100644
index 0000000000..547edd658c
--- /dev/null
+++ b/src/runtime/stackvm/ops/tensor.layernorm.cpp
@@ -0,0 +1,42 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../runtime_function.h"
+#include <iostream>
+#include <nncase/kernels/tensor_compute.h>
+#include <nncase/runtime/debug.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::runtime::stackvm;
+
+result<void> stackvm_runtime_function::visit(const tensor_layer_normalization_op_t &op) noexcept
+{
+    try_var(output, pop_addr());
+    try_var(bias, pop_addr());
+    try_var(scale, pop_addr());
+    try_var(input, pop_addr());
+    try_var(in_shape, module().shape_reg(op.input_shape));
+
+    switch (op.datatype)
+    {
+    case dt_float32:
+        return kernels::layernorm(reinterpret_cast<const float *>(input), reinterpret_cast<float *>(output),
+            reinterpret_cast<float *>(scale), reinterpret_cast<float *>(bias), in_shape, op.axis, op.epsilon);
+        break;
+    default:
+        std::cerr << "unsupported dtype for layernorm: " + std::string(datatype_names(op.datatype));
+        return err(std::errc::invalid_argument);
+    }
+}
diff --git a/src/runtime/stackvm/ops/tensor.reduce.cpp b/src/runtime/stackvm/ops/tensor.reduce.cpp
index 19d1f71a17..10b9b18a68 100644
--- a/src/runtime/stackvm/ops/tensor.reduce.cpp
+++ b/src/runtime/stackvm/ops/tensor.reduce.cpp
@@ -41,6 +41,10 @@ result<void> stackvm_runtime_function::visit(const tensor_reduce_op_t &op) noexc
         return kernels::reduce(op.reduce_op, init_value.as_i4(), reinterpret_cast<const int32_t *>(input),
             reinterpret_cast<int32_t *>(output), in_shape, axis, in_strides, out_strides, op.keep_dims, module().kernel_context());
         break;
+    case dt_int64:
+        return kernels::reduce(op.reduce_op, init_value.as_i8(), reinterpret_cast<const int64_t *>(input),
+            reinterpret_cast<int64_t *>(output), in_shape, axis, in_strides, out_strides, op.keep_dims, module().kernel_context());
+        break;
     default:
         std::cerr << "unsupported dtype for reduce: " + std::string(datatype_names(op.datatype)) << std::endl;
         return err(std::errc::invalid_argument);
diff --git a/src/runtime/stackvm/ops/tensor.space_to_batch.cpp b/src/runtime/stackvm/ops/tensor.space_to_batch.cpp
new file mode 100644
index 0000000000..b8041042a2
--- /dev/null
+++ b/src/runtime/stackvm/ops/tensor.space_to_batch.cpp
@@ -0,0 +1,34 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../runtime_function.h"
+#include <nncase/kernels/tensor_compute.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::runtime::stackvm;
+
+result<void> stackvm_runtime_function::visit(const tensor_space_to_batch_op_t &op) noexcept
+{
+    try_var(output, pop_addr());
+    try_var(input, pop_addr());
+    try_var(in_shape, module().shape_reg(op.rshape_src));
+    try_var(block_shape, module().shape_reg(op.rshape_block));
+    try_var(crops, module().paddings_reg(op.rpad_crops));
+    try_var(in_strides, module().shape_reg(op.rstride_src));
+    try_var(out_strides, module().shape_reg(op.rstride_dest));
+
+    return kernels::space_to_batch(op.datatype, reinterpret_cast<const gsl::byte *>(input), reinterpret_cast<gsl::byte *>(output),
+        in_shape, block_shape, crops, in_strides, out_strides, module().kernel_context());
+}
diff --git a/src/runtime/stackvm/ops/tensor.tflite_detection_postprocess.cpp b/src/runtime/stackvm/ops/tensor.tflite_detection_postprocess.cpp
new file mode 100644
index 0000000000..633b6305fa
--- /dev/null
+++ b/src/runtime/stackvm/ops/tensor.tflite_detection_postprocess.cpp
@@ -0,0 +1,44 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../runtime_function.h"
+#include <nncase/kernels/tensor_compute.h>
+#include <nncase/runtime/interpreter.h>
+#include <nncase/runtime/runtime_op_utility.h>
+
+using namespace nncase;
+using namespace nncase::runtime;
+using namespace nncase::runtime::stackvm;
+
+result<void> stackvm_runtime_function::visit(const tensor_tflite_detection_postprocess_op_t &op) noexcept
+{
+    try_var(output_num_detections, pop_addr());
+    try_var(output_scores, pop_addr());
+    try_var(output_classes, pop_addr());
+    try_var(output_locations, pop_addr());
+    try_var(anchor, pop_addr());
+    try_var(score, pop_addr());
+    try_var(box, pop_addr());
+
+    try_var(box_shape, module().shape_reg(op.box_shape_src));
+    try_var(score_shape, module().shape_reg(op.score_shape_src));
+    try_var(anchor_shape, module().shape_reg(op.anchor_shape_src));
+
+    return kernels::tflite_detection_postprocess(reinterpret_cast<const float *>(box), reinterpret_cast<const float *>(score),
+        reinterpret_cast<const float *>(anchor), reinterpret_cast<float *>(output_locations),
+        reinterpret_cast<float *>(output_classes), reinterpret_cast<float *>(output_scores),
+        reinterpret_cast<float *>(output_num_detections), box_shape, score_shape, anchor_shape, op.max_detections, op.max_classes_per_detection, op.detections_per_class,
+        op.use_regular_non_max_suppression, op.nms_score_threshold, op.nms_iou_threshold,
+        op.num_classes, op.y_scale, op.x_scale, op.h_scale, op.w_scale);
+}
diff --git a/src/runtime/stackvm/runtime_function.cpp b/src/runtime/stackvm/runtime_function.cpp
index 7942eec925..bd58bf0702 100644
--- a/src/runtime/stackvm/runtime_function.cpp
+++ b/src/runtime/stackvm/runtime_function.cpp
@@ -143,13 +143,19 @@ result<runtime_tensor> stackvm_runtime_function::create_tensor(uintptr_t addr, d
 {
     hrt::memory_pool_t pool;
     uintptr_t physical_address = 0;
-    if (addr >= reinterpret_cast<uintptr_t>(module().data().begin())
-        && addr < reinterpret_cast<uintptr_t>(module().data().end()))
+    auto data_span = module().data();
+    auto rdata_span = module().rdata();
+
+    if (addr >= reinterpret_cast<uintptr_t>(data_span.begin())
+        && addr < reinterpret_cast<uintptr_t>(data_span.end()))
     {
-        pool = hrt::pool_cpu_only;
+        auto &tensor = module().data_tensor();
+        auto &block = static_cast<const detail::host_runtime_tensor_impl *>(tensor.impl())->memory_block();
+        pool = block.pool;
+        physical_address = block.physical_block.physical_address + (addr - block.virtual_address);
     }
-    else if (addr >= reinterpret_cast<uintptr_t>(module().rdata().begin())
-        && addr < reinterpret_cast<uintptr_t>(module().rdata().end()))
+    else if (addr >= reinterpret_cast<uintptr_t>(rdata_span.begin())
+        && addr < reinterpret_cast<uintptr_t>(rdata_span.end()))
     {
         pool = hrt::pool_cpu_only;
     }
diff --git a/src/runtime/stackvm/runtime_function.h b/src/runtime/stackvm/runtime_function.h
index 6b4ca2cce0..0e10304d41 100644
--- a/src/runtime/stackvm/runtime_function.h
+++ b/src/runtime/stackvm/runtime_function.h
@@ -142,14 +142,17 @@ class stackvm_runtime_function : public runtime_function, private op_visitor
     result<void> visit(const tensor_broadcast_op_t &op) noexcept override;
     result<void> visit(const tensor_call_op_t &op) noexcept override;
     result<void> visit(const tensor_compare_op_t &op) noexcept override;
+    result<void> visit(const tensor_compress_op_t &op) noexcept override;
     result<void> visit(const tensor_conv2d_op_t &op) noexcept override;
     result<void> visit(const tensor_convert_op_t &op) noexcept override;
     result<void> visit(const tensor_copy_op_t &op) noexcept override;
     result<void> visit(const tensor_cumsum_op_t &op) noexcept override;
     result<void> visit(const tensor_dequantize_op_t &op) noexcept override;
-    result<void> visit(const tensor_gather_op_t &op) noexcept override;
     result<void> visit(const tensor_hardmax_op_t &op) noexcept override;
+    result<void> visit(const tensor_gather_op_t &op) noexcept override;
+    result<void> visit(const tensor_gather_elements_op_t &op) noexcept override;
     result<void> visit(const tensor_gather_nd_op_t &op) noexcept override;
+    result<void> visit(const tensor_gru_op_t &op) noexcept override;
     result<void> visit(const tensor_lut1d_op_t &op) noexcept override;
     result<void> visit(const tensor_matmul_op_t &op) noexcept override;
     result<void> visit(const tensor_onehot_op_t &op) noexcept override;
@@ -166,11 +169,15 @@ class stackvm_runtime_function : public runtime_function, private op_visitor
     result<void> visit(const tensor_sigmoid_op_t &op) noexcept override;
     result<void> visit(const tensor_slice_op_t &op) noexcept override;
     result<void> visit(const tensor_softmax_op_t &op) noexcept override;
+    result<void> visit(const tensor_space_to_batch_op_t &op) noexcept override;
     result<void> visit(const tensor_ternary_op_t &op) noexcept override;
     result<void> visit(const tensor_topk_op_t &op) noexcept override;
     result<void> visit(const tensor_transpose_op_t &op) noexcept override;
     result<void> visit(const tensor_trilu_op_t &op) noexcept override;
+    result<void> visit(const tensor_tflite_detection_postprocess_op_t &op) noexcept override;
     result<void> visit(const tensor_unary_op_t &op) noexcept override;
+    result<void> visit(const tensor_layer_normalization_op_t &op) noexcept override;
+    result<void> visit(const tensor_instance_normalization_op_t &op) noexcept override;
 
 private:
     uintptr_t pc() const noexcept;
diff --git a/src/runtime/stackvm/runtime_module.cpp b/src/runtime/stackvm/runtime_module.cpp
index ca5a75b2f6..4805b93f76 100644
--- a/src/runtime/stackvm/runtime_module.cpp
+++ b/src/runtime/stackvm/runtime_module.cpp
@@ -24,7 +24,13 @@ using namespace nncase::runtime::stackvm;
 
 gsl::span<gsl::byte> stackvm_runtime_module::data() const noexcept
 {
-    return { data_.get(), mempool(mem_data).size };
+    if (!data_.empty())
+    {
+        auto &block = static_cast<const detail::host_runtime_tensor_impl *>(data_tensor().impl())->memory_block();
+        return block.virtual_buffer();
+    }
+
+    return {};
 }
 
 gsl::span<const gsl::byte> stackvm_runtime_module::rdata() const noexcept
@@ -32,15 +38,18 @@ gsl::span<const gsl::byte> stackvm_runtime_module::rdata() const noexcept
     return rdata_;
 }
 
+const runtime_tensor &stackvm_runtime_module::data_tensor() const noexcept
+{
+    return data_;
+}
+
 result<void> stackvm_runtime_module::initialize_before_functions(runtime_module_init_context &context) noexcept
 {
     assert(context.is_section_pinned());
     auto data_pool = mempool(mem_data);
     if (data_pool.size)
     {
-        data_.reset(new (std::nothrow) gsl::byte[data_pool.size]);
-        if (!data_)
-            return err(std::errc::not_enough_memory);
+        try_set(data_, hrt::create(dt_uint8, { data_pool.size }, hrt::pool_shared));
     }
 
     rdata_ = context.section(".rdata");
diff --git a/src/runtime/stackvm/runtime_module.h b/src/runtime/stackvm/runtime_module.h
index 33bca0d454..26e2d9be3b 100644
--- a/src/runtime/stackvm/runtime_module.h
+++ b/src/runtime/stackvm/runtime_module.h
@@ -29,6 +29,8 @@ class stackvm_runtime_module : public runtime_module
     gsl::span<gsl::byte> data() const noexcept;
     gsl::span<const gsl::byte> rdata() const noexcept;
 
+    const runtime_tensor &data_tensor() const noexcept;
+
     result<uintptr_t> reg(size_t id) const noexcept;
     result<void> reg(size_t id, uintptr_t value) noexcept;
 
@@ -43,7 +45,7 @@ class stackvm_runtime_module : public runtime_module
     result<std::unique_ptr<runtime_function>> create_function() noexcept override;
 
 private:
-    std::unique_ptr<gsl::byte[]> data_;
+    runtime_tensor data_;
     gsl::span<const gsl::byte> rdata_;
     std::array<uintptr_t, MAX_GENERAL_REGS> regs_;
     std::vector<runtime_shape_t> shape_regs_;
diff --git a/src/targets/neutral_target.cpp b/src/targets/neutral_target.cpp
index fecb71452b..e24a058528 100644
--- a/src/targets/neutral_target.cpp
+++ b/src/targets/neutral_target.cpp
@@ -21,12 +21,15 @@
 #include <nncase/transforms/neutral/binary_motion.h>
 #include <nncase/transforms/neutral/bitcast_motion.h>
 #include <nncase/transforms/neutral/dequantize_motion.h>
+#include <nncase/transforms/neutral/fix_output_shape.h>
 #include <nncase/transforms/neutral/fix_tflite_error_shape.h>
 #include <nncase/transforms/neutral/fold_bitcast.h>
 #include <nncase/transforms/neutral/fold_constant.h>
 #include <nncase/transforms/neutral/fold_conv2d_binary.h>
 #include <nncase/transforms/neutral/fold_convert.h>
 #include <nncase/transforms/neutral/fold_dilated_conv2d.h>
+#include <nncase/transforms/neutral/fold_instancenorm.h>
+#include <nncase/transforms/neutral/fold_layernorm.h>
 #include <nncase/transforms/neutral/fold_matmul_add.h>
 #include <nncase/transforms/neutral/fold_pad.h>
 #include <nncase/transforms/neutral/fold_quantize.h>
@@ -95,7 +98,7 @@ void neutral_target::add_default_transforms(ir::transforms::transform_pass &pass
     // pass.emplace<dequantize_transpose_motion_transform>();
     pass.emplace<dequantize_bitcast_motion_transform>();
     pass.emplace<dequantize_reshape_motion_transform>();
-    pass.emplace<dequantize_slice_motion_transform>();
+    // pass.emplace<dequantize_slice_motion_transform>();
     // pass.emplace<dequantize_pad_motion_transform>();
     pass.emplace<quantize_pad_motion_transform>();
     //    pass.emplace<quantize_transbin_motion_transform>();
@@ -110,6 +113,10 @@ void neutral_target::add_default_transforms(ir::transforms::transform_pass &pass
     pass.emplace<fold_pad_pad_transform>();
     pass.emplace<fold_pad_strided_slice_transform>();
 
+    pass.emplace<fold_layernorm_pattern1_transform>();
+    pass.emplace<fold_layernorm_pattern2_transform>();
+    pass.emplace<fold_layernorm_pattern3_transform>();
+
     pass.emplace<fold_bitcast_transform>();
 
     pass.emplace<fold_convert_transform>();
@@ -185,6 +192,31 @@ void neutral_target::register_target_independent_passes(const module_type_t &typ
     using namespace nncase::ir;
     using namespace nncase::ir::transforms;
 
+    {
+        transform_pass p("fold_instancenorm");
+        p.emplace<fold_instancenorm_transform>();
+        pass_mgr.add_pass(std::move(p));
+    }
+    // fix tflite_detection_postprocess shape error in tflite
+    {
+        transform_pass p("fix_shape_tdp");
+        p.emplace<tflite_detection_postprocess_transform>();
+        pass_mgr.add_pass(std::move(p));
+    }
+
+    // fold quant node in source model
+    {
+        transform_pass p("fold_quantize_in_source_model");
+        p.emplace<fold_quantize_transform>();
+        pass_mgr.add_pass(std::move(p));
+    }
+    // split to slice
+    {
+        transform_pass p("split_to_slice");
+        p.emplace<split_to_slice_transform>();
+        pass_mgr.add_pass(std::move(p));
+    }
+
     if (type == runtime::stackvm::stackvm_module_type)
     {
         // fold_pad_conv
@@ -223,16 +255,6 @@ void neutral_target::register_target_dependent_passes([[maybe_unused]] const mod
 
 void neutral_target::register_quantize_annotation_passes([[maybe_unused]] const module_type_t &type, ir::transforms::pass_manager &pass_mgr)
 {
-    {
-        transform_pass p("fuse_unary");
-        p.emplace<fuse_one_unary_transform>();
-        p.emplace<fuse_one_binary_transform>();
-        p.emplace<fuse_two_fused_unary_transform>();
-        p.emplace<fuse_one_fused_unary_with_binary_transform>();
-        p.emplace<fuse_two_fused_unary_with_binary_transform>();
-        pass_mgr.add_pass(std::move(p));
-    }
-
     {
         transform_pass p("annotate_neutral_quantize");
         p.emplace<add_quant_checkpoints_transform>(std::in_place, ir::op_fused_unary, ir::op_bitcast, ir::op_dequantize, ir::op_binary, ir::op_output_node);
diff --git a/src/transforms/neutral/CMakeLists.txt b/src/transforms/neutral/CMakeLists.txt
index 50bcb13d51..a6950b78c3 100644
--- a/src/transforms/neutral/CMakeLists.txt
+++ b/src/transforms/neutral/CMakeLists.txt
@@ -1,4 +1,4 @@
-﻿cmake_minimum_required (VERSION 3.13)
+﻿cmake_minimum_required(VERSION 3.13)
 
 target_sources(transforms PRIVATE
     add_quant_checkpoints.cpp
@@ -47,4 +47,8 @@ target_sources(transforms PRIVATE
     pad_conv.cpp
     merge_binary_before_conv.cpp
     fold_matmul_add.cpp
-    )
+    squeeze_dims.cpp
+    fix_output_shape.cpp
+    fold_layernorm.cpp
+    fold_instancenorm.cpp
+)
diff --git a/src/transforms/neutral/binary_motion.cpp b/src/transforms/neutral/binary_motion.cpp
index 6fe90106f3..1c429cd90b 100644
--- a/src/transforms/neutral/binary_motion.cpp
+++ b/src/transforms/neutral/binary_motion.cpp
@@ -81,6 +81,7 @@ void binary_reduce_window2d_motion_up_transform::process(transform_context &cont
     auto &old_b = static_cast<binary &>(*context.matched_nodes[3]);
 
     auto b = context.graph.emplace<binary>(old_b.binary_op(), conv.output().type(), conv.output().shape(), c.output().shape(), old_b.fused_activation());
+    b->attributes(old_b.attributes());
     b->name(old_b.name());
     b->input_a().connect(conv.output());
     b->input_b().connect(c.output());
diff --git a/src/transforms/neutral/fix_output_shape.cpp b/src/transforms/neutral/fix_output_shape.cpp
new file mode 100644
index 0000000000..6df7642621
--- /dev/null
+++ b/src/transforms/neutral/fix_output_shape.cpp
@@ -0,0 +1,93 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <nncase/ir/ops/constant.h>
+#include <nncase/ir/ops/tflite_detection_postprocess.h>
+#include <nncase/ir/visitor.h>
+#include <nncase/transforms/neutral/fix_output_shape.h>
+
+using namespace nncase;
+using namespace nncase::ir;
+using namespace nncase::ir::transforms;
+
+bool tflite_detection_postprocess_transform::on_try_match(node &node, transform_context &context)
+{
+    if (auto tdp = node_cast<tflite_detection_postprocess>(node))
+    {
+        if (tdp->output_locations().shape() == shape_t { 1, (size_t)tdp->max_detections(), 4 })
+            return false;
+        context.inputs.emplace_back(&tdp->boxes());
+        context.inputs.emplace_back(&tdp->scores());
+        context.inputs.emplace_back(&tdp->anchors());
+
+        context.outputs.emplace_back(&tdp->output_locations());
+        context.outputs.emplace_back(&tdp->output_classes());
+        context.outputs.emplace_back(&tdp->output_scores());
+        context.outputs.emplace_back(&tdp->output_num_detections());
+
+        context.matched_nodes.emplace_back(tdp);
+        return true;
+    }
+
+    return false;
+}
+
+void tflite_detection_postprocess_transform::process(transform_context &context)
+{
+    auto &box = *context.inputs[0]->connection();
+    auto &score = *context.inputs[1]->connection();
+    auto &anchor = *context.inputs[2]->connection();
+    auto output_locations = context.outputs[0]->connections();
+    auto output_classes = context.outputs[1]->connections();
+    auto output_scores = context.outputs[2]->connections();
+    auto output_num_detections = context.outputs[3]->connections();
+
+    auto &old_tdp = static_cast<tflite_detection_postprocess &>(*context.matched_nodes[0]);
+    shape_t new_output_shape_0 { 1, (size_t)old_tdp.max_detections(), 4 };
+    shape_t new_output_shape_1 { 1, (size_t)old_tdp.max_detections() };
+    shape_t new_output_shape_2 { 1, (size_t)old_tdp.max_detections() };
+    shape_t new_output_shape_3 { 1 };
+
+    context.graph.outputs();
+    auto new_output_node_0 = context.graph.emplace<output_node>(output_locations[0]->type(), new_output_shape_0);
+    auto new_output_node_1 = context.graph.emplace<output_node>(output_classes[0]->type(), new_output_shape_1);
+    auto new_output_node_2 = context.graph.emplace<output_node>(output_scores[0]->type(), new_output_shape_2);
+    auto new_output_node_3 = context.graph.emplace<output_node>(output_num_detections[0]->type(), new_output_shape_3);
+    new_output_node_0->name("output_locations");
+    new_output_node_1->name("output_classes");
+    new_output_node_2->name("output_scores");
+    new_output_node_3->name("output_num_detections");
+
+    auto new_tdp = context.graph.emplace<tflite_detection_postprocess>(old_tdp.boxes().shape(), old_tdp.scores().shape(), old_tdp.anchors().shape(),
+        new_output_shape_0, new_output_shape_1, new_output_shape_2, new_output_shape_3, old_tdp.max_detections(), old_tdp.max_classes_per_detection(),
+        old_tdp.detections_per_class(), old_tdp.use_regular_non_max_suppression(), old_tdp.nms_score_threshold(), old_tdp.nms_iou_threshold(),
+        old_tdp.num_classes(), old_tdp.y_scale(), old_tdp.x_scale(), old_tdp.h_scale(), old_tdp.w_scale());
+    new_tdp->name(old_tdp.name());
+
+    for (auto &i : context.graph.outputs())
+    {
+        i->input().clear_connection();
+    }
+
+    new_tdp->boxes().connect(box);
+    new_tdp->scores().connect(score);
+    new_tdp->anchors().connect(anchor);
+
+    new_output_node_0->input().connect(new_tdp->output_locations());
+    new_output_node_1->input().connect(new_tdp->output_classes());
+    new_output_node_2->input().connect(new_tdp->output_scores());
+    new_output_node_3->input().connect(new_tdp->output_num_detections());
+
+    context.graph.dce();
+}
\ No newline at end of file
diff --git a/src/transforms/neutral/fold_instancenorm.cpp b/src/transforms/neutral/fold_instancenorm.cpp
new file mode 100644
index 0000000000..f7e7e5556a
--- /dev/null
+++ b/src/transforms/neutral/fold_instancenorm.cpp
@@ -0,0 +1,73 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <nncase/ir/ir_types.h>
+#include <nncase/ir/ops/binary.h>
+#include <nncase/ir/ops/constant.h>
+#include <nncase/ir/ops/instancenorm.h>
+#include <nncase/ir/ops/reduce.h>
+#include <nncase/ir/ops/unary.h>
+#include <nncase/ir/visitor.h>
+#include <nncase/transforms/neutral/fold_instancenorm.h>
+
+using namespace nncase;
+using namespace nncase::ir;
+using namespace nncase::ir::transforms;
+
+bool fold_instancenorm_transform::on_try_match(node &node, transform_context &context)
+{
+    binary *add_bias = nullptr, *mul_scale = nullptr, *div = nullptr, *add_e = nullptr, *sub_mean = nullptr, *sub_mean_cmp = nullptr;
+    unary *u_sqrt = nullptr, *u_square = nullptr;
+    reduce *reduce_mean0 = nullptr, *reduce_mean1 = nullptr;
+    constant *scale = nullptr, *bias = nullptr, *eps = nullptr;
+    if (((add_bias = node_cast<binary>(node)) && (bias = try_get_direct_parent<constant>(*add_bias))) && add_bias->binary_op() == binary_add
+        && (div = try_get_direct_parent<binary>(*add_bias)) && div->binary_op() == binary_div
+        && (mul_scale = try_get_direct_parent<binary>(*div)) && (scale = try_get_direct_parent<constant>(*mul_scale)) && mul_scale->binary_op() == binary_mul
+        && (u_sqrt = try_get_direct_parent<unary>(*div)) && u_sqrt->unary_op() == unary_sqrt
+        && (add_e = try_get_direct_parent<binary>(*u_sqrt)) && (eps = try_get_direct_parent<constant>(*add_e)) && add_e->binary_op() == binary_add
+        && (reduce_mean0 = try_get_direct_parent<reduce>(*add_e)) && reduce_mean0->reduce_op() == reduce_mean
+        && (u_square = try_get_direct_parent<unary>(*reduce_mean0)) && u_square->unary_op() == unary_square
+        && ((sub_mean = try_get_direct_parent<binary>(*u_square)) && (sub_mean_cmp = try_get_direct_parent<binary>(*mul_scale))
+            && (sub_mean == sub_mean_cmp) && sub_mean->binary_op() == binary_sub)
+        && (reduce_mean1 = try_get_direct_parent<reduce>(*sub_mean)) && reduce_mean1->reduce_op() == reduce_mean)
+    {
+        context.inputs.emplace_back(&reduce_mean1->input());
+        context.outputs.emplace_back(&add_bias->output());
+        context.matched_nodes.emplace_back(scale);
+        context.matched_nodes.emplace_back(bias);
+        context.matched_nodes.emplace_back(eps);
+        return true;
+    }
+
+    return false;
+}
+
+void fold_instancenorm_transform::process(transform_context &context)
+{
+    auto &output = *context.inputs[0]->connection();
+    auto inputs = context.outputs[0]->connections();
+
+    auto scale = node_cast<constant>(*context.matched_nodes[0]);
+    auto bias = node_cast<constant>(*context.matched_nodes[1]);
+    auto eps = node_cast<constant>(*context.matched_nodes[2]);
+
+    auto instancenorm_ = context.graph.emplace<instancenorm>(output.type(), output.shape(), *reinterpret_cast<const float *>(eps->data().data()));
+    instancenorm_->name(scale->name());
+    instancenorm_->input().connect(output);
+    instancenorm_->scale().connect(scale->output());
+    instancenorm_->bias().connect(bias->output());
+
+    for (auto &in : dup(inputs))
+        in->connect(instancenorm_->output());
+}
\ No newline at end of file
diff --git a/src/transforms/neutral/fold_layernorm.cpp b/src/transforms/neutral/fold_layernorm.cpp
new file mode 100644
index 0000000000..521512e8db
--- /dev/null
+++ b/src/transforms/neutral/fold_layernorm.cpp
@@ -0,0 +1,212 @@
+/* Copyright 2020 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <nncase/ir/ops/binary.h>
+#include <nncase/ir/ops/bitcast.h>
+#include <nncase/ir/ops/constant.h>
+#include <nncase/ir/ops/layernorm.h>
+#include <nncase/ir/ops/reduce.h>
+#include <nncase/ir/ops/unary.h>
+#include <nncase/ir/visitor.h>
+#include <nncase/runtime/datatypes.h>
+#include <nncase/transforms/neutral/fold_layernorm.h>
+
+using namespace nncase;
+using namespace nncase::ir;
+using namespace nncase::ir::transforms;
+
+bool fold_layernorm_pattern1_transform::on_try_match(node &node, transform_context &context)
+{
+    reduce *rd1 = nullptr, *rd2 = nullptr;
+    binary *sub = nullptr, *pow = nullptr, *add_eps = nullptr, *div = nullptr, *mul = nullptr, *add_beta = nullptr;
+    unary *sqrt = nullptr;
+    bitcast *rshape1 = nullptr, *rshape2 = nullptr;
+
+    if ((add_beta = node_cast<binary>(node)) and add_beta->binary_op() == binary_op_t::binary_add
+        and (mul = try_get_direct_parent<binary>(*add_beta)) and mul->binary_op() == binary_op_t::binary_mul
+        and (rshape2 = try_get_direct_parent<bitcast>(*mul))
+        and (div = try_get_direct_parent<binary>(*rshape2)) and div->binary_op() == binary_op_t::binary_div
+        and (sqrt = try_get_direct_parent<unary>(*div)) and sqrt->unary_op() == unary_op_t::unary_sqrt
+        and (add_eps = try_get_direct_parent<binary>(*sqrt)) and add_eps->binary_op() == binary_op_t::binary_add
+        and (rd2 = try_get_direct_parent<reduce>(*add_eps)) and rd2->reduce_op() == reduce_op_t::reduce_mean
+        and (pow = try_get_direct_parent<binary>(*rd2)) and pow->binary_op() == binary_op_t::binary_pow
+        and (sub = try_get_direct_parent<binary>(*pow)) and sub->binary_op() == binary_op_t::binary_sub
+        and (rd1 = try_get_direct_parent<reduce>(*sub)) and rd1->reduce_op() == reduce_op_t::reduce_mean
+        and (rshape1 = try_get_direct_parent<bitcast>(*rd1))
+        and (sub->input_a().connection() == rd1->input().connection() or sub->input_b().connection() == rd1->input().connection())
+        and try_get_direct_parent<binary>(*div) == sub)
+    {
+        context.inputs.emplace_back(&rshape1->input());
+        context.outputs.emplace_back(&add_beta->output());
+
+        context.matched_nodes.emplace_back(rshape1);
+        context.matched_nodes.emplace_back(rd1);
+        context.matched_nodes.emplace_back(sub);
+        context.matched_nodes.emplace_back(pow);
+        context.matched_nodes.emplace_back(rd2);
+        context.matched_nodes.emplace_back(add_eps);
+        context.matched_nodes.emplace_back(sqrt);
+        context.matched_nodes.emplace_back(div);
+        context.matched_nodes.emplace_back(rshape2);
+        context.matched_nodes.emplace_back(mul);
+        context.matched_nodes.emplace_back(add_beta);
+
+        return true;
+    }
+
+    return false;
+}
+
+void fold_layernorm_pattern1_transform::process(transform_context &context)
+{
+    auto &output = *context.inputs[0]->connection();
+    auto inputs = context.outputs[0]->connections();
+
+    auto eps = node_cast<constant>(context.matched_nodes[5]->input_at(1).connection()->owner());
+    auto gamma = node_cast<constant>(context.matched_nodes[9]->input_at(1).connection()->owner());
+    auto beta = node_cast<constant>(context.matched_nodes[10]->input_at(1).connection()->owner());
+
+    auto axis = output.shape().size() - gamma->output().shape().size();
+    auto ln = context.graph.emplace<layernorm>(output.type(), output.shape(), axis, *reinterpret_cast<const float *>(eps->data().data()));
+    ln->name(output.name() + "/layernorm");
+
+    ln->input().connect(output);
+    ln->scale().connect(gamma->output());
+    ln->bias().connect(beta->output());
+
+    for (auto &in : dup(inputs))
+        in->connect(ln->output());
+}
+
+bool fold_layernorm_pattern2_transform::on_try_match(node &node, transform_context &context)
+{
+    reduce *rd1 = nullptr, *rd2 = nullptr;
+    binary *sub = nullptr, *pow = nullptr, *add_eps = nullptr, *div = nullptr, *mul = nullptr, *add_beta = nullptr;
+    unary *sqrt = nullptr;
+
+    if ((add_beta = node_cast<binary>(node)) and add_beta->binary_op() == binary_op_t::binary_add
+        and (mul = try_get_direct_parent<binary>(*add_beta)) and mul->binary_op() == binary_op_t::binary_mul
+        and (div = try_get_direct_parent<binary>(*mul)) and div->binary_op() == binary_op_t::binary_div
+        and (sqrt = try_get_direct_parent<unary>(*div)) and sqrt->unary_op() == unary_op_t::unary_sqrt
+        and (add_eps = try_get_direct_parent<binary>(*sqrt)) and add_eps->binary_op() == binary_op_t::binary_add
+        and (rd2 = try_get_direct_parent<reduce>(*add_eps)) and rd2->reduce_op() == reduce_op_t::reduce_mean
+        and (pow = try_get_direct_parent<binary>(*rd2)) and pow->binary_op() == binary_op_t::binary_pow
+        and ((sub = try_get_direct_parent<binary>(*pow, 0)) or (sub = try_get_direct_parent<binary>(*pow, 1))) and sub->binary_op() == binary_op_t::binary_sub
+        and (rd1 = try_get_direct_parent<reduce>(*sub)) and rd1->reduce_op() == reduce_op_t::reduce_mean
+        and (sub->input_a().connection() == rd1->input().connection() or sub->input_b().connection() == rd1->input().connection())
+        and try_get_direct_parent<binary>(*div) == sub)
+    {
+        context.inputs.emplace_back(&rd1->input());
+        context.outputs.emplace_back(&add_beta->output());
+
+        context.matched_nodes.emplace_back(rd1);
+        context.matched_nodes.emplace_back(sub);
+        context.matched_nodes.emplace_back(pow);
+        context.matched_nodes.emplace_back(rd2);
+        context.matched_nodes.emplace_back(add_eps);
+        context.matched_nodes.emplace_back(sqrt);
+        context.matched_nodes.emplace_back(div);
+        context.matched_nodes.emplace_back(mul);
+        context.matched_nodes.emplace_back(add_beta);
+
+        return true;
+    }
+
+    return false;
+}
+
+void fold_layernorm_pattern2_transform::process(transform_context &context)
+{
+    auto &output = *context.inputs[0]->connection();
+    auto inputs = context.outputs[0]->connections();
+
+    auto eps = node_cast<constant>(context.matched_nodes[4]->input_at(1).connection()->owner());
+    auto gamma = node_cast<constant>(context.matched_nodes[7]->input_at(1).connection()->owner());
+    auto beta = node_cast<constant>(context.matched_nodes[8]->input_at(1).connection()->owner());
+
+    auto axis = output.shape().size() - gamma->output().shape().size();
+    auto ln = context.graph.emplace<layernorm>(output.type(), output.shape(), axis, *reinterpret_cast<const float *>(eps->data().data()));
+    ln->name(output.name() + "/layernorm");
+
+    ln->input().connect(output);
+    ln->scale().connect(gamma->output());
+    ln->bias().connect(beta->output());
+
+    for (auto &in : dup(inputs))
+        in->connect(ln->output());
+}
+
+bool fold_layernorm_pattern3_transform::on_try_match(node &node, transform_context &context)
+{
+    reduce *rd_mu = nullptr, *rd_var = nullptr;
+    binary *sub_mu = nullptr, *add_eps = nullptr, *mul_gamma = nullptr, *mul_x = nullptr, *mul_mu = nullptr, *sub_beta = nullptr, *add_all = nullptr;
+    unary *rsqrt = nullptr, *square = nullptr;
+
+    if ((add_all = node_cast<binary>(node)) and add_all->binary_op() == binary_op_t::binary_add
+        and (mul_x = try_get_direct_parent<binary>(*add_all, 0)) and mul_x->binary_op() == binary_op_t::binary_mul
+        and (sub_beta = try_get_direct_parent<binary>(*add_all, 1)) and sub_beta->binary_op() == binary_op_t::binary_sub
+        and (mul_gamma = try_get_direct_parent<binary>(*mul_x, 1)) and mul_gamma->binary_op() == binary_op_t::binary_mul
+        and (rsqrt = try_get_direct_parent<unary>(*mul_gamma, 0)) and rsqrt->unary_op() == unary_op_t::unary_rsqrt
+        and (add_eps = try_get_direct_parent<binary>(*rsqrt)) and add_eps->binary_op() == binary_op_t::binary_add
+        and (rd_var = try_get_direct_parent<reduce>(*add_eps, 0)) and rd_var->reduce_op() == reduce_op_t::reduce_mean
+        and (square = try_get_direct_parent<unary>(*rd_var)) and square->unary_op() == unary_op_t::unary_square
+        and (sub_mu = try_get_direct_parent<binary>(*square)) and sub_mu->binary_op() == binary_op_t::binary_sub
+        and (rd_mu = try_get_direct_parent<reduce>(*sub_mu, 1)) and rd_mu->reduce_op() == reduce_op_t::reduce_mean
+        and (mul_mu = try_get_direct_parent<binary>(*sub_beta, 1)) and mul_mu->binary_op() == binary_op_t::binary_mul
+        and (mul_mu->input_a().connection() == sub_mu->input_b().connection())
+        and (mul_mu->input_b().connection() == mul_x->input_b().connection())
+        and (mul_x->input_a().connection() == sub_mu->input_a().connection())
+        and (mul_x->input_a().connection() == rd_mu->input().connection()))
+    {
+        context.inputs.emplace_back(&rd_mu->input());
+        context.outputs.emplace_back(&add_all->output());
+
+        context.matched_nodes.emplace_back(rd_mu);
+        context.matched_nodes.emplace_back(sub_mu);
+        context.matched_nodes.emplace_back(square);
+        context.matched_nodes.emplace_back(rd_var);
+        context.matched_nodes.emplace_back(add_eps);
+        context.matched_nodes.emplace_back(rsqrt);
+        context.matched_nodes.emplace_back(mul_gamma);
+        context.matched_nodes.emplace_back(mul_x);
+        context.matched_nodes.emplace_back(mul_mu);
+        context.matched_nodes.emplace_back(sub_beta);
+        context.matched_nodes.emplace_back(add_all);
+
+        return true;
+    }
+
+    return false;
+}
+
+void fold_layernorm_pattern3_transform::process(transform_context &context)
+{
+    auto &output = *context.inputs[0]->connection();
+    auto inputs = context.outputs[0]->connections();
+
+    auto eps = node_cast<constant>(context.matched_nodes[4]->input_at(1).connection()->owner());
+    auto gamma = node_cast<constant>(context.matched_nodes[6]->input_at(1).connection()->owner());
+    auto beta = node_cast<constant>(context.matched_nodes[9]->input_at(0).connection()->owner());
+
+    auto axis = output.shape().size() - gamma->output().shape().size();
+    auto ln = context.graph.emplace<layernorm>(output.type(), output.shape(), axis, *reinterpret_cast<const float *>(eps->data().data()));
+    ln->name(output.name() + "/layernorm");
+
+    ln->input().connect(output);
+    ln->scale().connect(gamma->output());
+    ln->bias().connect(beta->output());
+
+    for (auto &in : dup(inputs))
+        in->connect(ln->output());
+}
\ No newline at end of file
diff --git a/src/transforms/neutral/fold_quantize.cpp b/src/transforms/neutral/fold_quantize.cpp
index ead23f7870..bba88f5173 100644
--- a/src/transforms/neutral/fold_quantize.cpp
+++ b/src/transforms/neutral/fold_quantize.cpp
@@ -44,11 +44,11 @@ bool fold_quantize_transform::on_try_match(node &node, transform_context &contex
 
                     context.matched_nodes.emplace_back(&q);
                     context.matched_nodes.emplace_back(&deq);
-                    if ((try_get_direct_parent<space_to_batch>(q) && try_get_direct_child<conv2d>(deq))
-                        || (try_get_direct_parent<conv2d>(q) && try_get_direct_child<batch_to_space>(deq)))
-                    {
-                        return true;
-                    }
+                    // if ((try_get_direct_parent<space_to_batch>(q) && try_get_direct_child<conv2d>(deq))
+                    //     || (try_get_direct_parent<conv2d>(q) && try_get_direct_child<batch_to_space>(deq)))
+                    // {
+                    return true;
+                    // }
                 }
             }
         }
diff --git a/src/transforms/neutral/optimize_allocation.cpp b/src/transforms/neutral/optimize_allocation.cpp
index cd25b67c41..4de646be69 100644
--- a/src/transforms/neutral/optimize_allocation.cpp
+++ b/src/transforms/neutral/optimize_allocation.cpp
@@ -140,6 +140,25 @@ void add_copy_to_output_pass::run_core(graph &graph, [[maybe_unused]] nncase::ta
     alias_visitor.visit(graph);
 }
 
+void add_copy_to_bitcast_pass::run_core(graph &graph, [[maybe_unused]] nncase::target &target, [[maybe_unused]] const run_pass_options &options)
+{
+    auto alias_visitor = make_relay_ir_visitor([&](node &node) {
+        if (auto b = node_cast<bitcast>(node))
+        {
+            auto &out = *b->input().connection();
+            if (out.owner().runtime_opcode() != op_copy)
+            {
+                auto cp = graph.emplace<copy>(out.type(), out.shape());
+                cp->module_type(graph.module_type());
+                cp->name(out.owner().name() + "/copy");
+                cp->input().connect(out);
+                b->input().connect(cp->output());
+            }
+        }
+    });
+    alias_visitor.visit(graph);
+}
+
 //   x@data       x@output
 //     |             |
 //   copy            |
@@ -173,8 +192,10 @@ void remove_exclusive_copy_to_output_transform::process(transform_context &conte
 {
     auto &output = *context.inputs[0]->connection();
     auto &old_out = static_cast<output_node &>(*context.matched_nodes[1]);
-
-    output.memory_location(mem_output);
+    if (output.connections().size() == 1)
+        output.memory_location(mem_output);
+    else
+        output.memory_location(mem_shared_data);
     output.attributes(output.attributes() | cnctr_attr_no_layout_strides);
     old_out.input().connect(output);
 }
@@ -188,7 +209,7 @@ void remove_exclusive_copy_to_output_transform::process(transform_context &conte
 bool remove_exclusive_copy_to_concat_transform::on_try_match(node &node, transform_context &context)
 {
     copy *cp;
-    concat *c;
+    concat *c, *pre_c;
 
     if ((cp = node_cast<copy>(node))
         && (c = try_get_direct_child<concat>(*cp)))
@@ -201,6 +222,8 @@ bool remove_exclusive_copy_to_concat_transform::on_try_match(node &node, transfo
             && ((input->attributes() & (cnctr_attr_no_buffer_fusion | cnctr_attr_buffer_slice)) == 0)
             && (is_simple_concat || (input->attributes() & (cnctr_attr_no_layout_strides)) == 0))
         {
+            if ((pre_c = try_get_direct_parent<concat>(*cp)) && pre_c->axis() != c->axis())
+                return false;
             context.inputs.emplace_back(&cp->input());
             context.outputs.emplace_back(&cp->output());
 
@@ -222,6 +245,39 @@ void remove_exclusive_copy_to_concat_transform::process(transform_context &conte
         in->connect(output);
 }
 
+bool remove_exclusive_copy_to_bitcast_transform::on_try_match(node &node, transform_context &context)
+{
+    copy *cp;
+    bitcast *b;
+
+    if ((cp = node_cast<copy>(node))
+        && (b = try_get_direct_child<bitcast>(*cp)))
+    {
+        auto input = cp->input().connection();
+        if ((input->memory_location() == mem_data || (input->memory_location() == mem_input && !try_get_direct_child<output_node>(*b)))
+            && ((input->attributes() & cnctr_attr_no_buffer_fusion) == 0))
+        {
+            context.inputs.emplace_back(&cp->input());
+            context.outputs.emplace_back(&cp->output());
+
+            context.matched_nodes.emplace_back(cp);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void remove_exclusive_copy_to_bitcast_transform::process(transform_context &context)
+{
+    auto &output = *context.inputs[0]->connection();
+    auto inputs = context.outputs[0]->connections();
+
+    output.attributes(output.attributes() | cnctr_attr_no_buffer_fusion);
+    for (auto &in : dup(inputs))
+        in->connect(output);
+}
+
 //     x             x
 //     |             |
 //   slice           |
diff --git a/src/transforms/neutral/pre_process_setting.cpp b/src/transforms/neutral/pre_process_setting.cpp
index 9fd46cfbce..66ade2128d 100644
--- a/src/transforms/neutral/pre_process_setting.cpp
+++ b/src/transforms/neutral/pre_process_setting.cpp
@@ -54,23 +54,6 @@ void pre_process_transform::run_core(graph &graph, [[maybe_unused]] nncase::targ
 
             mid_ptr = &new_input->output();
 
-            //dequantize: input_range_
-            if (mid_ptr->type() != dt_float32)
-            {
-                std::cout << " |Dequantize:" << std::endl;
-                value_range<float> range = { input_range_[0], input_range_[1] };
-
-                auto Q_max = 255;
-                auto Q_min = 0;
-                auto scale = (range.max - range.min) / (Q_max - Q_min);
-                auto bias = std::round((range.max * Q_min - range.min * Q_max) / (range.max - range.min));
-                quant_param_t deq_params { static_cast<int32_t>(bias), scale };
-                auto deq_input = graph.emplace<dequantize>(mid_ptr->type(), mid_ptr->shape(), dt_float32, deq_params);
-                deq_input->name("dequantize_input");
-                deq_input->input().connect(*mid_ptr);
-                mid_ptr = &deq_input->output();
-            }
-
             if (input_layout_ == "NHWC")
             {
                 auto transpose_pre = graph.emplace<transpose>(mid_ptr->type(), mid_ptr->shape(), axis_t { 0, 3, 1, 2 });
@@ -99,6 +82,23 @@ void pre_process_transform::run_core(graph &graph, [[maybe_unused]] nncase::targ
                 mid_ptr = &concat_slice->output();
             }
 
+            //dequantize: input_range_
+            if (mid_ptr->type() != dt_float32)
+            {
+                std::cout << " |Dequantize:" << std::endl;
+                value_range<float> range = { input_range_[0], input_range_[1] };
+
+                auto Q_max = 255;
+                auto Q_min = 0;
+                auto scale = (range.max - range.min) / (Q_max - Q_min);
+                auto bias = std::round((range.max * Q_min - range.min * Q_max) / (range.max - range.min));
+                quant_param_t deq_params { static_cast<int32_t>(bias), scale };
+                auto deq_input = graph.emplace<dequantize>(mid_ptr->type(), mid_ptr->shape(), dt_float32, deq_params);
+                deq_input->name("dequantize_input");
+                deq_input->input().connect(*mid_ptr);
+                mid_ptr = &deq_input->output();
+            }
+
             // letterbox :
             /**
              * input_layout:  HW have different axis
diff --git a/src/transforms/neutral/split_softmax.cpp b/src/transforms/neutral/split_softmax.cpp
index 1cdf9019c3..b9008b520b 100644
--- a/src/transforms/neutral/split_softmax.cpp
+++ b/src/transforms/neutral/split_softmax.cpp
@@ -48,24 +48,30 @@ void split_softmax_transform::process(transform_context &context)
     auto input_shape = output.shape();
     axis_t axes { sm.axis() };
     auto rmax = context.graph.emplace<reduce>(reduce_max, input_type, input_shape, axes, std::numeric_limits<float>::lowest(), true);
+    rmax->attributes(rmax->attributes() | node_attributes::node_attr_skip_quantize);
     rmax->name(sm.name() + ".rmax");
 
     auto sub = context.graph.emplace<binary>(binary_sub, input_type, input_shape, rmax->output().shape(), value_range<float>::full());
+    sub->attributes(sub->attributes() | node_attributes::node_attr_skip_quantize);
     sub->name(sm.name() + ".sub");
 
     auto beta = context.graph.emplace<constant>(sm.beta());
     beta->name(sm.name() + ".beta");
 
     auto mul = context.graph.emplace<binary>(binary_mul, input_type, sub->output().shape(), beta->output().shape(), value_range<float>::full());
+    mul->attributes(mul->attributes() | node_attributes::node_attr_skip_quantize);
     mul->name(sm.name() + ".mul");
 
     auto exp = context.graph.emplace<unary>(unary_exp, sub->output().shape());
+    exp->attributes(exp->attributes() | node_attributes::node_attr_skip_quantize);
     exp->name(sm.name() + ".exp");
 
     auto rsum = context.graph.emplace<reduce>(reduce_sum, input_type, exp->output().shape(), axes, 0.f, true);
+    rsum->attributes(rsum->attributes() | node_attributes::node_attr_skip_quantize);
     rsum->name(sm.name() + ".rsum");
 
     auto div = context.graph.emplace<binary>(binary_div, input_type, exp->output().shape(), rsum->output().shape(), value_range<float>::full());
+    div->attributes(div->attributes() | node_attributes::node_attr_skip_quantize);
     div->name(sm.name() + ".div");
 
     rmax->input().connect(output);
diff --git a/src/transforms/neutral/squeeze_dims.cpp b/src/transforms/neutral/squeeze_dims.cpp
new file mode 100644
index 0000000000..309b335415
--- /dev/null
+++ b/src/transforms/neutral/squeeze_dims.cpp
@@ -0,0 +1,400 @@
+/* Copyright 2019-2021 Canaan Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <nncase/ir/ops/binary.h>
+#include <nncase/ir/ops/bitcast.h>
+#include <nncase/ir/ops/concat.h>
+#include <nncase/ir/ops/constant.h>
+#include <nncase/ir/ops/sigmoid.h>
+#include <nncase/ir/ops/transpose.h>
+#include <nncase/ir/visitor.h>
+#include <nncase/transforms/neutral/squeeze_dims.h>
+
+using namespace nncase;
+using namespace nncase::ir;
+using namespace nncase::ir::transforms;
+
+shape_t squeeze_shape(shape_t old_shape)
+{
+    shape_t new_shape { 1, 1, 1, 1 };
+    for (int i = old_shape.size() - 1, k = 3; i >= 0; i--)
+    {
+        new_shape[k] *= old_shape[i];
+        if (k > 0)
+            k--;
+    }
+    return new_shape;
+}
+
+auto squeeze_binary_shape(shape_t old_a_shape, shape_t old_b_shape)
+{
+    auto a_size = old_a_shape.size();
+    auto b_size = old_b_shape.size();
+    auto squeeze_times = std::max(a_size > 4 ? a_size - 4 : 0, b_size > 4 ? b_size - 4 : 0);
+    if (squeeze_times <= 0)
+        return std::tuple(false, old_a_shape, old_b_shape);
+    shape_t new_a_shape, new_b_shape;
+
+    if (a_size == b_size)
+    {
+        /*
+        1.  a.shape == b.shape
+        2.  a.shape : [s1, s2, s3, s4, s5]
+            b.shape :  [1, 1, s3, 1, 1] ||[1, 1, 1, s4, 1]  ||...
+        */
+        // 1.   a.shape == b.shape
+        if (old_a_shape == old_b_shape)
+        {
+            new_a_shape = squeeze_shape(old_a_shape);
+            new_b_shape = squeeze_shape(old_b_shape);
+        }
+        // 2.   a.shape : [s1, s2, s3, s4, s5]
+        //      b.shape :  [1, 1, s3, 1, 1] ||[1, 1, 1, s4, 1]  ||...
+        else
+        {
+            new_a_shape = old_a_shape;
+            new_b_shape = old_b_shape;
+
+            // inquiry which dim can be fold
+            std::vector<bool> can_fold_index_list(a_size, true);
+            std::vector<std::tuple<size_t, size_t>> fold_index_couple;
+            for (size_t i = 0; i < a_size; i++)
+            {
+                if (old_a_shape[i] == old_b_shape[i])
+                    can_fold_index_list[i] = false;
+            }
+            for (size_t i = a_size - 1; i > 0; i--)
+            {
+                if (can_fold_index_list[i] && can_fold_index_list[i - 1])
+                    fold_index_couple.emplace_back(std::make_tuple(i - 1, i));
+            }
+
+            while (squeeze_times && !fold_index_couple.empty())
+            {
+                auto it = fold_index_couple.back();
+                auto front = std::get<0>(it);
+                auto back = std::get<1>(it);
+                new_a_shape[front] *= new_a_shape[back];
+                new_b_shape[front] *= new_b_shape[back];
+                new_a_shape.erase(std::begin(new_a_shape) + back);
+                new_b_shape.erase(std::begin(new_b_shape) + back);
+                fold_index_couple.pop_back();
+                squeeze_times--;
+            }
+
+            if (new_a_shape.size() > 4)
+            {
+                // remove shape.front() == 1 || shape.back() == 1
+                if (new_a_shape.front() == 1 && new_b_shape.front() == 1)
+                {
+                    new_a_shape.erase(std::begin(new_a_shape));
+                    new_b_shape.erase(std::begin(new_b_shape));
+                }
+                else if (new_a_shape.back() == 1 && new_b_shape.back() == 1)
+                {
+                    new_a_shape.erase(std::end(new_a_shape) - 1);
+                    new_b_shape.erase(std::end(new_b_shape) - 1);
+                }
+            }
+
+            new_a_shape.shrink_to_fit();
+            new_b_shape.shrink_to_fit();
+            if (new_a_shape.size() > 4)
+                return std::make_tuple(false, new_a_shape, new_b_shape);
+        }
+    }
+    else
+    {
+        if (a_size != 1)
+            new_a_shape = squeeze_shape(old_a_shape);
+        else
+            new_a_shape = old_a_shape;
+        if (b_size != 1)
+            new_b_shape = squeeze_shape(old_b_shape);
+        else
+            new_b_shape = old_b_shape;
+    }
+    return std::make_tuple(true, new_a_shape, new_b_shape);
+}
+
+auto squeeze_transpose_shape(shape_t old_shape, axis_t old_axis)
+{
+    if (old_shape.size() <= 4)
+        return std::make_tuple(false, old_axis, old_shape);
+
+    axis_t new_axis = old_axis;
+    shape_t new_shape = old_shape;
+    int squeeze_times = old_shape.size() - 4;
+
+    std::vector<std::tuple<size_t, size_t>> fold_index_couple;
+    for (size_t i = old_shape.size() - 1; i > 0; i--)
+    {
+        if (old_axis[i - 1] + 1 == old_axis[i])
+            fold_index_couple.emplace_back(std::make_tuple(i - 1, i));
+    }
+    if (fold_index_couple.size() < squeeze_times)
+        return std::make_tuple(false, new_axis, new_shape);
+
+    while (squeeze_times && !fold_index_couple.empty())
+    {
+        auto it = fold_index_couple.back();
+        auto front = std::get<0>(it);
+        auto back = std::get<1>(it);
+        new_shape[front] *= new_shape[back];
+        new_shape.erase(std::begin(new_shape) + back);
+        new_axis.erase(std::begin(new_axis) + back);
+        fold_index_couple.pop_back();
+        squeeze_times--;
+    }
+
+    // fix axis
+    for (int i = 0, j = 0; j < 4; i++)
+    {
+        auto find_index = std::find(new_axis.begin(), new_axis.end(), i);
+        if (find_index != new_axis.end())
+        {
+            *find_index = j;
+            j++;
+        }
+    }
+
+    return std::make_tuple(true, new_axis, new_shape);
+}
+
+auto squeeze_concat_shape(std::vector<shape_t> &old_shape, int concat_axis)
+{
+    int new_axis = 0;
+    for (int index = 0; index < old_shape.size(); index++)
+    {
+        auto tmp_axis = concat_axis;
+        auto squeeze_times = old_shape[index].size() - 4;
+        shape_t new_shape { 1, 1, 1, 1 };
+        for (int i = 0, j = 0; i < 4; i++, j++)
+        {
+            if (concat_axis > old_shape[index].size() - 4 - 1 && squeeze_times != 0)
+            {
+                new_shape[i] = old_shape[index][j] * old_shape[index][j + 1];
+                squeeze_times--;
+                j++;
+                tmp_axis--;
+            }
+            else
+            {
+                new_shape[i] = old_shape[index][j];
+            }
+        }
+        old_shape[index] = new_shape;
+        new_axis = tmp_axis;
+    }
+
+    return new_axis;
+}
+
+bool check_op(node_opcode op)
+{
+    if (op == op_binary || op == op_sigmoid || op == op_transpose || op == op_concat)
+        return true;
+    return false;
+}
+
+bool squeeze_dims_transform::on_try_match(node &node, transform_context &context)
+{
+    if (check_op(node.runtime_opcode()))
+    {
+        bool need_squeeze = false;
+        for (auto &it : node.inputs())
+        {
+            if (need_squeeze || it->shape().size() > 4)
+            {
+                need_squeeze = true;
+                context.inputs.emplace_back(it);
+            }
+        }
+
+        // double check all input emplaced if need squeeze
+        if (need_squeeze)
+        {
+            for (auto &it : node.inputs())
+            {
+                if (std::find(context.inputs.begin(), context.inputs.end(), it) == context.inputs.end())
+                    context.inputs.emplace_back(it);
+            }
+        }
+
+        for (auto &it : node.outputs())
+        {
+            if (need_squeeze || it->shape().size() > 4)
+            {
+                need_squeeze = true;
+                context.outputs.emplace_back(it);
+            }
+        }
+        if (need_squeeze)
+        {
+            context.matched_nodes.emplace_back(&node);
+            bool can_squeeze = true;
+            NNCASE_UNUSED shape_t a_shape, b_shape;
+            NNCASE_UNUSED axis_t new_axis;
+            if (node.runtime_opcode() == op_binary)
+                std::tie(can_squeeze, a_shape, b_shape) = squeeze_binary_shape(context.inputs[0]->shape(), context.inputs[1]->shape());
+            else if (node.runtime_opcode() == op_transpose)
+                std::tie(can_squeeze, new_axis, b_shape) = squeeze_transpose_shape(node_cast<transpose>(node)->input().shape(), node_cast<transpose>(node)->perm());
+
+            return can_squeeze;
+        }
+    }
+
+    return false;
+}
+
+void squeeze_dims_transform::process(transform_context &context)
+{
+    if (context.matched_nodes[0]->runtime_opcode() == op_binary)
+    {
+        auto &output_a = *context.inputs[0]->connection();
+        auto &output_b = *context.inputs[1]->connection();
+        auto inputs = context.outputs[0]->connections();
+        auto &old_binary = static_cast<binary &>(*context.matched_nodes[0]);
+
+        bitcast *in_a_bitc, *in_b_bitc, *out_bitc;
+        auto [_, new_a_shape, new_b_shape] = squeeze_binary_shape(output_a.shape(), output_b.shape());
+        if (output_a.shape().size() > 4)
+            in_a_bitc = context.graph.emplace<bitcast>(output_a.type(), output_a.shape(), new_a_shape);
+        else
+            in_a_bitc = context.graph.emplace<bitcast>(output_a.type(), output_a.shape(), output_a.shape());
+
+        if (output_b.shape().size() > 4)
+            in_b_bitc = context.graph.emplace<bitcast>(output_b.type(), output_b.shape(), new_b_shape);
+        else
+            in_b_bitc = context.graph.emplace<bitcast>(output_b.type(), output_b.shape(), output_b.shape());
+
+        auto new_binary = context.graph.emplace<binary>(old_binary.binary_op(), in_a_bitc->output().type(), in_a_bitc->output().shape(), in_b_bitc->output().shape(),
+            old_binary.fused_activation());
+        if (old_binary.output_at(0).shape().size() > 4)
+            out_bitc = context.graph.emplace<bitcast>(new_binary->output().type(), new_binary->output().shape(), old_binary.output_at(0).shape());
+        else
+            out_bitc = context.graph.emplace<bitcast>(new_binary->output().type(), new_binary->output().shape(), new_binary->output().shape());
+
+        in_a_bitc->name(old_binary.name() + "_in_a_bitc");
+        in_b_bitc->name(old_binary.name() + "_in_b_bitc");
+        new_binary->name(old_binary.name());
+        out_bitc->name(old_binary.name() + "_out_bitc");
+
+        new_binary->input_a().connect(in_a_bitc->output());
+        new_binary->input_b().connect(in_b_bitc->output());
+        out_bitc->input().connect(new_binary->output());
+
+        in_a_bitc->input().connect(output_a);
+        in_b_bitc->input().connect(output_b);
+        for (auto &in : dup(inputs))
+            in->connect(out_bitc->output());
+    }
+    else if (context.matched_nodes[0]->runtime_opcode() == op_sigmoid)
+    {
+        auto &output = *context.inputs[0]->connection();
+        auto inputs = context.outputs[0]->connections();
+        auto &old_sigmoid = static_cast<sigmoid &>(*context.matched_nodes[0]);
+
+        bitcast *in_bitc, *out_bitc;
+        if (output.shape().size() > 4)
+            in_bitc = context.graph.emplace<bitcast>(output.type(), output.shape(), squeeze_shape(output.shape()));
+        else
+            in_bitc = context.graph.emplace<bitcast>(output.type(), output.shape(), output.shape());
+
+        auto new_sigmoid = context.graph.emplace<sigmoid>(in_bitc->output().type(), in_bitc->output().shape());
+        if (old_sigmoid.output_at(0).shape().size() > 4)
+            out_bitc = context.graph.emplace<bitcast>(new_sigmoid->output().type(), new_sigmoid->output().shape(), old_sigmoid.output_at(0).shape());
+        else
+            out_bitc = context.graph.emplace<bitcast>(new_sigmoid->output().type(), new_sigmoid->output().shape(), new_sigmoid->output().shape());
+
+        in_bitc->name(old_sigmoid.name() + "_in_bitc");
+        new_sigmoid->name(old_sigmoid.name());
+        out_bitc->name(old_sigmoid.name() + "_out_bitc");
+
+        new_sigmoid->input().connect(in_bitc->output());
+        out_bitc->input().connect(new_sigmoid->output());
+
+        in_bitc->input().connect(output);
+        for (auto &in : dup(inputs))
+            in->connect(out_bitc->output());
+    }
+    else if (context.matched_nodes[0]->runtime_opcode() == op_transpose)
+    {
+        auto &output = *context.inputs[0]->connection();
+        auto inputs = context.outputs[0]->connections();
+        auto &old_transpose = static_cast<transpose &>(*context.matched_nodes[0]);
+
+        auto [_, new_axis, new_shape] = squeeze_transpose_shape(output.shape(), old_transpose.perm());
+
+        bitcast *in_bitc, *out_bitc;
+        if (output.shape().size() > 4)
+            in_bitc = context.graph.emplace<bitcast>(output.type(), output.shape(), new_shape);
+        else
+            in_bitc = context.graph.emplace<bitcast>(output.type(), output.shape(), output.shape());
+
+        auto new_transpose = context.graph.emplace<transpose>(in_bitc->output().type(), in_bitc->output().shape(), new_axis);
+        if (old_transpose.output_at(0).shape().size() > 4)
+            out_bitc = context.graph.emplace<bitcast>(new_transpose->output().type(), new_transpose->output().shape(), old_transpose.output_at(0).shape());
+        else
+            out_bitc = context.graph.emplace<bitcast>(new_transpose->output().type(), new_transpose->output().shape(), new_transpose->output().shape());
+
+        in_bitc->name(old_transpose.name() + "_in_bitc");
+        new_transpose->name(old_transpose.name());
+        out_bitc->name(old_transpose.name() + "_out_bitc");
+
+        new_transpose->input().connect(in_bitc->output());
+        out_bitc->input().connect(new_transpose->output());
+
+        in_bitc->input().connect(output);
+        for (auto &in : dup(inputs))
+            in->connect(out_bitc->output());
+    }
+    else if (context.matched_nodes[0]->runtime_opcode() == op_concat)
+    {
+        auto inputs = context.outputs[0]->connections();
+        auto &old_concat = static_cast<concat &>(*context.matched_nodes[0]);
+
+        std::vector<shape_t> concat_shape;
+        std::vector<output_connector *> concat_inputs;
+
+        for (auto &it : context.inputs)
+        {
+            concat_shape.emplace_back(it->shape());
+        }
+        auto new_axis = squeeze_concat_shape(concat_shape, old_concat.axis());
+        auto new_concat = context.graph.emplace<concat>(old_concat.output().type(), concat_shape, new_axis);
+        new_concat->name(old_concat.name());
+
+        for (size_t i = 0; i < context.inputs.size(); i++)
+        {
+            auto in_bitc = context.graph.emplace<bitcast>(context.inputs[i]->connection()->type(), context.inputs[i]->connection()->shape(), concat_shape[i]);
+
+            in_bitc->input().connect(*context.inputs[i]->connection());
+            in_bitc->name(old_concat.name() + "_in_bitc_" + std::to_string(i));
+            new_concat->input_at(i).connect(in_bitc->output());
+        }
+        bitcast *out_bitc;
+        if (old_concat.output_at(0).shape().size() > 4)
+            out_bitc = context.graph.emplace<bitcast>(new_concat->output().type(), new_concat->output().shape(), old_concat.output_at(0).shape());
+        else
+            out_bitc = context.graph.emplace<bitcast>(new_concat->output().type(), new_concat->output().shape(), new_concat->output().shape());
+
+        out_bitc->name(old_concat.name() + "_out_bitc");
+
+        out_bitc->input().connect(new_concat->output());
+
+        for (auto &in : dup(inputs))
+            in->connect(out_bitc->output());
+    }
+}
diff --git a/src/transforms/neutral/transpose_motion.cpp b/src/transforms/neutral/transpose_motion.cpp
index 06d3f3118d..b92da4e461 100644
--- a/src/transforms/neutral/transpose_motion.cpp
+++ b/src/transforms/neutral/transpose_motion.cpp
@@ -66,6 +66,7 @@ void transpose_binary_motion_transform::process(transform_context &context)
     auto &old_bin = static_cast<binary &>(*context.matched_nodes[2]);
 
     auto bin = context.graph.emplace<binary>(old_bin.binary_op(), output_a.type(), output_a.shape(), output_b.shape(), old_bin.fused_activation());
+    bin->attributes(old_bin.attributes());
     bin->name(old_bin.name());
     auto tp = context.graph.emplace<transpose>(bin->output().type(), bin->output().shape(), old_tp.perm());
     tp->name(old_tp.name());
@@ -138,6 +139,7 @@ void transpose_constant_binary_motion_transform::process(transform_context &cont
     if (old_bin.input_a().connection()->owner().runtime_opcode() == op_constant)
     {
         bin = context.graph.emplace<binary>(old_bin.binary_op(), output.type(), con->output().shape(), output.shape(), old_bin.fused_activation());
+        bin->attributes(old_bin.attributes());
         bin->name(old_bin.name());
         bin->input_a().connect(con->output());
         bin->input_b().connect(output);
@@ -145,6 +147,7 @@ void transpose_constant_binary_motion_transform::process(transform_context &cont
     else
     {
         bin = context.graph.emplace<binary>(old_bin.binary_op(), output.type(), output.shape(), con->output().shape(), old_bin.fused_activation());
+        bin->attributes(old_bin.attributes());
         bin->name(old_bin.name());
         bin->input_a().connect(output);
         bin->input_b().connect(con->output());
@@ -317,6 +320,7 @@ void transpose_reduce_motion_transform::process(transform_context &context)
     }
 
     auto r = context.graph.emplace<reduce>(old_r.reduce_op(), output.type(), output.shape(), axes, old_r.init_value(), old_r.keep_dims());
+    r->attributes(old_r.attributes());
     r->name(old_r.name());
     auto tp = context.graph.emplace<transpose>(r->output().type(), r->output().shape(), perm);
     tp->name(old_tp.name());
@@ -355,6 +359,7 @@ void transpose_unary_motion_transform::process(transform_context &context)
     auto &old_u = static_cast<unary &>(*context.matched_nodes[1]);
 
     auto u = context.graph.emplace<unary>(old_u.unary_op(), output.shape());
+    u->attributes(old_u.attributes());
     u->name(old_u.name());
     auto tp = context.graph.emplace<transpose>(u->output().type(), u->output().shape(), old_tp.perm());
     tp->name(old_tp.name());
@@ -501,6 +506,7 @@ void transpose_sigmoid_motion_transform::process(transform_context &context)
 
         auto new_sigmd = context.graph.emplace<sigmoid>(old_tp.input().type(), old_tp.input().shape());
         auto new_b = context.graph.emplace<binary>(old_b.binary_op(), old_tp.input().type(), old_tp.input().shape(), new_sigmd->output().shape(), old_b.fused_activation());
+        new_b->attributes(old_b.attributes());
         auto new_tp = context.graph.emplace<transpose>(new_b->output().type(), new_b->output().shape(), old_tp.perm());
         new_sigmd->name(old_sigmd.name());
         new_b->name(old_b.name());
diff --git a/targets/cpu/cpu_target.cpp b/targets/cpu/cpu_target.cpp
index 19a8d18afe..e01288fbea 100644
--- a/targets/cpu/cpu_target.cpp
+++ b/targets/cpu/cpu_target.cpp
@@ -14,7 +14,10 @@
  */
 #include "cpu_target.h"
 #include <nncase/plugin_loader.h>
+#include <nncase/transforms/neutral/add_quant_checkpoints.h>
 #include <nncase/transforms/neutral/fold_constant.h>
+#include <nncase/transforms/neutral/fuse_unary.h>
+#include <nncase/transforms/neutral/fused_unary_to_lookup1d.h>
 #include <nncase/transforms/neutral/lstm_transform.h>
 #include <nncase/transforms/pass.h>
 
@@ -46,4 +49,23 @@ void cpu_target::register_target_dependent_passes([[maybe_unused]] const module_
         p.emplace<lstm_transform>();
         pass_mgr.add_pass(std::move(p));
     }
+}
+
+void cpu_target::register_quantize_annotation_passes([[maybe_unused]] const module_type_t &type, ir::transforms::pass_manager &pass_mgr)
+{
+    {
+        transform_pass p("fuse_unary");
+        p.emplace<fuse_one_unary_transform>();
+        p.emplace<fuse_one_binary_transform>();
+        p.emplace<fuse_two_fused_unary_transform>();
+        p.emplace<fuse_one_fused_unary_with_binary_transform>();
+        p.emplace<fuse_two_fused_unary_with_binary_transform>();
+        pass_mgr.add_pass(std::move(p));
+    }
+
+    {
+        transform_pass p("annotate_neutral_quantize");
+        p.emplace<add_quant_checkpoints_transform>(std::in_place, ir::op_fused_unary, ir::op_bitcast, ir::op_dequantize, ir::op_binary, ir::op_output_node);
+        pass_mgr.add_pass(std::move(p));
+    }
 }
\ No newline at end of file
diff --git a/targets/cpu/cpu_target.h b/targets/cpu/cpu_target.h
index 5e68a0976a..14fdc1ef79 100644
--- a/targets/cpu/cpu_target.h
+++ b/targets/cpu/cpu_target.h
@@ -23,5 +23,6 @@ class cpu_target : public neutral_target
     using neutral_target::neutral_target;
 
     void register_target_dependent_passes(const module_type_t &type, ir::transforms::pass_manager &pass_mgr, bool use_ptq, bool split_w_to_act) override;
+    void register_quantize_annotation_passes(const module_type_t &type, ir::transforms::pass_manager &pass_mgr) override;
 };
 }
diff --git a/targets/k210/k210_target.cpp b/targets/k210/k210_target.cpp
index 660a64a950..16270efb31 100644
--- a/targets/k210/k210_target.cpp
+++ b/targets/k210/k210_target.cpp
@@ -37,6 +37,8 @@
 #include <nncase/transforms/neutral/fold_quantize.h>
 #include <nncase/transforms/neutral/fold_transpose.h>
 #include <nncase/transforms/neutral/fuse_pad.h>
+#include <nncase/transforms/neutral/fuse_unary.h>
+#include <nncase/transforms/neutral/fused_unary_to_lookup1d.h>
 #include <nncase/transforms/neutral/lstm_transform.h>
 #include <nncase/transforms/neutral/matmul_to_conv2d.h>
 #include <nncase/transforms/neutral/pad_conv.h>
@@ -138,7 +140,7 @@ void k210_target::register_target_dependent_passes([[maybe_unused]] const module
     }
 }
 
-void k210_target::register_quantize_annotation_passes(const module_type_t &type, ir::transforms::pass_manager &pass_mgr)
+void k210_target::register_quantize_annotation_passes(NNCASE_UNUSED const module_type_t &type, ir::transforms::pass_manager &pass_mgr)
 {
     {
         transform_pass p("annotate_kpu1");
@@ -150,7 +152,15 @@ void k210_target::register_quantize_annotation_passes(const module_type_t &type,
         pass_mgr.add_pass(std::move(p));
     }
 
-    neutral_target::register_quantize_annotation_passes(type, pass_mgr);
+    {
+        transform_pass p("fuse_unary");
+        p.emplace<fuse_one_unary_transform>();
+        p.emplace<fuse_one_binary_transform>();
+        p.emplace<fuse_two_fused_unary_transform>();
+        p.emplace<fuse_one_fused_unary_with_binary_transform>();
+        p.emplace<fuse_two_fused_unary_with_binary_transform>();
+        pass_mgr.add_pass(std::move(p));
+    }
 
     {
         transform_pass p("annotate_kpu2");
@@ -172,7 +182,7 @@ void k210_target::register_quantize_annotation_passes(const module_type_t &type,
 
     {
         transform_pass p("annotate_kpu_quantize");
-        p.emplace<add_quant_checkpoints_transform>(std::in_place, ir::op_fused_unary, ir::k210::op_k210_fake_kpu_conv2d, ir::op_bitcast, ir::op_dequantize, ir::op_binary);
+        p.emplace<add_quant_checkpoints_transform>(std::in_place, ir::op_fused_unary, ir::k210::op_k210_fake_kpu_conv2d, ir::op_bitcast, ir::op_dequantize, ir::op_binary, ir::op_slice);
         pass_mgr.add_pass(std::move(p));
     }
 }
diff --git a/tests/ci_proxy.py b/tests/ci_proxy.py
new file mode 100644
index 0000000000..ab876cddf1
--- /dev/null
+++ b/tests/ci_proxy.py
@@ -0,0 +1,178 @@
+import os
+import argparse
+import stat
+import socket
+import json
+import threading
+import queue
+import logging
+import logging.handlers
+import telnetlib
+import time
+
+class TelnetClient():
+    def __init__(self, mylogger):
+        self.tn = telnetlib.Telnet()
+        self.logger = mylogger
+        self.ip = '10.99.105.216'
+        self.timeout = 60
+
+    def login(self, ip, username, password):
+        try:
+            self.tn.open(ip, port=23)
+        except:
+            self.logger.error('telnet {0} failed'.format(ip))
+            return False
+
+        self.ip = ip
+        self.tn.read_until(b'login: ', timeout=self.timeout)
+        self.tn.write(username.encode() + b'\r\n')
+
+        cmd_result = self.tn.read_very_eager().decode()
+        if 'Login incorrect' not in cmd_result:
+            self.logger.info('{0} login succeed'.format(ip))
+            return True
+        else:
+            self.logger.error('{0} login failed'.format(ip))
+            return False
+
+    def logout(self):
+        self.tn.close()
+        self.logger.info('{0} logout succeed'.format(self.ip))
+
+    def execute(self, cmd, flag):
+        self.logger.debug('execute: cmd = {0}, flag = {1}'.format(cmd, flag))
+        self.tn.write(cmd.encode() + b'\r\n')
+        cmd_result = self.tn.read_until(flag.encode(), timeout=self.timeout).decode()
+        if flag not in cmd_result:
+            # time out
+            self.tn.write(telnetlib.IP)
+            cmd_result = f'timeout for {self.timeout} seconds'
+            self.logger.error('execute {0} failed: {1}'.format(cmd, cmd_result))
+            return cmd_result, False
+        else:
+            self.tn.write('echo $?'.encode() + b'\r\n')
+            cmd_status = self.tn.read_until(flag.encode(), self.timeout).decode()
+            if cmd_status.find('\r\n0\r\n') == -1:
+                self.logger.error('execute {0} failed: {1}'.format(cmd, cmd_result))
+                return cmd_result, False
+            else:
+                return cmd_result, True
+
+def recv_file(conn, target_root, mylogger):
+    header = conn.recv(1024)
+    file_dict = json.loads(header.decode())
+    file_name = file_dict['file_name']
+    file_size = file_dict['file_size']
+    mylogger.debug('recv: file = {0}, size = {1}'.format(file_name, file_size))
+    conn.sendall(f"pls send {file_name}".encode())
+
+    full_file = os.path.join(target_root, file_name)
+    with open(full_file, 'wb') as f:
+        recv_size = 0
+        while recv_size < file_size:
+            slice = conn.recv(4096)
+            f.write(slice)
+            recv_size += len(slice)
+
+    conn.sendall(f"recv {file_name} succeed".encode())
+    os.chmod(full_file, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)
+    return file_name
+
+def Consumer(kpu_target, kpu_ip, kpu_username, kpu_password, nfsroot, q, mylogger):
+    # create target root
+    target_root = os.path.join(nfsroot, kpu_target)
+    if not os.path.exists(target_root):
+        os.makedirs(target_root)
+
+    telnet_client = TelnetClient(mylogger)
+    while True:
+        cmd = './'
+        conn = q.get()
+
+        # recv header
+        header = conn.recv(1024)
+        header_dict = json.loads(header.decode())
+        mylogger.info("test case = {0}".format(header_dict['case']))
+        file_num = header_dict['app'] + header_dict['kmodel'] + header_dict['inputs']
+        conn.sendall(f"pls send {file_num} files".encode())
+
+        # recv all kinds of files(app + kmodel + inputs)
+        for i in range(file_num):
+            file = recv_file(conn, target_root, mylogger)
+            if i == 0:
+                cmd = cmd + file
+            else:
+                cmd = cmd + ' ' + file
+
+        # telnet target devcie to infer
+        telnet_client.login(kpu_ip, kpu_username, kpu_password)
+        flag = f'/mnt/{kpu_target} ]$'
+        cmd_result, cmd_status = telnet_client.execute(f'cd /mnt/{kpu_target} && {cmd}', flag)
+        if cmd_status:
+            conn.sendall(f'infer succeed'.encode())
+            dummy = conn.recv(1024)
+
+            # send outputs
+            for i in range(header_dict['outputs']):
+                file = os.path.join(target_root, f'nncase_result_{i}.bin')
+                file_size = os.path.getsize(file)
+                conn.sendall(str(file_size).encode())
+                dummy = conn.recv(1024)
+
+                with open(file, 'rb') as f:
+                    conn.sendall(f.read())
+                dummy = conn.recv(1024)
+                mylogger.debug('send: file = {0}, size = {1}'.format(file, file_size))
+        else:
+            conn.sendall(f'infer failed on {kpu_target} board: {cmd_result}'.encode())
+        conn.close()
+
+        if 'timeout' not in cmd_result:
+            telnet_client.logout()
+        else:
+            # reboot kpu_target when timeout
+            telnet_client.logout()
+            mylogger.error('reboot {0}({1}) for timeout'.format(kpu_target, kpu_ip))
+            telnet_client.login(kpu_ip, kpu_username, kpu_password)
+            flag = f'[{kpu_username}@canaan ~ ]$'
+            telnet_client.execute('reboot', flag)
+            telnet_client.logout()
+            time.sleep(60)
+
+def main():
+    # args
+    parser = argparse.ArgumentParser(prog="ci_proxy")
+    parser.add_argument("--kpu_target", help='kpu device target', type=str, default='k510')
+    parser.add_argument("--kpu_ip", help='kpu deivce ip address', type=str, default='10.99.105.216')
+    parser.add_argument("--kpu_username", help='kpu device usernmae', type=str, default='root')
+    parser.add_argument("--kpu_password", help='kpu device password', type=str, default='')
+    parser.add_argument("--nfsroot", help='nfsroot on pc', type=str, default='/data/nfs')
+    parser.add_argument("--port", help='listenning port of ci_proxy', type=int, default=51000)
+    args = parser.parse_args()
+
+    # logging
+    mylogger = logging.getLogger()
+    mylogger.setLevel(logging.DEBUG)
+    rf_handler = logging.handlers.RotatingFileHandler(f'ci_proxy_{args.kpu_target}.log', mode='a', maxBytes=32 * 1024 * 1024, backupCount=10)
+    rf_handler.setLevel(logging.INFO)
+    rf_handler.setFormatter(logging.Formatter('%(asctime)s [%(levelname)s] %(message)s'))
+    mylogger.addHandler(rf_handler)
+
+    # producer
+    size = 256
+    q = queue.Queue(maxsize=size)
+
+    # comsumer
+    t_consumer = threading.Thread(target=Consumer, args=(args.kpu_target, args.kpu_ip, args.kpu_username, args.kpu_password, args.nfsroot, q, mylogger))
+    t_consumer.start()
+
+    server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    server_socket.bind(('localhost', args.port))
+    server_socket.listen(size)
+    while True:
+        conn, addr = server_socket.accept()
+        q.put(conn)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/tests/config.yml b/tests/config.yml
index ec895d53b6..75eba9c5d0 100644
--- a/tests/config.yml
+++ b/tests/config.yml
@@ -108,9 +108,13 @@ judge:
   specifics:
     - matchs:
         #target: [cpu, vulkan, k210, k510]
-        target: [cpu, k210, k510]
+        target: [cpu, k510]
         ptq: true
       threshold: 0.98
+    - matchs:
+        target: [k210]
+        ptq: true
+      threshold: 0.97
     - matchs:
         target: [k510]
         ptq: false
diff --git a/tests/dataset_utils.py b/tests/dataset_utils.py
index ac36956487..385992607c 100644
--- a/tests/dataset_utils.py
+++ b/tests/dataset_utils.py
@@ -2,7 +2,8 @@
 import numpy as np
 from compare_util import *
 import copy
-
+import socket
+import json
 
 def get_topK(info, k, result):
     tmp = copy.deepcopy(result)
@@ -27,3 +28,85 @@ def sim_run(kmodel, data, paths, target, model_type, model_shape):
         for i in range(len(tmp)):
             f.write(tmp[i][0].split("/")[-1] + " " + str(tmp[i][1][0]) + '\n')
     return tmp
+
+def on_board_run(kmodel, data, paths, target, port, case, nncase_test_ci, input_num, output_num, model_type, model_shape):
+    # connect server
+    client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    client_socket.connect(('localhost', int(port)))
+
+    # send header
+    header_dict = {}
+    header_dict['case'] = case
+    header_dict['app'] = 1
+    header_dict['kmodel']= 1
+    header_dict['inputs'] = 1
+    header_dict['outputs'] = 1
+    client_socket.sendall(json.dumps(header_dict).encode())
+    dummy = client_socket.recv(1024)
+
+    # send app
+    file_dict = {}
+    file_dict['file_name'] = os.path.basename(nncase_test_ci)
+    file_dict['file_size'] = os.path.getsize(nncase_test_ci)
+    client_socket.sendall(json.dumps(file_dict).encode())
+    dummy = client_socket.recv(1024)
+    with open(nncase_test_ci, 'rb') as f:
+        client_socket.sendall(f.read())
+    dummy = client_socket.recv(1024)
+
+    # send kmodel
+    file_dict['file_name'] = 'test.kmodel'
+    file_dict['file_size'] = len(kmodel)
+    client_socket.sendall(json.dumps(file_dict).encode())
+    dummy = client_socket.recv(1024)
+    client_socket.sendall(kmodel)
+    dummy = client_socket.recv(1024)
+
+    # send inputs
+    for i in range(input_num):
+        if(model_type != "tflite" and model_shape[-1] != 3):
+            new_data = np.transpose(data[0], [0, 3, 1, 2]).astype(np.float32)
+        else:
+            new_data = data[0].astype(np.float32)
+
+        data_in_bytes = new_data.tobytes()
+        file_dict['file_name'] = f'input_0_{i}.bin'
+        file_dict['file_size'] = len(data_in_bytes)
+        client_socket.sendall(json.dumps(file_dict).encode())
+        dummy = client_socket.recv(1024)
+        client_socket.sendall(data_in_bytes)
+        dummy = client_socket.recv(1024)
+
+    # infer result
+    cmd_result = client_socket.recv(1024).decode()
+    if cmd_result.find('succeed') != -1:
+        client_socket.sendall(f"pls send outputs".encode())
+
+        # recv outputs
+        for i in range(output_num):
+            header = client_socket.recv(1024)
+            file_size = int(header.decode())
+            client_socket.sendall(f"pls send nncase_result_{i}.bin".encode())
+
+            recv_size = 0
+            buffer = bytearray(file_size)
+            while recv_size < file_size:
+                slice = client_socket.recv(4096)
+                buffer[recv_size:] = slice
+                recv_size += len(slice)
+
+            # result
+            result = np.frombuffer(buffer, dtype=np.float32)
+            tmp = []
+            tmp.append((data[1], get_topK(target, 1, result)))
+            with open(paths[-1][1], 'a') as f:
+                for i in range(len(tmp)):
+                    f.write(tmp[i][0].split("/")[-1] + " " + str(tmp[i][1][0]) + '\n')
+
+            client_socket.sendall(f"recv nncase_result_{i}.bin succeed".encode())
+
+        client_socket.close()
+        return tmp
+    else:
+        client_socket.close()
+        raise Exception(f'{cmd_result}')
\ No newline at end of file
diff --git a/tests/importer/onnx_/basic/test_compress.py b/tests/importer/onnx_/basic/test_compress.py
new file mode 100644
index 0000000000..0b4f21f630
--- /dev/null
+++ b/tests/importer/onnx_/basic/test_compress.py
@@ -0,0 +1,124 @@
+# Copyright 2019-2021 Canaan Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+
+import pytest
+import onnx
+from onnx import helper
+from onnx import AttributeProto, TensorProto, GraphProto
+from onnx_test_runner import OnnxTestRunner
+import numpy as np
+import random
+
+
+def _make_module(in_shape_0, condition_shape, axis=None):
+    inputs = []
+    outputs = []
+    attributes_dict = {}
+    nodes = []
+
+    # input
+    input = helper.make_tensor_value_info('input', TensorProto.FLOAT, in_shape_0)
+    inputs.append('input')
+
+    # output
+    x = np.random.rand(*in_shape_0).astype(np.float32)
+    condition = np.array(np.random.rand(*condition_shape) > .5).astype(np.bool_)
+    if(condition.sum() == 0):
+        print(condition.sum())
+        condition[-1] = True
+
+    output_shape = np.compress(condition, x, axis=axis).shape
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT, output_shape)
+    outputs.append('output')
+
+    condi_data = helper.make_tensor(
+        'condi_Constant',
+        TensorProto.BOOL,
+        dims=condition_shape,
+        vals=condition.astype(np.bool).flatten()
+    )
+    weights_constant = helper.make_node(
+        "Constant",
+        inputs=[],
+        outputs=["condi"],
+        value=condi_data,
+        name="condition")
+
+    nodes.append(weights_constant)
+    if axis != None:
+        attributes_dict['axis'] = axis
+    node = helper.make_node(
+        'Compress',
+        inputs=['input', 'condi'],
+        outputs=outputs,
+        **attributes_dict
+    )
+    nodes.append(node)
+
+    graph_def = helper.make_graph(
+        nodes,
+        'test-model',
+        [input],
+        [output],
+    )
+    model_def = helper.make_model(graph_def, producer_name='kendryte')
+
+    return model_def
+
+
+in_shapes_0 = [
+    [1],
+    [16],
+    [1, 16],
+    [16, 16],
+    [1, 15, 16],
+    [1, 3, 3, 3]
+]
+
+condition = [
+    [1],
+    [3],
+    [6],
+
+]
+
+axes = [
+    None,
+    -1,
+    0,
+    1,
+    2,
+    3
+]
+
+
+@pytest.mark.parametrize('in_shape_0', in_shapes_0)
+@pytest.mark.parametrize('condition', condition)
+@pytest.mark.parametrize('axes', axes)
+def test_compress(in_shape_0, condition, axes, request):
+    size = 1
+    for x in in_shape_0:
+        size *= x
+    if((axes != None and axes < len(in_shape_0) and condition[0] <= in_shape_0[axes])
+            or (axes == None and condition[0] <= size)):
+        model_def = _make_module(in_shape_0, condition, axes)
+
+        runner = OnnxTestRunner(request.node.name)
+        model_file = runner.from_onnx_helper(model_def)
+        runner.run(model_file)
+
+
+if __name__ == "__main__":
+    pytest.main(['-vv', 'test_compress.py'])
diff --git a/tests/importer/onnx_/basic/test_conv.py b/tests/importer/onnx_/basic/test_conv.py
index b58ccc34e1..2cb9717ccf 100644
--- a/tests/importer/onnx_/basic/test_conv.py
+++ b/tests/importer/onnx_/basic/test_conv.py
@@ -195,7 +195,7 @@ def _make_module(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dil
 @pytest.mark.parametrize('pad', pads)
 @pytest.mark.parametrize('stride', strides)
 def test_conv(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dilation, group, kernel_shape, pad, stride, request):
-    if (bias_shape is None or (bias_shape is not None and bias_shape[0] == kernel_output_channel)) and ((auto_pad_mode is not None and pad is None) or (auto_pad_mode is None and pad is not None)) and (dilation is None or (auto_pad_modes is None or auto_pad_modes == 'NOTSET')):
+    if (bias_shape is None or (bias_shape is not None and bias_shape[0] == kernel_output_channel)) and ((auto_pad_mode is not None and pad is None) or (auto_pad_mode is None and pad is not None)) and (dilation is None or auto_pad_mode is None or auto_pad_mode == 'NOTSET'):
         model_def = _make_module(in_shape, kernel_output_channel, bias_shape,
                                  auto_pad_mode, dilation, group, kernel_shape, pad, stride)
 
diff --git a/tests/importer/onnx_/basic/test_conv_transpose.py b/tests/importer/onnx_/basic/test_conv_transpose.py
index ec780ddee3..632a67aa53 100644
--- a/tests/importer/onnx_/basic/test_conv_transpose.py
+++ b/tests/importer/onnx_/basic/test_conv_transpose.py
@@ -30,10 +30,12 @@ def _make_module(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dil
     input = helper.make_tensor_value_info('input', TensorProto.FLOAT, in_shape)
     inputs.append('input')
 
+    group = 1 if group is None else group
+
     # weight
     w_shape = []
     w_shape.append(in_shape[1])
-    w_shape.append(kernel_output_channel)
+    w_shape.append(kernel_output_channel // group)
     w_shape.extend(kernel_shape)
     weight = helper.make_tensor(
         'weight',
@@ -67,7 +69,7 @@ def _make_module(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dil
     # output
     out_shape = []
     out_shape.append(in_shape[0])
-    out_shape.append(w_shape[1])
+    out_shape.append(w_shape[1] * group)
 
     # pad
     padding = [0, 0, 0, 0]
@@ -136,7 +138,7 @@ def _make_module(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dil
 ]
 
 kernel_output_channels = [
-    2
+    3
 ]
 
 bias_shapes = [
@@ -158,6 +160,7 @@ def _make_module(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dil
 
 groups = [
     None,
+    3
 ]
 
 kernel_shapes = [
diff --git a/tests/importer/onnx_/basic/test_conv_transpose1d.py b/tests/importer/onnx_/basic/test_conv_transpose1d.py
new file mode 100644
index 0000000000..ced69786a1
--- /dev/null
+++ b/tests/importer/onnx_/basic/test_conv_transpose1d.py
@@ -0,0 +1,203 @@
+# Copyright 2019-2021 Canaan Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+
+import math
+import pytest
+import onnx
+from onnx import helper
+from onnx import AttributeProto, TensorProto, GraphProto
+from onnx_test_runner import OnnxTestRunner
+import numpy as np
+
+
+def _make_module(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dilation, group, kernel_shape, output_padding, pad, stride):
+    inputs = []
+    initializers = []
+
+    # input
+    input = helper.make_tensor_value_info('input', TensorProto.FLOAT, in_shape)
+    inputs.append('input')
+
+    group = 1 if group is None else group
+
+    # weight
+    w_shape = []
+    w_shape.append(in_shape[1])
+    w_shape.append(kernel_output_channel // group)
+    w_shape.extend(kernel_shape)
+    weight = helper.make_tensor(
+        'weight',
+        TensorProto.FLOAT,
+        dims=w_shape,
+        vals=np.random.rand(*w_shape).astype(np.float32).flatten().tolist()
+    )
+    inputs.append('weight')
+    initializers.append(weight)
+
+    # bias
+    if bias_shape is not None:
+        bias = helper.make_tensor(
+            'bias',
+            TensorProto.FLOAT,
+            dims=bias_shape,
+            vals=np.random.rand(*bias_shape).astype(np.float32).flatten().tolist()
+        )
+        inputs.append('bias')
+        initializers.append(bias)
+
+    # dilation
+    d = [1] if dilation is None else dilation
+
+    # output_padding
+    out_padding = [0] if output_padding is None else output_padding
+
+    # stride
+    s = [1] if stride is None else stride
+
+    # output
+    out_shape = []
+    out_shape.append(in_shape[0])
+    out_shape.append(w_shape[1] * group)
+
+    # pad
+    padding = [0, 0]
+    if auto_pad_mode in [None, 'NOTSET'] and pad is not None:
+        padding = pad
+        out_shape.append(s[0] * (in_shape[2] - 1) + out_padding[0] +
+                         (w_shape[2] - 1) * d[0] + 1 - padding[0] - padding[1])
+    elif auto_pad_mode in ['SAME_UPPER', 'SAME_LOWER']:
+        out_shape.append(in_shape[2] * s[0])
+    else:
+        out_shape.append(in_shape[2] + (in_shape[2] - 1) * (s[0] - 1) - w_shape[2] + 1)
+
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT, out_shape)
+
+    attributes_dict = {}
+
+    if auto_pad_mode is not None:
+        attributes_dict['auto_pad'] = auto_pad_mode
+
+    if dilation is not None:
+        attributes_dict['dilations'] = dilation
+
+    if group is not None:
+        attributes_dict['group'] = group
+
+    if kernel_shape is not None:
+        attributes_dict['kernel_shape'] = kernel_shape
+
+    if output_padding is not None:
+        attributes_dict['output_padding'] = output_padding
+
+    if pad is not None:
+        attributes_dict['pads'] = padding
+
+    if stride is not None:
+        attributes_dict['strides'] = stride
+
+    node = onnx.helper.make_node(
+        'ConvTranspose',
+        inputs=inputs,
+        outputs=['output'],
+        **attributes_dict
+    )
+
+    nodes = []
+    nodes.append(node)
+
+    graph_def = helper.make_graph(
+        nodes,
+        'test-model',
+        [input],
+        [output],
+        initializer=initializers)
+
+    model_def = helper.make_model(graph_def, producer_name='kendryte')
+
+    return model_def
+
+
+in_shapes = [
+    [1, 3, 16]
+]
+
+kernel_output_channels = [
+    3
+]
+
+bias_shapes = [
+    None,
+]
+bias_shapes.extend(list([[x] for x in kernel_output_channels]))
+
+auto_pad_modes = [
+    None,
+    'NOTSET',
+    'SAME_UPPER',
+    'SAME_LOWER',
+    'VALID'
+]
+
+dilations = [
+    None,
+]
+
+groups = [
+    None,
+    3
+]
+
+kernel_shapes = [
+    [3],
+]
+
+output_paddings = [
+    None,
+]
+
+pads = [
+    # None,
+    [1, 1],
+]
+
+strides = [
+    None,
+    [2],
+    [3],
+]
+
+
+@pytest.mark.parametrize('in_shape', in_shapes)
+@pytest.mark.parametrize('kernel_output_channel', kernel_output_channels)
+@pytest.mark.parametrize('bias_shape', bias_shapes)
+@pytest.mark.parametrize('auto_pad_mode', auto_pad_modes)
+@pytest.mark.parametrize('dilation', dilations)
+@pytest.mark.parametrize('group', groups)
+@pytest.mark.parametrize('kernel_shape', kernel_shapes)
+@pytest.mark.parametrize('output_padding', output_paddings)
+@pytest.mark.parametrize('pad', pads)
+@pytest.mark.parametrize('stride', strides)
+def test_conv_transpose1d(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dilation, group, kernel_shape, output_padding, pad, stride, request):
+    if (bias_shape is None or (bias_shape is not None and bias_shape[0] == kernel_output_channel)) and ((auto_pad_mode in [None, 'NOTSET'] and pad is not None) or (auto_pad_mode in ['SAME_UPPER', 'SAME_LOWER', 'VALID'] and pad is None)) and (dilation is None or (auto_pad_modes in [None, 'NOTSET'])) and ((output_padding is None) or (output_padding is not None and stride is not None)):
+        model_def = _make_module(in_shape, kernel_output_channel, bias_shape,
+                                 auto_pad_mode, dilation, group, kernel_shape, output_padding, pad, stride)
+
+        runner = OnnxTestRunner(request.node.name, ['k510'])
+        model_file = runner.from_onnx_helper(model_def)
+        runner.run(model_file)
+
+
+if __name__ == "__main__":
+    pytest.main(['-vv', 'test_conv_transpose1d.py'])
diff --git a/tests/importer/onnx_/basic/test_gather_elements.py b/tests/importer/onnx_/basic/test_gather_elements.py
new file mode 100644
index 0000000000..e4887b6b8d
--- /dev/null
+++ b/tests/importer/onnx_/basic/test_gather_elements.py
@@ -0,0 +1,91 @@
+# Copyright 2019-2021 Canaan Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""System test: test gather"""
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+
+import pytest
+import onnx
+from onnx import helper
+from onnx import AttributeProto, TensorProto, GraphProto
+from onnx_test_runner import OnnxTestRunner
+import numpy as np
+
+
+def _make_module(in_shape, index, axis):
+    initializers = []
+    attributes_dict = {}
+
+    input = helper.make_tensor_value_info('input', TensorProto.FLOAT, in_shape)
+
+    # i_shape = list(np.array(indices).shape)
+    indices = helper.make_tensor(
+        'indices',
+        TensorProto.INT32,
+        np.array(index).shape,
+        np.array(index).astype(np.int32).flatten())
+    initializers.append(indices)
+
+    # axis
+    if axis is not None:
+        default_axis = axis
+    else:
+        default_axis = 0
+    attributes_dict['axis'] = default_axis
+
+    output = helper.make_tensor_value_info(
+        'output', TensorProto.FLOAT, np.array(index).shape)
+
+    node = onnx.helper.make_node(
+        'GatherElements',
+        inputs=['input', 'indices'],
+        outputs=['output'],
+        **attributes_dict
+    )
+
+    graph_def = helper.make_graph(
+        [node],
+        'test-model',
+        [input],
+        [output],
+        initializer=initializers
+    )
+
+    return helper.make_model(graph_def, producer_name='onnx')
+
+
+# input_shape, indices_data, axis
+# input_shape[i] >= indices_data.shape[i]
+in_shapes_indices_dim = [
+    ([2, 2], [[0, 0], [1, 0]], 0),
+    ([2, 3], [[0, 2], [1, 0]], 1),
+    ([2, 3], [[0, -2], [-1, 0]], 1),
+    ([1, 3, 3], [[[1, 2, 0], [2, 0, 0]]], 2),
+    ([1, 3, 3], [[[1, 2, 0], [2, 0, 0]]], 1),
+    ([4, 2, 3], [[[1, 2, 0], [2, 0, 0]], [[3, 2, 1], [2, 3, 1]]], 0),
+    ([1, 5, 3], [[[1, 2, 0], [2, 0, 0]]], 1),
+    ([2, 5, 6], [[[1, 2, 0], [2, 0, 0]], [[3, 2, 1], [2, 3, 1]]], 2),
+
+]
+
+
+@pytest.mark.parametrize('in_shape, indices, axis', in_shapes_indices_dim)
+def test_gather_elements(in_shape, indices, axis, request):
+    model_def = _make_module(in_shape, indices, axis)
+    runner = OnnxTestRunner(request.node.name)
+    model_file = runner.from_onnx_helper(model_def)
+    runner.run(model_file)
+
+
+if __name__ == "__main__":
+    pytest.main(['-vv', 'test_gather_elements.py'])
diff --git a/tests/importer/onnx_/basic/test_gather_nd.py b/tests/importer/onnx_/basic/test_gather_nd.py
index 26981f1929..c1f9f9d478 100644
--- a/tests/importer/onnx_/basic/test_gather_nd.py
+++ b/tests/importer/onnx_/basic/test_gather_nd.py
@@ -53,7 +53,10 @@ def _make_module(in_shape, indices, batch_dims):
         initializer=initializers
     )
 
-    return helper.make_model(graph_def, producer_name='kendryte')
+    # todo: support other opset
+    op = onnx.OperatorSetIdProto()
+    op.version = 12
+    return helper.make_model(graph_def, producer_name='kendryte', opset_imports=[op])
 
 
 in_shapes_indices_dim = [
diff --git a/tests/importer/onnx_/basic/test_gru.py b/tests/importer/onnx_/basic/test_gru.py
new file mode 100644
index 0000000000..0c04c0a658
--- /dev/null
+++ b/tests/importer/onnx_/basic/test_gru.py
@@ -0,0 +1,222 @@
+# Copyright 2019-2021 Canaan Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+
+import pytest
+import onnx
+from onnx import helper
+from onnx import AttributeProto, TensorProto, GraphProto
+from onnx_test_runner import OnnxTestRunner
+import numpy as np
+
+
+
+def _make_module(direction, hidden_size, seq_length, batch_size, input_size, bias, sequence_lens, initial_h, Y, Y_h,
+                 LBR):
+    nodes_inputs = []
+    nodes_outputs = []
+    initializers = []
+    attributes_dict = {}
+    nodes = []
+    graph_inputs = []
+    graph_outputs = []
+
+    num_directions = 2 if direction == 'bidirectional' else 1
+    if direction is not None:
+        attributes_dict['direction'] = direction
+    attributes_dict['hidden_size'] = hidden_size
+    attributes_dict['linear_before_reset'] = LBR
+
+    # input
+    input_shape = [seq_length, batch_size, input_size]
+    input = helper.make_tensor_value_info('input', TensorProto.FLOAT, input_shape)
+    nodes_inputs.append('input')
+    graph_inputs.append(input)
+
+    w_shape = [num_directions, 3 * hidden_size, input_size]
+    w_tensor = helper.make_tensor(
+        'W',
+        TensorProto.FLOAT,
+        dims=w_shape,
+        vals=(np.random.rand(*w_shape) * 2 - 1).astype(np.float32).flatten().tolist()
+    )
+    nodes_inputs.append('W')
+    initializers.append(w_tensor)
+
+    r_shape = [num_directions, 3 * hidden_size, hidden_size]
+    r_tensor = helper.make_tensor(
+        'R',
+        TensorProto.FLOAT,
+        dims=r_shape,
+        vals=(np.random.rand(*r_shape) * 2 - 1).astype(np.float32).flatten().tolist()
+    )
+    nodes_inputs.append('R')
+    initializers.append(r_tensor)
+
+    # bias
+    if bias is None:
+        nodes_inputs.append('')
+    else:
+        bias_shape = [num_directions, 6 * hidden_size]
+        bias_tensor = helper.make_tensor(
+            'B',
+            TensorProto.FLOAT,
+            dims=bias_shape,
+            vals=(np.random.rand(*bias_shape) * 2 - 1).astype(np.float32).flatten().tolist()
+        )
+        nodes_inputs.append('B')
+        initializers.append(bias_tensor)
+
+    if sequence_lens is None:
+        nodes_inputs.append('')
+    else:
+        sequence_lens_shape = [batch_size]
+        sequence_lens_tensor = helper.make_tensor(
+            'sequence_lens',
+            TensorProto.INT32,
+            dims=sequence_lens_shape,
+            vals=np.full(sequence_lens_shape, seq_length).flatten().tolist()
+        )
+        nodes_inputs.append('sequence_lens')
+        initializers.append(sequence_lens_tensor)
+
+    if initial_h is None:
+        nodes_inputs.append('')
+    else:
+        initial_h_shape = [num_directions, batch_size, hidden_size]
+        initial_h_tensor = helper.make_tensor(
+            'initial_h',
+            TensorProto.FLOAT,
+            dims=initial_h_shape,
+            vals=np.random.rand(*initial_h_shape).astype(np.float32).flatten().tolist()
+        )
+        nodes_inputs.append('initial_h')
+        initializers.append(initial_h_tensor)
+
+    # output
+    if Y is None:
+        nodes_outputs.append('')
+    else:
+        output_shape = [seq_length, num_directions, batch_size, hidden_size]
+        output = helper.make_tensor_value_info('Y', TensorProto.FLOAT, output_shape)
+        nodes_outputs.append('Y')
+        graph_outputs.append(output)
+
+    if Y_h is None:
+        nodes_outputs.append('')
+    else:
+        h_shape = [num_directions, batch_size, hidden_size]
+        y_h = helper.make_tensor_value_info('Y_h', TensorProto.FLOAT, h_shape)
+        nodes_outputs.append('Y_h')
+        graph_outputs.append(y_h)
+
+    # lstm node
+    node = onnx.helper.make_node(
+        'GRU',
+        inputs=nodes_inputs,
+        outputs=nodes_outputs,
+        **attributes_dict
+    )
+    nodes.append(node)
+
+    # graph
+    graph_def = helper.make_graph(
+        nodes,
+        'test-model',
+        graph_inputs,
+        graph_outputs,
+        initializer=initializers
+    )
+
+    model_def = helper.make_model(graph_def, producer_name='onnx')
+
+    return model_def
+
+
+directions = [
+    None,
+    'forward',
+    'reverse',
+    'bidirectional'
+]
+
+hidden_sizes = [
+    32,
+]
+
+seq_lengths = [
+    4,
+]
+
+batch_sizes = [
+    16,
+]
+
+input_sizes = [
+    64,
+]
+
+biases = [
+    None,
+    1
+]
+
+sequence_lenses = [
+    None,
+]
+
+initial_hs = [
+    None,
+    1
+]
+
+Ys = [
+    # None, // At least one output be requested
+    1
+]
+
+Y_hs = [
+    None,
+    1
+]
+
+LBRs = [
+    0,
+    1
+]
+
+
+@pytest.mark.parametrize('direction', directions)
+@pytest.mark.parametrize('hidden_size', hidden_sizes)
+@pytest.mark.parametrize('seq_length', seq_lengths)
+@pytest.mark.parametrize('batch_size', batch_sizes)
+@pytest.mark.parametrize('input_size', input_sizes)
+@pytest.mark.parametrize('bias', biases)
+@pytest.mark.parametrize('sequence_lens', sequence_lenses)
+@pytest.mark.parametrize('initial_h', initial_hs)
+@pytest.mark.parametrize('Y', Ys)
+@pytest.mark.parametrize('Y_h', Y_hs)
+@pytest.mark.parametrize('LBR', LBRs)
+def test_gru(direction, hidden_size, seq_length, batch_size, input_size, bias, sequence_lens, initial_h, Y, Y_h, LBR,
+             request):
+    model_def = _make_module(direction, hidden_size, seq_length, batch_size,
+                             input_size, bias, sequence_lens, initial_h, Y, Y_h, LBR)
+
+    runner = OnnxTestRunner(request.node.name)
+    model_file = runner.from_onnx_helper(model_def)
+    runner.run(model_file)
+
+
+if __name__ == "__main__":
+    pytest.main(['-vv', 'test_gru.py'])
diff --git a/tests/importer/onnx_/basic/test_layer_norm.py b/tests/importer/onnx_/basic/test_layer_norm.py
new file mode 100644
index 0000000000..63f611f920
--- /dev/null
+++ b/tests/importer/onnx_/basic/test_layer_norm.py
@@ -0,0 +1,103 @@
+# Copyright 2019-2021 Canaan Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+
+from importlib import import_module
+import pytest
+import onnx
+from onnx import helper
+from onnx import AttributeProto, TensorProto, GraphProto
+from onnx_test_runner import OnnxTestRunner
+import numpy as np
+
+
+def _make_module(in_shape, axis, epsilon):
+
+    input = helper.make_tensor_value_info('input', TensorProto.FLOAT, in_shape)
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT, in_shape)
+
+    initializers = []
+    actual_axis = -1 if axis is None else axis
+    scale = helper.make_tensor("scale",
+                               TensorProto.FLOAT,
+                               dims=in_shape[actual_axis:],
+                               vals=np.random.randn(*in_shape[actual_axis:]).astype(np.float32).flatten().tolist())
+    initializers.append(scale)
+
+    bias = helper.make_tensor("bias",
+                              TensorProto.FLOAT,
+                              dims=in_shape[actual_axis:],
+                              vals=np.random.randn(*in_shape[actual_axis:],).astype(np.float32).flatten().tolist())
+    initializers.append(bias)
+
+    if axis is None and epsilon is None:
+        node = onnx.helper.make_node('LayerNormalization',
+                                     inputs=['input', 'scale', 'bias'],
+                                     outputs=['output'])
+    elif axis is None:
+        node = onnx.helper.make_node('LayerNormalization',
+                                     inputs=['input', 'scale', 'bias'],
+                                     outputs=['output'],
+                                     epsilon=epsilon)
+    elif epsilon is None:
+        node = onnx.helper.make_node('LayerNormalization',
+                                     inputs=['input', 'scale', 'bias'],
+                                     outputs=['output'],
+                                     axis=axis)
+    else:
+        node = onnx.helper.make_node('LayerNormalization',
+                                     inputs=['input', 'scale', 'bias'],
+                                     outputs=['output'],
+                                     axis=axis,
+                                     epsilon=epsilon)
+
+    graph_def = helper.make_graph([node], 'test-model', [input], [output], initializer=initializers)
+    op = onnx.OperatorSetIdProto()
+    op.version = 17
+    model_def = helper.make_model(graph_def, producer_name='onnx', opset_imports=[op])
+
+    return model_def
+
+
+in_shapes = [
+    [1, 24, 256]
+]
+
+axes = [
+    None,
+    -1,
+    2,
+    1,
+    0
+]
+
+epsilons = [
+    None,
+    1e-2
+]
+
+
+@pytest.mark.parametrize('in_shape', in_shapes)
+@pytest.mark.parametrize('axis', axes)
+@pytest.mark.parametrize('epsilon', epsilons)
+def test_layer_norm(in_shape, axis, epsilon, request):
+    model_def = _make_module(in_shape, axis, epsilon)
+
+    runner = OnnxTestRunner(request.node.name)
+    model_file = runner.from_onnx_helper(model_def)
+    runner.run(model_file)
+
+
+if __name__ == "__main__":
+    pytest.main(['-vv', 'test_layer_norm.py'])
diff --git a/tests/importer/onnx_/basic/test_pool.py b/tests/importer/onnx_/basic/test_pool.py
index 7336279a1e..30ac7ae95c 100644
--- a/tests/importer/onnx_/basic/test_pool.py
+++ b/tests/importer/onnx_/basic/test_pool.py
@@ -70,7 +70,6 @@ def forward(self, x):
     True
 ]
 
-
 @pytest.mark.parametrize('in_shape', in_shapes)
 @pytest.mark.parametrize('kernel_size', kernel_sizes)
 @pytest.mark.parametrize('stride', strides)
diff --git a/tests/importer/onnx_/basic/test_pool2.py b/tests/importer/onnx_/basic/test_pool2.py
new file mode 100644
index 0000000000..6e57c2c93b
--- /dev/null
+++ b/tests/importer/onnx_/basic/test_pool2.py
@@ -0,0 +1,111 @@
+# Copyright 2019-2021 Canaan Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+
+import pytest
+import onnx
+from onnx import helper
+from onnx import AttributeProto, TensorProto, GraphProto
+from onnx_test_runner import OnnxTestRunner
+import numpy as np
+import math
+
+
+def _make_module(in_shape, kernel_size, stride, padding, count_include_pad, ceil_mode):
+    nodes = []
+    initializers = []
+    inputs = []
+    outputs = []
+
+    input = helper.make_tensor_value_info('input', TensorProto.FLOAT, in_shape)
+    inputs.append('input')
+    
+    out_shape = in_shape.copy()
+    out_shape[2] = (in_shape[2] + padding[0] + padding[2] - kernel_size[0]) // stride[0] + 1 if ceil_mode == 0 else math.ceil((in_shape[2] + padding[0] + padding[2] - kernel_size[0]) / stride[0]) + 1
+    out_shape[3] = (in_shape[3] + padding[1] + padding[3] - kernel_size[1]) // stride[1] + 1 if ceil_mode == 0 else math.ceil((in_shape[3] + padding[1] + padding[3] - kernel_size[1]) / stride[1]) + 1
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT, out_shape)
+    outputs.append('output')
+
+    node = onnx.helper.make_node(
+		'MaxPool',
+		inputs=inputs,
+		outputs=outputs,
+		kernel_shape=kernel_size,
+        strides=stride,
+        ceil_mode=ceil_mode,
+        pads=padding)
+
+    nodes.append(node)
+
+    graph_def = helper.make_graph(
+        nodes,
+        'test-model',
+        [input],
+        [output],
+        initializer=initializers)
+
+    op = onnx.OperatorSetIdProto()
+    op.version = 11
+    model_def = helper.make_model(graph_def, producer_name='kendryte', opset_imports=[op])
+
+    return model_def
+
+
+in_shapes = [
+    [1, 3, 60, 72],
+]
+
+kernel_sizes = [
+    (3, 3),
+]
+
+strides = [
+    (1, 1),
+    (2, 2),
+    [2, 1]
+]
+
+paddings = [
+    (0, 0, 0, 0),
+    (1, 1, 1, 1),
+    (1, 1, 1, 2)
+]
+
+count_include_pads = [
+    False,
+    True
+]
+
+ceil_modes = [
+    False,
+    True
+]
+
+@pytest.mark.parametrize('in_shape', in_shapes)
+@pytest.mark.parametrize('kernel_size', kernel_sizes)
+@pytest.mark.parametrize('stride', strides)
+@pytest.mark.parametrize('padding', paddings)
+@pytest.mark.parametrize('count_include_pad', count_include_pads)
+@pytest.mark.parametrize('ceil_mode', ceil_modes)
+def test_pool2(in_shape, kernel_size, stride, padding, count_include_pad, ceil_mode, request):
+    if kernel_size[0] / 2 > padding[0] and kernel_size[1] / 2 > padding[1]:
+        module = _make_module(in_shape, kernel_size, stride, padding, count_include_pad, ceil_mode)
+
+        runner = OnnxTestRunner(request.node.name)
+        model_file = runner.from_onnx_helper(module)
+        runner.run(model_file)
+
+
+if __name__ == "__main__":
+    pytest.main(['-vv', 'test_pool2.py'])
diff --git a/tests/importer/onnx_/basic/test_reduce.py b/tests/importer/onnx_/basic/test_reduce.py
index de48be47c2..3ef1c0906f 100644
--- a/tests/importer/onnx_/basic/test_reduce.py
+++ b/tests/importer/onnx_/basic/test_reduce.py
@@ -63,7 +63,12 @@ def _make_module(in_type, in_shape, reduce_op, axes, keepdims):
         [output],
         initializer=initializers)
 
-    model_def = helper.make_model(graph_def, producer_name='onnx')
+    if reduce_op=='ReduceSum':
+        op = onnx.OperatorSetIdProto()
+        op.version = 11
+        model_def = helper.make_model(graph_def, producer_name='onnx', opset_imports=[op])
+    else:
+        model_def = helper.make_model(graph_def, producer_name='onnx')
     return model_def
 
 
@@ -80,6 +85,7 @@ def _make_module(in_type, in_shape, reduce_op, axes, keepdims):
     'ReduceMax',
     'ReduceMean',
     'ReduceMin',
+    'ReduceSum'
 ]
 
 axes_list = [
diff --git a/tests/importer/onnx_/basic/test_roi_align.py b/tests/importer/onnx_/basic/test_roi_align.py
index c32c2e8597..61088cc074 100644
--- a/tests/importer/onnx_/basic/test_roi_align.py
+++ b/tests/importer/onnx_/basic/test_roi_align.py
@@ -21,7 +21,7 @@
 import numpy as np
 import copy
 
-def _make_module(in_shape, rois, batch_indices, mode, output_height, output_width, sampling_ratio, spatial_scale):
+def _make_module(in_shape, rois, batch_indices, mode, output_height, output_width, sampling_ratio, spatial_scale, op_version):
     inputs = []
     outputs = []
     initializers = []
@@ -54,7 +54,7 @@ def _make_module(in_shape, rois, batch_indices, mode, output_height, output_widt
     inputs.append('batch_indices')
 
     # output
-    out_shape = [rois_array.shape[0], in_shape[1], output_height, output_width]
+    out_shape = [rois_array.shape[0], in_shape[1], output_height if output_height is not None else 1, output_width if output_width is not None else 1]
     output = helper.make_tensor_value_info('output', TensorProto.FLOAT, out_shape)
     outputs.append('output')
 
@@ -95,7 +95,9 @@ def _make_module(in_shape, rois, batch_indices, mode, output_height, output_widt
         initializer=initializers
     )
 
-    model_def = helper.make_model(graph_def, producer_name='onnx')
+    op = onnx.OperatorSetIdProto()
+    op.version = op_version
+    model_def = helper.make_model(graph_def, producer_name='onnx helper', opset_imports=[op])
 
     return model_def
 
@@ -137,6 +139,10 @@ def _make_module(in_shape, rois, batch_indices, mode, output_height, output_widt
     1.0
 ]
 
+op_versions = [
+    10
+]
+
 @pytest.mark.parametrize('in_shape', in_shapes)
 @pytest.mark.parametrize('roi', rois)
 @pytest.mark.parametrize('batch_index', batch_indices)
@@ -145,8 +151,9 @@ def _make_module(in_shape, rois, batch_indices, mode, output_height, output_widt
 @pytest.mark.parametrize('output_width', output_widths)
 @pytest.mark.parametrize('sampling_ratio', sampling_ratios)
 @pytest.mark.parametrize('spatial_scale', spatial_scales)
-def test_roi_align(in_shape, roi, batch_index, mode, output_height, output_width, sampling_ratio, spatial_scale, request):
-    model_def = _make_module(in_shape, roi, batch_index, mode, output_height, output_width, sampling_ratio, spatial_scale)
+@pytest.mark.parametrize('op_version', op_versions)
+def test_roi_align(in_shape, roi, batch_index, mode, output_height, output_width, sampling_ratio, spatial_scale, op_version, request):
+    model_def = _make_module(in_shape, roi, batch_index, mode, output_height, output_width, sampling_ratio, spatial_scale, op_version)
 
     runner = OnnxTestRunner(request.node.name)
     model_file = runner.from_onnx_helper(model_def)
diff --git a/tests/importer/onnx_/basic/test_slice.py b/tests/importer/onnx_/basic/test_slice.py
index 3dc04e46fe..45c86684c3 100644
--- a/tests/importer/onnx_/basic/test_slice.py
+++ b/tests/importer/onnx_/basic/test_slice.py
@@ -21,7 +21,7 @@
 import numpy as np
 
 
-def _make_module(in_shape, start, end, axes, step, outshape, op_version, value_format):
+def _make_module(in_shape, start, end, axes, step, outshape, op_version, value_format, attribute_dtype):
     input_names = []
     output_names = []
     inputs = []
@@ -52,7 +52,7 @@ def _make_module(in_shape, start, end, axes, step, outshape, op_version, value_f
         # starts
         start_tensor = helper.make_tensor(
             'starts',
-            TensorProto.INT64,
+            attribute_dtype,
             dims=[len(start)],
             vals=start
         )
@@ -73,7 +73,7 @@ def _make_module(in_shape, start, end, axes, step, outshape, op_version, value_f
         # ends
         end_tensor = helper.make_tensor(
             'ends',
-            TensorProto.INT64,
+            attribute_dtype,
             dims=[len(end)],
             vals=end
         )
@@ -95,7 +95,7 @@ def _make_module(in_shape, start, end, axes, step, outshape, op_version, value_f
         if axes is not None:
             axes_tensor = helper.make_tensor(
                 'axes',
-                TensorProto.INT64,
+                attribute_dtype,
                 dims=[len(end)],
                 vals=axes
             )
@@ -117,7 +117,7 @@ def _make_module(in_shape, start, end, axes, step, outshape, op_version, value_f
         if step is not None:
             step_tensor = helper.make_tensor(
                 'steps',
-                TensorProto.INT64,
+                attribute_dtype,
                 dims=[len(step)],
                 vals=step
             )
@@ -183,16 +183,22 @@ def _make_module(in_shape, start, end, axes, step, outshape, op_version, value_f
     [13, 'constant']
 ]
 
+attribute_dtypes = [
+    TensorProto.INT64,
+    TensorProto.INT32
+]
+
 
 @pytest.mark.parametrize('in_shape', in_shapes)
 @pytest.mark.parametrize('start_end_axes_step_outshape', starts_ends_axes_steps_outshapes)
 @pytest.mark.parametrize('op_versions_and_value_format', op_versions_and_value_formats)
-def test_slice(in_shape, start_end_axes_step_outshape, op_versions_and_value_format, request):
+@pytest.mark.parametrize('attribute_dtype', attribute_dtypes)
+def test_slice(in_shape, start_end_axes_step_outshape, op_versions_and_value_format, attribute_dtype, request):
     start, end, axes, step, outshape = start_end_axes_step_outshape
     op_version, value_format = op_versions_and_value_format
-    if op_version != 1 or (op_version == 1 and step is not None and all([x == 1 for x in step])):
+    if op_version != 1 or (op_version == 1 and step is not None and all([x == 1 for x in step]) and attribute_dtype == TensorProto.INT64):
         model_def = _make_module(in_shape, start, end, axes, step,
-                                 outshape, op_version, value_format)
+                                 outshape, op_version, value_format, attribute_dtype)
 
         runner = OnnxTestRunner(request.node.name)
         model_file = runner.from_onnx_helper(model_def)
diff --git a/tests/importer/onnx_/basic/test_threadholdrelu.py b/tests/importer/onnx_/basic/test_threadholdrelu.py
new file mode 100644
index 0000000000..16a7e2d8c9
--- /dev/null
+++ b/tests/importer/onnx_/basic/test_threadholdrelu.py
@@ -0,0 +1,101 @@
+# Copyright 2019-2021 Canaan Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+
+import pytest
+import onnx
+import numpy as np
+from onnx import helper
+from onnx import AttributeProto, TensorProto, GraphProto
+from onnx_test_runner import OnnxTestRunner
+
+
+def _make_module(in_shape, alpha):
+    inputs = []
+    outputs = []
+    initializers = []
+    attributes_dict = {}
+    nodes = []
+
+    # input
+    input = helper.make_tensor_value_info('input', TensorProto.FLOAT, in_shape)
+    inputs.append('input')
+
+    # output
+    output = helper.make_tensor_value_info('output', TensorProto.FLOAT, in_shape)
+    outputs.append('output')
+
+    # alpha
+    if alpha is not None:
+        attributes_dict['alpha'] = alpha
+
+    tensor = helper.make_tensor(
+        'input2',
+        TensorProto.FLOAT,
+        dims=in_shape,
+        vals=(np.random.rand(*in_shape) + 2).astype(onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[TensorProto.FLOAT]).flatten().tolist()
+    )
+    # inputs.append('input2')
+    initializers.append(tensor)
+
+    # enable default alphas: None -> 1
+    node = onnx.helper.make_node(
+        'Mul',
+        inputs=[inputs[0], 'input2'],
+        outputs=['0'],
+    )
+    nodes.append(node)
+
+    # Celu node
+    node = onnx.helper.make_node(
+        'ThresholdedRelu',
+        inputs=['0'],
+        outputs=outputs,
+        **attributes_dict
+    )
+    nodes.append(node)
+
+    graph_def = helper.make_graph(
+        nodes,
+        'test-model',
+        [input],
+        [output],
+        initializer=initializers)
+
+    model_def = helper.make_model(graph_def, producer_name='onnx')
+
+    return model_def
+
+
+in_shapes = [
+    [1, 3, 16, 16]
+]
+
+alphas = [
+    None,
+    0.5,
+    1.5
+]
+
+@pytest.mark.parametrize('in_shape', in_shapes)
+@pytest.mark.parametrize('alpha', alphas)
+def test_threadholdrelu(in_shape, alpha, request):
+    model_def = _make_module(in_shape, alpha)
+
+    runner = OnnxTestRunner(request.node.name)
+    model_file = runner.from_onnx_helper(model_def)
+    runner.run(model_file)
+
+if __name__ == "__main__":
+    pytest.main(['-vv', 'test_threadholdrelu.py'])
diff --git a/tests/importer/onnx_/basic/test_unary.py b/tests/importer/onnx_/basic/test_unary.py
index 1137dd0a06..bf470172b4 100644
--- a/tests/importer/onnx_/basic/test_unary.py
+++ b/tests/importer/onnx_/basic/test_unary.py
@@ -13,12 +13,137 @@
 # limitations under the License.
 # pylint: disable=invalid-name, unused-argument, import-outside-toplevel
 
+import sys
 import pytest
-import torch
-# import test_util
+import onnx
+from onnx import helper
+from onnx import AttributeProto, TensorProto, GraphProto
 from onnx_test_runner import OnnxTestRunner
+import numpy as np
+
+def _make_module(op, in_type, in_shape):
+    inputs = []
+    outputs = []
+    # initializers = []
+    attributes_dict = {}
+    nodes = []
+
+    # input1
+    input1 = helper.make_tensor_value_info('input1', in_type, in_shape)
+    inputs.append('input1')
+
+    output_shape = in_shape
+    output = helper.make_tensor_value_info('output', in_type, output_shape)
+    outputs.append('output')
+
+    node = onnx.helper.make_node(
+        op,
+        inputs=inputs,
+        outputs=outputs,
+        **attributes_dict
+    )
+    
+    nodes.append(node)
+    graph_def = helper.make_graph(
+        nodes,
+        'test-model',
+        [input1],
+        [output],
+        initializer=None)
+        
+    model_def = helper.make_model(graph_def, producer_name='onnx')
+    return model_def
 
 
+in_shapes = [
+    [16],
+    [1, 3, 16, 16]
+]
+
+# calc operators
+ops = [
+    # 'Rsqrt', 'Square'  # 这 2 个算子目前不支持
+    'Ceil',  
+    'Floor',
+    'Round',
+    'Sqrt',
+    'Tanh',
+    'Erf',
+    'Abs',
+    'Acos',
+    'Asin',
+    'Exp',
+    'Log',
+    'Neg',
+    'Sign',
+    'Sin',
+    'Cos',
+]
+
+# calc operators data type
+in_types = [
+    TensorProto.FLOAT,
+    # TensorProto.INT32,  // Not supported at present
+    # TensorProto.INT8,   // Not supported at present
+    # TensorProto.INT64,  // Not supported at present
+]
+
+# logical operators
+logical_ops = [
+    'Not'
+]
+
+# logical operators data type
+logical_types = [
+    TensorProto.BOOL
+]
+
+# operators and types group
+op_type_pairs = [
+    [logical_ops, logical_types], 
+    [ops, in_types]
+]
+
+def get_case_data(in_datas):
+    case_data = []
+    for op_types in in_datas:
+        _ops = op_types[0]
+        _types = op_types[1]
+        for _op in _ops:
+            for _type in _types:
+                tmp_pair = []
+                tmp_pair.append(_op)
+                tmp_pair.append(_type)
+                case_data.append(tmp_pair)
+    return case_data        
+    pass
+
+class TestUnaryModule(object):
+ 
+    def setup_class(self):
+        pass
+ 
+    def teardown_class(self):
+        pass
+ 
+    # get the test case
+    case_data=get_case_data(op_type_pairs)
+    print(case_data)
+    
+    @pytest.mark.parametrize('in_shape', in_shapes)
+    @pytest.mark.parametrize('op, in_type', case_data)
+    def test_unary(self, op, in_type, in_shape, request):
+        model_def = _make_module(op, in_type, in_shape)
+        runner = OnnxTestRunner(request.node.name)
+        model_file = runner.from_onnx_helper(model_def)
+        runner.run(model_file)
+        pass
+    
+'''
+import pytest
+import torch
+import test_util
+from onnx_test_runner import OnnxTestRunner
 def _make_module():
     class UnaryModule(torch.nn.Module):
         def __init__(self):
@@ -39,17 +164,10 @@ def forward(self, x):
             outs.append(torch.sin(x))
             outs.append(torch.sqrt(x + 2))
             outs.append(torch.tanh(x))
+            outs.append(torch.rsqrt(x + 2))
             return outs
 
     return UnaryModule()
-
-
-in_shapes = [
-    [16],
-    [1, 3, 16, 16]
-]
-
-
 @pytest.mark.parametrize('in_shape', in_shapes)
 def test_unary(in_shape, request):
     module = _make_module()
@@ -57,7 +175,7 @@ def test_unary(in_shape, request):
     runner = OnnxTestRunner(request.node.name)
     model_file = runner.from_torch(module, in_shape)
     runner.run(model_file)
-
+'''
 
 if __name__ == "__main__":
-    pytest.main(['-vv', 'test_unary.py'])
+    pytest.main(['-v', 'test_unary.py'])
diff --git a/tests/importer/onnx_/combine/squeeze_dim/test_squeeze_binary.py b/tests/importer/onnx_/combine/squeeze_dim/test_squeeze_binary.py
new file mode 100644
index 0000000000..1d8d0e5d66
--- /dev/null
+++ b/tests/importer/onnx_/combine/squeeze_dim/test_squeeze_binary.py
@@ -0,0 +1,100 @@
+# Copyright 2019-2021 Canaan Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+
+import pytest
+import onnx
+from onnx import helper
+from onnx import AttributeProto, TensorProto, GraphProto
+from onnx_test_runner import OnnxTestRunner
+import numpy as np
+
+def _make_module(op, in_type, in_shape_0, in_shape_1):
+    inputs = []
+    outputs = []
+    initializers = []
+    attributes_dict = {}
+    nodes = []
+
+    # input1
+    input1 = helper.make_tensor_value_info('input1', in_type, in_shape_0)
+    inputs.append('input1')
+
+    # set input2 to avoid SIGFPE for div op.
+    tensor = helper.make_tensor(
+        'input2',
+        in_type,
+        dims=in_shape_1,
+        vals=(np.random.rand(*in_shape_1) + 2).astype(onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[in_type]).flatten().tolist()
+    )
+    inputs.append('input2')
+    initializers.append(tensor)
+
+    # output
+    x = np.random.randn(*in_shape_0)
+    y = np.random.randn(*in_shape_1)
+    output_shape = np.add(x, y).shape
+    output = helper.make_tensor_value_info('output', in_type, output_shape)
+    outputs.append('output')
+
+    node = onnx.helper.make_node(
+        op,
+        inputs=inputs,
+        outputs=outputs,
+        **attributes_dict
+    )
+    nodes.append(node)
+
+    graph_def = helper.make_graph(
+        nodes,
+        'test-model',
+        [input1],
+        [output],
+        initializer=initializers)
+
+    model_def = helper.make_model(graph_def, producer_name='onnx')
+    return model_def
+
+ops = [
+    'Add',
+]
+
+in_types = [
+    TensorProto.FLOAT,
+]
+
+in_shapes = [
+    [[1, 3, 4, 5, 2], [1]],   
+    [[4, 3, 4, 5, 2], [2]],   
+    [[1, 3, 4, 5, 2], [1, 3, 4, 1, 1]],   
+    [[1, 3, 16, 16, 2], [1, 1, 1, 16, 1]],
+    [[1, 3, 16, 16, 2], [1, 3, 1, 16, 1]],
+    [[2, 3, 16, 16, 2], [2, 1, 16, 1, 2]],
+    [[1, 3, 16, 16, 2, 3], [1, 3, 1, 16, 1, 1]],
+    [[1, 3, 16, 16, 2, 3], [1, 3, 1, 16, 2, 1]],
+]
+
+@pytest.mark.parametrize('op', ops)
+@pytest.mark.parametrize('in_type', in_types)
+@pytest.mark.parametrize('in_shape', in_shapes)
+def test_squeeze_binary(op, in_type, in_shape, request):
+    model_def = _make_module(op, in_type, in_shape[0], in_shape[1])
+
+    runner = OnnxTestRunner(request.node.name, ['cpu', 'k210', 'k510'])
+    model_file = runner.from_onnx_helper(model_def)
+    runner.run(model_file)
+
+
+if __name__ == "__main__":
+    pytest.main(['-vv', 'test_squeeze_binary.py'])
diff --git a/tests/importer/onnx_/combine/squeeze_dim/test_squeeze_transpose.py b/tests/importer/onnx_/combine/squeeze_dim/test_squeeze_transpose.py
new file mode 100644
index 0000000000..4057c6b8ce
--- /dev/null
+++ b/tests/importer/onnx_/combine/squeeze_dim/test_squeeze_transpose.py
@@ -0,0 +1,57 @@
+# Copyright 2019-2021 Canaan Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""System test: test transpose"""
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+
+import pytest
+import tensorflow as tf
+import numpy as np
+from tflite_test_runner import TfliteTestRunner
+
+
+def _make_module(in_shape, perm):
+    class TransposeModule(tf.Module):
+        def __init__(self):
+            super(TransposeModule).__init__()
+
+        @tf.function(input_signature=[tf.TensorSpec(in_shape, tf.float32)])
+        def __call__(self, x):
+            return tf.transpose(x, perm=perm)
+    return TransposeModule()
+
+
+in_shapes = [
+    [8, 3, 64, 3, 4],
+    [1, 3, 8, 8, 4]
+]
+
+perms = [
+    [2, 1, 0, 4, 3], #CPU
+    [0, 1, 3, 4, 2]  #target
+]
+
+
+@pytest.mark.parametrize('in_shape', in_shapes)
+@pytest.mark.parametrize('perm', perms)
+def test_squeeze_transpose(in_shape, perm, request):
+    if len(perm) == len(in_shape):
+        module = _make_module(in_shape, perm)
+
+        runner = TfliteTestRunner(request.node.name, ['cpu', 'k210', 'k510'])
+        model_file = runner.from_tensorflow(module)
+        runner.run(model_file)
+
+
+if __name__ == "__main__":
+    pytest.main(['-vv', 'test_squeeze_transpose.py'])
diff --git a/tests/importer/tflite_/basic/test_compare.py b/tests/importer/tflite_/basic/test_compare.py
index 321a8cc1bd..7c4b8caea0 100644
--- a/tests/importer/tflite_/basic/test_compare.py
+++ b/tests/importer/tflite_/basic/test_compare.py
@@ -23,7 +23,7 @@ def _make_module(compare_op, in_type_0, in_shape_0, in_type_1, in_shape_1):
     class CompareModule(tf.Module):
         def __init__(self):
             super(CompareModule).__init__()
-            self.v = tf.constant(np.random.rand(*in_shape_1).astype(in_type_1))
+            self.v = tf.constant((np.ones(in_shape_1)/2.0).astype(in_type_1))
 
         @tf.function(input_signature=[tf.TensorSpec(in_shape_0, in_type_0)])
         def __call__(self, x):
diff --git a/tests/importer/tflite_/basic/test_conv2d.py b/tests/importer/tflite_/basic/test_conv2d.py
index 8e31c43ed5..f1a6dba7d0 100644
--- a/tests/importer/tflite_/basic/test_conv2d.py
+++ b/tests/importer/tflite_/basic/test_conv2d.py
@@ -89,7 +89,7 @@ def __call__(self, x):
 @pytest.mark.parametrize('padding', paddings)
 @pytest.mark.parametrize('dilations', dilations)
 def test_conv2d(n, i_channels, i_size, k_size, o_channels, strides, padding, dilations, request):
-    if padding != 'VALID' or (k_size[0] <= i_size[0] and k_size[1] <= i_size[1]):
+    if k_size[0] <= i_size[0] and k_size[1] <= i_size[1] and strides[0] <= k_size[0] and strides[1] <= k_size[1]:
         module = _make_module(n, i_channels, i_size, k_size, o_channels,
                               strides, padding, dilations)
 
diff --git a/tests/importer/tflite_/basic/test_space_to_batch.py b/tests/importer/tflite_/basic/test_space_to_batch.py
new file mode 100644
index 0000000000..7e6559481b
--- /dev/null
+++ b/tests/importer/tflite_/basic/test_space_to_batch.py
@@ -0,0 +1,58 @@
+# Copyright 2019-2021 Canaan Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+import pytest
+import os
+import tensorflow as tf
+import numpy as np
+import sys
+from tflite_test_runner import TfliteTestRunner
+
+
+def _make_module(in_shape, block_shape, paddings):
+    class SpaceToBatchModule(tf.Module):
+        def __init__(self):
+            super(SpaceToBatchModule).__init__()
+
+        @tf.function(input_signature=[tf.TensorSpec(in_shape, tf.float32)])
+        def __call__(self, x):
+            return tf.space_to_batch(x, block_shape, paddings)
+    return SpaceToBatchModule()
+
+in_shapes = [
+    [1, 16, 16, 3]
+]
+
+block_shapes = [
+    [2, 2],
+]
+
+paddings = [
+    [[0, 0], [0, 0]],
+    [[0, 2], [0, 2]],
+    [[2, 0], [2, 0]],
+    [[2, 2], [2, 2]]
+]
+
+@pytest.mark.parametrize('in_shape', in_shapes)
+@pytest.mark.parametrize('block_shape', block_shapes)
+@pytest.mark.parametrize('padding', paddings)
+def test_space_to_batch(in_shape, block_shape,padding, request):
+    module = _make_module(in_shape, block_shape, padding)
+    runner = TfliteTestRunner(request.node.name)
+    model_file = runner.from_tensorflow(module)
+    runner.run(model_file)
+
+if __name__ == "__main__":
+    pytest.main(['-vv', 'test_space_to_batch.py'])
diff --git a/tests/importer/tflite_/model/test_mobilenetv1.py b/tests/importer/tflite_/model/test_mobilenetv1.py
index e3f1ce3179..987a910e53 100644
--- a/tests/importer/tflite_/model/test_mobilenetv1.py
+++ b/tests/importer/tflite_/model/test_mobilenetv1.py
@@ -40,20 +40,55 @@ def _make_module(in_shape, alpha):
 def test_mobilenetv1(in_shape, alpha, request):
     module = _make_module(in_shape, alpha)
     overwrite_cfg = """
-     judge:
-       specifics:
-         - matchs:
-             target: [cpu, k510]
-             ptq: true
-           threshold: 0.98
-         - matchs:
-             target: [k210]
-             ptq: true
-           threshold: 0.94
-         - matchs:
-             target: [k510]
-             ptq: false
-           threshold: 0.99
+    case: 
+      preprocess_opt:
+        - name: preprocess
+          values:
+            - true
+        - name: swapRB
+          values:
+            - false
+        - name: input_shape
+          values:
+            - [1,224,224,3]
+        - name: mean
+          values:
+            - [0.5,0.5,0.5]
+        - name: std
+          values:
+            - [0.5,0.5,0.5]
+        - name: input_range
+          values:
+            - [0,1]
+        - name: input_type
+          values:
+            - float32
+        - name: model_layout
+          values:
+            - NHWC
+        - name: input_layout
+          values:
+            - NHWC
+        - name: output_layout
+          values:
+            - NHWC
+        - name: letterbox_value
+          values:
+            - 0.
+    judge:
+      specifics:
+        - matchs:
+            target: [cpu, k510]
+            ptq: true
+          threshold: 0.97
+        - matchs:
+            target: [k210]
+            ptq: true
+          threshold: 0.94
+        - matchs:
+            target: [k510]
+            ptq: false
+          threshold: 0.99
      """
     runner = TfliteTestRunner(request.node.name, overwrite_configs=overwrite_cfg)
     model_file = runner.from_tensorflow(module)
diff --git a/tests/onnx_test_runner.py b/tests/onnx_test_runner.py
index 3f7c9d28c6..559c6169ae 100644
--- a/tests/onnx_test_runner.py
+++ b/tests/onnx_test_runner.py
@@ -63,6 +63,7 @@ def run(self, model_file):
 
         model_file = os.path.join(
             os.path.dirname(model_file), 'simplified.onnx')
+        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
         onnx.save_model(onnx_model, model_file)
 
         super().run(model_file)
@@ -115,10 +116,10 @@ def parse_model_input_output(self, model_file: str):
             input_dict = {}
             input_dict['name'] = e.name
             input_dict['dtype'] = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[onnx_type.elem_type]
-            input_dict['shape'] = [(i.dim_value if i.dim_value != 0 else d) for i, d in zip(
-                onnx_type.shape.dim, [1, 3, 224, 224])]
-            input_dict['model_shape'] = [(i.dim_value if i.dim_value != 0 else d) for i, d in zip(
-                onnx_type.shape.dim, [1, 3, 224, 224])]
+            input_dict['shape'] = [(i.dim_value if i.dim_value != 0 else 10) for i in
+                                   onnx_type.shape.dim]
+            input_dict['model_shape'] = [(i.dim_value if i.dim_value != 0 else 10) for i in
+                                         onnx_type.shape.dim]
             self.inputs.append(input_dict)
             self.calibs.append(copy.deepcopy(input_dict))
             self.dump_range_data.append(copy.deepcopy(input_dict))
@@ -150,6 +151,7 @@ def cpu_infer(self, case_dir: str, model_file: bytes, type: str, mode: str):
                 onnx_model = onnx.load(model_file)
                 onnx_model = version_converter.convert_version(onnx_model, 8)
                 model_file = os.path.join(case_dir, 'converted.onnx')
+                onnx_model = onnx.shape_inference(onnx_model)
                 onnx.save_model(onnx_model, model_file)
                 sess = ort.InferenceSession(model_file)
 
diff --git a/tests/schedule/buffer_fusion/test_bitcast.py b/tests/schedule/buffer_fusion/test_bitcast.py
new file mode 100644
index 0000000000..5797e0c29e
--- /dev/null
+++ b/tests/schedule/buffer_fusion/test_bitcast.py
@@ -0,0 +1,42 @@
+# Copyright 2019-2021 Canaan Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+
+import pytest
+import tensorflow as tf
+import numpy as np
+from tflite_test_runner import TfliteTestRunner
+
+
+def _make_module():
+    class Module(tf.Module):
+        def __init__(self):
+            super(Module).__init__()
+
+        @tf.function(input_signature=[tf.TensorSpec([1, 4, 4, 3], tf.float32)])
+        def __call__(self, x):
+            return tf.reshape(x, [1, -1, 3])
+    return Module()
+
+
+def test_bitcast(request):
+    module = _make_module()
+
+    runner = TfliteTestRunner(request.node.name)
+    model_file = runner.from_tensorflow(module)
+    runner.run(model_file)
+
+
+if __name__ == "__main__":
+    pytest.main(['-vv', 'test_bitcast.py'])
diff --git a/tests/test_runner.py b/tests/test_runner.py
index 220eeb89bf..f2605597b5 100644
--- a/tests/test_runner.py
+++ b/tests/test_runner.py
@@ -20,7 +20,8 @@
 from compare_util import compare
 from dataset_utils import *
 from models.preprocess.preprocess import preprocess
-
+import socket
+import json
 
 class Edict:
     def __init__(self, d: Dict[str, int]) -> None:
@@ -108,7 +109,7 @@ def generate_random(shape: List[int], dtype: np.dtype,
     elif dtype == np.bool:
         data = np.random.rand(*shape) > 0.5
     else:
-        data = np.random.rand(*shape)
+        data = np.random.uniform(0.01, 1, shape)
     data = data.astype(dtype=dtype)
     if abs:
         return np.abs(data)
@@ -231,7 +232,16 @@ def __init__(self, case_name, targets=None, overwrite_configs: Union[Dict, str]
             self.case_dir = os.path.join(self.cfg.setup.root, case_name)
         self.clear(self.case_dir)
 
-        self.validate_targets(targets)
+        self.kpu_target = os.getenv('KPU_TARGET')
+        self.port = os.getenv('PORT')
+        self.nncase_test_ci = os.getenv('NNCASE_TEST_CI')
+
+        if self.in_ci and self.cfg.case.generate_inputs.name == 'generate_random' and self.kpu_target is not None and self.port is not None and self.nncase_test_ci is not None and (targets is None or self.kpu_target in targets):
+            new_targets = []
+            new_targets.append(self.kpu_target)
+        else:
+            new_targets = targets
+        self.validate_targets(new_targets)
 
         self.inputs: List[Dict] = []
         self.calibs: List[Dict] = []
@@ -316,7 +326,7 @@ def get_process_config(self, config):
 
     def data_pre_process(self, data):
         data = copy.deepcopy(data)
-        if self.pre_process[3]['input_type'] == "float32":
+        if self.pre_process[0]['preprocess'] and self.pre_process[3]['input_type'] == "float32":
             data = np.asarray(data, dtype=np.float32)
         if self.pre_process[0]['preprocess'] and len(data.shape) == 4:
             if self.pre_process[-1]['input_layout'] == 'NCHW':
@@ -462,6 +472,14 @@ def import_model(self, compiler, model_content, import_options):
     def run_single(self, cfg, case_dir: str, model_file: Union[List[str], str]):
         if not self.inputs:
             self.parse_model_input_output(model_file)
+
+        on_board = self.in_ci and self.kpu_target is not None and self.port is not None and self.nncase_test_ci is not None and len(self.inputs) > 0 and len(self.outputs) > 0
+        if on_board and cfg.generate_inputs.name == 'generate_imagenet_dataset':
+            cfg.generate_inputs.batch_size = 1
+
+        if on_board and cfg.generate_calibs.name == 'generate_imagenet_dataset':
+            cfg.generate_calibs.batch_size = 1
+
         names, args = TestRunner.split_value(cfg.preprocess_opt)
         for combine_args in product(*args):
             dict_args = dict(zip(names, combine_args))
@@ -734,8 +752,13 @@ def nncase_infer(self, cfg, case_dir: str,
         if kwargs['ptq']:
             ptq_options = nncase.PTQTensorOptions()
             if cfg.generate_calibs.name == "generate_imagenet_dataset":
-                ptq_options.set_tensor_data(np.asarray(
-                    [sample['data'] for sample in self.calibs]).tobytes())
+                # ptq_options.set_tensor_data(np.asarray(
+                #     [sample['data'] for sample in self.calibs]).tobytes())
+                calib_len = len(self.calibs[0]['data'])
+                byte_inputs = np.asarray(self.calibs[0]['data'][0][0]).tobytes()
+                for i in range(1, len(self.calibs[0]['data'])):
+                    byte_inputs += np.asarray(self.calibs[0]['data'][i][0]).tobytes()
+                ptq_options.set_tensor_data(byte_inputs)
                 ptq_options.calibrate_method = self.cfg.case.compile_opt.quant_method
             else:
                 raw_inputs = [self.transform_input(sample['data'], preprocess['input_type'], "infer") for sample in
@@ -754,6 +777,10 @@ def nncase_infer(self, cfg, case_dir: str,
             f.write(kmodel)
 
         infer_output_paths: List[np.ndarray] = []
+
+        on_board = self.in_ci and kwargs['target'] == self.kpu_target and self.port is not None and self.nncase_test_ci is not None and len(self.inputs) > 0 and len(self.outputs) > 0
+        case_name = f'{os.path.basename(case_dir)}_{os.path.basename(infer_dir)}'
+
         if cfg.generate_inputs.name == "generate_imagenet_dataset":
             gnne_txt = "gnne_no_ptq" if kwargs['ptq'] is False else "gnne_ptq"
             infer_output_paths.append((
@@ -767,45 +794,157 @@ def nncase_infer(self, cfg, case_dir: str,
             result = []
             for in_data in self.inputs[0]['data']:
                 input_data = copy.deepcopy(in_data)
-                p.apply_async(sim_run, args=(
-                    kmodel, input_data, infer_output_paths, kwargs['target'], self.model_type,
-                    self.inputs[0]['model_shape']))
+                if on_board:
+                    on_board_run(kmodel, input_data, infer_output_paths, kwargs['target'], self.port, case_name, self.nncase_test_ci, len(self.inputs), len(self.outputs), self.model_type,
+                        self.inputs[0]['model_shape'])
+                else:
+                    p.apply_async(sim_run, args=(
+                        kmodel, input_data, infer_output_paths, kwargs['target'], self.model_type,
+                        self.inputs[0]['model_shape']))
             p.close()
             p.join()
 
         else:
-            sim = nncase.Simulator()
-            sim.load_model(kmodel)
-            for i in range(len(self.inputs)):
-                data = self.transform_input(
-                    self.inputs[i]['data'], preprocess['input_type'], "infer")
-                dtype = preprocess['input_type']
-                if preprocess['preprocess']:
-                    data.tofile(os.path.join(case_dir, f'input_{i}_{dtype}.bin'))
-                    self.totxtfile(os.path.join(case_dir, f'input_{i}_{dtype}.txt'), data)
-
-                sim.set_input_tensor(i, nncase.RuntimeTensor.from_numpy(data))
-            sim.run()
-
-            for i in range(sim.outputs_size):
-                result = sim.get_output_tensor(i).to_numpy()
-                if preprocess['preprocess'] and len(result.shape) == 4:
-                    if (preprocess['output_layout'] == 'NHWC' and self.model_type in ['caffe', 'onnx']):
-                        result = np.transpose(result, [0, 3, 1, 2])
-                    elif (preprocess['output_layout'] == 'NCHW' and self.model_type in ['tflite']):
-                        result = np.transpose(result, [0, 2, 3, 1])
-                infer_output_paths.append((
-                    os.path.join(infer_dir, f'nncase_result_{i}.bin'),
-                    os.path.join(infer_dir, f'nncase_result_{i}.txt')))
-                if cfg.compile_opt.output_type != "float32" and infer_dir.split('/')[-1] == "ptq":
-                    result.tofile(os.path.join(
-                        infer_dir, f'nncase_result_{cfg.compile_opt.output_type}_{i}.bin'))
-                    self.totxtfile(os.path.join(
-                        infer_dir, f'nncase_result_{cfg.compile_opt.output_type}_{i}.txt'), result)
-                    result = deq_output(os.path.join(
-                        infer_dir, f'kmodel_info.txt'), result)
-                result.tofile(infer_output_paths[-1][0])
-                self.totxtfile(infer_output_paths[-1][1], result)
+            if on_board:
+                # connect server
+                client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                client_socket.connect(('localhost', int(self.port)))
+
+                # send header
+                header_dict = {}
+                header_dict['case'] = case_name
+                header_dict['app'] = 1
+                header_dict['kmodel']= 1
+                header_dict['inputs'] = len(self.inputs)
+                header_dict['outputs'] = len(self.outputs)
+                client_socket.sendall(json.dumps(header_dict).encode())
+                dummy = client_socket.recv(1024)
+
+                # send app
+                file_dict = {}
+                file_dict['file_name'] = os.path.basename(self.nncase_test_ci)
+                file_dict['file_size'] = os.path.getsize(self.nncase_test_ci)
+                client_socket.sendall(json.dumps(file_dict).encode())
+                dummy = client_socket.recv(1024)
+                with open(self.nncase_test_ci, 'rb') as f:
+                    client_socket.sendall(f.read())
+                dummy = client_socket.recv(1024)
+
+                # send kmodel
+                file_dict['file_name'] = 'test.kmodel'
+                file_dict['file_size'] = len(kmodel)
+                client_socket.sendall(json.dumps(file_dict).encode())
+                dummy = client_socket.recv(1024)
+                client_socket.sendall(kmodel)
+                dummy = client_socket.recv(1024)
+
+                # send inputs
+                for i in range(len(self.inputs)):
+                    input_bin = os.path.join(case_dir, f'input_0_{i}.bin')
+                    data = self.transform_input(
+                        self.inputs[i]['data'], preprocess['input_type'], "infer")
+                    dtype = preprocess['input_type']
+                    if preprocess['preprocess']:
+                        input_bin = os.path.join(case_dir, f'input_{i}_{dtype}.bin')
+                        data.tofile(input_bin)
+                        self.totxtfile(os.path.join(case_dir, f'input_{i}_{dtype}.txt'), data)
+
+                    file_dict['file_name'] = f'input_0_{i}.bin'
+                    file_dict['file_size'] = os.path.getsize(input_bin)
+                    client_socket.sendall(json.dumps(file_dict).encode())
+                    dummy = client_socket.recv(1024)
+                    client_socket.sendall(data.tobytes())
+                    dummy = client_socket.recv(1024)
+
+                # infer result
+                cmd_result = client_socket.recv(1024).decode()
+                if cmd_result.find('succeed') != -1:
+                    client_socket.sendall(f"pls send outputs".encode())
+
+                    # recv outputs
+                    for i in range(len(self.outputs)):
+                        header = client_socket.recv(1024)
+                        file_size = int(header.decode())
+                        client_socket.sendall(f"pls send nncase_result_{i}.bin".encode())
+
+                        recv_size = 0
+                        buffer = bytearray(file_size)
+                        while recv_size < file_size:
+                            slice = client_socket.recv(4096)
+                            buffer[recv_size:] = slice
+                            recv_size += len(slice)
+
+                        # save nncase_result
+                        nncase_result = np.frombuffer(buffer, dtype=self.outputs[i]['dtype'])
+                        nncase_result.tofile(os.path.join(infer_dir, f'nncase_result_{i}.bin'))
+                        self.totxtfile(os.path.join(infer_dir, f'nncase_result_{i}.txt'), nncase_result)
+
+                        # save nncase_vs_cpu_result
+                        model_shape = self.outputs[i]['model_shape']
+                        nncase_vs_cpu_result = nncase_result.reshape(model_shape)
+                        if preprocess['preprocess'] and len(model_shape) == 4:
+                            if (preprocess['output_layout'] == 'NHWC' and self.model_type in ['caffe', 'onnx']):
+                                nncase_vs_cpu_result = nncase_result.reshape(model_shape[0], model_shape[2], model_shape[3], model_shape[1])
+                                nncase_vs_cpu_result = np.transpose(nncase_vs_cpu_result, [0, 3, 1, 2])
+                            elif (preprocess['output_layout'] == 'NCHW' and self.model_type in ['tflite']):
+                                nncase_vs_cpu_result = nncase_result.reshape(model_shape[0], model_shape[3], model_shape[1], model_shape[2])
+                                nncase_vs_cpu_result = np.transpose(nncase_vs_cpu_result, [0, 2, 3, 1])
+                        infer_output_paths.append((
+                            os.path.join(infer_dir, f'nncase_vs_cpu_result_{i}.bin'),
+                            os.path.join(infer_dir, f'nncase_vs_cpu_result_{i}.txt')))
+                        if cfg.compile_opt.output_type != "float32" and infer_dir.split('/')[-1] == "ptq":
+                            nncase_vs_cpu_result.tofile(os.path.join(infer_dir, f'nncase_vs_cpu_result_{cfg.compile_opt.output_type}_{i}.bin'))
+                            self.totxtfile(os.path.join(infer_dir, f'nncase_vs_cpu_result_{cfg.compile_opt.output_type}_{i}.txt'), nncase_vs_cpu_result)
+                            nncase_vs_cpu_result = deq_output(os.path.join(infer_dir, f'kmodel_info.txt'), nncase_vs_cpu_result)
+                        nncase_vs_cpu_result.tofile(infer_output_paths[-1][0])
+                        self.totxtfile(infer_output_paths[-1][1], nncase_vs_cpu_result)
+
+                        client_socket.sendall(f"recv nncase_result_{i}.bin succeed".encode())
+
+                    client_socket.close()
+                else:
+                    client_socket.close()
+                    raise Exception(f'{cmd_result}')
+            else:
+                # run in simulator
+                sim = nncase.Simulator()
+                sim.load_model(kmodel)
+                for i in range(len(self.inputs)):
+                    data = self.transform_input(
+                        self.inputs[i]['data'], preprocess['input_type'], "infer")
+                    dtype = preprocess['input_type']
+                    if preprocess['preprocess']:
+                        data.tofile(os.path.join(case_dir, f'input_{i}_{dtype}.bin'))
+                        self.totxtfile(os.path.join(case_dir, f'input_{i}_{dtype}.txt'), data)
+
+                    sim.set_input_tensor(i, nncase.RuntimeTensor.from_numpy(data))
+                sim.run()
+
+                for i in range(sim.outputs_size):
+                    nncase_result = sim.get_output_tensor(i).to_numpy()
+
+                    # save nncase_result
+                    nncase_result.tofile(os.path.join(infer_dir, f'nncase_result_{i}.bin'))
+                    self.totxtfile(os.path.join(infer_dir, f'nncase_result_{i}.txt'), nncase_result)
+
+                    # save nncase_vs_cpu_result
+                    model_shape = self.outputs[i]['model_shape']
+                    nncase_vs_cpu_result = nncase_result
+                    if preprocess['preprocess'] and len(model_shape) == 4:
+                        if (preprocess['output_layout'] == 'NHWC' and self.model_type in ['caffe', 'onnx']):
+                            nncase_vs_cpu_result = np.transpose(nncase_vs_cpu_result, [0, 3, 1, 2])
+                        elif (preprocess['output_layout'] == 'NCHW' and self.model_type in ['tflite']):
+                            nncase_vs_cpu_result = np.transpose(nncase_vs_cpu_result, [0, 2, 3, 1])
+                    infer_output_paths.append((
+                        os.path.join(infer_dir, f'nncase_vs_cpu_result_{i}.bin'),
+                        os.path.join(infer_dir, f'nncase_vs_cpu_result_{i}.txt')))
+                    if cfg.compile_opt.output_type != "float32" and infer_dir.split('/')[-1] == "ptq":
+                        nncase_vs_cpu_result.tofile(os.path.join(infer_dir, f'nncase_vs_cpu_result_{cfg.compile_opt.output_type}_{i}.bin'))
+                        self.totxtfile(os.path.join(infer_dir, f'nncase_vs_cpu_result_{cfg.compile_opt.output_type}_{i}.txt'), nncase_vs_cpu_result)
+                        nncase_vs_cpu_result = deq_output(os.path.join(infer_dir, f'kmodel_info.txt'), nncase_vs_cpu_result)
+                    nncase_vs_cpu_result.tofile(infer_output_paths[-1][0])
+                    self.totxtfile(infer_output_paths[-1][1], nncase_vs_cpu_result)
+
         return infer_output_paths
 
     def on_test_start(self) -> None:
diff --git a/toolchains/k230.baremetal.toolchain.cmake b/toolchains/k230.baremetal.toolchain.cmake
index 5e8cde22c1..49163af48b 100644
--- a/toolchains/k230.baremetal.toolchain.cmake
+++ b/toolchains/k230.baremetal.toolchain.cmake
@@ -16,6 +16,10 @@ set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-elf-g++")
 
 set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-elf")
 
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64imafdcv -mabi=lp64d -mcmodel=medany -static")   #-march=rv64imafdc_v0p7_zfh_zvamo0p7_zvlsseg0p7_xtheadc
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64imafdcv -mabi=lp64d -mcmodel=medany -static") 
+
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
@@ -26,4 +30,7 @@ set(ENABLE_HALIDE OFF)
 set(DEFAULT_BUILTIN_RUNTIMES OFF)
 set(BUILD_PYTHON_BINDING OFF)
 set(DEFAULT_SHARED_RUNTIME_TENSOR_PLATFORM_IMPL OFF)
-set(BUILD_BENCHMARK OFF)
\ No newline at end of file
+set(BUILD_BENCHMARK OFF)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64imafdcv_zihintpause_zfh_zba_zbb_zbc_zbs_xtheadc -mabi=lp64d -mcmodel=medany -mtune=c908")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64imafdcv_zihintpause_zfh_zba_zbb_zbc_zbs_xtheadc -mabi=lp64d -mcmodel=medany -mtune=c908")
\ No newline at end of file
diff --git a/toolchains/k230.linux.toolchain.cmake b/toolchains/k230.linux.toolchain.cmake
index 730072c6e1..92ccf2aece 100644
--- a/toolchains/k230.linux.toolchain.cmake
+++ b/toolchains/k230.linux.toolchain.cmake
@@ -10,20 +10,18 @@ if(NOT RISCV_ROOT_PATH)
 endif()
 
 set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain")
-
-set(CMAKE_C_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc")
-set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc")
-
-set(CMAKE_C_FLAGS "-march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906")
-set(CMAKE_CXX_FLAGS "-march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906")
-
-set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu")
+set(CMAKE_C_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-musl-gcc")
+set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-musl-g++")
+set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-musl")
 
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(ENABLE_VULKAN_RUNTIME OFF)
 set(ENABLE_HALIDE OFF)
-# set(DEFAULT_BUILTIN_RUNTIMES OFF)
-# set(DEFAULT_SHARED_RUNTIME_TENSOR_PLATFORM_IMPL OFF)
+set(DEFAULT_BUILTIN_RUNTIMES OFF)
+set(DEFAULT_SHARED_RUNTIME_TENSOR_PLATFORM_IMPL OFF)
 set(BUILD_BENCHMARK OFF)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64imafdcv -mabi=lp64d -mcmodel=medany")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64imafdcv -mabi=lp64d -mcmodel=medany")
diff --git a/toolchains/x86_64.toolchain.cmake b/toolchains/x86_64.toolchain.cmake
new file mode 100644
index 0000000000..6808c70c84
--- /dev/null
+++ b/toolchains/x86_64.toolchain.cmake
@@ -0,0 +1,11 @@
+
+if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Windows")
+    add_definitions(/DX86_64_SIMD_ON)
+    add_compile_options(/arch:AVX)
+    add_compile_options(/arch:AVX2)
+elseif (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux")
+    add_definitions(-DX86_64_SIMD_ON)
+    add_compile_options( -mfma -msse -msse2 -msse3 -mssse3 -msse4 -msse4a -msse4.1 -msse4.2 -mavx -mavx2)
+else()
+    message("current platform: other ... ")
+endif()
diff --git a/tools/stackvm_gen/IsaGen/Instructions.cs b/tools/stackvm_gen/IsaGen/Instructions.cs
index ee4c76bc12..1ce60f6596 100644
--- a/tools/stackvm_gen/IsaGen/Instructions.cs
+++ b/tools/stackvm_gen/IsaGen/Instructions.cs
@@ -8,2248 +8,2456 @@
 
 namespace IsaGen
 {
-    [System.AttributeUsage(AttributeTargets.Enum, Inherited = false, AllowMultiple = false)]
-    public sealed class BitLengthAttribute : Attribute
-    {
-        public uint BitLength { get; }
-
-        public BitLengthAttribute(uint bitLength)
-        {
-            BitLength = bitLength;
-        }
-    }
-
-    [System.AttributeUsage(AttributeTargets.All, Inherited = false, AllowMultiple = false)]
-    public sealed class EnumNameAttribute : Attribute
-    {
-        public string Name { get; }
-
-        public EnumNameAttribute(string name)
-        {
-            Name = name;
-        }
-    }
-
-    [BitLength(8)]
-    [EnumName("opcode_t")]
-    public enum OpCode
-    {
-        NOP,
-        LDNULL,
-        LDC_I4,
-        LDC_I4_0,
-        LDC_I4_1,
-        LDC_R4,
-        LDIND_I1,
-        LDIND_I2,
-        LDIND_I4,
-        LDIND_I,
-        LDIND_U1,
-        LDIND_U2,
-        LDIND_U4,
-        LDIND_U,
-        LDIND_BR2,
-        LDIND_R4,
-        STIND_I1,
-        STIND_I2,
-        STIND_I4,
-        STIND_I,
-        STIND_BR2,
-        STIND_R4,
-        LEA_GP,
-        LEA_BUFFER,
-
-        LDELEM_I1,
-        LDELEM_I2,
-        LDELEM_I4,
-        LDELEM_I,
-        LDELEM_U1,
-        LDELEM_U2,
-        LDELEM_U4,
-        LDELEM_U,
-        LDELEM_BR2,
-        LDELEM_R4,
-        STELEM_I1,
-        STELEM_I2,
-        STELEM_I4,
-        STELEM_I,
-        STELEM_BR2,
-        STELEM_R4,
-
-        LDARG,
-        LDARG_0,
-        LDARG_1,
-        LDARG_2,
-        LDARG_3,
-        LDARG_4,
-        LDARG_5,
-
-        DUP,
-        POP,
-
-        STSHAPE,
-        STPADDINGS,
-
-        NEG,
-        ADD,
-        SUB,
-        MUL,
-        DIV,
-        DIV_U,
-        REM,
-        REM_U,
-        AND,
-        OR,
-        XOR,
-        NOT,
-        SHL,
-        SHR,
-        SHR_U,
-
-        CLT,
-        CLT_U,
-        CLE,
-        CLE_U,
-        CEQ,
-        CGE,
-        CGE_U,
-        CGT,
-        CGT_U,
-        CNE,
-
-        CONV_I1,
-        CONV_I2,
-        CONV_I4,
-        CONV_I,
-        CONV_U1,
-        CONV_U2,
-        CONV_U4,
-        CONV_U,
-        CONV_BR2,
-        CONV_R4,
-
-        BR,
-        BR_TRUE,
-        BR_FALSE,
-        RET,
-        CALL,
-        ECALL,
-        THROW,
-        BREAK,
-
-        TENSOR,
-    }
-
-    [BitLength(16)]
-    [EnumName("tensor_function_t")]
-    public enum TensorFunction
-    {
-        BATCH_TO_SPACE,
-        BINARY,
-        BROADCAST,
-        CALL,
-        COMPARE,
-        CLAMP,
-        CONV2D,
-        CONV2D_TRANSPOSE,
-        CONVERT,
-        COPY,
-        CUMSUM,
-        DEQUANTIZE,
-        GATHER,
-        GATHER_ND,
-        HARDMAX,
-        LOGISTIC,
-        LUT1D,
-        MATMUL,
-        ONEHOT,
-        PAD,
-        QUANTIZE,
-        RANDOM_NORMAL,
-        RANDOM_UNIFORM,
-        REDUCE,
-        REDUCE_ARG,
-        REDUCE_PROD,
-        REDUCE_WINDOW2D,
-        RESIZE_IMAGE,
-        ROI_ALIGN,
-        SIGMOID,
-        SLICE,
-        SOFTMAX,
-        SPACE_TO_BATCH,
-        TAKE,
-        TERNARY,
-        TOPK,
-        TRANSPOSE,
-        TRILU,
-        UNARY,
-    }
-
-    [BitLength(8)]
-    [EnumName("datatype_t")]
-    [Browsable(false)]
-    public enum DataType
-    {
-    }
-
-    [BitLength(8)]
-    [EnumName("onehot_mode_t")]
-    [Browsable(false)]
-    public enum OneHotMode
-    {
-    }
-
-    [BitLength(8)]
-    [EnumName("pad_mode_t")]
-    [Browsable(false)]
-    public enum PadMode
-    {
-    }
-
-    [BitLength(8)]
-    [EnumName("memory_location_t")]
-    [Browsable(false)]
-    public enum MemoryLocation
-    {
-    }
-
-    [BitLength(8)]
-    [EnumName("reduce_op_t")]
-    [Browsable(false)]
-    public enum ReduceOp
-    {
-    }
-
-    [BitLength(8)]
-    [EnumName("reduce_arg_op_t")]
-    [Browsable(false)]
-    public enum ReduceArgOp
-    {
-    }
-
-    [BitLength(8)]
-    [EnumName("image_resize_mode_t")]
-    [Browsable(false)]
-    public enum ImageResizeMode
-    {
-    }
-
-    [BitLength(8)]
-    [EnumName("binary_op_t")]
-    [Browsable(false)]
-    public enum BinaryOp
-    {
-    }
-
-    [BitLength(8)]
-    [EnumName("unary_op_t")]
-    [Browsable(false)]
-    public enum UnaryOp
-    {
-    }
-
-    [BitLength(8)]
-    [EnumName("compare_op_t")]
-    [Browsable(false)]
-    public enum CompareOp
-    {
-    }
-
-   [BitLength(8)]
-    [EnumName("roi_align_mode_t")]
-    [Browsable(false)]
-    public enum RoiAlignMode
-    {
-    }
-
-    public abstract class Instruction
-    {
-        [DisplayName("opcode")]
-        [Description("OpCode")]
-        public abstract OpCode OpCode { get; }
-    }
-
-    [DisplayName("NOP")]
-    [Category("Control and Status Instructions")]
-    [Description("No operation")]
-    public class NopInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.NOP;
-    }
-
-    [DisplayName("LDC_I4")]
-    [Category("Immediate Instructions")]
-    [Description("Load immedidate I4 to stack")]
-    public class LdcI4Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDC_I4;
-
-        [DisplayName("imm")]
-        [Description("Immedidate I4")]
-        public int Imm { get; set; }
-    }
-
-    [DisplayName("LDNULL")]
-    [Category("Immediate Instructions")]
-    [Description("Load immedidate nullptr as I to stack")]
-    public class LdNullInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDNULL;
-    }
-
-    [DisplayName("LDC_I4_0")]
-    [Category("Immediate Instructions")]
-    [Description("Load immedidate 0 as I4 to stack")]
-    public class LdcI4_0Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDC_I4_0;
-    }
-
-    [DisplayName("LDC_I4_1")]
-    [Category("Immediate Instructions")]
-    [Description("Load immedidate 1 as I4 to stack")]
-    public class LdcI4_1Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDC_I4_1;
-    }
-
-    [DisplayName("LDC_R4")]
-    [Category("Immediate Instructions")]
-    [Description("Load immedidate R4 to stack")]
-    public class LdcR4Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDC_R4;
-
-        [DisplayName("imm")]
-        [Description("Immedidate R4")]
-        public float Imm { get; set; }
-    }
-
-    [Category("Load Store Instructions")]
-    public abstract class LdStindInstruction : Instruction
-    {
-    }
-
-    [DisplayName("LDIND_I1")]
-    [Description("Load indirect I1 to stack")]
-    public class LdindI1Instruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.LDIND_I1;
-    }
-
-    [DisplayName("LDIND_I2")]
-    [Description("Load indirect I2 to stack")]
-    public class LdindI2Instruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.LDIND_I2;
-    }
-
-    [DisplayName("LDIND_I4")]
-    [Description("Load indirect I4 to stack")]
-    public class LdindI4Instruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.LDIND_I4;
-    }
-
-    [DisplayName("LDIND_I")]
-    [Description("Load indirect I to stack")]
-    public class LdindIInstruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.LDIND_I;
-    }
-
-    [DisplayName("LDIND_U1")]
-    [Description("Load indirect U1 to stack")]
-    public class LdindU1Instruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.LDIND_U1;
-    }
-
-    [DisplayName("LDIND_U2")]
-    [Description("Load indirect U2 to stack")]
-    public class LdindU2Instruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.LDIND_U2;
-    }
-
-    [DisplayName("LDIND_U4")]
-    [Description("Load indirect U4 to stack")]
-    public class LdindU4Instruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.LDIND_U4;
-    }
-
-    [DisplayName("LDIND_U")]
-    [Description("Load indirect U to stack")]
-    public class LdindUInstruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.LDIND_U;
-    }
-
-    [DisplayName("LDIND_BR2")]
-    [Description("Load indirect BR2 to stack")]
-    public class LdindBR2Instruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.LDIND_BR2;
-    }
-
-    [DisplayName("LDIND_R4")]
-    [Description("Load indirect R4 to stack")]
-    public class LdindR4Instruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.LDIND_R4;
-    }
-
-    [DisplayName("STIND_I1")]
-    [Description("Store indirect I1 from stack")]
-    public class StindI1Instruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.STIND_I1;
-    }
-
-    [DisplayName("STIND_I2")]
-    [Description("Store indirect I2 from stack")]
-    public class StindI2Instruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.STIND_I2;
-    }
-
-    [DisplayName("STIND_I4")]
-    [Description("Store indirect I4 from stack")]
-    public class StindI4Instruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.STIND_I4;
-    }
-
-    [DisplayName("STIND_I")]
-    [Description("Store indirect I from stack")]
-    public class StindIInstruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.STIND_I;
-    }
-
-    [DisplayName("STIND_BR2")]
-    [Description("Store indirect BR2 from stack")]
-    public class StindBR2Instruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.STIND_BR2;
-    }
-
-    [DisplayName("STIND_R4")]
-    [Description("Store indirect R4 from stack")]
-    public class StindR4Instruction : LdStindInstruction
-    {
-        public override OpCode OpCode => OpCode.STIND_R4;
-    }
-
-    [DisplayName("LEA_GP")]
-    [Category("Load Store Instructions")]
-    [Description("Load a global pointer with offset to stack")]
-    public class LeaGPInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LEA_GP;
-
-        [DisplayName("gpid")]
-        [Description("Global pointer id")]
-        public byte GpId { get; set; }
-
-        [DisplayName("offset")]
-        [Description("Signed immediate offset")]
-        public int Offset { get; set; }
-    }
-
-    [DisplayName("LEA_BUFFER")]
-    [Category("Load Store Instructions")]
-    [Description("Load a buffer pointer with offset to stack")]
-    public class LeaBufferInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LEA_BUFFER;
-
-        [DisplayName("location")]
-        [Description("Location")]
-        public MemoryLocation Location { get; set; }
-
-        [DisplayName("subres_id")]
-        [Description("SubresourceId")]
-        public byte SubresourceId { get; set; }
-
-        [DisplayName("offset")]
-        [Description("Unsigned immediate offset")]
-        public uint Offset { get; set; }
-    }
-
-    [DisplayName("LDELEM_I1")]
-    [Category("Load Store Instructions")]
-    [Description("Load an array element of I1 to stack")]
-    public class LdelemI1Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDELEM_I1;
-    }
-
-    [DisplayName("LDELEM_I2")]
-    [Category("Load Store Instructions")]
-    [Description("Load an array element of I2 to stack")]
-    public class LdelemI2Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDELEM_I2;
-    }
-
-    [DisplayName("LDELEM_I4")]
-    [Category("Load Store Instructions")]
-    [Description("Load an array element of I4 to stack")]
-    public class LdelemI4Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDELEM_I4;
-    }
-
-    [DisplayName("LDELEM_I")]
-    [Category("Load Store Instructions")]
-    [Description("Load an array element of I to stack")]
-    public class LdelemIInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDELEM_I;
-    }
-
-    [DisplayName("LDELEM_U1")]
-    [Category("Load Store Instructions")]
-    [Description("Load an array element of U1 to stack")]
-    public class LdelemU1Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDELEM_U1;
-    }
-
-    [DisplayName("LDELEM_U2")]
-    [Category("Load Store Instructions")]
-    [Description("Load an array element of U2 to stack")]
-    public class LdelemU2Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDELEM_U2;
-    }
-
-    [DisplayName("LDELEM_U4")]
-    [Category("Load Store Instructions")]
-    [Description("Load an array element of U4 to stack")]
-    public class LdelemU4Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDELEM_U4;
-    }
-
-    [DisplayName("LDELEM_U")]
-    [Category("Load Store Instructions")]
-    [Description("Load an array element of U to stack")]
-    public class LdelemUInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDELEM_U;
-    }
-
-    [DisplayName("LDELEM_BR2")]
-    [Category("Load Store Instructions")]
-    [Description("Load an array element of BR2 to stack")]
-    public class LdelemBR2Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDELEM_BR2;
-    }
-
-    [DisplayName("LDELEM_R4")]
-    [Category("Load Store Instructions")]
-    [Description("Load an array element of R4 to stack")]
-    public class LdelemR4Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDELEM_R4;
-    }
-
-    [DisplayName("STELEM_I1")]
-    [Category("Load Store Instructions")]
-    [Description("Store an array element of I1 from stack")]
-    public class StelemI1Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.STELEM_I1;
-    }
-
-    [DisplayName("STELEM_I2")]
-    [Category("Load Store Instructions")]
-    [Description("Store an array element of I2 from stack")]
-    public class StelemI2Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.STELEM_I2;
-    }
-
-    [DisplayName("STELEM_I4")]
-    [Category("Load Store Instructions")]
-    [Description("Store an array element of I4 from stack")]
-    public class StelemI4Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.STELEM_I4;
-    }
-
-    [DisplayName("STELEM_I")]
-    [Category("Load Store Instructions")]
-    [Description("Store an array element of I from stack")]
-    public class StelemIInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.STELEM_I;
-    }
-
-    [DisplayName("STELEM_BR2")]
-    [Category("Load Store Instructions")]
-    [Description("Store an array element of BR2 from stack")]
-    public class StelemBR2Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.STELEM_BR2;
-    }
-
-    [DisplayName("STELEM_R4")]
-    [Category("Load Store Instructions")]
-    [Description("Store an array element of R4 from stack")]
-    public class StelemR4Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.STELEM_R4;
-    }
-
-    [DisplayName("LDARG")]
-    [Category("Load Store Instructions")]
-    [Description("Load an argument to stack")]
-    public class LdargInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDARG;
-
-        [DisplayName("index")]
-        [Description("Argument index")]
-        public uint Index { get; set; }
-    }
-
-    [DisplayName("LDARG_0")]
-    [Category("Load Store Instructions")]
-    [Description("Load an argument with index of 0 to stack")]
-    public class Ldarg0Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDARG_0;
-    }
-
-    [DisplayName("LDARG_1")]
-    [Category("Load Store Instructions")]
-    [Description("Load an argument with index of 1 to stack")]
-    public class Ldarg1Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDARG_1;
-    }
-
-    [DisplayName("LDARG_2")]
-    [Category("Load Store Instructions")]
-    [Description("Load an argument with index of 2 to stack")]
-    public class Ldarg2Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDARG_2;
-    }
-
-    [DisplayName("LDARG_3")]
-    [Category("Load Store Instructions")]
-    [Description("Load an argument with index of 1 to stack")]
-    public class Ldarg3Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDARG_3;
-    }
-
-    [DisplayName("LDARG_4")]
-    [Category("Load Store Instructions")]
-    [Description("Load an argument with index of 4 to stack")]
-    public class Ldarg4Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDARG_4;
-    }
-
-    [DisplayName("LDARG_5")]
-    [Category("Load Store Instructions")]
-    [Description("Load an argument with index of 5 to stack")]
-    public class Ldarg5Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.LDARG_5;
-    }
-
-    [DisplayName("STSHAPE")]
-    [Category("Load Store Instructions")]
-    [Description("Store a shape from stack")]
-    public class StShapeInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.STSHAPE;
-
-        [DisplayName("rshape")]
-        [Description("Shape register index")]
-        public byte Rshape { get; set; }
-
-        [DisplayName("rank")]
-        [Description("Shape's rank")]
-        public byte Rank { get; set; }
-    }
-
-    [DisplayName("STPADDINGS")]
-    [Category("Load Store Instructions")]
-    [Description("Store paddings from stack")]
-    public class StPaddingsInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.STPADDINGS;
-
-        [DisplayName("rpaddings")]
-        [Description("Paddings register index")]
-        public byte Rpaddings { get; set; }
-
-        [DisplayName("rank")]
-        [Description("Paddings' rank")]
-        public byte Rank { get; set; }
-    }
-
-    [DisplayName("DUP")]
-    [Category("Stack Instructions")]
-    [Description("Duplicate the top item of stack")]
-    public class DupInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.DUP;
-    }
-
-    [DisplayName("POP")]
-    [Category("Stack Instructions")]
-    [Description("Pop the top item of stack")]
-    public class PopInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.POP;
-    }
-
-    [DisplayName("NEG")]
-    [Category("Computational Instructions")]
-    [Description("Negates a value and pushes the result onto the evaluation stack")]
-    public class NegInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.NEG;
-    }
-
-    [DisplayName("ADD")]
-    [Category("Computational Instructions")]
-    [Description("Adds two values and pushes the result onto the evaluation stack")]
-    public class AddInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.ADD;
-    }
-
-    [DisplayName("SUB")]
-    [Category("Computational Instructions")]
-    [Description("Subtracts one value from another and pushes the result onto the evaluation stack")]
-    public class SubInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.SUB;
-    }
-
-    [DisplayName("MUL")]
-    [Category("Computational Instructions")]
-    [Description("Multiplies two values and pushes the result on the evaluation stack")]
-    public class MulInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.MUL;
-    }
-
-    [DisplayName("DIV")]
-    [Category("Computational Instructions")]
-    [Description("Divides two values and pushes the result as a floating-point (type F) or quotient (type int32) onto the evaluation stack")]
-    public class DivInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.DIV;
-    }
-
-    [DisplayName("DIV_U")]
-    [Category("Computational Instructions")]
-    [Description("Divides two unsigned integer values and pushes the result (int32) onto the evaluation stack")]
-    public class DivUInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.DIV_U;
-    }
-
-    [DisplayName("REM")]
-    [Category("Computational Instructions")]
-    [Description("Divides two values and pushes the remainder onto the evaluation stack")]
-    public class RemInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.REM;
-    }
-
-    [DisplayName("REM_U")]
-    [Category("Computational Instructions")]
-    [Description("Divides two unsigned values and pushes the remainder onto the evaluation stack")]
-    public class RemUInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.REM_U;
-    }
-
-    [DisplayName("AND")]
-    [Category("Computational Instructions")]
-    [Description("Computes the bitwise AND of two values and pushes the result onto the evaluation stack")]
-    public class AndInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.AND;
-    }
-
-    [DisplayName("OR")]
-    [Category("Computational Instructions")]
-    [Description("Compute the bitwise complement of the two integer values on top of the stack and pushes the result onto the evaluation stack")]
-    public class OrInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.OR;
-    }
-
-    [DisplayName("XOR")]
-    [Category("Computational Instructions")]
-    [Description("Computes the bitwise XOR of the top two values on the evaluation stack, pushing the result onto the evaluation stack")]
-    public class XorInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.XOR;
-    }
-
-    [DisplayName("NOT")]
-    [Category("Computational Instructions")]
-    [Description("Computes the bitwise complement of the integer value on top of the stack and pushes the result onto the evaluation stack as the same type")]
-    public class NotInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.NOT;
-    }
-
-    [DisplayName("SHL")]
-    [Category("Computational Instructions")]
-    [Description("Shifts an integer value to the left (in zeroes) by a specified number of bits, pushing the result onto the evaluation stack")]
-    public class ShlInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.SHL;
-    }
-
-    [DisplayName("SHR")]
-    [Category("Computational Instructions")]
-    [Description("Shifts an integer value (in sign) to the right by a specified number of bits, pushing the result onto the evaluation stack")]
-    public class ShrInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.SHR;
-    }
-
-    [DisplayName("SHR_U")]
-    [Category("Computational Instructions")]
-    [Description("Shifts an unsigned integer value (in zeroes) to the right by a specified number of bits, pushing the result onto the evaluation stack")]
-    public class ShrUInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.SHR_U;
-    }
-
-    [DisplayName("CLT")]
-    [Category("Computational Instructions")]
-    [Description("Compares two values. If the first value is less than the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
-    public class CltInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CLT;
-    }
-
-    [DisplayName("CLT_U")]
-    [Category("Computational Instructions")]
-    [Description("Compares the unsigned or unordered values value1 and value2. If value1 is less than value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
-    public class CltUInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CLT_U;
-    }
-
-    [DisplayName("CLE")]
-    [Category("Computational Instructions")]
-    [Description("Compares two values. If the first value is less than or equal to the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
-    public class CleInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CLE;
-    }
-
-    [DisplayName("CLE_U")]
-    [Category("Computational Instructions")]
-    [Description("Compares the unsigned or unordered values value1 and value2. If value1 is less than or equal to value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
-    public class CleUInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CLE_U;
-    }
-
-    [DisplayName("CEQ")]
-    [Category("Computational Instructions")]
-    [Description("Compares two values. If they are equal, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
-    public class CeqInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CEQ;
-    }
-
-    [DisplayName("CGE")]
-    [Category("Computational Instructions")]
-    [Description("Compares two values. If the first value is greater than or equal to the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
-    public class CgeInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CGE;
-    }
-
-    [DisplayName("CGE_U")]
-    [Category("Computational Instructions")]
-    [Description("Compares the unsigned or unordered values value1 and value2. If value1 is greater than or equal to value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
-    public class CgeUInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CGE_U;
-    }
-
-    [DisplayName("CGT")]
-    [Category("Computational Instructions")]
-    [Description("Compares two values. If the first value is greater than the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
-    public class CgtInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CGT;
-    }
-
-    [DisplayName("CGT_U")]
-    [Category("Computational Instructions")]
-    [Description("Compares the unsigned or unordered values value1 and value2. If value1 is greater than value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
-    public class CgtUInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CGT_U;
-    }
-
-    [DisplayName("CNE")]
-    [Category("Computational Instructions")]
-    [Description("Compares two values. If the first value is not equal to the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
-    public class CneInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CNE;
-    }
-
-    [DisplayName("CONV_I1")]
-    [Category("Conversion Instructions")]
-    [Description("Converts the value on top of the evaluation stack to int8, and extends it to int32")]
-    public class ConvI1Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CONV_I1;
-    }
-
-    [DisplayName("CONV_I2")]
-    [Category("Conversion Instructions")]
-    [Description("Converts the value on top of the evaluation stack to int16, and extends it to int32")]
-    public class ConvI2Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CONV_I2;
-    }
-
-    [DisplayName("CONV_I4")]
-    [Category("Conversion Instructions")]
-    [Description("Converts the value on top of the evaluation stack to int32, and extends it to int32")]
-    public class ConvI4Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CONV_I4;
-    }
-
-    [DisplayName("CONV_I")]
-    [Category("Conversion Instructions")]
-    [Description("Converts the value on top of the evaluation stack to native int, and extends it to int32")]
-    public class ConvIInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CONV_I;
-    }
-
-    [DisplayName("CONV_U1")]
-    [Category("Conversion Instructions")]
-    [Description("Converts the value on top of the evaluation stack to unsigned int8, and extends it to int32")]
-    public class ConvU1Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CONV_U1;
-    }
-
-    [DisplayName("CONV_U2")]
-    [Category("Conversion Instructions")]
-    [Description("Converts the value on top of the evaluation stack to unsigned int16, and extends it to int32")]
-    public class ConvU2Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CONV_U2;
-    }
-
-    [DisplayName("CONV_U4")]
-    [Category("Conversion Instructions")]
-    [Description("Converts the value on top of the evaluation stack to unsigned int32, and extends it to int32")]
-    public class ConvU4Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CONV_U4;
-    }
-
-    [DisplayName("CONV_U")]
-    [Category("Conversion Instructions")]
-    [Description("Converts the value on top of the evaluation stack to unsigned native int, and extends it to int32")]
-    public class ConvUInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CONV_U;
-    }
-
-    [DisplayName("CONV_BR2")]
-    [Category("Conversion Instructions")]
-    [Description("Converts the value on top of the evaluation stack to bfloat16")]
-    public class ConvBR2Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CONV_BR2;
-    }
-
-    [DisplayName("CONV_R4")]
-    [Category("Conversion Instructions")]
-    [Description("Converts the value on top of the evaluation stack to float32")]
-    public class ConvR4Instruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CONV_R4;
-    }
-
-    [DisplayName("BR")]
-    [Category("Control and Status Instructions")]
-    [Description("Unconditionally transfers control to a target instruction")]
-    public class BrInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.BR;
-
-        [DisplayName("target")]
-        [Description("Branches to a target instruction at the specified offset")]
-        public int Target { get; set; }
-    }
-
-    [DisplayName("BR_TRUE")]
-    [Category("Control and Status Instructions")]
-    [Description("Transfers control to a target instruction if value is true, not null, or non-zero")]
-    public class BrTrueInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.BR_TRUE;
-
-        [DisplayName("target")]
-        [Description("Branches to a target instruction at the specified offset")]
-        public int Target { get; set; }
-    }
-
-    [DisplayName("BR_FALSE")]
-    [Category("Control and Status Instructions")]
-    [Description("Transfers control to a target instruction if value is false, null, or zero")]
-    public class BrFalseInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.BR_FALSE;
-
-        [DisplayName("target")]
-        [Description("Branches to a target instruction at the specified offset")]
-        public int Target { get; set; }
-    }
-
-    [DisplayName("RET")]
-    [Category("Control and Status Instructions")]
-    [Description("Return")]
-    public class RetInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.RET;
-    }
-
-    [DisplayName("CALL")]
-    [Category("Control and Status Instructions")]
-    [Description("Call a target method")]
-    public class CallInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.CALL;
-
-        [DisplayName("args")]
-        [Description("Arguments count")]
-        public byte ArgsCount { get; set; }
-
-        [DisplayName("target")]
-        [Description("Call a target method at the specified offset")]
-        public int Target { get; set; }
-    }
-
-    [DisplayName("ECALL")]
-    [Category("Control and Status Instructions")]
-    [Description("Call a environment method")]
-    public class ECallInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.ECALL;
-
-        [DisplayName("args")]
-        [Description("Arguments count")]
-        public byte ArgsCount { get; set; }
-    }
-
-    [DisplayName("THROW")]
-    [Category("Control and Status Instructions")]
-    [Description("Throw a error code currently on the evaluation stack")]
-    public class ThrowInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.THROW;
-    }
-
-    [DisplayName("BREAK")]
-    [Category("Control and Status Instructions")]
-    [Description("Inform the debugger that a break point has been tripped")]
-    public class BreakInstruction : Instruction
-    {
-        public override OpCode OpCode => OpCode.BREAK;
-    }
-
-    public static class TensorCalls
-    {
-        public abstract class TensorInstruction : Instruction
-        {
-            public sealed override OpCode OpCode => OpCode.TENSOR;
-
-            [DisplayName("funct")]
-            [Description("Tensor call function")]
-            public abstract TensorFunction Function { get; }
-        }
-
-        [DisplayName("TENSOR.BATCH_TO_SPACE")]
-        [Category("Tensor Instructions")]
-        [Description("BatchToSpace")]
-        public class BatchToSpaceInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.BATCH_TO_SPACE;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("rshape_block")]
-            [Description("Block shape register")]
-            public byte RshapeBlock { get; set; }
-
-            [DisplayName("rpad_crops")]
-            [Description("Crops paddings register")]
-            public byte RpadCrops { get; set; }
-        }
-
-        [DisplayName("TENSOR.BROADCAST")]
-        [Category("Tensor Instructions")]
-        [Description("Broadcast")]
-        public class BroadcastInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.BROADCAST;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rshape_dest")]
-            [Description("Dest shape register")]
-            public byte RshapeDest { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-        }
-
-        [DisplayName("TENSOR.BINARY")]
-        [Category("Tensor Instructions")]
-        [Description("Binary")]
-        public class BinaryInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.BINARY;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src1")]
-            [Description("Source1 shape register")]
-            public byte RshapeSrc1 { get; set; }
-
-            [DisplayName("rstride_src1")]
-            [Description("Source1 stride register")]
-            public byte RstrideSrc1 { get; set; }
-
-            [DisplayName("rshape_src2")]
-            [Description("Source2 shape register")]
-            public byte RshapeSrc2 { get; set; }
-
-            [DisplayName("rstride_src2")]
-            [Description("Source2 stride register")]
-            public byte RstrideSrc2 { get; set; }
-
-            [DisplayName("rshape_dest")]
-            [Description("Dest shape register")]
-            public byte RshapeDest { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("binary_op")]
-            [Description("Binary operator")]
-            public BinaryOp BinaryOp { get; set; }
-
-            [DisplayName("fused_clamp_low")]
-            [Description("FusedClampLow")]
-            public float FusedClampLow { get; set; }
-
-            [DisplayName("fused_clamp_high")]
-            [Description("FusedClampHigh")]
-            public float FusedClampHigh { get; set; }
-        }
-
-        [DisplayName("TENSOR.CALL")]
-        [Category("Tensor Instructions")]
-        [Description("Call")]
-        public class CallInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.CALL;
-
-            [DisplayName("function_id")]
-            [Description("Function Id")]
-            public uint FunctionId { get; set; }
-
-            [DisplayName("module_id")]
-            [Description("Module Id")]
-            public ushort ModuleId { get; set; }
-
-            [DisplayName("num_src")]
-            [Description("Source count")]
-            public byte SrcCount { get; set; }
-
-            [DisplayName("num_dst")]
-            [Description("Dest count")]
-            public byte DstCount { get; set; }
-        }
-
-        [DisplayName("TENSOR.COMPARE")]
-        [Category("Tensor Instructions")]
-        [Description("Compare")]
-        public class CompareInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.COMPARE;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src1")]
-            [Description("Source1 shape register")]
-            public byte RshapeSrc1 { get; set; }
-
-            [DisplayName("rstride_src1")]
-            [Description("Source1 stride register")]
-            public byte RstrideSrc1 { get; set; }
-
-            [DisplayName("rshape_src2")]
-            [Description("Source2 shape register")]
-            public byte RshapeSrc2 { get; set; }
-
-            [DisplayName("rstride_src2")]
-            [Description("Source2 stride register")]
-            public byte RstrideSrc2 { get; set; }
-
-            [DisplayName("rshape_dest")]
-            [Description("Dest shape register")]
-            public byte RshapeDest { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("compare_op")]
-            [Description("Compare operator")]
-            public CompareOp CompareOp { get; set; }
-        }
-        [DisplayName("TENSOR.CONV2D")]
-        [Category("Tensor Instructions")]
-        [Description("Conv2D")]
-        public class Conv2DInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.CONV2D;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rshape_kernel")]
-            [Description("Kernel shape register")]
-            public byte RshapeKernel { get; set; }
-
-            [DisplayName("rstride_kernel")]
-            [Description("Kernel stride register")]
-            public byte RstrideKernel { get; set; }
-
-            [DisplayName("rstride_bias")]
-            [Description("Bias stride register")]
-            public byte RstrideBias { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("groups")]
-            [Description("Groups")]
-            public ushort Groups { get; set; }
-
-            [DisplayName("stride_h")]
-            [Description("StrideH")]
-            public ushort StrideH { get; set; }
-
-            [DisplayName("stride_w")]
-            [Description("StrideW")]
-            public ushort StrideW { get; set; }
-
-            [DisplayName("dilation_h")]
-            [Description("DilationH")]
-            public ushort DilationH { get; set; }
-
-            [DisplayName("dilation_w")]
-            [Description("DilationW")]
-            public ushort DilationW { get; set; }
-
-            [DisplayName("fused_clamp_low")]
-            [Description("FusedClampLow")]
-            public float FusedClampLow { get; set; }
-
-            [DisplayName("fused_clamp_high")]
-            [Description("FusedClampHigh")]
-            public float FusedClampHigh { get; set; }
-        }
-
-        [DisplayName("TENSOR.COPY")]
-        [Category("Tensor Instructions")]
-        [Description("Copy")]
-        public class CopyInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.COPY;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape")]
-            [Description("Shape register")]
-            public byte Rshape { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-        }
-
-        [DisplayName("TENSOR.CONVERT")]
-        [Category("Tensor Instructions")]
-        [Description("Convert")]
-        public class ConvertInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.CONVERT;
-
-            [DisplayName("in_datatype")]
-            [Description("Source Datatype")]
-            public DataType SrcDataType { get; set; }
-
-            [DisplayName("dst_datatype")]
-            [Description("Dest Datatype")]
-            public DataType DestDataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source1 shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-        }
-
-        [DisplayName("TENSOR.CUMSUM")]
-        [Category("Tensor Instructions")]
-        [Description("CumSum")]
-        public class CumSumInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.CUMSUM;
-
-            [DisplayName("datatype")]
-            [Description("Input/Output datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("axis")]
-            [Description("Axis")]
-            public int Axis { get; set; }
-
-            [DisplayName("exclusive")]
-            [Description("Exclusive")]
-            public bool Exclusive { get; set; }
-
-            [DisplayName("reverse")]
-            [Description("Reverse")]
-            public bool Reverse { get; set; }
-        }
-
-        [DisplayName("TENSOR.DEQUANTIZE")]
-        [Category("Tensor Instructions")]
-        [Description("Dequantize")]
-        public class DequantizeInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.DEQUANTIZE;
-
-            [DisplayName("in_datatype")]
-            [Description("Source Datatype")]
-            public DataType SrcDataType { get; set; }
-
-            [DisplayName("dst_datatype")]
-            [Description("Dest Datatype")]
-            public DataType DestDataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-        }
-
-        [DisplayName("TENSOR.GATHER")]
-        [Category("Tensor Instructions")]
-        [Description("Gather")]
-        public class GatherInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.GATHER;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rshape_dest")]
-            [Description("Dest shape register")]
-            public byte RshapeDest { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("rshape_indices")]
-            [Description("Indices shape register")]
-            public byte RshapeIndices { get; set; }
-
-            [DisplayName("axis")]
-            [Description("Axis")]
-            public byte Axis { get; set; }
-        }
-
-        [DisplayName("TENSOR.GATHER_ND")]
-        [Category("Tensor Instructions")]
-        [Description("GatherND")]
-        public class GatherNDInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.GATHER_ND;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rshape_dest")]
-            [Description("Dest shape register")]
-            public byte RshapeDest { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("rshape_indices")]
-            [Description("Indices shape register")]
-            public byte RshapeIndices { get; set; }
-
-            [DisplayName("batch_dims")]
-            [Description("Batch Dims")]
-            public byte Batchdims { get; set; }
-        }
-
-        [DisplayName("TENSOR.HARDMAX")]
-        [Category("Tensor Instructions")]
-        [Description("Hardmax")]
-        public class HardmaxInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.HARDMAX;
-
-            [DisplayName("datatype")]
-            [Description("Input/Output datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("axis")]
-            [Description("Axis")]
-            public int Axis { get; set; }
-        }
-
-        [DisplayName("TENSOR.LUT1D")]
-        [Category("Tensor Instructions")]
-        [Description("Lut1D")]
-        public class LUT1DInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.LUT1D;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("table_len")]
-            [Description("Table length")]
-            public ushort TableLength { get; set; }
-        }
-
-        [DisplayName("TENSOR.MATMUL")]
+	[System.AttributeUsage(AttributeTargets.Enum, Inherited = false, AllowMultiple = false)]
+	public sealed class BitLengthAttribute : Attribute
+	{
+		public uint BitLength { get; }
+
+		public BitLengthAttribute(uint bitLength)
+		{
+			BitLength = bitLength;
+		}
+	}
+
+	[System.AttributeUsage(AttributeTargets.All, Inherited = false, AllowMultiple = false)]
+	public sealed class EnumNameAttribute : Attribute
+	{
+		public string Name { get; }
+
+		public EnumNameAttribute(string name)
+		{
+			Name = name;
+		}
+	}
+
+	[BitLength(8)]
+	[EnumName("opcode_t")]
+	public enum OpCode
+	{
+		NOP,
+		LDNULL,
+		LDC_I4,
+		LDC_I4_0,
+		LDC_I4_1,
+		LDC_R4,
+		LDIND_I1,
+		LDIND_I2,
+		LDIND_I4,
+		LDIND_I,
+		LDIND_U1,
+		LDIND_U2,
+		LDIND_U4,
+		LDIND_U,
+		LDIND_BR2,
+		LDIND_R4,
+		STIND_I1,
+		STIND_I2,
+		STIND_I4,
+		STIND_I,
+		STIND_BR2,
+		STIND_R4,
+		LEA_GP,
+		LEA_BUFFER,
+
+		LDELEM_I1,
+		LDELEM_I2,
+		LDELEM_I4,
+		LDELEM_I,
+		LDELEM_U1,
+		LDELEM_U2,
+		LDELEM_U4,
+		LDELEM_U,
+		LDELEM_BR2,
+		LDELEM_R4,
+		STELEM_I1,
+		STELEM_I2,
+		STELEM_I4,
+		STELEM_I,
+		STELEM_BR2,
+		STELEM_R4,
+
+		LDARG,
+		LDARG_0,
+		LDARG_1,
+		LDARG_2,
+		LDARG_3,
+		LDARG_4,
+		LDARG_5,
+
+		DUP,
+		POP,
+
+		STSHAPE,
+		STPADDINGS,
+
+		NEG,
+		ADD,
+		SUB,
+		MUL,
+		DIV,
+		DIV_U,
+		REM,
+		REM_U,
+		AND,
+		OR,
+		XOR,
+		NOT,
+		SHL,
+		SHR,
+		SHR_U,
+
+		CLT,
+		CLT_U,
+		CLE,
+		CLE_U,
+		CEQ,
+		CGE,
+		CGE_U,
+		CGT,
+		CGT_U,
+		CNE,
+
+		CONV_I1,
+		CONV_I2,
+		CONV_I4,
+		CONV_I,
+		CONV_U1,
+		CONV_U2,
+		CONV_U4,
+		CONV_U,
+		CONV_BR2,
+		CONV_R4,
+
+		BR,
+		BR_TRUE,
+		BR_FALSE,
+		RET,
+		CALL,
+		ECALL,
+		THROW,
+		BREAK,
+
+		TENSOR,
+	}
+
+	[BitLength(16)]
+	[EnumName("tensor_function_t")]
+	public enum TensorFunction
+	{
+		BATCH_TO_SPACE,
+		BINARY,
+		BROADCAST,
+		CALL,
+		COMPARE,
+		CLAMP,
+		CONV2D,
+		CONV2D_TRANSPOSE,
+		CONVERT,
+		COPY,
+		CUMSUM,
+		DEQUANTIZE,
+		GATHER,
+		GATHER_ND,
+		HARDMAX,
+		LOGISTIC,
+		LUT1D,
+		MATMUL,
+		ONEHOT,
+		PAD,
+		QUANTIZE,
+		RANDOM_NORMAL,
+		RANDOM_UNIFORM,
+		REDUCE,
+		REDUCE_ARG,
+		REDUCE_PROD,
+		REDUCE_WINDOW2D,
+		RESIZE_IMAGE,
+		ROI_ALIGN,
+		SIGMOID,
+		SLICE,
+		SOFTMAX,
+		SPACE_TO_BATCH,
+		TAKE,
+		TERNARY,
+		TOPK,
+		TRANSPOSE,
+		TRILU,
+		UNARY,
+		GRU,
+		TFLITE_DETECTION_POSTPROCESS,
+		LAYER_NORMALIZATION,
+		COMPRESS,
+		GATHER_ELEMENTS,
+		INSTANCE_NORMALIZATION
+	}
+
+	[BitLength(8)]
+	[EnumName("datatype_t")]
+	[Browsable(false)]
+	public enum DataType
+	{
+	}
+
+	[BitLength(8)]
+	[EnumName("onehot_mode_t")]
+	[Browsable(false)]
+	public enum OneHotMode
+	{
+	}
+
+	[BitLength(8)]
+	[EnumName("pad_mode_t")]
+	[Browsable(false)]
+	public enum PadMode
+	{
+	}
+
+	[BitLength(8)]
+	[EnumName("memory_location_t")]
+	[Browsable(false)]
+	public enum MemoryLocation
+	{
+	}
+
+	[BitLength(8)]
+	[EnumName("reduce_op_t")]
+	[Browsable(false)]
+	public enum ReduceOp
+	{
+	}
+
+	[BitLength(8)]
+	[EnumName("reduce_arg_op_t")]
+	[Browsable(false)]
+	public enum ReduceArgOp
+	{
+	}
+
+	[BitLength(8)]
+	[EnumName("image_resize_mode_t")]
+	[Browsable(false)]
+	public enum ImageResizeMode
+	{
+	}
+
+	[BitLength(8)]
+	[EnumName("binary_op_t")]
+	[Browsable(false)]
+	public enum BinaryOp
+	{
+	}
+
+	[BitLength(8)]
+	[EnumName("unary_op_t")]
+	[Browsable(false)]
+	public enum UnaryOp
+	{
+	}
+
+	[BitLength(8)]
+	[EnumName("compare_op_t")]
+	[Browsable(false)]
+	public enum CompareOp
+	{
+	}
+
+	[BitLength(8)]
+	[EnumName("roi_align_mode_t")]
+	[Browsable(false)]
+	public enum RoiAlignMode
+	{
+	}
+
+	public abstract class Instruction
+	{
+		[DisplayName("opcode")]
+		[Description("OpCode")]
+		public abstract OpCode OpCode { get; }
+	}
+
+	[DisplayName("NOP")]
+	[Category("Control and Status Instructions")]
+	[Description("No operation")]
+	public class NopInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.NOP;
+	}
+
+	[DisplayName("LDC_I4")]
+	[Category("Immediate Instructions")]
+	[Description("Load immedidate I4 to stack")]
+	public class LdcI4Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDC_I4;
+
+		[DisplayName("imm")]
+		[Description("Immedidate I4")]
+		public int Imm { get; set; }
+	}
+
+	[DisplayName("LDNULL")]
+	[Category("Immediate Instructions")]
+	[Description("Load immedidate nullptr as I to stack")]
+	public class LdNullInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDNULL;
+	}
+
+	[DisplayName("LDC_I4_0")]
+	[Category("Immediate Instructions")]
+	[Description("Load immedidate 0 as I4 to stack")]
+	public class LdcI4_0Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDC_I4_0;
+	}
+
+	[DisplayName("LDC_I4_1")]
+	[Category("Immediate Instructions")]
+	[Description("Load immedidate 1 as I4 to stack")]
+	public class LdcI4_1Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDC_I4_1;
+	}
+
+	[DisplayName("LDC_R4")]
+	[Category("Immediate Instructions")]
+	[Description("Load immedidate R4 to stack")]
+	public class LdcR4Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDC_R4;
+
+		[DisplayName("imm")]
+		[Description("Immedidate R4")]
+		public float Imm { get; set; }
+	}
+
+	[Category("Load Store Instructions")]
+	public abstract class LdStindInstruction : Instruction
+	{
+	}
+
+	[DisplayName("LDIND_I1")]
+	[Description("Load indirect I1 to stack")]
+	public class LdindI1Instruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.LDIND_I1;
+	}
+
+	[DisplayName("LDIND_I2")]
+	[Description("Load indirect I2 to stack")]
+	public class LdindI2Instruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.LDIND_I2;
+	}
+
+	[DisplayName("LDIND_I4")]
+	[Description("Load indirect I4 to stack")]
+	public class LdindI4Instruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.LDIND_I4;
+	}
+
+	[DisplayName("LDIND_I")]
+	[Description("Load indirect I to stack")]
+	public class LdindIInstruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.LDIND_I;
+	}
+
+	[DisplayName("LDIND_U1")]
+	[Description("Load indirect U1 to stack")]
+	public class LdindU1Instruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.LDIND_U1;
+	}
+
+	[DisplayName("LDIND_U2")]
+	[Description("Load indirect U2 to stack")]
+	public class LdindU2Instruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.LDIND_U2;
+	}
+
+	[DisplayName("LDIND_U4")]
+	[Description("Load indirect U4 to stack")]
+	public class LdindU4Instruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.LDIND_U4;
+	}
+
+	[DisplayName("LDIND_U")]
+	[Description("Load indirect U to stack")]
+	public class LdindUInstruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.LDIND_U;
+	}
+
+	[DisplayName("LDIND_BR2")]
+	[Description("Load indirect BR2 to stack")]
+	public class LdindBR2Instruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.LDIND_BR2;
+	}
+
+	[DisplayName("LDIND_R4")]
+	[Description("Load indirect R4 to stack")]
+	public class LdindR4Instruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.LDIND_R4;
+	}
+
+	[DisplayName("STIND_I1")]
+	[Description("Store indirect I1 from stack")]
+	public class StindI1Instruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.STIND_I1;
+	}
+
+	[DisplayName("STIND_I2")]
+	[Description("Store indirect I2 from stack")]
+	public class StindI2Instruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.STIND_I2;
+	}
+
+	[DisplayName("STIND_I4")]
+	[Description("Store indirect I4 from stack")]
+	public class StindI4Instruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.STIND_I4;
+	}
+
+	[DisplayName("STIND_I")]
+	[Description("Store indirect I from stack")]
+	public class StindIInstruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.STIND_I;
+	}
+
+	[DisplayName("STIND_BR2")]
+	[Description("Store indirect BR2 from stack")]
+	public class StindBR2Instruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.STIND_BR2;
+	}
+
+	[DisplayName("STIND_R4")]
+	[Description("Store indirect R4 from stack")]
+	public class StindR4Instruction : LdStindInstruction
+	{
+		public override OpCode OpCode => OpCode.STIND_R4;
+	}
+
+	[DisplayName("LEA_GP")]
+	[Category("Load Store Instructions")]
+	[Description("Load a global pointer with offset to stack")]
+	public class LeaGPInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LEA_GP;
+
+		[DisplayName("gpid")]
+		[Description("Global pointer id")]
+		public byte GpId { get; set; }
+
+		[DisplayName("offset")]
+		[Description("Signed immediate offset")]
+		public int Offset { get; set; }
+	}
+
+	[DisplayName("LEA_BUFFER")]
+	[Category("Load Store Instructions")]
+	[Description("Load a buffer pointer with offset to stack")]
+	public class LeaBufferInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LEA_BUFFER;
+
+		[DisplayName("location")]
+		[Description("Location")]
+		public MemoryLocation Location { get; set; }
+
+		[DisplayName("subres_id")]
+		[Description("SubresourceId")]
+		public byte SubresourceId { get; set; }
+
+		[DisplayName("offset")]
+		[Description("Unsigned immediate offset")]
+		public uint Offset { get; set; }
+	}
+
+	[DisplayName("LDELEM_I1")]
+	[Category("Load Store Instructions")]
+	[Description("Load an array element of I1 to stack")]
+	public class LdelemI1Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDELEM_I1;
+	}
+
+	[DisplayName("LDELEM_I2")]
+	[Category("Load Store Instructions")]
+	[Description("Load an array element of I2 to stack")]
+	public class LdelemI2Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDELEM_I2;
+	}
+
+	[DisplayName("LDELEM_I4")]
+	[Category("Load Store Instructions")]
+	[Description("Load an array element of I4 to stack")]
+	public class LdelemI4Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDELEM_I4;
+	}
+
+	[DisplayName("LDELEM_I")]
+	[Category("Load Store Instructions")]
+	[Description("Load an array element of I to stack")]
+	public class LdelemIInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDELEM_I;
+	}
+
+	[DisplayName("LDELEM_U1")]
+	[Category("Load Store Instructions")]
+	[Description("Load an array element of U1 to stack")]
+	public class LdelemU1Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDELEM_U1;
+	}
+
+	[DisplayName("LDELEM_U2")]
+	[Category("Load Store Instructions")]
+	[Description("Load an array element of U2 to stack")]
+	public class LdelemU2Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDELEM_U2;
+	}
+
+	[DisplayName("LDELEM_U4")]
+	[Category("Load Store Instructions")]
+	[Description("Load an array element of U4 to stack")]
+	public class LdelemU4Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDELEM_U4;
+	}
+
+	[DisplayName("LDELEM_U")]
+	[Category("Load Store Instructions")]
+	[Description("Load an array element of U to stack")]
+	public class LdelemUInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDELEM_U;
+	}
+
+	[DisplayName("LDELEM_BR2")]
+	[Category("Load Store Instructions")]
+	[Description("Load an array element of BR2 to stack")]
+	public class LdelemBR2Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDELEM_BR2;
+	}
+
+	[DisplayName("LDELEM_R4")]
+	[Category("Load Store Instructions")]
+	[Description("Load an array element of R4 to stack")]
+	public class LdelemR4Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDELEM_R4;
+	}
+
+	[DisplayName("STELEM_I1")]
+	[Category("Load Store Instructions")]
+	[Description("Store an array element of I1 from stack")]
+	public class StelemI1Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.STELEM_I1;
+	}
+
+	[DisplayName("STELEM_I2")]
+	[Category("Load Store Instructions")]
+	[Description("Store an array element of I2 from stack")]
+	public class StelemI2Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.STELEM_I2;
+	}
+
+	[DisplayName("STELEM_I4")]
+	[Category("Load Store Instructions")]
+	[Description("Store an array element of I4 from stack")]
+	public class StelemI4Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.STELEM_I4;
+	}
+
+	[DisplayName("STELEM_I")]
+	[Category("Load Store Instructions")]
+	[Description("Store an array element of I from stack")]
+	public class StelemIInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.STELEM_I;
+	}
+
+	[DisplayName("STELEM_BR2")]
+	[Category("Load Store Instructions")]
+	[Description("Store an array element of BR2 from stack")]
+	public class StelemBR2Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.STELEM_BR2;
+	}
+
+	[DisplayName("STELEM_R4")]
+	[Category("Load Store Instructions")]
+	[Description("Store an array element of R4 from stack")]
+	public class StelemR4Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.STELEM_R4;
+	}
+
+	[DisplayName("LDARG")]
+	[Category("Load Store Instructions")]
+	[Description("Load an argument to stack")]
+	public class LdargInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDARG;
+
+		[DisplayName("index")]
+		[Description("Argument index")]
+		public uint Index { get; set; }
+	}
+
+	[DisplayName("LDARG_0")]
+	[Category("Load Store Instructions")]
+	[Description("Load an argument with index of 0 to stack")]
+	public class Ldarg0Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDARG_0;
+	}
+
+	[DisplayName("LDARG_1")]
+	[Category("Load Store Instructions")]
+	[Description("Load an argument with index of 1 to stack")]
+	public class Ldarg1Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDARG_1;
+	}
+
+	[DisplayName("LDARG_2")]
+	[Category("Load Store Instructions")]
+	[Description("Load an argument with index of 2 to stack")]
+	public class Ldarg2Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDARG_2;
+	}
+
+	[DisplayName("LDARG_3")]
+	[Category("Load Store Instructions")]
+	[Description("Load an argument with index of 1 to stack")]
+	public class Ldarg3Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDARG_3;
+	}
+
+	[DisplayName("LDARG_4")]
+	[Category("Load Store Instructions")]
+	[Description("Load an argument with index of 4 to stack")]
+	public class Ldarg4Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDARG_4;
+	}
+
+	[DisplayName("LDARG_5")]
+	[Category("Load Store Instructions")]
+	[Description("Load an argument with index of 5 to stack")]
+	public class Ldarg5Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.LDARG_5;
+	}
+
+	[DisplayName("STSHAPE")]
+	[Category("Load Store Instructions")]
+	[Description("Store a shape from stack")]
+	public class StShapeInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.STSHAPE;
+
+		[DisplayName("rshape")]
+		[Description("Shape register index")]
+		public byte Rshape { get; set; }
+
+		[DisplayName("rank")]
+		[Description("Shape's rank")]
+		public byte Rank { get; set; }
+	}
+
+	[DisplayName("STPADDINGS")]
+	[Category("Load Store Instructions")]
+	[Description("Store paddings from stack")]
+	public class StPaddingsInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.STPADDINGS;
+
+		[DisplayName("rpaddings")]
+		[Description("Paddings register index")]
+		public byte Rpaddings { get; set; }
+
+		[DisplayName("rank")]
+		[Description("Paddings' rank")]
+		public byte Rank { get; set; }
+	}
+
+	[DisplayName("DUP")]
+	[Category("Stack Instructions")]
+	[Description("Duplicate the top item of stack")]
+	public class DupInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.DUP;
+	}
+
+	[DisplayName("POP")]
+	[Category("Stack Instructions")]
+	[Description("Pop the top item of stack")]
+	public class PopInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.POP;
+	}
+
+	[DisplayName("NEG")]
+	[Category("Computational Instructions")]
+	[Description("Negates a value and pushes the result onto the evaluation stack")]
+	public class NegInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.NEG;
+	}
+
+	[DisplayName("ADD")]
+	[Category("Computational Instructions")]
+	[Description("Adds two values and pushes the result onto the evaluation stack")]
+	public class AddInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.ADD;
+	}
+
+	[DisplayName("SUB")]
+	[Category("Computational Instructions")]
+	[Description("Subtracts one value from another and pushes the result onto the evaluation stack")]
+	public class SubInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.SUB;
+	}
+
+	[DisplayName("MUL")]
+	[Category("Computational Instructions")]
+	[Description("Multiplies two values and pushes the result on the evaluation stack")]
+	public class MulInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.MUL;
+	}
+
+	[DisplayName("DIV")]
+	[Category("Computational Instructions")]
+	[Description("Divides two values and pushes the result as a floating-point (type F) or quotient (type int32) onto the evaluation stack")]
+	public class DivInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.DIV;
+	}
+
+	[DisplayName("DIV_U")]
+	[Category("Computational Instructions")]
+	[Description("Divides two unsigned integer values and pushes the result (int32) onto the evaluation stack")]
+	public class DivUInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.DIV_U;
+	}
+
+	[DisplayName("REM")]
+	[Category("Computational Instructions")]
+	[Description("Divides two values and pushes the remainder onto the evaluation stack")]
+	public class RemInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.REM;
+	}
+
+	[DisplayName("REM_U")]
+	[Category("Computational Instructions")]
+	[Description("Divides two unsigned values and pushes the remainder onto the evaluation stack")]
+	public class RemUInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.REM_U;
+	}
+
+	[DisplayName("AND")]
+	[Category("Computational Instructions")]
+	[Description("Computes the bitwise AND of two values and pushes the result onto the evaluation stack")]
+	public class AndInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.AND;
+	}
+
+	[DisplayName("OR")]
+	[Category("Computational Instructions")]
+	[Description("Compute the bitwise complement of the two integer values on top of the stack and pushes the result onto the evaluation stack")]
+	public class OrInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.OR;
+	}
+
+	[DisplayName("XOR")]
+	[Category("Computational Instructions")]
+	[Description("Computes the bitwise XOR of the top two values on the evaluation stack, pushing the result onto the evaluation stack")]
+	public class XorInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.XOR;
+	}
+
+	[DisplayName("NOT")]
+	[Category("Computational Instructions")]
+	[Description("Computes the bitwise complement of the integer value on top of the stack and pushes the result onto the evaluation stack as the same type")]
+	public class NotInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.NOT;
+	}
+
+	[DisplayName("SHL")]
+	[Category("Computational Instructions")]
+	[Description("Shifts an integer value to the left (in zeroes) by a specified number of bits, pushing the result onto the evaluation stack")]
+	public class ShlInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.SHL;
+	}
+
+	[DisplayName("SHR")]
+	[Category("Computational Instructions")]
+	[Description("Shifts an integer value (in sign) to the right by a specified number of bits, pushing the result onto the evaluation stack")]
+	public class ShrInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.SHR;
+	}
+
+	[DisplayName("SHR_U")]
+	[Category("Computational Instructions")]
+	[Description("Shifts an unsigned integer value (in zeroes) to the right by a specified number of bits, pushing the result onto the evaluation stack")]
+	public class ShrUInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.SHR_U;
+	}
+
+	[DisplayName("CLT")]
+	[Category("Computational Instructions")]
+	[Description("Compares two values. If the first value is less than the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
+	public class CltInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CLT;
+	}
+
+	[DisplayName("CLT_U")]
+	[Category("Computational Instructions")]
+	[Description("Compares the unsigned or unordered values value1 and value2. If value1 is less than value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
+	public class CltUInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CLT_U;
+	}
+
+	[DisplayName("CLE")]
+	[Category("Computational Instructions")]
+	[Description("Compares two values. If the first value is less than or equal to the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
+	public class CleInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CLE;
+	}
+
+	[DisplayName("CLE_U")]
+	[Category("Computational Instructions")]
+	[Description("Compares the unsigned or unordered values value1 and value2. If value1 is less than or equal to value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
+	public class CleUInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CLE_U;
+	}
+
+	[DisplayName("CEQ")]
+	[Category("Computational Instructions")]
+	[Description("Compares two values. If they are equal, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
+	public class CeqInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CEQ;
+	}
+
+	[DisplayName("CGE")]
+	[Category("Computational Instructions")]
+	[Description("Compares two values. If the first value is greater than or equal to the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
+	public class CgeInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CGE;
+	}
+
+	[DisplayName("CGE_U")]
+	[Category("Computational Instructions")]
+	[Description("Compares the unsigned or unordered values value1 and value2. If value1 is greater than or equal to value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
+	public class CgeUInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CGE_U;
+	}
+
+	[DisplayName("CGT")]
+	[Category("Computational Instructions")]
+	[Description("Compares two values. If the first value is greater than the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
+	public class CgtInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CGT;
+	}
+
+	[DisplayName("CGT_U")]
+	[Category("Computational Instructions")]
+	[Description("Compares the unsigned or unordered values value1 and value2. If value1 is greater than value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
+	public class CgtUInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CGT_U;
+	}
+
+	[DisplayName("CNE")]
+	[Category("Computational Instructions")]
+	[Description("Compares two values. If the first value is not equal to the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")]
+	public class CneInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CNE;
+	}
+
+	[DisplayName("CONV_I1")]
+	[Category("Conversion Instructions")]
+	[Description("Converts the value on top of the evaluation stack to int8, and extends it to int32")]
+	public class ConvI1Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CONV_I1;
+	}
+
+	[DisplayName("CONV_I2")]
+	[Category("Conversion Instructions")]
+	[Description("Converts the value on top of the evaluation stack to int16, and extends it to int32")]
+	public class ConvI2Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CONV_I2;
+	}
+
+	[DisplayName("CONV_I4")]
+	[Category("Conversion Instructions")]
+	[Description("Converts the value on top of the evaluation stack to int32, and extends it to int32")]
+	public class ConvI4Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CONV_I4;
+	}
+
+	[DisplayName("CONV_I")]
+	[Category("Conversion Instructions")]
+	[Description("Converts the value on top of the evaluation stack to native int, and extends it to int32")]
+	public class ConvIInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CONV_I;
+	}
+
+	[DisplayName("CONV_U1")]
+	[Category("Conversion Instructions")]
+	[Description("Converts the value on top of the evaluation stack to unsigned int8, and extends it to int32")]
+	public class ConvU1Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CONV_U1;
+	}
+
+	[DisplayName("CONV_U2")]
+	[Category("Conversion Instructions")]
+	[Description("Converts the value on top of the evaluation stack to unsigned int16, and extends it to int32")]
+	public class ConvU2Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CONV_U2;
+	}
+
+	[DisplayName("CONV_U4")]
+	[Category("Conversion Instructions")]
+	[Description("Converts the value on top of the evaluation stack to unsigned int32, and extends it to int32")]
+	public class ConvU4Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CONV_U4;
+	}
+
+	[DisplayName("CONV_U")]
+	[Category("Conversion Instructions")]
+	[Description("Converts the value on top of the evaluation stack to unsigned native int, and extends it to int32")]
+	public class ConvUInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CONV_U;
+	}
+
+	[DisplayName("CONV_BR2")]
+	[Category("Conversion Instructions")]
+	[Description("Converts the value on top of the evaluation stack to bfloat16")]
+	public class ConvBR2Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CONV_BR2;
+	}
+
+	[DisplayName("CONV_R4")]
+	[Category("Conversion Instructions")]
+	[Description("Converts the value on top of the evaluation stack to float32")]
+	public class ConvR4Instruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CONV_R4;
+	}
+
+	[DisplayName("BR")]
+	[Category("Control and Status Instructions")]
+	[Description("Unconditionally transfers control to a target instruction")]
+	public class BrInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.BR;
+
+		[DisplayName("target")]
+		[Description("Branches to a target instruction at the specified offset")]
+		public int Target { get; set; }
+	}
+
+	[DisplayName("BR_TRUE")]
+	[Category("Control and Status Instructions")]
+	[Description("Transfers control to a target instruction if value is true, not null, or non-zero")]
+	public class BrTrueInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.BR_TRUE;
+
+		[DisplayName("target")]
+		[Description("Branches to a target instruction at the specified offset")]
+		public int Target { get; set; }
+	}
+
+	[DisplayName("BR_FALSE")]
+	[Category("Control and Status Instructions")]
+	[Description("Transfers control to a target instruction if value is false, null, or zero")]
+	public class BrFalseInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.BR_FALSE;
+
+		[DisplayName("target")]
+		[Description("Branches to a target instruction at the specified offset")]
+		public int Target { get; set; }
+	}
+
+	[DisplayName("RET")]
+	[Category("Control and Status Instructions")]
+	[Description("Return")]
+	public class RetInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.RET;
+	}
+
+	[DisplayName("CALL")]
+	[Category("Control and Status Instructions")]
+	[Description("Call a target method")]
+	public class CallInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.CALL;
+
+		[DisplayName("args")]
+		[Description("Arguments count")]
+		public byte ArgsCount { get; set; }
+
+		[DisplayName("target")]
+		[Description("Call a target method at the specified offset")]
+		public int Target { get; set; }
+	}
+
+	[DisplayName("ECALL")]
+	[Category("Control and Status Instructions")]
+	[Description("Call a environment method")]
+	public class ECallInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.ECALL;
+
+		[DisplayName("args")]
+		[Description("Arguments count")]
+		public byte ArgsCount { get; set; }
+	}
+
+	[DisplayName("THROW")]
+	[Category("Control and Status Instructions")]
+	[Description("Throw a error code currently on the evaluation stack")]
+	public class ThrowInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.THROW;
+	}
+
+	[DisplayName("BREAK")]
+	[Category("Control and Status Instructions")]
+	[Description("Inform the debugger that a break point has been tripped")]
+	public class BreakInstruction : Instruction
+	{
+		public override OpCode OpCode => OpCode.BREAK;
+	}
+
+	public static class TensorCalls
+	{
+		public abstract class TensorInstruction : Instruction
+		{
+			public sealed override OpCode OpCode => OpCode.TENSOR;
+
+			[DisplayName("funct")]
+			[Description("Tensor call function")]
+			public abstract TensorFunction Function { get; }
+		}
+
+		[DisplayName("TENSOR.BATCH_TO_SPACE")]
+		[Category("Tensor Instructions")]
+		[Description("BatchToSpace")]
+		public class BatchToSpaceInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.BATCH_TO_SPACE;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("rshape_block")]
+			[Description("Block shape register")]
+			public byte RshapeBlock { get; set; }
+
+			[DisplayName("rpad_crops")]
+			[Description("Crops paddings register")]
+			public byte RpadCrops { get; set; }
+		}
+
+		[DisplayName("TENSOR.BROADCAST")]
+		[Category("Tensor Instructions")]
+		[Description("Broadcast")]
+		public class BroadcastInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.BROADCAST;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rshape_dest")]
+			[Description("Dest shape register")]
+			public byte RshapeDest { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+		}
+
+		[DisplayName("TENSOR.BINARY")]
+		[Category("Tensor Instructions")]
+		[Description("Binary")]
+		public class BinaryInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.BINARY;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src1")]
+			[Description("Source1 shape register")]
+			public byte RshapeSrc1 { get; set; }
+
+			[DisplayName("rstride_src1")]
+			[Description("Source1 stride register")]
+			public byte RstrideSrc1 { get; set; }
+
+			[DisplayName("rshape_src2")]
+			[Description("Source2 shape register")]
+			public byte RshapeSrc2 { get; set; }
+
+			[DisplayName("rstride_src2")]
+			[Description("Source2 stride register")]
+			public byte RstrideSrc2 { get; set; }
+
+			[DisplayName("rshape_dest")]
+			[Description("Dest shape register")]
+			public byte RshapeDest { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("binary_op")]
+			[Description("Binary operator")]
+			public BinaryOp BinaryOp { get; set; }
+
+			[DisplayName("fused_clamp_low")]
+			[Description("FusedClampLow")]
+			public float FusedClampLow { get; set; }
+
+			[DisplayName("fused_clamp_high")]
+			[Description("FusedClampHigh")]
+			public float FusedClampHigh { get; set; }
+		}
+
+		[DisplayName("TENSOR.CALL")]
+		[Category("Tensor Instructions")]
+		[Description("Call")]
+		public class CallInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.CALL;
+
+			[DisplayName("function_id")]
+			[Description("Function Id")]
+			public uint FunctionId { get; set; }
+
+			[DisplayName("module_id")]
+			[Description("Module Id")]
+			public ushort ModuleId { get; set; }
+
+			[DisplayName("num_src")]
+			[Description("Source count")]
+			public byte SrcCount { get; set; }
+
+			[DisplayName("num_dst")]
+			[Description("Dest count")]
+			public byte DstCount { get; set; }
+		}
+
+		[DisplayName("TENSOR.COMPARE")]
+		[Category("Tensor Instructions")]
+		[Description("Compare")]
+		public class CompareInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.COMPARE;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src1")]
+			[Description("Source1 shape register")]
+			public byte RshapeSrc1 { get; set; }
+
+			[DisplayName("rstride_src1")]
+			[Description("Source1 stride register")]
+			public byte RstrideSrc1 { get; set; }
+
+			[DisplayName("rshape_src2")]
+			[Description("Source2 shape register")]
+			public byte RshapeSrc2 { get; set; }
+
+			[DisplayName("rstride_src2")]
+			[Description("Source2 stride register")]
+			public byte RstrideSrc2 { get; set; }
+
+			[DisplayName("rshape_dest")]
+			[Description("Dest shape register")]
+			public byte RshapeDest { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("compare_op")]
+			[Description("Compare operator")]
+			public CompareOp CompareOp { get; set; }
+		}
+		[DisplayName("TENSOR.CONV2D")]
+		[Category("Tensor Instructions")]
+		[Description("Conv2D")]
+		public class Conv2DInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.CONV2D;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rshape_kernel")]
+			[Description("Kernel shape register")]
+			public byte RshapeKernel { get; set; }
+
+			[DisplayName("rstride_kernel")]
+			[Description("Kernel stride register")]
+			public byte RstrideKernel { get; set; }
+
+			[DisplayName("rstride_bias")]
+			[Description("Bias stride register")]
+			public byte RstrideBias { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("groups")]
+			[Description("Groups")]
+			public ushort Groups { get; set; }
+
+			[DisplayName("stride_h")]
+			[Description("StrideH")]
+			public ushort StrideH { get; set; }
+
+			[DisplayName("stride_w")]
+			[Description("StrideW")]
+			public ushort StrideW { get; set; }
+
+			[DisplayName("dilation_h")]
+			[Description("DilationH")]
+			public ushort DilationH { get; set; }
+
+			[DisplayName("dilation_w")]
+			[Description("DilationW")]
+			public ushort DilationW { get; set; }
+
+			[DisplayName("fused_clamp_low")]
+			[Description("FusedClampLow")]
+			public float FusedClampLow { get; set; }
+
+			[DisplayName("fused_clamp_high")]
+			[Description("FusedClampHigh")]
+			public float FusedClampHigh { get; set; }
+		}
+
+		[DisplayName("TENSOR.COPY")]
+		[Category("Tensor Instructions")]
+		[Description("Copy")]
+		public class CopyInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.COPY;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape")]
+			[Description("Shape register")]
+			public byte Rshape { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+		}
+
+		[DisplayName("TENSOR.CONVERT")]
+		[Category("Tensor Instructions")]
+		[Description("Convert")]
+		public class ConvertInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.CONVERT;
+
+			[DisplayName("in_datatype")]
+			[Description("Source Datatype")]
+			public DataType SrcDataType { get; set; }
+
+			[DisplayName("dst_datatype")]
+			[Description("Dest Datatype")]
+			public DataType DestDataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source1 shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+		}
+
+		[DisplayName("TENSOR.CUMSUM")]
+		[Category("Tensor Instructions")]
+		[Description("CumSum")]
+		public class CumSumInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.CUMSUM;
+
+			[DisplayName("datatype")]
+			[Description("Input/Output datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("axis")]
+			[Description("Axis")]
+			public int Axis { get; set; }
+
+			[DisplayName("exclusive")]
+			[Description("Exclusive")]
+			public bool Exclusive { get; set; }
+
+			[DisplayName("reverse")]
+			[Description("Reverse")]
+			public bool Reverse { get; set; }
+		}
+
+		[DisplayName("TENSOR.DEQUANTIZE")]
+		[Category("Tensor Instructions")]
+		[Description("Dequantize")]
+		public class DequantizeInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.DEQUANTIZE;
+
+			[DisplayName("in_datatype")]
+			[Description("Source Datatype")]
+			public DataType SrcDataType { get; set; }
+
+			[DisplayName("dst_datatype")]
+			[Description("Dest Datatype")]
+			public DataType DestDataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+		}
+
+		[DisplayName("TENSOR.GATHER")]
+		[Category("Tensor Instructions")]
+		[Description("Gather")]
+		public class GatherInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.GATHER;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rshape_dest")]
+			[Description("Dest shape register")]
+			public byte RshapeDest { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("rshape_indices")]
+			[Description("Indices shape register")]
+			public byte RshapeIndices { get; set; }
+
+			[DisplayName("axis")]
+			[Description("Axis")]
+			public byte Axis { get; set; }
+		}
+
+		[DisplayName("TENSOR.GATHER_ND")]
+		[Category("Tensor Instructions")]
+		[Description("GatherND")]
+		public class GatherNDInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.GATHER_ND;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rshape_dest")]
+			[Description("Dest shape register")]
+			public byte RshapeDest { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("rshape_indices")]
+			[Description("Indices shape register")]
+			public byte RshapeIndices { get; set; }
+
+			[DisplayName("batch_dims")]
+			[Description("Batch Dims")]
+			public byte Batchdims { get; set; }
+		}
+
+		[DisplayName("TENSOR.HARDMAX")]
+		[Category("Tensor Instructions")]
+		[Description("Hardmax")]
+		public class HardmaxInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.HARDMAX;
+
+			[DisplayName("datatype")]
+			[Description("Input/Output datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("axis")]
+			[Description("Axis")]
+			public int Axis { get; set; }
+		}
+
+		[DisplayName("TENSOR.LUT1D")]
+		[Category("Tensor Instructions")]
+		[Description("Lut1D")]
+		public class LUT1DInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.LUT1D;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("table_len")]
+			[Description("Table length")]
+			public ushort TableLength { get; set; }
+		}
+
+		[DisplayName("TENSOR.MATMUL")]
+		[Category("Tensor Instructions")]
+		[Description("Matmul")]
+		public class MatmulInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.MATMUL;
+
+			[DisplayName("rshape_src1")]
+			[Description("Source1 shape register")]
+			public byte RshapeSrc1 { get; set; }
+
+			[DisplayName("rstride_src1")]
+			[Description("Source1 stride register")]
+			public byte RstrideSrc1 { get; set; }
+
+			[DisplayName("rshape_src2")]
+			[Description("Source2 shape register")]
+			public byte RshapeSrc2 { get; set; }
+
+			[DisplayName("rstride_src2")]
+			[Description("Source2 stride register")]
+			public byte RstrideSrc2 { get; set; }
+
+			[DisplayName("rshape_dest")]
+			[Description("Dest shape register")]
+			public byte RshapeDest { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("fused_clamp_low")]
+			[Description("FusedClampLow")]
+			public float FusedClampLow { get; set; }
+
+			[DisplayName("fused_clamp_high")]
+			[Description("FusedClampHigh")]
+			public float FusedClampHigh { get; set; }
+		}
+
+		[DisplayName("TENSOR.ONEHOT")]
+		[Category("Tensor Instructions")]
+		[Description("OneHot")]
+		public class OneHotInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.ONEHOT;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_indices")]
+			[Description("Indices shape register")]
+			public byte RshapeIndices { get; set; }
+
+			[DisplayName("rshape_dest")]
+			[Description("Dest shape register")]
+			public byte RshapeDest { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("axis")]
+			[Description("Axis")]
+			public byte Axis { get; set; }
+
+			[DisplayName("onehot_mode")]
+			[Description("OneHot Mode")]
+			public OneHotMode OneHotMode { get; set; }
+		}
+
+		[DisplayName("TENSOR.PAD")]
+		[Category("Tensor Instructions")]
+		[Description("Pad")]
+		public class PadInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.PAD;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("rpaddings")]
+			[Description("Paddings register")]
+			public byte Rpaddings { get; set; }
+
+			[DisplayName("pad_mode")]
+			[Description("Pad mode")]
+			public PadMode PadMode { get; set; }
+		}
+
+		[DisplayName("TENSOR.QUANTIZE")]
+		[Category("Tensor Instructions")]
+		[Description("Quantize")]
+		public class QuantizeInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.QUANTIZE;
+
+			[DisplayName("in_datatype")]
+			[Description("Source Datatype")]
+			public DataType SrcDataType { get; set; }
+
+			[DisplayName("dst_datatype")]
+			[Description("Dest Datatype")]
+			public DataType DestDataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+		}
+
+		[DisplayName("TENSOR.RANDOM_NORMAL")]
+		[Category("Tensor Instructions")]
+		[Description("RandomNormal")]
+		public class RandomNormalInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.RANDOM_NORMAL;
+
+			[DisplayName("datatype_dest")]
+			[Description("Output datatype")]
+			public DataType DataTypeDest { get; set; }
+
+			[DisplayName("rshape_dest")]
+			[Description("output shape register")]
+			public byte RshapeDest { get; set; }
+
+			[DisplayName("mean")]
+			[Description("Mean")]
+			public float Mean { get; set; }
+
+			[DisplayName("std")]
+			[Description("Std")]
+			public float Std { get; set; }
+
+			[DisplayName("seed")]
+			[Description("Seed")]
+			public float Seed { get; set; }
+		}
+
+		[DisplayName("TENSOR.RANDOM_UNIFORM")]
+		[Category("Tensor Instructions")]
+		[Description("RandomUniform")]
+		public class RandomUniformInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.RANDOM_UNIFORM;
+
+			[DisplayName("datatype_dest")]
+			[Description("Output datatype")]
+			public DataType DataTypeDest { get; set; }
+
+			[DisplayName("rshape_dest")]
+			[Description("output shape register")]
+			public byte RshapeDest { get; set; }
+
+			[DisplayName("low")]
+			[Description("Low")]
+			public float Low { get; set; }
+
+			[DisplayName("high")]
+			[Description("High")]
+			public float High { get; set; }
+
+			[DisplayName("seed")]
+			[Description("Seed")]
+			public float Seed { get; set; }
+		}
+
+		[DisplayName("TENSOR.REDUCE")]
+		[Category("Tensor Instructions")]
+		[Description("Reduce")]
+		public class ReduceInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.REDUCE;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("reduce_op")]
+			[Description("Reduce operator")]
+			public ReduceOp ReduceOp { get; set; }
+
+			[DisplayName("rshape_axis")]
+			[Description("Axis shape register")]
+			public byte RshapeAxis { get; set; }
+
+			[DisplayName("keep_dims")]
+			[Description("Keep dimensions")]
+			public bool KeepDims { get; set; }
+		}
+
+		[DisplayName("TENSOR.REDUCE_ARG")]
+		[Category("Tensor Instructions")]
+		[Description("ReduceArg")]
+		public class ReduceArgInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.REDUCE_ARG;
+
+			[DisplayName("datatype_src")]
+			[Description("Input datatype")]
+			public DataType DataTypeSrc { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("datatype_dest")]
+			[Description("Output datatype")]
+			public DataType DataTypeDest { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("reduce_arg_op")]
+			[Description("Reduce arg operator")]
+			public ReduceArgOp ReduceArgOp { get; set; }
+
+			[DisplayName("rshape_axis")]
+			[Description("Axis shape register")]
+			public byte RshapeAxis { get; set; }
+
+			[DisplayName("keep_dims")]
+			[Description("Keep dimensions")]
+			public bool KeepDims { get; set; }
+
+			[DisplayName("select_last_idx")]
+			[Description("select last index")]
+			public bool SelectLastIdx { get; set; }
+		}
+
+		[DisplayName("TENSOR.REDUCE_PROD")]
+		[Category("Tensor Instructions")]
+		[Description("ReduceProd")]
+		public class ReduceProdInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.REDUCE_PROD;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("rshape_axes")]
+			[Description("Axes shape register")]
+			public byte RshapeAxes { get; set; }
+
+			[DisplayName("keep_dims")]
+			[Description("Keep dimensions")]
+			public bool KeepDims { get; set; }
+		}
+
+		[DisplayName("TENSOR.REDUCE_WINDOW2D")]
+		[Category("Tensor Instructions")]
+		[Description("REDUCE_WINDOW2D")]
+		public class ReduceWindow2DInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.REDUCE_WINDOW2D;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("reduce_op")]
+			[Description("Reduce operator")]
+			public ReduceOp ReduceOp { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("filter_h")]
+			[Description("FilterH")]
+			public ushort FilterH { get; set; }
+
+			[DisplayName("filter_w")]
+			[Description("FilterW")]
+			public ushort FilterW { get; set; }
+
+			[DisplayName("stride_h")]
+			[Description("StrideH")]
+			public ushort StrideH { get; set; }
+
+			[DisplayName("stride_w")]
+			[Description("StrideW")]
+			public ushort StrideW { get; set; }
+
+			[DisplayName("dilation_h")]
+			[Description("DilationH")]
+			public ushort DilationH { get; set; }
+
+			[DisplayName("dilation_w")]
+			[Description("DilationW")]
+			public ushort DilationW { get; set; }
+
+			[DisplayName("fused_clamp_low")]
+			[Description("FusedClampLow")]
+			public float FusedClampLow { get; set; }
+
+			[DisplayName("fused_clamp_high")]
+			[Description("FusedClampHigh")]
+			public float FusedClampHigh { get; set; }
+		}
+
+		[DisplayName("TENSOR.RESIZE_IMAGE")]
+		[Category("Tensor Instructions")]
+		[Description("RESIZE_IMAGE")]
+		public class ResizeImageInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.RESIZE_IMAGE;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("align_corners")]
+			[Description("Align Corners")]
+			public bool AlignCorners { get; set; }
+
+			[DisplayName("half_pixel_centers")]
+			[Description("Half Pixel Centers")]
+			public bool HalfPixelCenters { get; set; }
+
+			[DisplayName("image_resize_mode")]
+			[Description("Image Resize Mode")]
+			public ImageResizeMode ImageResizeMode { get; set; }
+		}
+
+		[DisplayName("TENSOR.ROI_ALIGN")]
+		[Category("Tensor Instructions")]
+		[Description("RoiAlign")]
+		public class RoiAlignInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.ROI_ALIGN;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rshape_dest")]
+			[Description("Dest shape register")]
+			public byte RshapeDest { get; set; }
+
+			[DisplayName("mode")]
+			[Description("Mode")]
+			public RoiAlignMode mode { get; set; }
+
+			[DisplayName("spatial_scale")]
+			[Description("Spatial Scale")]
+			public float SpatialScale { get; set; }
+
+			[DisplayName("sampling_ratio")]
+			[Description("Sampling Ratio")]
+			public long SamplingRatio { get; set; }
+		}
+
+		[DisplayName("TENSOR.SIGMOID")]
+		[Category("Tensor Instructions")]
+		[Description("Sigmoid")]
+		public class SigmoidInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.SIGMOID;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+		}
+
+		[DisplayName("TENSOR.SLICE")]
+		[Category("Tensor Instructions")]
+		[Description("Slice")]
+		public class SliceInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.SLICE;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("rbegins")]
+			[Description("Begins shape register")]
+			public byte Rbegins { get; set; }
+
+			[DisplayName("rends")]
+			[Description("Ends shape register")]
+			public byte Rends { get; set; }
+
+			[DisplayName("rstrides")]
+			[Description("Strides shape register")]
+			public byte Strides { get; set; }
+		}
+
+		[DisplayName("TENSOR.SOFTMAX")]
+		[Category("Tensor Instructions")]
+		[Description("Softmax")]
+		public class SoftmaxInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.SOFTMAX;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("axis")]
+			[Description("Axis")]
+			public int Axis { get; set; }
+
+			[DisplayName("beta")]
+			[Description("Beta")]
+			public float Beta { get; set; }
+		}
+
+		[DisplayName("TENSOR.SPACE_TO_BATCH")]
+		[Category("Tensor Instructions")]
+		[Description("SpaceToBatch")]
+		public class SpaceToBatchInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.SPACE_TO_BATCH;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("rshape_block")]
+			[Description("Block shape register")]
+			public byte RshapeBlock { get; set; }
+
+			[DisplayName("rpad_crops")]
+			[Description("Crops paddings register")]
+			public byte RpadCrops { get; set; }
+		}
+
+		[DisplayName("TENSOR.TERNARY")]
+		[Category("Tensor Instructions")]
+		[Description("Ternary")]
+		public class TernaryInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.TERNARY;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src1")]
+			[Description("Source1 shape register")]
+			public byte RshapeSrc1 { get; set; }
+
+			[DisplayName("rstride_src1")]
+			[Description("Source1 stride register")]
+			public byte RstrideSrc1 { get; set; }
+
+			[DisplayName("rshape_src2")]
+			[Description("Source2 shape register")]
+			public byte RshapeSrc2 { get; set; }
+
+			[DisplayName("rstride_src2")]
+			[Description("Source2 stride register")]
+			public byte RstrideSrc2 { get; set; }
+
+			[DisplayName("rshape_src3")]
+			[Description("Source3 shape register")]
+			public byte RshapeSrc3 { get; set; }
+
+			[DisplayName("rstride_src3")]
+			[Description("Source3 stride register")]
+			public byte RstrideSrc3 { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+		}
+
+		[DisplayName("TENSOR.TOPK")]
+		[Category("Tensor Instructions")]
+		[Description("Topk")]
+		public class TopKInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.TOPK;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rshape_dest1")]
+			[Description("Dest1 shape register")]
+			public byte RshapeDest1 { get; set; }
+
+			[DisplayName("rstride_dest1")]
+			[Description("Dest1 stride register")]
+			public byte RstrideDest1 { get; set; }
+
+			[DisplayName("rshape_dest2")]
+			[Description("Dest2 shape register")]
+			public byte RshapeDest2 { get; set; }
+
+			[DisplayName("rstride_dest2")]
+			[Description("Dest2 stride register")]
+			public byte RstrideDest2 { get; set; }
+
+			[DisplayName("k")]
+			[Description("K")]
+			public long K { get; set; }
+
+			[DisplayName("axis")]
+			[Description("Axis")]
+			public int Axis { get; set; }
+
+			[DisplayName("largest")]
+			[Description("Largest")]
+			public bool Largest { get; set; }
+
+			[DisplayName("sorted")]
+			[Description("Sorted")]
+			public bool Sorted { get; set; }
+		}
+
+		[DisplayName("TENSOR.TRILU")]
+		[Category("Tensor Instructions")]
+		[Description("Trilu")]
+		public class TriluInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.TRILU;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("upper")]
+			[Description("Upper")]
+			public bool Upper { get; set; }
+
+			[DisplayName("k")]
+			[Description("K")]
+			public long K { get; set; }
+		}
+
+		[DisplayName("TENSOR.UNARY")]
+		[Category("Tensor Instructions")]
+		[Description("Unary")]
+		public class UnaryInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.UNARY;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source1 shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("unary_op")]
+			[Description("Unary operator")]
+			public UnaryOp UnaryOp { get; set; }
+		}
+
+		[DisplayName("TENSOR.TRANSPOSE")]
+		[Category("Tensor Instructions")]
+		[Description("Transpose")]
+		public class TransposeInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.TRANSPOSE;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("rshape_src")]
+			[Description("Source shape register")]
+			public byte RshapeSrc { get; set; }
+
+			[DisplayName("rstride_src")]
+			[Description("Source stride register")]
+			public byte RstrideSrc { get; set; }
+
+			[DisplayName("rstride_dest")]
+			[Description("Dest stride register")]
+			public byte RstrideDest { get; set; }
+
+			[DisplayName("rshape_perm")]
+			[Description("Perm shape register")]
+			public byte RshapePerm { get; set; }
+		}
+		[DisplayName("TENSOR.GRU")]
+		[Category("Tensor Instructions")]
+		[Description("Gru")]
+		public class GruInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.GRU;
+
+			[DisplayName("input_shape_src")]
+			[Description("Input shape register")]
+			public byte RshapeSrc1 { get; set; }
+
+			[DisplayName("w_shape_src")]
+			[Description("W shape register")]
+			public byte RshapeSrc2 { get; set; }
+
+			[DisplayName("direction")]
+			[Description("direction register")]
+			public byte Direction { get; set; }
+
+			[DisplayName("linear_before_reset")]
+			[Description("LBR register")]
+			public bool LinearBeforeReset { get; set; }
+
+		}
+		[DisplayName("TENSOR.TFLITE_DETECTION_POSTPROCESS")]
+		[Category("Tensor Instructions")]
+		[Description("Tflite_Detection_Postprocess")]
+		public class TfliteDetectionPostprocessInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.TFLITE_DETECTION_POSTPROCESS;
+
+			[DisplayName("box_shape_src")]
+			[Description("Box shape register")]
+			public byte RshapeSrc1 { get; set; }
+
+			[DisplayName("score_shape_src")]
+			[Description("Score shape register")]
+			public byte RshapeSrc2 { get; set; }
+
+			[DisplayName("anchor_shape_src")]
+			[Description("Anchor shape register")]
+			public byte RshapeSrc3 { get; set; }
+
+			[DisplayName("max_detections")]
+			[Description("max_detections register")]
+			public int MaxDetections { get; set; }
+
+			[DisplayName("max_classes_per_detection")]
+			[Description("max_classes_per_detection register")]
+			public int MaxClassesPerDetection { get; set; }
+
+			[DisplayName("detections_per_class")]
+			[Description("detections_per_class register")]
+			public int DetectionsPerClass { get; set; }
+
+			[DisplayName("use_regular_non_max_suppression")]
+			[Description("use_regular_non_max_suppression register")]
+			public bool UseRegularNonMaxSuppression { get; set; }
+
+			[DisplayName("nms_score_threshold")]
+			[Description("nms_score_threshold register")]
+			public float NmsScoreThreshold { get; set; }
+
+			[DisplayName("nms_iou_threshold")]
+			[Description("nms_iou_threshold register")]
+			public float NmsIouThreshold { get; set; }
+
+			[DisplayName("num_classes")]
+			[Description("num_classes register")]
+			public int NumClasses { get; set; }
+
+			[DisplayName("y_scale")]
+			[Description("y_scale register")]
+			public float YScale { get; set; }
+
+			[DisplayName("x_scale")]
+			[Description("x_scale register")]
+			public float XScale { get; set; }
+
+			[DisplayName("h_scale")]
+			[Description("h_scale register")]
+			public float HScale { get; set; }
+
+			[DisplayName("w_scale")]
+			[Description("w_scale register")]
+			public float WScale { get; set; }
+		}
+
+		[DisplayName("TENSOR.LAYER_NORMALIZATION")]
+		[Category("Tensor Instructions")]
+		[Description("LAYER_NORMALIZATION")]
+		public class LayerNormInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.LAYER_NORMALIZATION;
+
+			[DisplayName("datatype")]
+			[Description("Datatype")]
+			public DataType DataType { get; set; }
+
+			[DisplayName("input_shape")]
+			[Description("input_shape")]
+			public byte input_shape { get; set; }
+			[DisplayName("axis")]
+			[Description("axis")]
+			public int axis { get; set; }
+
+			[DisplayName("epsilon")]
+			[Description("epsilon")]
+			public float epsilon { get; set; }
+		}
+
+		[DisplayName("TENSOR.COMPRESS")]
+		[Category("Tensor Instructions")]
+		[Description("Compress")]
+		public class CompressInstruction : TensorInstruction
+		{
+			public override TensorFunction Function => TensorFunction.COMPRESS;
+
+			[DisplayName("input_shape_src")]
+			[Description("Input shape register")]
+			public byte RshapeSrc1 { get; set; }
+
+			[DisplayName("condition_shape_src")]
+			[Description("Condition shape register")]
+			public byte RshapeSrc2 { get; set; }
+
+			[DisplayName("axis")]
+			[Description("axis register")]
+			public float axis { get; set; }
+		}
+
+		[DisplayName("TENSOR.GATHER_ELEMENTS")]
         [Category("Tensor Instructions")]
-        [Description("Matmul")]
-        public class MatmulInstruction : TensorInstruction
+        [Description("Gather_Elements")]
+        public class Gather_ElementsInstruction : TensorInstruction
         {
-            public override TensorFunction Function => TensorFunction.MATMUL;
+            public override TensorFunction Function => TensorFunction.GATHER_ELEMENTS;
 
-            [DisplayName("rshape_src1")]
-            [Description("Source1 shape register")]
+            [DisplayName("input_shape_src")]
+            [Description("Input shape register")]
             public byte RshapeSrc1 { get; set; }
 
-            [DisplayName("rstride_src1")]
-            [Description("Source1 stride register")]
-            public byte RstrideSrc1 { get; set; }
-
-            [DisplayName("rshape_src2")]
-            [Description("Source2 shape register")]
-            public byte RshapeSrc2 { get; set; }
-
-            [DisplayName("rstride_src2")]
-            [Description("Source2 stride register")]
-            public byte RstrideSrc2 { get; set; }
-
-            [DisplayName("rshape_dest")]
-            [Description("Dest shape register")]
-            public byte RshapeDest { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("fused_clamp_low")]
-            [Description("FusedClampLow")]
-            public float FusedClampLow { get; set; }
-
-            [DisplayName("fused_clamp_high")]
-            [Description("FusedClampHigh")]
-            public float FusedClampHigh { get; set; }
-        }
-
-        [DisplayName("TENSOR.ONEHOT")]
-        [Category("Tensor Instructions")]
-        [Description("OneHot")]
-        public class OneHotInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.ONEHOT;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_indices")]
+            [DisplayName("indices_shape_src")]
             [Description("Indices shape register")]
-            public byte RshapeIndices { get; set; }
-
-            [DisplayName("rshape_dest")]
-            [Description("Dest shape register")]
-            public byte RshapeDest { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("axis")]
-            [Description("Axis")]
-            public byte Axis { get; set; }
-
-            [DisplayName("onehot_mode")]
-            [Description("OneHot Mode")]
-            public OneHotMode OneHotMode { get; set; }
-        }
-
-        [DisplayName("TENSOR.PAD")]
-        [Category("Tensor Instructions")]
-        [Description("Pad")]
-        public class PadInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.PAD;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("rpaddings")]
-            [Description("Paddings register")]
-            public byte Rpaddings { get; set; }
-
-            [DisplayName("pad_mode")]
-            [Description("Pad mode")]
-            public PadMode PadMode { get; set; }
-        }
-
-        [DisplayName("TENSOR.QUANTIZE")]
-        [Category("Tensor Instructions")]
-        [Description("Quantize")]
-        public class QuantizeInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.QUANTIZE;
-
-            [DisplayName("in_datatype")]
-            [Description("Source Datatype")]
-            public DataType SrcDataType { get; set; }
-
-            [DisplayName("dst_datatype")]
-            [Description("Dest Datatype")]
-            public DataType DestDataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-        }
-
-        [DisplayName("TENSOR.RANDOM_NORMAL")]
-        [Category("Tensor Instructions")]
-        [Description("RandomNormal")]
-        public class RandomNormalInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.RANDOM_NORMAL;
-
-            [DisplayName("datatype_dest")]
-            [Description("Output datatype")]
-            public DataType DataTypeDest { get; set; }
-
-            [DisplayName("rshape_dest")]
-            [Description("output shape register")]
-            public byte RshapeDest { get; set; }
-
-            [DisplayName("mean")]
-            [Description("Mean")]
-            public float Mean { get; set; }
-
-            [DisplayName("std")]
-            [Description("Std")]
-            public float Std { get; set; }
-
-            [DisplayName("seed")]
-            [Description("Seed")]
-            public float Seed { get; set; }
-        }
-
-        [DisplayName("TENSOR.RANDOM_UNIFORM")]
-        [Category("Tensor Instructions")]
-        [Description("RandomUniform")]
-        public class RandomUniformInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.RANDOM_UNIFORM;
-
-            [DisplayName("datatype_dest")]
-            [Description("Output datatype")]
-            public DataType DataTypeDest { get; set; }
-
-            [DisplayName("rshape_dest")]
-            [Description("output shape register")]
-            public byte RshapeDest { get; set; }
-
-            [DisplayName("low")]
-            [Description("Low")]
-            public float Low { get; set; }
-
-            [DisplayName("high")]
-            [Description("High")]
-            public float High { get; set; }
-
-            [DisplayName("seed")]
-            [Description("Seed")]
-            public float Seed { get; set; }
-        }
-
-        [DisplayName("TENSOR.REDUCE")]
-        [Category("Tensor Instructions")]
-        [Description("Reduce")]
-        public class ReduceInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.REDUCE;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("reduce_op")]
-            [Description("Reduce operator")]
-            public ReduceOp ReduceOp { get; set; }
-
-            [DisplayName("rshape_axis")]
-            [Description("Axis shape register")]
-            public byte RshapeAxis { get; set; }
-
-            [DisplayName("keep_dims")]
-            [Description("Keep dimensions")]
-            public bool KeepDims { get; set; }
-        }
-
-        [DisplayName("TENSOR.REDUCE_ARG")]
-        [Category("Tensor Instructions")]
-        [Description("ReduceArg")]
-        public class ReduceArgInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.REDUCE_ARG;
-
-            [DisplayName("datatype_src")]
-            [Description("Input datatype")]
-            public DataType DataTypeSrc { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("datatype_dest")]
-            [Description("Output datatype")]
-            public DataType DataTypeDest { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("reduce_arg_op")]
-            [Description("Reduce arg operator")]
-            public ReduceArgOp ReduceArgOp { get; set; }
-
-            [DisplayName("rshape_axis")]
-            [Description("Axis shape register")]
-            public byte RshapeAxis { get; set; }
-
-            [DisplayName("keep_dims")]
-            [Description("Keep dimensions")]
-            public bool KeepDims { get; set; }
-
-            [DisplayName("select_last_idx")]
-            [Description("select last index")]
-            public bool SelectLastIdx { get; set; }
-        }
-
-        [DisplayName("TENSOR.REDUCE_PROD")]
-        [Category("Tensor Instructions")]
-        [Description("ReduceProd")]
-        public class ReduceProdInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.REDUCE_PROD;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("rshape_axes")]
-            [Description("Axes shape register")]
-            public byte RshapeAxes { get; set; }
-
-            [DisplayName("keep_dims")]
-            [Description("Keep dimensions")]
-            public bool KeepDims { get; set; }
-        }
-
-        [DisplayName("TENSOR.REDUCE_WINDOW2D")]
-        [Category("Tensor Instructions")]
-        [Description("REDUCE_WINDOW2D")]
-        public class ReduceWindow2DInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.REDUCE_WINDOW2D;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("reduce_op")]
-            [Description("Reduce operator")]
-            public ReduceOp ReduceOp { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("filter_h")]
-            [Description("FilterH")]
-            public ushort FilterH { get; set; }
-
-            [DisplayName("filter_w")]
-            [Description("FilterW")]
-            public ushort FilterW { get; set; }
-
-            [DisplayName("stride_h")]
-            [Description("StrideH")]
-            public ushort StrideH { get; set; }
-
-            [DisplayName("stride_w")]
-            [Description("StrideW")]
-            public ushort StrideW { get; set; }
-
-            [DisplayName("dilation_h")]
-            [Description("DilationH")]
-            public ushort DilationH { get; set; }
-
-            [DisplayName("dilation_w")]
-            [Description("DilationW")]
-            public ushort DilationW { get; set; }
-
-            [DisplayName("fused_clamp_low")]
-            [Description("FusedClampLow")]
-            public float FusedClampLow { get; set; }
-
-            [DisplayName("fused_clamp_high")]
-            [Description("FusedClampHigh")]
-            public float FusedClampHigh { get; set; }
-        }
-
-        [DisplayName("TENSOR.RESIZE_IMAGE")]
-        [Category("Tensor Instructions")]
-        [Description("RESIZE_IMAGE")]
-        public class ResizeImageInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.RESIZE_IMAGE;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("align_corners")]
-            [Description("Align Corners")]
-            public bool AlignCorners { get; set; }
-
-            [DisplayName("half_pixel_centers")]
-            [Description("Half Pixel Centers")]
-            public bool HalfPixelCenters { get; set; }
-
-            [DisplayName("image_resize_mode")]
-            [Description("Image Resize Mode")]
-            public ImageResizeMode ImageResizeMode { get; set; }
-        }
-
-        [DisplayName("TENSOR.ROI_ALIGN")]
-        [Category("Tensor Instructions")]
-        [Description("RoiAlign")]
-        public class RoiAlignInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.ROI_ALIGN;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rshape_dest")]
-            [Description("Dest shape register")]
-            public byte RshapeDest { get; set; }
-
-            [DisplayName("mode")]
-            [Description("Mode")]
-            public RoiAlignMode mode { get; set; }
-
-            [DisplayName("spatial_scale")]
-            [Description("Spatial Scale")]
-            public float SpatialScale { get; set; }
-
-            [DisplayName("sampling_ratio")]
-            [Description("Sampling Ratio")]
-            public long SamplingRatio { get; set; }
-        }
-
-        [DisplayName("TENSOR.SIGMOID")]
-        [Category("Tensor Instructions")]
-        [Description("Sigmoid")]
-        public class SigmoidInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.SIGMOID;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-        }
-
-        [DisplayName("TENSOR.SLICE")]
-        [Category("Tensor Instructions")]
-        [Description("Slice")]
-        public class SliceInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.SLICE;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("rbegins")]
-            [Description("Begins shape register")]
-            public byte Rbegins { get; set; }
-
-            [DisplayName("rends")]
-            [Description("Ends shape register")]
-            public byte Rends { get; set; }
-
-            [DisplayName("rstrides")]
-            [Description("Strides shape register")]
-            public byte Strides { get; set; }
-        }
-
-        [DisplayName("TENSOR.SOFTMAX")]
-        [Category("Tensor Instructions")]
-        [Description("Softmax")]
-        public class SoftmaxInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.SOFTMAX;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("axis")]
-            [Description("Axis")]
-            public int Axis { get; set; }
-
-            [DisplayName("beta")]
-            [Description("Beta")]
-            public float Beta { get; set; }
-        }
-
-        [DisplayName("TENSOR.TERNARY")]
-        [Category("Tensor Instructions")]
-        [Description("Ternary")]
-        public class TernaryInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.TERNARY;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src1")]
-            [Description("Source1 shape register")]
-            public byte RshapeSrc1 { get; set; }
-
-            [DisplayName("rstride_src1")]
-            [Description("Source1 stride register")]
-            public byte RstrideSrc1 { get; set; }
-
-            [DisplayName("rshape_src2")]
-            [Description("Source2 shape register")]
             public byte RshapeSrc2 { get; set; }
 
-            [DisplayName("rstride_src2")]
-            [Description("Source2 stride register")]
-            public byte RstrideSrc2 { get; set; }
-
-            [DisplayName("rshape_src3")]
-            [Description("Source3 shape register")]
-            public byte RshapeSrc3 { get; set; }
-
-            [DisplayName("rstride_src3")]
-            [Description("Source3 stride register")]
-            public byte RstrideSrc3 { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-        }
-
-        [DisplayName("TENSOR.TOPK")]
-        [Category("Tensor Instructions")]
-        [Description("Topk")]
-        public class TopKInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.TOPK;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rshape_dest1")]
-            [Description("Dest1 shape register")]
-            public byte RshapeDest1 { get; set; }
-
-            [DisplayName("rstride_dest1")]
-            [Description("Dest1 stride register")]
-            public byte RstrideDest1 { get; set; }
-
-            [DisplayName("rshape_dest2")]
-            [Description("Dest2 shape register")]
-            public byte RshapeDest2 { get; set; }
-
-            [DisplayName("rstride_dest2")]
-            [Description("Dest2 stride register")]
-            public byte RstrideDest2 { get; set; }
-
-            [DisplayName("k")]
-            [Description("K")]
-            public long K { get; set; }
-
             [DisplayName("axis")]
             [Description("Axis")]
             public int Axis { get; set; }
-
-            [DisplayName("largest")]
-            [Description("Largest")]
-            public bool Largest { get; set; }
-
-            [DisplayName("sorted")]
-            [Description("Sorted")]
-            public bool Sorted { get; set; }
-        }
-
-        [DisplayName("TENSOR.TRILU")]
-        [Category("Tensor Instructions")]
-        [Description("Trilu")]
-        public class TriluInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.TRILU;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("upper")]
-            [Description("Upper")]
-            public bool Upper { get; set; }
-
-            [DisplayName("k")]
-            [Description("K")]
-            public long K { get; set; }
         }
 
-        [DisplayName("TENSOR.UNARY")]
+        [DisplayName("TENSOR.INSTANCE_NORMALIZATION")]
         [Category("Tensor Instructions")]
-        [Description("Unary")]
-        public class UnaryInstruction : TensorInstruction
+        [Description("INSTANCE_NORMALIZATION")]
+        public class InstanceNormInstruction : TensorInstruction
         {
-            public override TensorFunction Function => TensorFunction.UNARY;
+            public override TensorFunction Function => TensorFunction.INSTANCE_NORMALIZATION;
 
             [DisplayName("datatype")]
             [Description("Datatype")]
             public DataType DataType { get; set; }
 
-            [DisplayName("rshape_src")]
-            [Description("Source1 shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
-
-            [DisplayName("unary_op")]
-            [Description("Unary operator")]
-            public UnaryOp UnaryOp { get; set; }
-        }
-
-        [DisplayName("TENSOR.TRANSPOSE")]
-        [Category("Tensor Instructions")]
-        [Description("Transpose")]
-        public class TransposeInstruction : TensorInstruction
-        {
-            public override TensorFunction Function => TensorFunction.TRANSPOSE;
-
-            [DisplayName("datatype")]
-            [Description("Datatype")]
-            public DataType DataType { get; set; }
-
-            [DisplayName("rshape_src")]
-            [Description("Source shape register")]
-            public byte RshapeSrc { get; set; }
-
-            [DisplayName("rstride_src")]
-            [Description("Source stride register")]
-            public byte RstrideSrc { get; set; }
-
-            [DisplayName("rstride_dest")]
-            [Description("Dest stride register")]
-            public byte RstrideDest { get; set; }
+            [DisplayName("input_shape")]
+            [Description("input_shape")]
+            public byte input_shape { get; set; }
 
-            [DisplayName("rshape_perm")]
-            [Description("Perm shape register")]
-            public byte RshapePerm { get; set; }
+            [DisplayName("epsilon")]
+            [Description("epsilon")]
+            public float epsilon { get; set; }
         }
-    }
+	}
 }
diff --git a/tools/stackvm_gen/IsaGen/Templates/op_reader_cpp.razor b/tools/stackvm_gen/IsaGen/Templates/op_reader_cpp.razor
index 78a5277698..809cbf766a 100644
--- a/tools/stackvm_gen/IsaGen/Templates/op_reader_cpp.razor
+++ b/tools/stackvm_gen/IsaGen/Templates/op_reader_cpp.razor
@@ -78,10 +78,9 @@ result<void> op_visitor::visit(gsl::span<const gsl::byte> text) noexcept
     while (!interrupted_ && !reader_.empty())
         try_(next());
 
-    #ifdef ENABLE_OP_PROFILE
-        op_profile profile_time;
-        profile_time.print();
-    #endif
+#ifdef ENABLE_OP_PROFILE
+    op_profile::print();
+#endif
 
     return ok();
 }