diff --git a/.github/workflows/compiler-build.yml b/.github/workflows/compiler-build.yml index d6c526e4f8..d34934dd55 100644 --- a/.github/workflows/compiler-build.yml +++ b/.github/workflows/compiler-build.yml @@ -2,206 +2,161 @@ name: compiler-build on: [push, pull_request] -env: - BUILD_TYPE: Release - jobs: build: - runs-on: ${{ matrix.os }} + name: build-${{matrix.config.name}} + runs-on: ${{matrix.config.os}} strategy: matrix: - os: [ubuntu-18.04,windows-2019,macos-10.15] + config: + - {name: x86_64-macos, os: macos-11, cmakeArgs: '', buildType: Release} + - {name: x86_64-linux, os: ubuntu-20.04, cmakeArgs: '', buildType: Release} + - {name: x86_64-windows, os: windows-latest, arch: x64, cmakeArgs: '', buildType: Release} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: seanmiddleditch/gha-setup-ninja@master - - name: Install System Requirements - if: runner.os == 'Macos' - shell: bash - run: | - brew install sunnycase/core/libomp@11.1.0 - - - name: Add msbuild to PATH + - name: Set up build environment (Windows, Visual Studio) + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: ${{matrix.config.arch}} if: runner.os == 'Windows' - uses: ilammy/msvc-dev-cmd@v1.10.0 + + - name: Set up build environment (Macos) + run: | + brew install sunnycase/core/libomp@14.0.6 + if: runner.os == 'Macos' - name: Setup Python - uses: actions/setup-python@v2.2.1 + uses: actions/setup-python@v4 with: python-version: 3.7 - name: Install Conan - shell: bash - run: | - pip install conan + run: pip install conan==1.59.0 - - name: Configure Conan - if: runner.os == 'Linux' - shell: bash + - name: Configure Conan (Linux) run: | conan profile new default --detect conan profile update settings.compiler.libcxx=libstdc++11 default - - - name: Create Build Environment - run: cmake -E make_directory ${{github.workspace}}/build - - - name: Configure CMake - env: - CC: gcc-10 - CXX: g++-10 + echo "CC=gcc-10" >> $GITHUB_ENV + echo "CXX=g++-10" >> $GITHUB_ENV if: runner.os == 'Linux' - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation} - + - name: Configure CMake - if: runner.os != 'Linux' - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation} - - - name: Build shell: bash - working-directory: ${{github.workspace}}/build run: | - cmake --build . --config $BUILD_TYPE + cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=${{matrix.config.buildType}} ${{matrix.config.cmakeArgs}} -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation} - - name: Install - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake --install . --prefix ../install + - name: Build & Install + run: | + cmake --build build --config ${{matrix.config.buildType}} + cmake --install build --prefix install - name: CTest - shell: bash working-directory: ${{github.workspace}}/build/tests/kernels - run: ctest -C $BUILD_TYPE + run: ctest -C ${{matrix.config.buildType}} - - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 + - name: Upload nncase Build Artifact + uses: actions/upload-artifact@v3 with: - name: nncase-${{matrix.os}}-x86_64 + name: nncase-${{matrix.config.name}} path: ${{github.workspace}}/install if-no-files-found: error test-compiler: needs: [build] - runs-on: ${{ matrix.os }} + name: test-${{matrix.config.name}} + runs-on: ${{matrix.config.os}} strategy: matrix: - os: [ubuntu-18.04,windows-2019,macos-10.15] + config: + - {name: x86_64-macos, os: macos-11, shell: bash} + - {name: x86_64-linux, os: ubuntu-20.04, shell: bash} + - {name: x86_64-windows, os: windows-latest, shell: bash} - steps: - - uses: actions/checkout@v2 + env: + VULKANSDK_VER: 1.2.182.0 - - name: Install System Requirements - if: runner.os == 'Macos' - shell: bash + steps: + - uses: actions/checkout@v3 + + - name: Set up test environment (macOS) run: | - brew install sunnycase/core/libomp@11.1.0 - - - name: Install Vulkan SDK - if: runner.os == 'Linux' - shell: bash - env: - VULKANSDK_VER: 1.2.182.0 + brew install sunnycase/core/libomp@14.0.6 + aria2c --parameterized-uri=true https://{sdk.lunarg.com/sdk/download/${VULKANSDK_VER}/mac,distfiles.macports.org/MoltenVK}/vulkansdk-macos-${VULKANSDK_VER}.dmg + hdiutil attach ./vulkansdk-macos-*.dmg + sudo /Volumes/vulkansdk-macos-*/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $HOME/VulkanSDK --accept-licenses --default-answer --confirm-command install + hdiutil detach /Volumes/vulkansdk-macos-* + echo "VULKAN_SDK=$HOME/VulkanSDK/macOS" >> $GITHUB_ENV + wget https://github.com/sunnycase/swiftshader/releases/download/v1.0/swiftshader-macos-10.15-x86_64.zip -O swiftshader.zip + unzip swiftshader.zip + sudo cmake -E make_directory /usr/local/share/vulkan/icd.d + sudo cp lib/* /usr/local/share/vulkan/icd.d + echo "PYTHONPATH=$GITHUB_WORKSPACE/install/lib:$GITHUB_WORKSPACE/install/python:$GITHUB_WORKSPACE/tests" >> $GITHUB_ENV + if: runner.os == 'macOS' + + - name: Set up test environment (Linux) run: | - wget https://sdk.lunarg.com/sdk/download/1.2.182.0/linux/vulkansdk-linux-x86_64-${VULKANSDK_VER}.tar.gz -O vulkansdk.tar.gz + wget https://sdk.lunarg.com/sdk/download/${VULKANSDK_VER}/linux/vulkansdk-linux-x86_64-${VULKANSDK_VER}.tar.gz -O vulkansdk.tar.gz tar xf vulkansdk.tar.gz sudo cp -P ${VULKANSDK_VER}/x86_64/lib/libvulkan.so* /usr/local/lib/ - - - name: Install Vulkan SDK - if: runner.os == 'Windows' - shell: pwsh - run: | - Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.2.182.0/windows/VulkanSDK-1.2.182.0-Installer.exe -O VulkanSDK-Installer.exe - .\VulkanSDK-Installer.exe /S - - - name: Install SwiftShader - if: runner.os != 'Windows' - shell: bash - run: | - wget https://github.com/sunnycase/swiftshader/releases/download/v1.0/swiftshader-${{matrix.os}}-x86_64.zip -O swiftshader.zip + wget https://github.com/sunnycase/swiftshader/releases/download/v1.0/swiftshader-ubuntu-18.04-x86_64.zip -O swiftshader.zip unzip swiftshader.zip sudo cmake -E make_directory /usr/local/share/vulkan/icd.d sudo cp lib/* /usr/local/share/vulkan/icd.d - - - name: Install SwiftShader - if: runner.os == 'Windows' + echo "PYTHONPATH=$GITHUB_WORKSPACE/install/lib:$GITHUB_WORKSPACE/install/python:$GITHUB_WORKSPACE/tests" >> $GITHUB_ENV + if: runner.os == 'Linux' + + - name: Set up test environment (Windows) shell: pwsh run: | - Invoke-WebRequest -Uri https://github.com/sunnycase/swiftshader/releases/download/v1.0/swiftshader-${{matrix.os}}-x86_64.zip -OutFile swiftshader.zip + Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/${env:VULKANSDK_VER}/windows/VulkanSDK-${env:VULKANSDK_VER}-Installer.exe -O VulkanSDK-Installer.exe + .\VulkanSDK-Installer.exe /S + Invoke-WebRequest -Uri https://github.com/sunnycase/swiftshader/releases/download/v1.0/swiftshader-windows-2019-x86_64.zip -OutFile swiftshader.zip Expand-Archive swiftshader.zip Copy-Item swiftshader\lib\vk_swiftshader_icd.json swiftshader\bin\ + echo "VK_ICD_FILENAMES=${env:GITHUB_WORKSPACE}/swiftshader/bin/vk_swiftshader_icd.json" >> $env:GITHUB_ENV + echo "PYTHONPATH=${env:GITHUB_WORKSPACE}/install/lib;${env:GITHUB_WORKSPACE}/install/python;${env:GITHUB_WORKSPACE}/tests" >> $env:GITHUB_ENV + if: runner.os == 'Windows' - name: Setup Python - uses: actions/setup-python@v2.2.1 + uses: actions/setup-python@v4 with: python-version: 3.7 - name: Install Python Packages - if: runner.os == 'Linux' - shell: bash - run: | - pip install conan tensorflow==2.5.0 matplotlib pillow onnx==1.9.0 onnx-simplifier==0.3.6 onnxoptimizer==0.2.6 onnxruntime==1.8.0 opencv-python - pip install torch==1.9.0+cpu torchvision==0.10.0+cpu -f https://download.pytorch.org/whl/torch_stable.html - pip install imageio==2.15.0 - pip install https://github.com/kendryte/caffe/releases/download/v1.0.0.20210829/kendryte_caffe-1.0.0.20210829-cp37-cp37m-manylinux_2_24_x86_64.whl - pip install pytest - - - name: Install Python Packages - if: runner.os == 'Windows' - shell: bash - run: | - pip install conan tensorflow==2.5.0 matplotlib pillow onnx==1.9.0 onnx-simplifier==0.3.6 onnxoptimizer==0.2.6 onnxruntime==1.8.0 opencv-python - pip install torch==1.9.0+cpu torchvision==0.10.0+cpu -f https://download.pytorch.org/whl/torch_stable.html - pip install imageio==2.15.0 - pip install https://github.com/kendryte/caffe/releases/download/v1.0.0.20210829/kendryte_caffe-1.0.0.20210829-cp37-cp37m-win_amd64.whl - pip install pytest - - - name: Install Python Packages - if: runner.os == 'Macos' - shell: bash - run: | - pip install conan tensorflow==2.5.0 matplotlib pillow onnx==1.9.0 onnx-simplifier==0.3.6 onnxoptimizer==0.2.6 onnxruntime==1.8.0 opencv-python - pip install torch==1.9.0 torchvision==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html - pip install imageio==2.15.0 - pip install https://github.com/kendryte/caffe/releases/download/v1.0.0.20210829/kendryte_caffe-1.0.0.20210829-cp37-cp37m-macosx_10_9_x86_64.whl - pip install pytest + run: pip install -r requirements.test.txt - name: Create Test Environment - working-directory: ${{github.workspace}} - shell: bash - run: | - mkdir test_results + run: mkdir test_results - - name: Download nncase Artifact - uses: actions/download-artifact@v2.0.9 + - name: Install nncase + uses: actions/download-artifact@v3 with: - name: nncase-${{matrix.os}}-x86_64 + name: nncase-${{matrix.config.name}} path: ${{github.workspace}}/install - - name: Test - working-directory: ${{github.workspace}} - if: runner.os != 'Windows' - shell: bash + - name: Generate benchmark kmodels + working-directory: ${{github.workspace}}/benchmark env: - PYTHONPATH: ${{github.workspace}}/install/lib:${{github.workspace}}/install/python:${{github.workspace}}/tests + PATH: ${{github.workspace}}/install/bin run: | - pytest tests/other --doctest-modules --junitxml=test_results/other.xml - pytest tests/importer --doctest-modules --junitxml=test_results/importer.xml - pytest tests/schedule --doctest-modules --junitxml=test_results/schedule.xml - pytest tests/graph_partition --doctest-modules --junitxml=test_results/graph_partition.xml - pytest tests/examples --doctest-modules --junitxml=test_results/examples.xml + python gen_kmodel.py + if: matrix.config.name == 'x86_64-linux' + + - uses: stefanzweifel/git-auto-commit-action@v4 + with: + commit_message: Update benchmark kmodels + file_pattern: 'benchmark/models/*' + if: matrix.config.name == 'x86_64-linux' - name: Test working-directory: ${{github.workspace}} - if: runner.os == 'Windows' shell: bash env: PATH: ${{github.workspace}}/install/bin - PYTHONPATH: ${{github.workspace}}/install/lib;${{github.workspace}}/install/python;${{github.workspace}}/tests - VK_ICD_FILENAMES: ${{github.workspace}}/swiftshader/bin/vk_swiftshader_icd.json run: | pytest tests/other --doctest-modules --junitxml=test_results/other.xml pytest tests/importer --doctest-modules --junitxml=test_results/importer.xml @@ -210,7 +165,7 @@ jobs: pytest tests/examples --doctest-modules --junitxml=test_results/examples.xml - name: Publish Test Results - uses: EnricoMi/publish-unit-test-result-action@v1 - if: always() && runner.os == 'Linux' + uses: EnricoMi/publish-unit-test-result-action@v2 + if: always() && matrix.config.name == 'x86_64-linux' with: files: test_results/*.xml diff --git a/.github/workflows/compiler-python-release.yml b/.github/workflows/compiler-python-release.yml index 8b8b7fdaf5..6ac04e6a27 100644 --- a/.github/workflows/compiler-python-release.yml +++ b/.github/workflows/compiler-python-release.yml @@ -5,61 +5,58 @@ on: tags: - '*' -env: - BUILD_TYPE: Release - jobs: build: - runs-on: ${{ matrix.os }} + name: ${{matrix.config.name}} + runs-on: ${{matrix.config.os}} strategy: matrix: - os: [ubuntu-18.04,windows-2019,macos-10.15] + config: + - {name: x86_64-macos, os: macos-11} + - {name: x86_64-linux, os: ubuntu-20.04} + - {name: x86_64-windows, os: windows-latest, arch: x64} + + env: + VULKANSDK_VER: 1.2.182.0 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: seanmiddleditch/gha-setup-ninja@master + - name: Set up build environment (Windows, Visual Studio) + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: ${{matrix.config.arch}} + if: runner.os == 'Windows' + + - name: Set up build environment (Macos) + run: | + brew install sunnycase/core/libomp@14.0.6 + aria2c --parameterized-uri=true https://{sdk.lunarg.com/sdk/download/${VULKANSDK_VER}/mac,distfiles.macports.org/MoltenVK}/vulkansdk-macos-${VULKANSDK_VER}.dmg + hdiutil attach ./vulkansdk-macos-*.dmg + sudo /Volumes/vulkansdk-macos-*/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $HOME/VulkanSDK --accept-licenses --default-answer --confirm-command install + hdiutil detach /Volumes/vulkansdk-macos-* + echo "VULKAN_SDK=$HOME/VulkanSDK/macOS" >> $GITHUB_ENV + wget https://github.com/sunnycase/swiftshader/releases/download/v1.0/swiftshader-macos-10.15-x86_64.zip -O swiftshader.zip + unzip swiftshader.zip + sudo cmake -E make_directory /usr/local/share/vulkan/icd.d + sudo cp lib/* /usr/local/share/vulkan/icd.d + if: runner.os == 'Macos' + - name: Setup Python - uses: actions/setup-python@v2.2.1 + uses: actions/setup-python@v4 with: python-version: 3.7 - name: Install cibuildwheel run: pip install cibuildwheel - - name: Install System Requirements - if: runner.os == 'Macos' - shell: bash - run: | - brew install sunnycase/core/libomp@11.1.0 - - - name: Add msbuild to PATH - if: runner.os == 'Windows' - uses: ilammy/msvc-dev-cmd@v1.10.0 - - name: Build wheel run: python -m cibuildwheel --output-dir wheelhouse - - - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 - if: runner.os == 'Windows' - with: - name: nncase-python-windows - path: ${{github.workspace}}/wheelhouse - if-no-files-found: error - - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 - if: runner.os == 'Linux' - with: - name: nncase-python-linux - path: ${{github.workspace}}/wheelhouse - if-no-files-found: error - - - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 - if: runner.os == 'Macos' + - name: Upload nncase-python Build Artifact + uses: actions/upload-artifact@v3 with: - name: nncase-python-macos + name: nncase-python-${{matrix.config.name}} path: ${{github.workspace}}/wheelhouse if-no-files-found: error diff --git a/.github/workflows/compiler-test.yml b/.github/workflows/compiler-test.yml index 623b778646..45a84fc791 100644 --- a/.github/workflows/compiler-test.yml +++ b/.github/workflows/compiler-test.yml @@ -2,112 +2,96 @@ name: compiler-test on: [push, pull_request] -env: - BUILD_TYPE: Release - jobs: build: + name: build-${{matrix.config.name}} runs-on: [self-hosted] + strategy: + matrix: + config: + - {name: x86_64-linux, shell: bash, cmakeArgs: '', buildType: Release} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Python - uses: actions/setup-python@v2.2.1 + uses: actions/setup-python@v4 with: python-version: 3.7 - name: Install Conan - shell: bash - run: | - pip install conan + run: pip install conan==1.59.0 - - name: Configure Conan - if: runner.os == 'Linux' - shell: bash + - name: Configure Conan (Linux) run: | conan profile update settings.compiler.libcxx=libstdc++11 default - - - name: Create Build Environment - run: cmake -E make_directory ${{github.workspace}}/build - - - name: Configure CMake - env: - CC: gcc-10 - CXX: g++-10 + echo "CC=gcc-10" >> $GITHUB_ENV + echo "CXX=g++-10" >> $GITHUB_ENV if: runner.os == 'Linux' - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation} - - - name: Configure CMake - if: runner.os != 'Linux' - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation} - name: Build - shell: bash - working-directory: ${{github.workspace}}/build run: | - cmake --build . --config $BUILD_TYPE + cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=${{matrix.config.buildType}} ${{matrix.config.cmakeArgs}} -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation} + cmake --build build --config ${{matrix.config.buildType}} - name: Install - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake --install . --prefix /tmp/nncase + run: cmake --install build --prefix /tmp/nncase - name: CTest - shell: bash working-directory: ${{github.workspace}}/build/tests/kernels - run: ctest -C $BUILD_TYPE + run: ctest -C ${{matrix.config.buildType}} test-compiler: needs: [build] + name: test-${{matrix.config.name}} runs-on: [self-hosted] + strategy: + matrix: + config: + - {name: x86_64-linux, shell: bash} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Python - uses: actions/setup-python@v2.2.1 + uses: actions/setup-python@v4 with: python-version: 3.7 - name: Install Python Packages - if: runner.os != 'Macos' - shell: bash - run: | - pip install conan tensorflow==2.5.0 matplotlib pillow onnx==1.9.0 onnx-simplifier==0.3.6 onnxoptimizer==0.2.6 onnxruntime==1.8.0 opencv-python - pip install torch==1.9.0+cpu torchvision==0.10.0+cpu -f https://download.pytorch.org/whl/torch_stable.html - pip install imageio==2.15.0 - pip install kendryte_caffe - pip install pytest + run: pip install -r requirements.test.txt - name: Create Test Environment - working-directory: ${{github.workspace}} - shell: bash - run: | - pip install pytest pytest-xdist - mkdir test_results + run: mkdir test_results - name: Test working-directory: ${{github.workspace}} - if: runner.os != 'Windows' - shell: bash env: PYTHONPATH: /tmp/nncase/lib:/tmp/nncase/python:${{github.workspace}}/tests ONNX_MODELS_DIR: /compiler/github-runner/onnx-models TFLITE_MODELS_DIR: /compiler/github-runner/tflite-models DATASET_DIR: /compiler/share run: | - pytest -n 50 --dist=load tests/other --doctest-modules --junitxml=test_results/other.xml pytest -n 50 --dist=load tests/importer --doctest-modules --junitxml=test_results/importer.xml pytest -n 50 --dist=load tests/schedule --doctest-modules --junitxml=test_results/schedule.xml pytest -n 50 --dist=load tests/graph_partition --doctest-modules --junitxml=test_results/graph_partition.xml pytest -n 50 --dist=load tests/transform --doctest-modules --junitxml=test_results/transform.xml - pytest -n 8 tests/models/onnx-model-zoo --doctest-modules --junitxml=test_results/models.xml + pytest -n 8 tests/models/onnx-model-zoo --doctest-modules --junitxml=test_results/onnx-models.xml pytest tests/examples --doctest-modules --junitxml=test_results/examples.xml for dir in `ls dataset_tests_output`; do cat dataset_tests_output/$dir/dataset_test_result.txt; done - + if: runner.os != 'Windows' + + - name: Upload Test Results + uses: actions/upload-artifact@v3 + with: + name: nncase-test_results-${{matrix.config.name}} + path: ${{github.workspace}}/test_results + if-no-files-found: error + + - name: Upload Dataset Test Results + uses: actions/upload-artifact@v3 + with: + name: nncase-dataset_test_results-${{matrix.config.name}} + path: ${{github.workspace}}/dataset_tests_output + if-no-files-found: error diff --git a/.github/workflows/dataset-test.yml b/.github/workflows/dataset-test.yml index 1ce1cbbbe5..5c81bcee83 100644 --- a/.github/workflows/dataset-test.yml +++ b/.github/workflows/dataset-test.yml @@ -5,99 +5,72 @@ on: - cron: '0 17 * * 6' # 1:00 am -env: - BUILD_TYPE: Release - jobs: build: + name: build-${{matrix.config.name}} runs-on: [self-hosted] + strategy: + matrix: + config: + - {name: x86_64-linux, shell: bash, cmakeArgs: '', buildType: Release} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Python - uses: actions/setup-python@v2.2.1 + uses: actions/setup-python@v4 with: python-version: 3.7 - name: Install Conan - shell: bash - run: | - pip install conan + run: pip install conan==1.59.0 - - name: Configure Conan - if: runner.os == 'Linux' - shell: bash + - name: Configure Conan (Linux) run: | conan profile update settings.compiler.libcxx=libstdc++11 default + if: runner.os == 'Linux' - - name: Create Build Environment - run: cmake -E make_directory ${{github.workspace}}/build - - - name: Configure CMake + - name: Build env: CC: gcc-10 CXX: g++-10 - if: runner.os == 'Linux' - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation} - - - name: Configure CMake - if: runner.os != 'Linux' - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation} - - - name: Build - shell: bash - working-directory: ${{github.workspace}}/build run: | - cmake --build . --config $BUILD_TYPE + cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=${{matrix.config.buildType}} ${{matrix.config.cmakeArgs}} -DBUILD_TESTING=ON -DPython3_ROOT_DIR=${pythonLocation} + cmake --build build --config ${{matrix.config.buildType}} - name: Install - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake --install . --prefix /tmp/nncase + run: cmake --install build --prefix /tmp/nncase - name: CTest - shell: bash working-directory: ${{github.workspace}}/build/tests/kernels - run: ctest -C $BUILD_TYPE + run: ctest -C ${{matrix.config.buildType}} dataset-test: needs: [build] + name: test-${{matrix.config.name}} runs-on: [self-hosted] timeout-minutes: 4320 + strategy: + matrix: + config: + - {name: x86_64-linux, shell: bash} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Python - uses: actions/setup-python@v2.2.1 + uses: actions/setup-python@v4 with: python-version: 3.7 - name: Install Python Packages - if: runner.os != 'Macos' - shell: bash - run: | - pip install conan tensorflow==2.5.0 matplotlib pillow onnx==1.9.0 onnx-simplifier==0.3.6 onnxoptimizer==0.2.6 onnxruntime==1.8.0 opencv-python - pip install torch==1.9.0+cpu torchvision==0.10.0+cpu -f https://download.pytorch.org/whl/torch_stable.html - pip install imageio==2.15.0 - pip install kendryte_caffe - pip install pytest + run: pip install -r requirements.test.txt - name: Create Test Environment - working-directory: ${{github.workspace}} - shell: bash - run: | - pip install pytest pytest-xdist - mkdir test_results + run: mkdir test_results - name: Test working-directory: ${{github.workspace}} - if: runner.os != 'Windows' shell: bash env: PYTHONPATH: /tmp/nncase/lib:/tmp/nncase/python:${{github.workspace}}/tests @@ -105,6 +78,13 @@ jobs: TFLITE_MODELS_DIR: /compiler/github-runner/tflite-models DATASET_DIR: /compiler/share run: | - - pytest -n 8 tests/models/tflite-model-zoo --doctest-modules --junitxml=test_results/models-dataset.xml + pytest -n 8 tests/models/tflite-model-zoo --doctest-modules --junitxml=test_results/tflite-models.xml for dir in `ls dataset_tests_output`; do cat dataset_tests_output/$dir/dataset_test_result.txt; done + if: runner.os != 'Windows' + + - name: Upload Dataset Test Results + uses: actions/upload-artifact@v3 + with: + name: nncase-dataset_test_results-${{matrix.config.name}} + path: ${{github.workspace}}/dataset_tests_output + if-no-files-found: error diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml index cb2ccbcaa1..41417d9ffc 100644 --- a/.github/workflows/presubmit.yml +++ b/.github/workflows/presubmit.yml @@ -5,25 +5,12 @@ on: pull_request: types: [opened, synchronize, reopened] paths: - - '**.h' - - '**.c' - - '**.cpp' - '**.py' jobs: - check_clang_format: - name: Check clang-format - runs-on: ubuntu-18.04 - steps: - - uses: actions/checkout@v2 - - uses: DoozyX/clang-format-lint-action@v0.11 - with: - source: 'tests src include modules python targets' - extensions: 'h,c,cc,cxx,cpp,hpp,cppm' - clangFormatVersion: 11 check_autopep8_format: name: Check autopep8-format - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest steps: - name: autopep8 id: autopep8 @@ -33,4 +20,4 @@ jobs: - name: Fail if autopep8 made changes if: steps.autopep8.outputs.exit-code == 2 - run: exit 1 \ No newline at end of file + run: exit 1 diff --git a/.github/workflows/runtime-build.yml b/.github/workflows/runtime-build.yml new file mode 100644 index 0000000000..331040e40b --- /dev/null +++ b/.github/workflows/runtime-build.yml @@ -0,0 +1,124 @@ +name: runtime-build + +on: [push, pull_request] + +jobs: + build-native: + name: ${{matrix.config.name}} + runs-on: ${{matrix.config.os}} + strategy: + matrix: + config: + - {name: x86_64-macos, os: macos-11, cmakeArgs: -G Ninja, buildType: Release} + - {name: x86_64-linux, os: ubuntu-20.04, cmakeArgs: -G Ninja, buildType: Release} + - {name: x86_64-windows, os: windows-latest, arch: x64, cmakeArgs: -G Ninja, buildType: Release} + + steps: + - uses: actions/checkout@v3 + - uses: seanmiddleditch/gha-setup-ninja@master + + - name: Set up build environment (Windows, Visual Studio) + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: ${{matrix.config.arch}} + if: runner.os == 'Windows' + + - name: Set up build environment (Macos) + run: | + brew install sunnycase/core/libomp@14.0.6 + if: runner.os == 'Macos' + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: 3.7 + + - name: Install Conan + run: pip install conan==1.59.0 + + - name: Configure Conan (Linux) + run: | + conan profile new default --detect + conan profile update settings.compiler.libcxx=libstdc++11 default + echo "CC=gcc-10" >> $GITHUB_ENV + echo "CXX=g++-10" >> $GITHUB_ENV + if: runner.os == 'Linux' + + - name: Configure CMake + shell: bash + run: | + cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=${{matrix.config.buildType}} ${{matrix.config.cmakeArgs}} -DBUILDING_RUNTIME=TRUE -DBUILD_PYTHON_BINDING=OFF -DPython3_ROOT_DIR=${pythonLocation} + + - name: Build & Install + run: | + cmake --build build --config ${{matrix.config.buildType}} + cmake --install build --prefix install + + - name: Benchmark + run: | + ${{github.workspace}}/install/bin/benchnncase > benchnncase.log + cat benchnncase.log + + - name: Upload nncaseruntime Build Artifact + uses: actions/upload-artifact@v3 + with: + name: nncaseruntime-${{matrix.config.name}} + path: ${{github.workspace}}/install + if-no-files-found: error + + - name: Upload nncaseruntime Benchmark + uses: actions/upload-artifact@v3 + with: + name: nncaseruntime-benchmark-${{matrix.config.name}} + path: ${{github.workspace}}/benchnncase.log + if-no-files-found: error + + build-cross: + name: ${{matrix.config.name}} + runs-on: ubuntu-latest + strategy: + matrix: + config: + - {name: riscv64-none-k210, shell: bash, toolchain: k210, cmakeArgs: -DK210_SDK_DIR=$GITHUB_WORKSPACE/kendryte-standalone-sdk-develop, buildType: Release} + + steps: + - uses: actions/checkout@v3 + - uses: seanmiddleditch/gha-setup-ninja@master + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: 3.7 + + - name: Install K210 Baremetal SDK + shell: bash + run: | + wget https://github.com/kendryte/kendryte-gnu-toolchain/releases/download/v8.2.0-20190409/kendryte-toolchain-ubuntu-amd64-8.2.0-20190409.tar.xz -O $GITHUB_WORKSPACE/kendryte-toolchain.tar.xz + sudo tar xf $GITHUB_WORKSPACE/kendryte-toolchain.tar.xz -C $GITHUB_WORKSPACE + wget https://github.com/kendryte/kendryte-standalone-sdk/archive/refs/heads/develop.tar.gz -O $GITHUB_WORKSPACE/k210-sdk.tar.gz + sudo tar xf $GITHUB_WORKSPACE/k210-sdk.tar.gz -C $GITHUB_WORKSPACE + echo "RISCV_ROOT_PATH=$GITHUB_WORKSPACE/kendryte-toolchain" >> $GITHUB_ENV + if: matrix.config.name == 'riscv64-none-k210' + + - name: Install Conan + run: pip install conan==1.59.0 + + - name: Configure Conan (Linux) + run: | + conan profile new default --detect + conan profile update settings.compiler.libcxx=libstdc++11 default + + - name: Build + run: | + cmake -B build -G Ninja -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/${{matrix.config.toolchain}}.toolchain.cmake -DCMAKE_BUILD_TYPE=${{matrix.config.buildType}} ${{matrix.config.cmakeArgs}} -DBUILDING_RUNTIME=TRUE -DBUILD_PYTHON_BINDING=OFF -DPython3_ROOT_DIR=${pythonLocation} + cmake --build build --config ${{matrix.config.buildType}} + + - name: Install + run: cmake --install build --prefix install + + - name: Upload nncaseruntime Build Artifact + uses: actions/upload-artifact@v3 + with: + name: nncaseruntime-${{matrix.config.name}} + path: ${{github.workspace}}/install + if-no-files-found: error diff --git a/.github/workflows/runtime-k210.yml b/.github/workflows/runtime-k210.yml deleted file mode 100644 index cc98ee8c3b..0000000000 --- a/.github/workflows/runtime-k210.yml +++ /dev/null @@ -1,80 +0,0 @@ -name: runtime-k210 - -on: [push, pull_request] - -env: - BUILD_TYPE: Release - -jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-18.04] - - steps: - - uses: actions/checkout@v2 - - uses: seanmiddleditch/gha-setup-ninja@master - - - name: Download K210 Toolchains - if: runner.os == 'Linux' - shell: bash - run: | - wget https://github.com/kendryte/kendryte-gnu-toolchain/releases/download/v8.2.0-20190409/kendryte-toolchain-ubuntu-amd64-8.2.0-20190409.tar.xz -O $GITHUB_WORKSPACE/kendryte-toolchain.tar.xz - - - name: Install K210 Toolchains - shell: bash - run: | - sudo tar xf $GITHUB_WORKSPACE/kendryte-toolchain.tar.xz -C $GITHUB_WORKSPACE - - - name: Download K210 SDK - shell: bash - run: | - wget https://github.com/kendryte/kendryte-standalone-sdk/archive/refs/heads/develop.tar.gz -O $GITHUB_WORKSPACE/k210-sdk.tar.gz - - - name: Install K210 SDK - shell: bash - run: | - sudo tar xf $GITHUB_WORKSPACE/k210-sdk.tar.gz -C $GITHUB_WORKSPACE - - - name: Setup Python - uses: actions/setup-python@v2.2.1 - with: - python-version: 3.7 - - - name: Install Conan - run: | - pip install conan - - - name: Configure Conan - shell: bash - run: | - conan profile new default --detect - conan profile update settings.compiler.libcxx=libstdc++11 default - - - name: Create Build Environment - run: cmake -E make_directory ${{github.workspace}}/build - - - name: Configure CMake - env: - RISCV_ROOT_PATH: ${{github.workspace}}/kendryte-toolchain - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DK210_SDK_DIR=$GITHUB_WORKSPACE/kendryte-standalone-sdk-develop -DBUILDING_RUNTIME=TRUE -DBUILD_PYTHON_BINDING=OFF -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/k210.toolchain.cmake -DPython3_ROOT_DIR=${pythonLocation} - - - name: Build - working-directory: ${{github.workspace}}/build - shell: bash - run: cmake --build . --config $BUILD_TYPE - - - name: Install - working-directory: ${{github.workspace}}/build - shell: bash - run: cmake --install . --prefix ../install - - - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 - with: - name: nncaseruntime-k210 - path: ${{github.workspace}}/install - if-no-files-found: error diff --git a/.github/workflows/runtime-linux-x64-gcc.yml b/.github/workflows/runtime-linux-x64-gcc.yml deleted file mode 100644 index fe755cd81a..0000000000 --- a/.github/workflows/runtime-linux-x64-gcc.yml +++ /dev/null @@ -1,72 +0,0 @@ -name: runtime-linux-x64-gcc - -on: [push, pull_request] - -env: - BUILD_TYPE: Release - -jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-18.04] - - steps: - - uses: actions/checkout@v2 - - uses: seanmiddleditch/gha-setup-ninja@master - - - name: Setup Python - uses: actions/setup-python@v2.2.1 - with: - python-version: 3.7 - - - name: Install Conan - run: | - pip install conan - - - name: Configure Conan - shell: bash - run: | - conan profile new default --detect - conan profile update settings.compiler.libcxx=libstdc++11 default - - - name: Create Build Environment - run: cmake -E make_directory ${{github.workspace}}/build - - - name: Configure CMake - env: - CC: gcc-7 - CXX: g++-7 - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILDING_RUNTIME=TRUE -DPython3_ROOT_DIR=${pythonLocation} - - - name: Build - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake --build . --config $BUILD_TYPE - - - name: Install - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake --install . --prefix ../install - - - name: Benchmark - shell: bash - working-directory: ${{github.workspace}} - run: ${{github.workspace}}/install/bin/benchnncase > benchnncase.log - - - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 - with: - name: nncaseruntime-linux-x64-gcc - path: ${{github.workspace}}/install - if-no-files-found: error - - - name: Upload Benchmark Result - uses: actions/upload-artifact@v2.2.2 - with: - name: nncasebenchmark-linux-x64-gcc - path: ${{github.workspace}}/benchnncase.log - if-no-files-found: error diff --git a/.github/workflows/runtime-macos-x64-appleclang.yml b/.github/workflows/runtime-macos-x64-appleclang.yml deleted file mode 100644 index 45b4a489dc..0000000000 --- a/.github/workflows/runtime-macos-x64-appleclang.yml +++ /dev/null @@ -1,69 +0,0 @@ -name: runtime-macos-x64-appleclang - -on: [push, pull_request] - -env: - BUILD_TYPE: Release - -jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [macos-10.15] - - steps: - - uses: actions/checkout@v2 - - uses: seanmiddleditch/gha-setup-ninja@master - - - name: Install System Requirements - shell: bash - run: | - brew install sunnycase/core/libomp@11.1.0 - - - name: Setup Python - uses: actions/setup-python@v2.2.1 - with: - python-version: 3.7 - - - name: Install Conan - shell: bash - run: | - pip install conan - - - name: Create Build Environment - run: cmake -E make_directory ${{github.workspace}}/build - - - name: Configure CMake - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILDING_RUNTIME=TRUE -DPython3_ROOT_DIR=${pythonLocation} - - - name: Build - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake --build . --config $BUILD_TYPE - - - name: Install - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake --install . --prefix ../install - - - name: Benchmark - shell: bash - working-directory: ${{github.workspace}} - run: ${{github.workspace}}/install/bin/benchnncase > benchnncase.log - - - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 - with: - name: nncaseruntime-macos-x64-appleclang - path: ${{github.workspace}}/install - if-no-files-found: error - - - name: Upload Benchmark Result - uses: actions/upload-artifact@v2.2.2 - with: - name: nncasebenchmark-macos-x64-appleclang - path: ${{github.workspace}}/benchnncase.log - if-no-files-found: error diff --git a/.github/workflows/runtime-win-x64-msvc.yml b/.github/workflows/runtime-win-x64-msvc.yml deleted file mode 100644 index 5cef593961..0000000000 --- a/.github/workflows/runtime-win-x64-msvc.yml +++ /dev/null @@ -1,68 +0,0 @@ -name: runtime-win-x64-msvc - -on: [push, pull_request] - -env: - BUILD_TYPE: Release - -jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [windows-2019] - - steps: - - uses: actions/checkout@v2 - - uses: seanmiddleditch/gha-setup-ninja@master - - - name: Add msbuild to PATH - if: runner.os == 'Windows' - uses: ilammy/msvc-dev-cmd@v1.10.0 - - - name: Setup Python - uses: actions/setup-python@v2.2.1 - with: - python-version: 3.7 - - - name: Install Conan - shell: bash - run: | - pip install conan - - - name: Create Build Environment - run: cmake -E make_directory ${{github.workspace}}/build - - - name: Configure CMake - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake -G Ninja $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILDING_RUNTIME=TRUE -DPython3_ROOT_DIR=${pythonLocation} - - - name: Build - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake --build . --config $BUILD_TYPE - - - name: Install - shell: bash - working-directory: ${{github.workspace}}/build - run: cmake --install . --prefix ../install - - - name: Benchmark - shell: pwsh - working-directory: ${{github.workspace}} - run: .\install\bin\benchnncase.exe > benchnncase.log - - - name: Upload a Build Artifact - uses: actions/upload-artifact@v2.2.2 - with: - name: nncaseruntime-win-x64-msvc - path: ${{github.workspace}}/install - if-no-files-found: error - - - name: Upload Benchmark Result - uses: actions/upload-artifact@v2.2.2 - with: - name: nncasebenchmark-win-x64-msvc - path: ${{github.workspace}}/benchnncase.log - if-no-files-found: error diff --git a/CMakeLists.txt b/CMakeLists.txt index e7d77a2090..275db97f00 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,31 +1,31 @@ -cmake_minimum_required(VERSION 3.13) +cmake_minimum_required(VERSION 3.13) list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake/Modules) if(NOT DEFINED NNCASE_VERSION) - set(NNCASE_VERSION "1.0.0") + set(NNCASE_VERSION "1.9.0") endif() if(DEFINED ENV{NNCASE_VERSION_SUFFIX}) - set(NNCASE_VERSION_SUFFIX $ENV{NNCASE_VERSION_SUFFIX}) + set(NNCASE_VERSION_SUFFIX $ENV{NNCASE_VERSION_SUFFIX}) endif() if(NOT DEFINED NNCASE_VERSION_SUFFIX) - find_package (Git) - execute_process( - COMMAND ${GIT_EXECUTABLE} describe --always --dirty - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - OUTPUT_VARIABLE GIT_DESC - OUTPUT_STRIP_TRAILING_WHITESPACE) - set(NNCASE_VERSION_SUFFIX "-${GIT_DESC}") + find_package(Git) + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --always --dirty + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GIT_DESC + OUTPUT_STRIP_TRAILING_WHITESPACE) + set(NNCASE_VERSION_SUFFIX "-${GIT_DESC}") endif() -if (NOT PACKAGE_VERSION) - set(PACKAGE_VERSION - "${NNCASE_VERSION}${NNCASE_VERSION_SUFFIX}") +if(NOT PACKAGE_VERSION) + set(PACKAGE_VERSION "${NNCASE_VERSION}${NNCASE_VERSION_SUFFIX}") endif() -project(nncase +project( + nncase VERSION ${NNCASE_VERSION} LANGUAGES C CXX ASM) @@ -35,208 +35,259 @@ option(BUILD_PYTHON_BINDING "Build python binding" ON) option(BUILD_BENCHMARK "Build benchmark programs" ON) option(BUILD_TESTING "Build test programs" OFF) option(ENABLE_OP_PROFILE "Profile ops cast time" OFF) -if (ENABLE_OP_PROFILE) - add_definitions(-DENABLE_OP_PROFILE) +if(ENABLE_OP_PROFILE) + add_definitions(-DENABLE_OP_PROFILE) endif() -if (BUILDING_RUNTIME) - option(ENABLE_VULKAN_RUNTIME "Enable Vulkan runtime" ON) - option(ENABLE_K210_RUNTIME "Enable k210 runtime" OFF) - option(DEFAULT_BUILTIN_RUNTIMES "Use default builtin runtimes" ON) - option(DEFAULT_SHARED_RUNTIME_TENSOR_PLATFORM_IMPL "Use default shared memory platform impl" ON) +if(${CMAKE_SYSTEM_PROCESSOR} MATCHES + "(x86)|(X86)|(amd64)|(AMD64)|(x86_64)|(X86_64)") + if(NOT TURNOFF_SIMD_OPTIMIZE) + include(toolchains/x86_64.toolchain.cmake) + endif() +endif() + +if(BUILDING_RUNTIME) + option(ENABLE_VULKAN_RUNTIME "Enable Vulkan runtime" ON) + option(ENABLE_K210_RUNTIME "Enable k210 runtime" OFF) + option(DEFAULT_BUILTIN_RUNTIMES "Use default builtin runtimes" ON) + option(DEFAULT_SHARED_RUNTIME_TENSOR_PLATFORM_IMPL + "Use default shared memory platform impl" ON) endif() include(cmake/configure-conan.cmake) include(cmake/conan.cmake) -if(NOT CONAN_EXPORTED) - conan_check() - conan_add_remote(NAME sunnycase URL https://conan.sunnycase.moe INDEX 0) +if(NOT CONAN_EXPORTED) + conan_check() + conan_add_remote(NAME sunnycase URL https://conan.sunnycase.moe INDEX 0) endif() if(CONAN_EXPORTED) # in conan local cache - message(STATUS "Standard Conan Installation") - include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake) - conan_basic_setup() # NOTE need manmul set cppstd in conanfile.py + message(STATUS "Standard Conan Installation") + include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake) + conan_basic_setup() # NOTE need manmul set cppstd in conanfile.py else() # in user space - message(STATUS "Auto Cmake Conan Installation") - include(${CMAKE_SOURCE_DIR}/cmake/conan.cmake) - conan_cmake_run(CONANFILE conanfile.py - BASIC_SETUP - OPTIONS ${CONAN_OPTS} - SETTINGS ${CONAN_SETTINGS} - BUILD missing) + message(STATUS "Auto Cmake Conan Installation") + include(${CMAKE_SOURCE_DIR}/cmake/conan.cmake) + conan_cmake_run( + CONANFILE + conanfile.py + BASIC_SETUP + OPTIONS + ${CONAN_OPTS} + SETTINGS + ${CONAN_SETTINGS} + BUILD + missing) endif() include(${CMAKE_BINARY_DIR}/conan_paths.cmake) include(cmake/dependencies.cmake) -if (BUILDING_RUNTIME) - set(NNCASE_MAIN_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include) - set(NNCASE_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include) - set(THIRD_PARTY ${CMAKE_CURRENT_LIST_DIR}/third_party) - set_property(GLOBAL PROPERTY POSITION_INDEPENDENT_CODE ON) - - if (MSVC) - add_definitions(/D_CRT_SECURE_NO_WARNINGS /DNOMINMAX) - add_compile_options(/wd4267 /wd4251 /wd4244 /FC /utf-8 /W3 /WX) +if(BUILDING_RUNTIME) + set(NNCASE_MAIN_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include) + set(NNCASE_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include) + set(THIRD_PARTY ${CMAKE_CURRENT_LIST_DIR}/third_party) + set_property(GLOBAL PROPERTY POSITION_INDEPENDENT_CODE ON) + + if(MSVC) + add_definitions(/D_CRT_SECURE_NO_WARNINGS /DNOMINMAX) + add_compile_options( + /wd4267 + /wd4251 + /wd4244 + /FC + /utf-8 + /W3 + /WX) + else() + add_compile_options( + -Wall + -Wextra + -pedantic + -Werror + -Wno-multichar + -Wno-missing-field-initializers + -Wno-unused-function + -Wno-type-limits) + if(APPLE) + add_compile_options(-Wno-four-char-constants -Wno-sometimes-uninitialized) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + add_compile_options(-Wno-uninitialized -Wno-unused-private-field) else() - add_compile_options(-Wall -Wextra -pedantic -Werror -Wno-multichar -Wno-missing-field-initializers -Wno-unused-function -Wno-type-limits) - if (APPLE) - add_compile_options(-Wno-four-char-constants -Wno-sometimes-uninitialized) - elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - add_compile_options(-Wno-uninitialized -Wno-unused-private-field) - else () - add_compile_options(-Wno-maybe-uninitialized -Wno-unused-private-field) - endif() - endif() - - include_directories(${NNCASE_MAIN_INCLUDE_DIR}) - include_directories(${NNCASE_INCLUDE_DIR}) - - add_subdirectory(include/nncase) - add_subdirectory(src/kernels) - add_subdirectory(src/runtime) - add_subdirectory(src/functional) - if(BUILD_BENCHMARK) - add_subdirectory(benchmark) + add_compile_options(-Wno-maybe-uninitialized -Wno-unused-private-field) endif() - - # Python binding - if(BUILD_PYTHON_BINDING) - add_subdirectory(python/nncaseruntime/native) - endif() - - install(DIRECTORY ${NNCASE_INCLUDE_DIR}/nncase - DESTINATION include - COMPONENT nncase-headers - FILES_MATCHING - PATTERN "*version.h" - PATTERN "CMakeFiles" EXCLUDE - ) - - install(DIRECTORY include/nncase/kernels - DESTINATION include/nncase - COMPONENT nncase-headers - FILES_MATCHING - PATTERN "*.def" - PATTERN "*.h" - PATTERN "*.hpp" - PATTERN "*.td" - PATTERN "*.inc" - PATTERN "LICENSE.TXT" - ) - - install(DIRECTORY include/nncase/runtime - DESTINATION include/nncase - COMPONENT nncase-headers - FILES_MATCHING - PATTERN "*.def" - PATTERN "*.h" - PATTERN "*.hpp" - PATTERN "*.td" - PATTERN "*.inc" - PATTERN "LICENSE.TXT" - ) + endif() + + include_directories(${NNCASE_MAIN_INCLUDE_DIR}) + include_directories(${NNCASE_INCLUDE_DIR}) + + add_subdirectory(include/nncase) + add_subdirectory(src/kernels) + add_subdirectory(src/runtime) + add_subdirectory(src/functional) + if(BUILD_BENCHMARK) + add_subdirectory(benchmark) + endif() + + # Python binding + if(BUILD_PYTHON_BINDING) + add_subdirectory(python/nncaseruntime/native) + endif() + + install( + DIRECTORY ${NNCASE_INCLUDE_DIR}/nncase + DESTINATION include + COMPONENT nncase-headers + FILES_MATCHING + PATTERN "*version.h" + PATTERN "CMakeFiles" EXCLUDE) + + install( + DIRECTORY include/nncase/kernels + DESTINATION include/nncase + COMPONENT nncase-headers + FILES_MATCHING + PATTERN "*.def" + PATTERN "*.h" + PATTERN "*.hpp" + PATTERN "*.td" + PATTERN "*.inc" + PATTERN "LICENSE.TXT") + + install( + DIRECTORY include/nncase/runtime + DESTINATION include/nncase + COMPONENT nncase-headers + FILES_MATCHING + PATTERN "*.def" + PATTERN "*.h" + PATTERN "*.hpp" + PATTERN "*.td" + PATTERN "*.inc" + PATTERN "LICENSE.TXT") else() - set(CMAKE_SKIP_RPATH OFF) - - set(NNCASE_MAIN_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include) - set(NNCASE_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include) - set(THIRD_PARTY ${CMAKE_CURRENT_LIST_DIR}/third_party) - set_property(GLOBAL PROPERTY POSITION_INDEPENDENT_CODE ON) - if (APPLE) - set(CMAKE_MACOSX_RPATH TRUE) - set(CMAKE_INSTALL_RPATH "@loader_path") - set(CMAKE_INSTALL_NAME_DIR "@rpath") - else () - set(CMAKE_INSTALL_RPATH "$ORIGIN") - endif() - - if (MSVC) - add_definitions(/D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS /D_CRT_SECURE_NO_WARNINGS /DNOMINMAX) - add_compile_options(/wd4267 /wd4251 /wd4244 /FC /utf-8 /W3 /WX) - set(PYBIND11_CPP_STANDARD "/std:c++latest") + set(CMAKE_SKIP_RPATH OFF) + + set(NNCASE_MAIN_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include) + set(NNCASE_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include) + set(THIRD_PARTY ${CMAKE_CURRENT_LIST_DIR}/third_party) + set_property(GLOBAL PROPERTY POSITION_INDEPENDENT_CODE ON) + if(APPLE) + set(CMAKE_MACOSX_RPATH TRUE) + set(CMAKE_INSTALL_RPATH "@loader_path") + set(CMAKE_INSTALL_NAME_DIR "@rpath") + else() + set(CMAKE_INSTALL_RPATH "$ORIGIN") + endif() + + if(MSVC) + add_definitions(/D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS + /D_CRT_SECURE_NO_WARNINGS /DNOMINMAX) + add_compile_options( + /wd4267 + /wd4251 + /wd4244 + /FC + /utf-8 + /W3 + /WX) + set(PYBIND11_CPP_STANDARD "/std:c++latest") + else() + add_compile_options(-fvisibility=hidden) + add_compile_options( + -Wall + -Wextra + -pedantic + -Werror + -Wno-multichar + -Wno-missing-field-initializers + -Wno-unused-function + -Wno-type-limits + -Wno-unused-local-typedefs + -Wno-sign-compare) + if(APPLE) + add_compile_options(-Wno-four-char-constants -Wno-sometimes-uninitialized + -Wno-deprecated) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + add_compile_options(-Wno-uninitialized) else() - add_compile_options(-fvisibility=hidden) - add_compile_options(-Wall -Wextra -pedantic -Werror -Wno-multichar -Wno-missing-field-initializers -Wno-unused-function -Wno-type-limits -Wno-unused-local-typedefs -Wno-sign-compare) - if (APPLE) - add_compile_options(-Wno-four-char-constants -Wno-sometimes-uninitialized -Wno-deprecated) - elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - add_compile_options(-Wno-uninitialized) - else () - add_compile_options(-Wno-maybe-uninitialized -Wno-deprecated-copy) - add_link_options(-Wl,--exclude-libs,ALL) - endif() - endif() - - include_directories(${NNCASE_MAIN_INCLUDE_DIR}) - include_directories(${NNCASE_INCLUDE_DIR}) - - add_subdirectory(include/nncase) - add_subdirectory(src/nncase) - add_subdirectory(src/data) - add_subdirectory(src/ir) - add_subdirectory(src/importer) - add_subdirectory(src/schedule) - add_subdirectory(src/evaluator) - add_subdirectory(src/functional) - add_subdirectory(src/transforms) - add_subdirectory(src/codegen) - add_subdirectory(src/kernels) - add_subdirectory(src/runtime) - add_subdirectory(src/targets) - add_subdirectory(src/plugin) - add_subdirectory(src/cli) - - if(BUILD_TESTING) - add_subdirectory(tests/kernels) - endif() - - # Python binding - if(BUILD_PYTHON_BINDING) - add_subdirectory(python/nncase/native) + add_compile_options(-Wno-maybe-uninitialized -Wno-deprecated-copy) + if(CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL "12.0") + add_compile_options(-Wno-array-bounds -Wno-deprecated-declarations + -Wno-restrict) + endif() + add_link_options(-Wl,--exclude-libs,ALL) endif() - - # Thrid party - add_subdirectory(third_party/onnx) - - install(DIRECTORY include/nncase - DESTINATION include - COMPONENT nncase-headers - FILES_MATCHING - PATTERN "*.def" - PATTERN "*.h" - PATTERN "*.hpp" - PATTERN "*.td" - PATTERN "*.inc" - PATTERN "LICENSE.TXT" - ) - - install(DIRECTORY ${NNCASE_INCLUDE_DIR}/nncase - DESTINATION include - COMPONENT nncase-headers - FILES_MATCHING - PATTERN "*.def" - PATTERN "*.h" - PATTERN "*.hpp" - PATTERN "*.td" - PATTERN "*.inc" - PATTERN "CMakeFiles" EXCLUDE - PATTERN "config.h" EXCLUDE - ) - - install(DIRECTORY python/nncase - DESTINATION python - COMPONENT nncase-python - FILES_MATCHING - PATTERN "*.py" - ) - - # Targets - add_subdirectory(targets/cpu) - add_subdirectory(targets/k210) - add_subdirectory(targets/vulkan) + endif() + + include_directories(${NNCASE_MAIN_INCLUDE_DIR}) + include_directories(${NNCASE_INCLUDE_DIR}) + + add_subdirectory(include/nncase) + add_subdirectory(src/nncase) + add_subdirectory(src/data) + add_subdirectory(src/ir) + add_subdirectory(src/importer) + add_subdirectory(src/schedule) + add_subdirectory(src/evaluator) + add_subdirectory(src/functional) + add_subdirectory(src/transforms) + add_subdirectory(src/codegen) + add_subdirectory(src/kernels) + add_subdirectory(src/runtime) + add_subdirectory(src/targets) + add_subdirectory(src/plugin) + add_subdirectory(src/cli) + + if(BUILD_TESTING) + add_subdirectory(tests/kernels) + endif() + + # Python binding + if(BUILD_PYTHON_BINDING) + add_subdirectory(python/nncase/native) + endif() + + # Thrid party + add_subdirectory(third_party/onnx) + + install( + DIRECTORY include/nncase + DESTINATION include + COMPONENT nncase-headers + FILES_MATCHING + PATTERN "*.def" + PATTERN "*.h" + PATTERN "*.hpp" + PATTERN "*.td" + PATTERN "*.inc" + PATTERN "LICENSE.TXT") + + install( + DIRECTORY ${NNCASE_INCLUDE_DIR}/nncase + DESTINATION include + COMPONENT nncase-headers + FILES_MATCHING + PATTERN "*.def" + PATTERN "*.h" + PATTERN "*.hpp" + PATTERN "*.td" + PATTERN "*.inc" + PATTERN "CMakeFiles" EXCLUDE + PATTERN "config.h" EXCLUDE) + + install( + DIRECTORY python/nncase + DESTINATION python + COMPONENT nncase-python + FILES_MATCHING + PATTERN "*.py") + + # Targets + add_subdirectory(targets/cpu) + add_subdirectory(targets/k210) + add_subdirectory(targets/vulkan) endif() # Modules diff --git a/benchmark/gen_kmodel.py b/benchmark/gen_kmodel.py index 3f5765b89e..6068ea5563 100644 --- a/benchmark/gen_kmodel.py +++ b/benchmark/gen_kmodel.py @@ -31,12 +31,12 @@ MODELS = { "mnist": { - "url": "https://media.githubusercontent.com/media/onnx/models/master/vision/classification/mnist/model/mnist-8.onnx", + "url": "https://github.com/onnx/models/raw/main/vision/classification/mnist/model/mnist-8.onnx", "in_shapes": {"Input3": [1, 1, 28, 28]} }, "mobilenet_v2": { - "url": "https://github.com/onnx/models/raw/master/vision/classification/mobilenet/model/mobilenetv2-7.onnx", - "in_shapes": {"input": [1, 3, 224, 224]} + "url": "https://github.com/onnx/models/raw/main/vision/classification/mobilenet/model/mobilenetv2-7.onnx", + "in_shapes": {"data": [1, 3, 224, 224]} } } @@ -47,7 +47,7 @@ def _download(url, name, in_shapes): req = requests.get(url) onnx_model, check = onnxsim.simplify( onnx.load_model(BytesIO(req.content)), check_n=3, input_shapes=in_shapes) - assert check, "Simplified ONNX model could not be validated" + # assert check, "Simplified ONNX model could not be validated" onnx.save(onnx_model, filename) with open(filename, "rb") as file: @@ -65,10 +65,10 @@ def _make_module(name, target): compile_options.input_layout = "NCHW" compile_options.output_layout = "NCHW" compile_options.dump_dir = os.path.join(TEMP_DIR, name) - compile_options.dump_ir = True - compile_options.dump_asm = True - compile_options.dump_quant_error = True - compile_options.dump_import_op_range = True + compile_options.dump_ir = False + compile_options.dump_asm = False + compile_options.dump_quant_error = False + compile_options.dump_import_op_range = False compile_options.use_mse_quant_w = True compile_options.split_w_to_act = False compile_options.benchmark_only = True diff --git a/benchmark/models/cpu/mnist.kmodel b/benchmark/models/cpu/mnist.kmodel index bc3ec30a86..2e7a5e1db9 100644 Binary files a/benchmark/models/cpu/mnist.kmodel and b/benchmark/models/cpu/mnist.kmodel differ diff --git a/benchmark/models/cpu/mobilenet_v2.kmodel b/benchmark/models/cpu/mobilenet_v2.kmodel index d070a1c43a..8aec579391 100644 Binary files a/benchmark/models/cpu/mobilenet_v2.kmodel and b/benchmark/models/cpu/mobilenet_v2.kmodel differ diff --git a/cmake/conan.cmake b/cmake/conan.cmake index 208ce24855..66b381dbfd 100644 --- a/cmake/conan.cmake +++ b/cmake/conan.cmake @@ -33,7 +33,7 @@ # but it is only necessary on the end-user side. It is not necessary to create conan # packages, in fact it shouldn't be use for that. Check the project documentation. -# version: 0.18.0-dev +# version: 0.19.0-dev include(CMakeParseArguments) @@ -95,7 +95,7 @@ macro(_conan_check_system_name) endif() if(${CMAKE_SYSTEM_NAME} STREQUAL "QNX") set(CONAN_SYSTEM_NAME Neutrino) - endif() + endif() set(CONAN_SUPPORTED_PLATFORMS Windows Linux Macos Android iOS FreeBSD WindowsStore WindowsCE watchOS tvOS FreeBSD SunOS AIX Arduino Emscripten Neutrino) list (FIND CONAN_SUPPORTED_PLATFORMS "${CONAN_SYSTEM_NAME}" _index) if (${_index} GREATER -1) @@ -132,18 +132,28 @@ macro(_conan_detect_compiler) set(_CONAN_SETTING_COMPILER_CPPSTD ${CMAKE_CXX_STANDARD}) endif() - if (${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL GNU) - # using GCC + if (${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL GNU OR ${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL QCC) + # using GCC or QCC # TODO: Handle other params string(REPLACE "." ";" VERSION_LIST ${CMAKE_${LANGUAGE}_COMPILER_VERSION}) list(GET VERSION_LIST 0 MAJOR) list(GET VERSION_LIST 1 MINOR) - set(COMPILER_VERSION ${MAJOR}.${MINOR}) - if(${MAJOR} GREATER 4) - set(COMPILER_VERSION ${MAJOR}) - endif() - set(_CONAN_SETTING_COMPILER gcc) + + if (${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL GNU) + set(_CONAN_SETTING_COMPILER gcc) + # mimic Conan client autodetection + if (${MAJOR} GREATER_EQUAL 5) + set(COMPILER_VERSION ${MAJOR}) + else() + set(COMPILER_VERSION ${MAJOR}.${MINOR}) + endif() + elseif (${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL QCC) + set(_CONAN_SETTING_COMPILER qcc) + set(COMPILER_VERSION ${MAJOR}.${MINOR}) + endif () + set(_CONAN_SETTING_COMPILER_VERSION ${COMPILER_VERSION}) + if (USING_CXX) conan_cmake_detect_unix_libcxx(_LIBCXX) set(_CONAN_SETTING_COMPILER_LIBCXX ${_LIBCXX}) @@ -152,7 +162,7 @@ macro(_conan_detect_compiler) string(REPLACE "." ";" VERSION_LIST ${CMAKE_${LANGUAGE}_COMPILER_VERSION}) list(GET VERSION_LIST 0 MAJOR) list(GET VERSION_LIST 1 MINOR) - set(COMPILER_VERSION ${MAJOR}.${MINOR}) + set(COMPILER_VERSION ${MAJOR}) set(_CONAN_SETTING_COMPILER intel) set(_CONAN_SETTING_COMPILER_VERSION ${COMPILER_VERSION}) if (USING_CXX) @@ -164,18 +174,39 @@ macro(_conan_detect_compiler) string(REPLACE "." ";" VERSION_LIST ${CMAKE_${LANGUAGE}_COMPILER_VERSION}) list(GET VERSION_LIST 0 MAJOR) list(GET VERSION_LIST 1 MINOR) + + # mimic Conan client autodetection + if (${MAJOR} GREATER_EQUAL 13) + set(COMPILER_VERSION ${MAJOR}) + else() + set(COMPILER_VERSION ${MAJOR}.${MINOR}) + endif() + + set(_CONAN_SETTING_COMPILER_VERSION ${COMPILER_VERSION}) + set(_CONAN_SETTING_COMPILER apple-clang) - set(_CONAN_SETTING_COMPILER_VERSION ${MAJOR}.${MINOR}) if (USING_CXX) conan_cmake_detect_unix_libcxx(_LIBCXX) set(_CONAN_SETTING_COMPILER_LIBCXX ${_LIBCXX}) endif () - elseif (${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL Clang) + elseif (${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL Clang + AND NOT "${CMAKE_${LANGUAGE}_COMPILER_FRONTEND_VARIANT}" STREQUAL "MSVC" + AND NOT "${CMAKE_${LANGUAGE}_SIMULATE_ID}" STREQUAL "MSVC") + string(REPLACE "." ";" VERSION_LIST ${CMAKE_${LANGUAGE}_COMPILER_VERSION}) list(GET VERSION_LIST 0 MAJOR) list(GET VERSION_LIST 1 MINOR) set(_CONAN_SETTING_COMPILER clang) - set(_CONAN_SETTING_COMPILER_VERSION ${MAJOR}.${MINOR}) + + # mimic Conan client autodetection + if (${MAJOR} GREATER_EQUAL 8) + set(COMPILER_VERSION ${MAJOR}) + else() + set(COMPILER_VERSION ${MAJOR}.${MINOR}) + endif() + + set(_CONAN_SETTING_COMPILER_VERSION ${COMPILER_VERSION}) + if(APPLE) cmake_policy(GET CMP0025 APPLE_CLANG_POLICY) if(NOT APPLE_CLANG_POLICY STREQUAL NEW) @@ -183,14 +214,15 @@ macro(_conan_detect_compiler) set(_CONAN_SETTING_COMPILER apple-clang) endif() endif() - if(${_CONAN_SETTING_COMPILER} STREQUAL clang AND ${MAJOR} GREATER 7) - set(_CONAN_SETTING_COMPILER_VERSION ${MAJOR}) - endif() if (USING_CXX) conan_cmake_detect_unix_libcxx(_LIBCXX) set(_CONAN_SETTING_COMPILER_LIBCXX ${_LIBCXX}) endif () - elseif(${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL MSVC) + elseif(${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL MSVC + OR (${CMAKE_${LANGUAGE}_COMPILER_ID} STREQUAL Clang + AND "${CMAKE_${LANGUAGE}_COMPILER_FRONTEND_VARIANT}" STREQUAL "MSVC" + AND "${CMAKE_${LANGUAGE}_SIMULATE_ID}" STREQUAL "MSVC")) + set(_VISUAL "Visual Studio") _get_msvc_ide_version(_VISUAL_VERSION) if("${_VISUAL_VERSION}" STREQUAL "") @@ -281,7 +313,7 @@ function(conan_cmake_settings result) string(REGEX MATCH "[^=]*" MANUAL_SETTING "${ARG}") message(STATUS "Conan: ${MANUAL_SETTING} was added as an argument. Not using the autodetected one.") list(REMOVE_ITEM ARGUMENTS_PROFILE_AUTO "${MANUAL_SETTING}") - endforeach() + endforeach() # Automatic from CMake foreach(ARG ${ARGUMENTS_PROFILE_AUTO}) @@ -398,7 +430,7 @@ function(conan_cmake_detect_vs_runtime result) if(build_type) string(TOUPPER "${build_type}" build_type) - endif() + endif() set(variables CMAKE_CXX_FLAGS_${build_type} CMAKE_C_FLAGS_${build_type} CMAKE_CXX_FLAGS CMAKE_C_FLAGS) foreach(variable ${variables}) if(NOT "${${variable}}" STREQUAL "") @@ -443,17 +475,18 @@ function(conan_cmake_autodetect detected_settings) endfunction() macro(conan_parse_arguments) - set(options BASIC_SETUP CMAKE_TARGETS UPDATE KEEP_RPATHS NO_LOAD NO_OUTPUT_DIRS OUTPUT_QUIET NO_IMPORTS SKIP_STD) - set(oneValueArgs CONANFILE ARCH BUILD_TYPE INSTALL_FOLDER CONAN_COMMAND) - set(multiValueArgs DEBUG_PROFILE RELEASE_PROFILE RELWITHDEBINFO_PROFILE MINSIZEREL_PROFILE - PROFILE REQUIRES OPTIONS IMPORTS SETTINGS BUILD ENV GENERATORS PROFILE_AUTO - INSTALL_ARGS CONFIGURATION_TYPES PROFILE_BUILD BUILD_REQUIRES) - cmake_parse_arguments(ARGUMENTS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(options BASIC_SETUP CMAKE_TARGETS UPDATE KEEP_RPATHS NO_LOAD NO_OUTPUT_DIRS + OUTPUT_QUIET NO_IMPORTS SKIP_STD) + set(oneValueArgs CONANFILE ARCH BUILD_TYPE INSTALL_FOLDER OUTPUT_FOLDER CONAN_COMMAND) + set(multiValueArgs DEBUG_PROFILE RELEASE_PROFILE RELWITHDEBINFO_PROFILE MINSIZEREL_PROFILE + PROFILE REQUIRES OPTIONS IMPORTS SETTINGS BUILD ENV GENERATORS PROFILE_AUTO + INSTALL_ARGS CONFIGURATION_TYPES PROFILE_BUILD BUILD_REQUIRES) + cmake_parse_arguments(ARGUMENTS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) endmacro() function(old_conan_cmake_install) # Calls "conan install" - # Argument BUILD is equivalant to --build={missing, PkgName,...} or + # Argument BUILD is equivalent to --build={missing, PkgName,...} or # --build when argument is 'BUILD all' (which builds all packages from source) # Argument CONAN_COMMAND, to specify the conan path, e.g. in case of running from source # cmake does not identify conan as command, even if it is +x and it is in the path @@ -502,6 +535,10 @@ function(old_conan_cmake_install) if(ARGUMENTS_INSTALL_FOLDER) set(CONAN_INSTALL_FOLDER -if=${ARGUMENTS_INSTALL_FOLDER}) endif() + set(CONAN_OUTPUT_FOLDER "") + if(ARGUMENTS_OUTPUT_FOLDER) + set(CONAN_OUTPUT_FOLDER -of=${ARGUMENTS_OUTPUT_FOLDER}) + endif() foreach(ARG ${ARGUMENTS_GENERATORS}) set(CONAN_GENERATORS ${CONAN_GENERATORS} -g=${ARG}) endforeach() @@ -539,9 +576,9 @@ function(conan_cmake_install) endif() set(installOptions UPDATE NO_IMPORTS OUTPUT_QUIET ERROR_QUIET) - set(installOneValueArgs PATH_OR_REFERENCE REFERENCE REMOTE LOCKFILE LOCKFILE_OUT LOCKFILE_NODE_ID INSTALL_FOLDER) + set(installOneValueArgs PATH_OR_REFERENCE REFERENCE REMOTE LOCKFILE LOCKFILE_OUT LOCKFILE_NODE_ID INSTALL_FOLDER OUTPUT_FOLDER) set(installMultiValueArgs GENERATOR BUILD ENV ENV_HOST ENV_BUILD OPTIONS_HOST OPTIONS OPTIONS_BUILD PROFILE - PROFILE_HOST PROFILE_BUILD SETTINGS SETTINGS_HOST SETTINGS_BUILD) + PROFILE_HOST PROFILE_BUILD SETTINGS SETTINGS_HOST SETTINGS_BUILD CONF CONF_HOST CONF_BUILD) cmake_parse_arguments(ARGS "${installOptions}" "${installOneValueArgs}" "${installMultiValueArgs}" ${ARGN}) foreach(arg ${installOptions}) if(ARGS_${arg}) @@ -560,6 +597,8 @@ function(conan_cmake_install) set(flag "--lockfile-node-id") elseif("${arg}" STREQUAL "INSTALL_FOLDER") set(flag "--install-folder") + elseif("${arg}" STREQUAL "OUTPUT_FOLDER") + set(flag "--output-folder") endif() set(${arg} ${${arg}} ${flag} ${ARGS_${arg}}) endif() @@ -594,6 +633,12 @@ function(conan_cmake_install) set(flag "--settings:host") elseif("${arg}" STREQUAL "SETTINGS_BUILD") set(flag "--settings:build") + elseif("${arg}" STREQUAL "CONF") + set(flag "--conf") + elseif("${arg}" STREQUAL "CONF_HOST") + set(flag "--conf:host") + elseif("${arg}" STREQUAL "CONF_BUILD") + set(flag "--conf:build") endif() list(LENGTH ARGS_${arg} numargs) foreach(item ${ARGS_${arg}}) @@ -611,13 +656,16 @@ function(conan_cmake_install) if(DEFINED NO_IMPORTS) set(NO_IMPORTS --no-imports) endif() - set(install_args install ${PATH_OR_REFERENCE} ${REFERENCE} ${UPDATE} ${NO_IMPORTS} ${REMOTE} ${LOCKFILE} ${LOCKFILE_OUT} ${LOCKFILE_NODE_ID} ${INSTALL_FOLDER} - ${GENERATOR} ${BUILD} ${ENV} ${ENV_HOST} ${ENV_BUILD} ${OPTIONS} ${OPTIONS_HOST} ${OPTIONS_BUILD} - ${PROFILE} ${PROFILE_HOST} ${PROFILE_BUILD} ${SETTINGS} ${SETTINGS_HOST} ${SETTINGS_BUILD}) + set(install_args install ${PATH_OR_REFERENCE} ${REFERENCE} ${UPDATE} ${NO_IMPORTS} ${REMOTE} + ${LOCKFILE} ${LOCKFILE_OUT} ${LOCKFILE_NODE_ID} ${INSTALL_FOLDER} + ${OUTPUT_FOLDER} ${GENERATOR} ${BUILD} ${ENV} ${ENV_HOST} ${ENV_BUILD} + ${OPTIONS} ${OPTIONS_HOST} ${OPTIONS_BUILD} ${PROFILE} ${PROFILE_HOST} + ${PROFILE_BUILD} ${SETTINGS} ${SETTINGS_HOST} ${SETTINGS_BUILD} + ${CONF} ${CONF_HOST} ${CONF_BUILD}) string(REPLACE ";" " " _install_args "${install_args}") message(STATUS "Conan executing: ${CONAN_CMD} ${_install_args}") - + if(ARGS_OUTPUT_QUIET) set(OUTPUT_OPT OUTPUT_QUIET) endif() @@ -641,6 +689,109 @@ function(conan_cmake_install) endfunction() +function(conan_cmake_lock_create) + if(DEFINED CONAN_COMMAND) + set(CONAN_CMD ${CONAN_COMMAND}) + else() + conan_check(REQUIRED) + endif() + + set(lockCreateOptions UPDATE BASE OUTPUT_QUIET ERROR_QUIET) + set(lockCreateOneValueArgs PATH REFERENCE REMOTE LOCKFILE LOCKFILE_OUT) + set(lockCreateMultiValueArgs BUILD ENV ENV_HOST ENV_BUILD OPTIONS_HOST OPTIONS OPTIONS_BUILD PROFILE + PROFILE_HOST PROFILE_BUILD SETTINGS SETTINGS_HOST SETTINGS_BUILD) + cmake_parse_arguments(ARGS "${lockCreateOptions}" "${lockCreateOneValueArgs}" "${lockCreateMultiValueArgs}" ${ARGN}) + foreach(arg ${lockCreateOptions}) + if(ARGS_${arg}) + set(${arg} ${${arg}} ${ARGS_${arg}}) + endif() + endforeach() + foreach(arg ${lockCreateOneValueArgs}) + if(DEFINED ARGS_${arg}) + if("${arg}" STREQUAL "REMOTE") + set(flag "--remote") + elseif("${arg}" STREQUAL "LOCKFILE") + set(flag "--lockfile") + elseif("${arg}" STREQUAL "LOCKFILE_OUT") + set(flag "--lockfile-out") + endif() + set(${arg} ${${arg}} ${flag} ${ARGS_${arg}}) + endif() + endforeach() + foreach(arg ${lockCreateMultiValueArgs}) + if(DEFINED ARGS_${arg}) + if("${arg}" STREQUAL "BUILD") + set(flag "--build") + elseif("${arg}" STREQUAL "ENV") + set(flag "--env") + elseif("${arg}" STREQUAL "ENV_HOST") + set(flag "--env:host") + elseif("${arg}" STREQUAL "ENV_BUILD") + set(flag "--env:build") + elseif("${arg}" STREQUAL "OPTIONS") + set(flag "--options") + elseif("${arg}" STREQUAL "OPTIONS_HOST") + set(flag "--options:host") + elseif("${arg}" STREQUAL "OPTIONS_BUILD") + set(flag "--options:build") + elseif("${arg}" STREQUAL "PROFILE") + set(flag "--profile") + elseif("${arg}" STREQUAL "PROFILE_HOST") + set(flag "--profile:host") + elseif("${arg}" STREQUAL "PROFILE_BUILD") + set(flag "--profile:build") + elseif("${arg}" STREQUAL "SETTINGS") + set(flag "--settings") + elseif("${arg}" STREQUAL "SETTINGS_HOST") + set(flag "--settings:host") + elseif("${arg}" STREQUAL "SETTINGS_BUILD") + set(flag "--settings:build") + endif() + list(LENGTH ARGS_${arg} numargs) + foreach(item ${ARGS_${arg}}) + if(${item} STREQUAL "all" AND ${arg} STREQUAL "BUILD") + set(${arg} "--build") + break() + endif() + set(${arg} ${${arg}} ${flag} ${item}) + endforeach() + endif() + endforeach() + if(DEFINED UPDATE) + set(UPDATE --update) + endif() + if(DEFINED BASE) + set(BASE --base) + endif() + set(lock_create_Args lock create ${PATH} ${REFERENCE} ${UPDATE} ${BASE} ${REMOTE} ${LOCKFILE} ${LOCKFILE_OUT} ${LOCKFILE_NODE_ID} ${INSTALL_FOLDER} + ${GENERATOR} ${BUILD} ${ENV} ${ENV_HOST} ${ENV_BUILD} ${OPTIONS} ${OPTIONS_HOST} ${OPTIONS_BUILD} + ${PROFILE} ${PROFILE_HOST} ${PROFILE_BUILD} ${SETTINGS} ${SETTINGS_HOST} ${SETTINGS_BUILD}) + + string(REPLACE ";" " " _lock_create_Args "${lock_create_Args}") + message(STATUS "Conan executing: ${CONAN_CMD} ${_lock_create_Args}") + + if(ARGS_OUTPUT_QUIET) + set(OUTPUT_OPT OUTPUT_QUIET) + endif() + if(ARGS_ERROR_QUIET) + set(ERROR_OPT ERROR_QUIET) + endif() + + execute_process(COMMAND ${CONAN_CMD} ${lock_create_Args} + RESULT_VARIABLE return_code + ${OUTPUT_OPT} + ${ERROR_OPT} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + + if(NOT "${return_code}" STREQUAL "0") + if (ARGS_ERROR_QUIET) + message(WARNING "Conan lock create failed='${return_code}'") + else() + message(FATAL_ERROR "Conan lock create failed='${return_code}'") + endif() + endif() +endfunction() + function(conan_cmake_setup_conanfile) conan_parse_arguments(${ARGV}) if(ARGUMENTS_CONANFILE) @@ -734,7 +885,7 @@ endmacro() macro(conan_cmake_run) conan_parse_arguments(${ARGV}) - + if(ARGUMENTS_CONFIGURATION_TYPES AND NOT CMAKE_CONFIGURATION_TYPES) message(WARNING "CONFIGURATION_TYPES should only be specified for multi-configuration generators") elseif(ARGUMENTS_CONFIGURATION_TYPES AND ARGUMENTS_BUILD_TYPE) @@ -785,6 +936,30 @@ macro(conan_cmake_run) endif() endmacro() +function(conan_version result) + set(${result} "" PARENT_SCOPE) + + if(NOT CONAN_CMD) + find_program(CONAN_CMD conan) + if(NOT CONAN_CMD AND CONAN_REQUIRED) + message(FATAL_ERROR "Conan executable not found! Please install conan.") + endif() + endif() + + execute_process(COMMAND ${CONAN_CMD} --version + RESULT_VARIABLE return_code + OUTPUT_VARIABLE CONAN_VERSION_OUTPUT + ERROR_VARIABLE CONAN_VERSION_OUTPUT) + + if(NOT "${return_code}" STREQUAL "0") + message(FATAL_ERROR "Conan --version failed='${return_code}'") + endif() + + string(REGEX MATCH ".*Conan version ([0-9]+\\.[0-9]+\\.[0-9]+)" FOO "${CONAN_VERSION_OUTPUT}") + + set(${result} ${CMAKE_MATCH_1} PARENT_SCOPE) +endfunction() + macro(conan_check) # Checks conan availability in PATH # Arguments REQUIRED, DETECT_QUIET and VERSION are optional @@ -804,25 +979,16 @@ macro(conan_check) if(NOT CONAN_DETECT_QUIET) message(STATUS "Conan: Found program ${CONAN_CMD}") endif() - execute_process(COMMAND ${CONAN_CMD} --version - RESULT_VARIABLE return_code - OUTPUT_VARIABLE CONAN_VERSION_OUTPUT - ERROR_VARIABLE CONAN_VERSION_OUTPUT) - if(NOT "${return_code}" STREQUAL "0") - message(FATAL_ERROR "Conan --version failed='${return_code}'") - endif() - + conan_version(CONAN_DETECTED_VERSION) + if(NOT CONAN_DETECT_QUIET) - string(STRIP "${CONAN_VERSION_OUTPUT}" _CONAN_VERSION_OUTPUT) - message(STATUS "Conan: Version found ${_CONAN_VERSION_OUTPUT}") + message(STATUS "Conan: Version found ${CONAN_DETECTED_VERSION}") endif() if(DEFINED CONAN_VERSION) - string(REGEX MATCH ".*Conan version ([0-9]+\\.[0-9]+\\.[0-9]+)" FOO - "${CONAN_VERSION_OUTPUT}") - if(${CMAKE_MATCH_1} VERSION_LESS ${CONAN_VERSION}) - message(FATAL_ERROR "Conan outdated. Installed: ${CMAKE_MATCH_1}, \ + if(${CONAN_DETECTED_VERSION} VERSION_LESS ${CONAN_VERSION}) + message(FATAL_ERROR "Conan outdated. Installed: ${CONAN_DETECTED_VERSION}, \ required: ${CONAN_VERSION}. Consider updating via 'pip \ install conan==${CONAN_VERSION}'.") endif() @@ -869,9 +1035,10 @@ macro(conan_config_install) set(multiValueArgs ARGS) cmake_parse_arguments(CONAN "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - find_program(CONAN_CMD conan) - if(NOT CONAN_CMD AND CONAN_REQUIRED) - message(FATAL_ERROR "Conan executable not found!") + if(DEFINED CONAN_COMMAND) + set(CONAN_CMD ${CONAN_COMMAND}) + else() + conan_check(REQUIRED) endif() if(DEFINED CONAN_VERIFY_SSL) @@ -883,7 +1050,9 @@ macro(conan_config_install) endif() if(DEFINED CONAN_ARGS) - set(CONAN_ARGS_ARGS "--args=\"${CONAN_ARGS}\"") + # Convert ; seperated multi arg list into space seperated string + string(REPLACE ";" " " l_CONAN_ARGS "${CONAN_ARGS}") + set(CONAN_ARGS_ARGS "--args=${l_CONAN_ARGS}") endif() if(DEFINED CONAN_SOURCE) @@ -907,3 +1076,67 @@ macro(conan_config_install) message(FATAL_ERROR "Conan config failed='${return_code}'") endif() endmacro() + + +function(conan_cmake_profile) + set(profileOneValueArgs FILEPATH) + set(profileMultiValueArgs SETTINGS OPTIONS CONF ENV BUILDENV RUNENV TOOL_REQUIRES) + cmake_parse_arguments(ARGS "" "${profileOneValueArgs}" "${profileMultiValueArgs}" ${ARGN}) + + if(DEFINED ARGS_FILEPATH) + set(_FN "${ARGS_FILEPATH}") + else() + set(_FN "${CMAKE_CURRENT_BINARY_DIR}/profile") + endif() + message(STATUS "Conan: Creating profile ${_FN}") + file(WRITE ${_FN} "") + + if(DEFINED ARGS_SETTINGS) + file(APPEND ${_FN} "[settings]\n") + foreach(SETTING ${ARGS_SETTINGS}) + file(APPEND ${_FN} ${SETTING} "\n") + endforeach() + endif() + + if(DEFINED ARGS_OPTIONS) + file(APPEND ${_FN} "[options]\n") + foreach(OPTION ${ARGS_OPTIONS}) + file(APPEND ${_FN} ${OPTION} "\n") + endforeach() + endif() + + if(DEFINED ARGS_CONF) + file(APPEND ${_FN} "[conf]\n") + foreach(CONF ${ARGS_CONF}) + file(APPEND ${_FN} ${CONF} "\n") + endforeach() + endif() + + if(DEFINED ARGS_ENV) + file(APPEND ${_FN} "[env]\n") + foreach(ENV ${ARGS_ENV}) + file(APPEND ${_FN} ${ENV} "\n") + endforeach() + endif() + + if(DEFINED ARGS_BUILDENV) + file(APPEND ${_FN} "[buildenv]\n") + foreach(BUILDENV ${ARGS_BUILDENV}) + file(APPEND ${_FN} ${BUILDENV} "\n") + endforeach() + endif() + + if(DEFINED ARGS_RUNENV) + file(APPEND ${_FN} "[runenv]\n") + foreach(RUNENV ${ARGS_RUNENV}) + file(APPEND ${_FN} ${RUNENV} "\n") + endforeach() + endif() + + if(DEFINED ARGS_TOOL_REQUIRES) + file(APPEND ${_FN} "[tool_requires]\n") + foreach(TOOL_REQUIRE ${ARGS_TOOL_REQUIRES}) + file(APPEND ${_FN} ${TOOL_REQUIRE} "\n") + endforeach() + endif() +endfunction() diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake index fe6152980b..226089c2f5 100644 --- a/cmake/dependencies.cmake +++ b/cmake/dependencies.cmake @@ -9,7 +9,7 @@ if ((NOT BUILDING_RUNTIME) OR ENABLE_VULKAN_RUNTIME) endif () if (NOT BUILDING_RUNTIME) - find_package(Flatbuffers REQUIRED) + find_package(flatbuffers REQUIRED) find_package(libzip REQUIRED) if(NOT CONAN_EXPORTED) set(FLATBUFFERS_FLATC_EXECUTABLE ${flatbuffers_LIB_DIRS}/../bin/flatc) diff --git a/conanfile.py b/conanfile.py index b11adc3851..1a84490650 100644 --- a/conanfile.py +++ b/conanfile.py @@ -60,7 +60,8 @@ def requirements(self): self.requires('protobuf/3.17.1') self.requires('xtensor/0.21.5') self.requires('spdlog/1.8.2') - self.requires('libzippp/4.0') + self.requires('zlib/1.2.12') + self.requires('libzippp/5.0-1.8.0') self.requires('inja/3.2.0') self.requires('shaderc/2021.1') if self.options.tests: @@ -82,20 +83,21 @@ def configure(self): if not self.options.runtime: self.options["opencv"].contrib = False + self.options["opencv"].with_ade = False self.options["opencv"].with_webp = False self.options["opencv"].with_openexr = False self.options["opencv"].with_eigen = False self.options["opencv"].with_quirc = False + self.options["opencv"].with_ffmpeg = False + self.options["opencv"].with_tiff = False + self.options["opencv"].with_jpeg = 'libjpeg-turbo' self.options["opencv"].dnn = False - self.options["flatbuffers"].options_from_context = False self.options["xtensor"].xsimd = False self.options["libzip"].with_bzip2 = False self.options["libzip"].with_zstd = False self.options["libzip"].crypto = False if self.settings.os == 'Linux': self.options["opencv"].with_gtk = False - self.options["spirv-tools"].link_libcpp = False - self.options["shaderc"].link_libcpp = False if (not self.options.runtime) or self.options.vulkan_runtime: if self.settings.os == 'Linux': diff --git a/docs/USAGE_EN.md b/docs/USAGE_EN.md index b161c26fa7..b8af24687a 100644 --- a/docs/USAGE_EN.md +++ b/docs/USAGE_EN.md @@ -2,32 +2,67 @@ # Overview -nncase provides both python wheel package and ncc client to compile your neural models. +nncase provides python wheel package to compile your neural models. The current documentation only works for nncase-v1. The available version are shown below. -- nncase wheel package can be downloaded at [nncase release](https://github.com/kendryte/nncase/releases), target wheel package except for both cpu and K210 can be got from nncase sdk for your target. -- For ncc client, you should git clone nncase repository and then build it by yourself. +``` +1.0.0.20211029, 1.1.0.20211203, 1.3.0.20220127, 1.4.0.20220303, 1.5.0.20220331, 1.6.0.20220505, 1.7.0.20220530, 1.7.1.20220701, 1.8.0.20220929, 1.9.0.20230322 +``` + +- nncase wheel package can be downloaded at [nncase release](https://github.com/kendryte/nncase/releases). # nncase python APIs -nncase provides Python APIs to compile neural network model and inference on your PC. +nncase provides Python APIs to compile neural network model and inference on x86_64 and amd64 platforms. ## Installation +The nncase toolchain compiler consists of nncase and plug-in wheel packages. + +- Both nncase and plug-in wheel packages are released at [nncase github](https://github.com/kendryte/nncase/releases) +- Nncase wheel package supports Python 3.6/3.7/3.8/3.9/3.10, You can download it according to your operating system and Python version. +- The plug-in wheel package does not depend on Python version, you can install it directly. + You can make use of [nncase docker image](https://github.com/kendryte/nncase/blob/master/docs/build.md)(Ubuntu 20.04 + Python 3.8) if you do not have Ubuntu development. ```shell +$ cd /path/to/nncase_sdk $ docker pull registry.cn-hangzhou.aliyuncs.com/kendryte/nncase:latest $ docker run -it --rm -v `pwd`:/mnt -w /mnt registry.cn-hangzhou.aliyuncs.com/kendryte/nncase:latest /bin/bash -c "/bin/bash" ``` -Take Ubuntu 20.04 + Python 3.8 for example +### cpu/K210 + +- Download nncase wheel package and then install it. + +``` +root@2b11cc15c7f8:/mnt# wget -P x86_64 https://github.com/kendryte/nncase/releases/download/v1.8.0/nncase-1.8.0.20220929-cp38-cp38-manylinux_2_24_x86_64.whl + +root@2b11cc15c7f8:/mnt# pip3 install x86_64/*.whl +``` + +### K510 + +- Download both nncase and nncase_k510 wheel packages and then install them. ```shell -root@f74598de4a02:/mnt# pip3 install nncase_github/nncase-1.0.0.20211029-cp38-cp38-manylinux_2_24_x86_64.whl +root@2b11cc15c7f8:/mnt# wget -P x86_64 https://github.com/kendryte/nncase/releases/download/v1.8.0/nncase-1.8.0.20220929-cp38-cp38-manylinux_2_24_x86_64.whl +root@2b11cc15c7f8:/mnt# wget -P x86_64 https://github.com/kendryte/nncase/releases/download/v1.8.0/nncase_k510-1.8.0.20220930-py2.py3-none-manylinux_2_24_x86_64.whl + +root@2b11cc15c7f8:/mnt# pip3 install x86_64/*.whl ``` -> You should get and install target wheel package from your nncase sdk if you do not take cpu/K210 as your target +### Check nncase version + +```python +root@469e6a4a9e71:/mnt# python3 +Python 3.8.10 (default, Jun 2 2021, 10:49:15) +[GCC 9.4.0] on linux +Type "help", "copyright", "credits" or "license" for more information. +>>> import _nncase +>>> print(_nncase.__version__) +1.8.0-55be52f +``` ## nncase compile model APIs @@ -76,13 +111,13 @@ The details of all attributes are following. | quant_type | string | N | Specify the quantization type for input data , such as 'uint8', 'int8', 'int16' | | w_quant_type | string | N | Specify the quantization type for weight , such as 'uint8'(by default), 'int8', 'int16' | | use_mse_quant_w | bool | N | Specify whether use mean-square error when quantizing weight | -| split_w_to_act | bool | N | Specify whether split weight into activation | +| split_w_to_act | bool | N | Specify whether split weight into activation | | preprocess | bool | N | Whether enable preprocess, False by default | | swapRB | bool | N | Whether swap red and blue channel for RGB data(from RGB to BGR or from BGR to RGB), False by default | | mean | list | N | Normalize mean value for preprocess, [0, 0, 0] by default | | std | list | N | Normalize std value for preprocess, [1, 1, 1] by default | | input_range | list | N | The float range for dequantized input data, [0,1] by default | -| output_range | list | N | The float range for quantized output data, [ ] by default | +| output_range | list | N | The float range for quantized output data, [ ] by default | | input_shape | list | N | Specify the shape of input data. input_shape should be consistent with input _layout. There will be letterbox operations(Such as resize/pad) if input_shape is not the same as input shape of model. | | letterbox_value | float | N | Specify the pad value of letterbox during preprocess. | | input_type | string | N | Specify the data type of input data, 'float32' by default. | @@ -727,10 +762,50 @@ if __name__ == '__main__': ## Deploy nncase runtime -### K210 +### Inference on K210 development board + +1. Download [SDK](https://github.com/kendryte/kendryte-standalone-sdk) + + ```shell + $ git clone https://github.com/kendryte/kendryte-standalone-sdk.git + $ cd kendryte-standalone-sdk + $ export KENDRYTE_WORKSPACE=`pwd` + ``` +2. Download the cross-compile toolchain and extract it + + ```shell + $ wget https://github.com/kendryte/kendryte-gnu-toolchain/releases/download/v8.2.0-20190409/kendryte-toolchain-ubuntu-amd64-8.2.0-20190409.tar.xz -O $KENDRYTE_WORKSPACE/kendryte-toolchain.tar.xz + $ cd $KENDRYTE_WORKSPACE + $ mkdir toolchain + $ tar -xf kendryte-toolchain.tar.xz -C ./toolchain + ``` +3. Update nncase runtime + + Download `k210-runtime.zip` from [Release](https://github.com/kendryte/nncase/releases) and extract it into [kendryte-standalone-sdk](https://github.com/kendryte/kendryte-standalone-sdk) 's `lib/nncase/v1`. +4. Compile App + + ```shell + # 1.copy your programe into `$KENDRYTE_WORKSPACE/src` + # e.g. copy ($NNCASE_WORK_DIR/examples/facedetect_landmark/k210/facedetect_landmark_example) into PATH_TO_SDK/src. + $ cp -r $NNCASE_WORK_DIR/examples/facedetect_landmark/k210/facedetect_landmark_example $KENDRYTE_WORKSPACE/src/ + + # 2. compile + $ cd $KENDRYTE_WORKSPACE + $ mkdir build + $ cmake .. -DPROJ=facedetect_landmark_example -DTOOLCHAIN=$KENDRYTE_WORKSPACE/toolchain/kendryte-toolchain/bin && make + ``` + + `facedetect_landmark_example` and `FaceDETECt_landmark_example.bin` will be generated. +5. Write the program to the K210 development board -1. Download `k210-runtime.zip` from [Release](https://github.com/kendryte/nncase/releases) page. -2. Unzip to your [kendryte-standalone-sdk](https://github.com/kendryte/kendryte-standalone-sdk) 's `lib/nncase/v1` directory. + ```shell + # 1. Check available USB ports + $ ls /dev/ttyUSB* + # /dev/ttyUSB0 /dev/ttyUSB1 + + # 2. Write your App by kflash + $ kflash -p /dev/ttyUSB0 -t facedetect_landmark_example.bin + ``` ## nncase inference APIs @@ -1161,143 +1236,3 @@ N/A ```python sim.run() ``` - -# ncc - -## Comannd line - -```shell -DESCRIPTION -NNCASE model compiler and inference tool. - -SYNOPSIS - ncc compile -i -t - [--input-prototxt ] [--output-arrays ] - [--quant-type ] [--w-quant-type ] [--use-mse-quant-w] - [--dataset ] [--dataset-format ] [--calibrate-method ] - [--preprocess] [--swapRB] [--mean ] [--std ] - [--input-range ] [--input-shape ] [--letterbox-value ] - [--input-type ] [--output-type ] - [--input-layout ] [--output-layout ] [--tcu-num ] - [--is-fpga] [--dump-ir] [--dump-asm] [--dump-quant-error] [--dump-import-op-range] [--dump-dir ] - [--dump-range-dataset ] [--dump-range-dataset-format ] [--benchmark-only] - - ncc infer - --dataset [--dataset-format ] - [--input-layout ] - - ncc [-v] - -OPTIONS - compile - - -i, --input-format - input format, e.g. tflite|onnx|caffe - -t, --target target architecture, e.g. cpu|k210|k510 - input file - --input-prototxt - input prototxt - output file - --output-arrays - output arrays - --quant-type - post trainning quantize type, e.g uint8|int8|int16, default is uint8 - --w-quant-type - post trainning weights quantize type, e.g uint8|int8|int16, default is uint8 - --use-mse-quant-w use min mse algorithm to refine weights quantilization or not, default is 0 - --dataset - calibration dataset, used in post quantization - --dataset-format - datset format: e.g. image|raw, default is image - --dump-range-dataset - dump import op range dataset - --dump-range-dataset-format - datset format: e.g. image|raw, default is image - --calibrate-method - calibrate method: e.g. no_clip|l2|kld_m0|kld_m1|kld_m2|cdf, default is no_clip - --preprocess enable preprocess, default is 0 - --swapRB swap red and blue channel, default is 0 - --mean normalize mean, default is 0. 0. 0. - --std normalize std, default is 1. 1. 1. - --input-range - float range after preprocess - --input-shape - shape for input data - --letterbox-value - letter box pad value, default is 0.000000 - --input-type - input type, e.g float32|uint8|default, default is default - --output-type - output type, e.g float32|uint8, default is float32 - --input-layout - input layout, e.g NCHW|NHWC, default is NCHW - --output-layout - output layout, e.g NCHW|NHWC, default is NCHW - --tcu-num tcu number, e.g 1|2|3|4, default is 0 - --is-fpga use fpga parameters, default is 0 - --dump-ir dump ir to .dot, default is 0 - --dump-asm dump assembly, default is 0 - --dump-quant-error dump quant error, default is 0 - --dump-import-op-range dump import op range, default is 0 - --dump-dir - dump to directory - --benchmark-only compile kmodel only for benchmark use, default is 0 - - infer - - kmodel filename - output path - --dataset - dataset path - --dataset-format - dataset format, e.g. image|raw, default is image - --input-layout - input layout, e.g NCHW|NHWC, default is NCHW -``` - -## Description - -`ncc` is the nncase command line tool. It has two commands: `compile` and `infer`. - -`compile` command compile your trained models (`.tflite`, `.caffemodel`, `.onnx`) to `.kmodel`. - -- `-i, --input-format` option is used to specify the input model format. nncase supports `tflite`, `caffe` and `onnx` input model currently. -- `-t, --target` option is used to set your desired target device to run the model. `cpu` is the most general target that almost every platform should support. `k210` is the Kendryte K210 SoC platform. If you set this option to `k210`, this model can only run on K210 or be emulated on your PC. -- `` is your input model path. -- `--input-prototxt` is the prototxt file for caffe model. -- `` is the output model path. -- `--output-arrays` is the names of nodes to output. -- `--quant-type` is used to specify quantize type, such as `uint8` by default and `int8` and `int16`. -- `--w-quant-type` is used to specify quantize type for weight, such as `uint8` by default and `int8 `and `int16`. -- `--use-mse-quant-w ` is used to specify whether use minimize mse(mean-square error, mse) algorithm to quantize weight or not. -- `--dataset` is to provide your quantization calibration dataset to quantize your models. You should put hundreds or thousands of data in training set to this directory. -- `--dataset-format` is to set the format of the calibration dataset. Default is `image`, nncase will use `opencv` to read your images and autoscale to the desired input size of your model. If the input has 3 channels, ncc will convert images to RGB float tensors [0,1] in `NCHW` layout. If the input has only 1 channel, ncc will grayscale your images. Set to `raw` if your dataset is not image dataset for example, audio or matrices. In this scenario you should convert your dataset to raw binaries which contains float tensors. -- `--dump-range-dataset` is to provide your dump range dataset to dump each op data range of your models. You should put hundreds or thousands of data in training set to this directory. -- `--dump-range-dataset-format` is to set the format of the dump range dataset. Default is `image`, nncase will use `opencv` to read your images and autoscale to the desired input size of your model. If the input has 3 channels, ncc will convert images to RGB float tensors [0,1] in `NCHW` layout. If the input has only 1 channel, ncc will grayscale your images. Set to `raw` if your dataset is not image dataset for example, audio or matrices. In this scenario you should convert your dataset to raw binaries which contains float tensors. -- `--calibrate-method` is to set your desired calibration method, which is used to select the optimal activation ranges. The default is `no_clip` in that ncc will use the full range of activations. If you want a better quantization result, you can use `l2` but it will take a longer time to find the optimal ranges. -- `--preprocess ` is used specify whether enable preprocessing or not. -- `--swapRB ` is used specify whether swap red and blue channel or not. You can use this flag to implement RGB2BGR or BGR2RGB feature. -- `--mean` is the mean values to be subtracted during preprocessing. -- `--std` is the std values to be divided during preprocessing. -- `--input-range` is the input range in float after dequantization. -- `--input-shape` is used to specify the shape of input data. If the input shape is different from the input shape of your model, the preprocess will add resize/pad ops automatically for the transformation. -- `--letterbox-value` is used to specify the pad values when pad is added during preprocessing. -- `--input-type` is to set your desired input data type when do inference. If `--input-type` is `uint8`, for example you should provide RGB888 uint8 tensors when you do inference. If `--input-type` is `float`, you should provide RGB float tensors instead. -- `--output-type` is the type of output data. -- `--input-layout` is the layout of input data. -- `--output-layout` is the layout of output data. -- `--tcu-num` is used to configure the number of TCU. 0 means do not configure the number of TCU. -- `--is-fpga` is a debug option. It is used to specify whether the kmodel run on fpga or not. -- `--dump-ir` is a debug option. It is used to specify whether dump IR or not. -- `--dump-asm` is a debug option. It is used to specify whether dump asm file or not. -- `--dump-quant-error` is a debug option. It is used to specify whether dump quantization error information or not. -- `--dump-import-op-range` is a debug option. It is used to specify whether dump imported op data range or not, need to also specify dump-range-dataset if enabled. -- `--dump-dir` is used to specify dump directory. -- `--benchmark-only` is used to specify whether the kmodel is used for benchmark or not. - -`infer` command can run your kmodel, and it's often used as debug purpose. ncc will save the model's output tensors to `.bin` files in `NCHW` layout. - -- `` is your kmodel path. -- `` is the output directory ncc will produce to. -- `--dataset` is the test set directory. -- `--dataset-format` and `--input-layout` have the same meaning as in `compile` command. diff --git a/docs/USAGE_ZH.md b/docs/USAGE_ZH.md index 81adc69cea..0293f3eb5c 100644 --- a/docs/USAGE_ZH.md +++ b/docs/USAGE_ZH.md @@ -1,31 +1,66 @@ # 概述 -nncase目前提供了python wheel包和ncc客户端两种方法编译模型. +nncase目前提供了python wheel包编译模型。当前文档仅适用于nncase-v1,适用于以下版本号: -- nncase wheel包需要去[nncase release](https://github.com/kendryte/nncase/releases)获取, target wheel包除cpu/K210不需要安装外, 其它target需要从nncase sdk离线获取 -- ncc客户端需要用户下载并编译nncase +``` +1.0.0.20211029, 1.1.0.20211203, 1.3.0.20220127, 1.4.0.20220303, 1.5.0.20220331, 1.6.0.20220505, 1.7.0.20220530, 1.7.1.20220701, 1.8.0.20220929, 1.9.0.20230322 +``` + +- nncase wheel包需要去[nncase release](https://github.com/kendryte/nncase/releases)获取 # nncase python APIs -nncase提供了Python APIs, 用于在PC上编译/推理深度学习模型. +nncase提供了Python APIs, 用于在x86_64和amd64平台上编译/推理深度学习模型. ## 安装 -用户若没有Ubuntu环境, 可使用[nncase docker image](https://github.com/kendryte/nncase/blob/master/docs/build.md)(Ubuntu 20.04 + Python 3.8) +nncase工具链compiler部分包括nncase和插件包 + +- nncase 和插件包均在[nncase github](https://github.com/kendryte/nncase/releases)发布 +- nncase wheel包支持Python 3.6/3.7/3.8/3.9/3.10, 用户可根据操作系统和Python选择相应版本下载 . +- 插件包不依赖Python版本, 可直接安装 + +用户若没有Ubuntu环境, 可使用[nncase docker](https://github.com/kendryte/nncase/blob/master/docs/build.md#docker)(Ubuntu 20.04 + Python 3.8) ```shell +$ cd /path/to/nncase_sdk $ docker pull registry.cn-hangzhou.aliyuncs.com/kendryte/nncase:latest $ docker run -it --rm -v `pwd`:/mnt -w /mnt registry.cn-hangzhou.aliyuncs.com/kendryte/nncase:latest /bin/bash -c "/bin/bash" ``` -下面以Ubuntu 20.04 + Python 3.8平台安装nncase为例 +### cpu/K210 + +- 下载nncase wheel包, 直接安装即可. + +``` +root@2b11cc15c7f8:/mnt# wget -P x86_64 https://github.com/kendryte/nncase/releases/download/v1.8.0/nncase-1.8.0.20220929-cp38-cp38-manylinux_2_24_x86_64.whl + +root@2b11cc15c7f8:/mnt# pip3 install x86_64/*.whl +``` + +### K510 + +- 分别下载nncase和nncase_k510插件包,再一起安装 ```shell -root@f74598de4a02:/mnt# pip3 install nncase_github/nncase-1.0.0.20211029-cp38-cp38-manylinux_2_24_x86_64.whl +root@2b11cc15c7f8:/mnt# wget -P x86_64 https://github.com/kendryte/nncase/releases/download/v1.8.0/nncase-1.8.0.20220929-cp38-cp38-manylinux_2_24_x86_64.whl +root@2b11cc15c7f8:/mnt# wget -P x86_64 https://github.com/kendryte/nncase/releases/download/v1.8.0/nncase_k510-1.8.0.20220930-py2.py3-none-manylinux_2_24_x86_64.whl + +root@2b11cc15c7f8:/mnt# pip3 install x86_64/*.whl ``` -> 若不使用cpu/K210作为target, 需要从相应target的nncase sdk中获取wheel包并进行安装 +### 查看版本信息 + +```python +root@469e6a4a9e71:/mnt# python3 +Python 3.8.10 (default, Jun 2 2021, 10:49:15) +[GCC 9.4.0] on linux +Type "help", "copyright", "credits" or "license" for more information. +>>> import _nncase +>>> print(_nncase.__version__) +1.8.0-55be52f +``` ## nncase 编译模型APIs @@ -68,32 +103,32 @@ py::class_(m, "CompileOptions") 各属性说明如下 -| 属性名称 | 类型 | 是否必须 | 描述 | -| ---------------- | ------ | -------- | ------------------------------------------------------------ | -| target | string | 是 | 指定编译目标, 如'k210', 'k510' | -| quant_type | string | 否 | 指定数据量化类型, 如'uint8', 'int8', 'int16' | -| w_quant_type | string | 否 | 指定权重量化类型, 如'uint8', 'int8', 'int16', 默认为'uint8' | -| use_mse_quant_w | bool | 否 | 指定权重量化时是否使用最小化均方误差(mean-square error, MSE)算法优化量化参数 | -| split_w_to_act | bool | 否 | 指定是否将权重数据平衡到激活数据中 | -| preprocess | bool | 否 | 是否开启前处理,默认为False | -| swapRB | bool | 否 | 是否交换RGB输入数据的红和蓝两个通道(RGB-->BGR或者BGR-->RGB),默认为False | -| mean | list | 否 | 前处理标准化参数均值,默认为[0, 0, 0] | -| std | list | 否 | 前处理标准化参数方差,默认为[1, 1, 1] | -| input_range | list | 否 | 输入数据反量化后对应浮点数的范围,默认为[0,1] | -| output_range | list | 否 | 输出定点数据前对应浮点数的范围,默认为空,使用模型实际浮点输出范围 | +| 属性名称 | 类型 | 是否必须 | 描述 | +| ---------------- | ------ | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | +| target | string | 是 | 指定编译目标, 如'k210', 'k510' | +| quant_type | string | 否 | 指定数据量化类型, 如'uint8', 'int8', 'int16' | +| w_quant_type | string | 否 | 指定权重量化类型, 如'uint8', 'int8', 'int16', 默认为'uint8' | +| use_mse_quant_w | bool | 否 | 指定权重量化时是否使用最小化均方误差(mean-square error, MSE)算法优化量化参数 | +| split_w_to_act | bool | 否 | 指定是否将权重数据平衡到激活数据中 | +| preprocess | bool | 否 | 是否开启前处理,默认为False | +| swapRB | bool | 否 | 是否交换RGB输入数据的红和蓝两个通道(RGB-->BGR或者BGR-->RGB),默认为False | +| mean | list | 否 | 前处理标准化参数均值,默认为[0, 0, 0] | +| std | list | 否 | 前处理标准化参数方差,默认为[1, 1, 1] | +| input_range | list | 否 | 输入数据反量化后对应浮点数的范围,默认为[0,1] | +| output_range | list | 否 | 输出定点数据前对应浮点数的范围,默认为空,使用模型实际浮点输出范围 | | input_shape | list | 否 | 指定输入数据的shape,input_shape的layout需要与input layout保持一致,输入数据的input_shape与模型的input shape不一致时会进行letterbox操作(resize/pad等) | -| letterbox_value | float | 否 | 指定前处理letterbox的填充值 | -| input_type | string | 否 | 指定输入数据的类型, 默认为'float32' | -| output_type | string | 否 | 指定输出数据的类型, 如'float32', 'uint8'(仅用于指定量化情况下), 默认为'float32' | -| input_layout | string | 否 | 指定输入数据的layout, 如'NCHW', 'NHWC'. 若输入数据layout与模型本身layout不同, nncase会插入transpose进行转换 | -| output_layout | string | 否 | 指定输出数据的layout, 如'NCHW', 'NHWC'. 若输出数据layout与模型本身layout不同, nncase会插入transpose进行转换 | -| model_layout | string | 否 | 指定模型的layout,默认为空,当tflite模型layout为‘NCHW’,Onnx和Caffe模型layout为‘NHWC’时需指定 | -| is_fpga | bool | 否 | 指定kmodel是否用于fpga, 默认为False | -| dump_ir | bool | 否 | 指定是否dump IR, 默认为False | -| dump_asm | bool | 否 | 指定是否dump asm汇编文件, 默认为False | -| dump_quant_error | bool | 否 | 指定是否dump量化前后的模型误差 | -| dump_dir | string | 否 | 前面指定dump_ir等开关后, 这里指定dump的目录, 默认为空字符串 | -| benchmark_only | bool | 否 | 指定kmodel是否只用于benchmark, 默认为False | +| letterbox_value | float | 否 | 指定前处理letterbox的填充值 | +| input_type | string | 否 | 指定输入数据的类型, 默认为'float32' | +| output_type | string | 否 | 指定输出数据的类型, 如'float32', 'uint8'(仅用于指定量化情况下), 默认为'float32' | +| input_layout | string | 否 | 指定输入数据的layout, 如'NCHW', 'NHWC'. 若输入数据layout与模型本身layout不同, nncase会插入transpose进行转换 | +| output_layout | string | 否 | 指定输出数据的layout, 如'NCHW', 'NHWC'. 若输出数据layout与模型本身layout不同, nncase会插入transpose进行转换 | +| model_layout | string | 否 | 指定模型的layout,默认为空,当tflite模型layout为‘NCHW’,Onnx和Caffe模型layout为‘NHWC’时需指定 | +| is_fpga | bool | 否 | 指定kmodel是否用于fpga, 默认为False | +| dump_ir | bool | 否 | 指定是否dump IR, 默认为False | +| dump_asm | bool | 否 | 指定是否dump asm汇编文件, 默认为False | +| dump_quant_error | bool | 否 | 指定是否dump量化前后的模型误差 | +| dump_dir | string | 否 | 前面指定dump_ir等开关后, 这里指定dump的目录, 默认为空字符串 | +| benchmark_only | bool | 否 | 指定kmodel是否只用于benchmark, 默认为False | > 1. mean和std为浮点数进行normalize的参数,用户可以自由指定. > 2. input range为浮点数的范围,即如果输入数据类型为uint8,则input range为反量化到浮点之后的范围(可以不为0~1),可以自由指定. @@ -729,10 +764,51 @@ if __name__ == '__main__': ## 部署 nncase runtime -### K210 +### K210上板推理流程 -1. 从 [Release](https://github.com/kendryte/nncase/releases) 页面下载 `k210-runtime.zip`。 -2. 解压到 [kendryte-standalone-sdk](https://github.com/kendryte/kendryte-standalone-sdk) 's `lib/nncase/v1` 目录。 +1. 下载官方[SDK](https://github.com/kendryte/kendryte-standalone-sdk) + + ```shell + git clone https://github.com/kendryte/kendryte-standalone-sdk.git + cd kendryte-standalone-sdk + export KENDRYTE_WORKSPACE=`pwd` + ``` +2. 下载交叉编译工具链,并解压 + + ```shell + wget https://github.com/kendryte/kendryte-gnu-toolchain/releases/download/v8.2.0-20190409/kendryte-toolchain-ubuntu-amd64-8.2.0-20190409.tar.xz -O $KENDRYTE_WORKSPACE/kendryte-toolchain.tar.xz + cd $KENDRYTE_WORKSPACE + mkdir toolchain + tar -xf kendryte-toolchain.tar.xz -C ./toolchain + ``` +3. 更新runtime + + 从 [Release](https://github.com/kendryte/nncase/releases) 页面下载 `k210-runtime.zip`。解压到 [kendryte-standalone-sdk](https://github.com/kendryte/kendryte-standalone-sdk) 's `lib/nncase/v1` 目录。 +4. 编译App + + ````shell + # 1.将自己的App工程放在`$KENDRYTE_WORKSPACE/src`目录下 + # 例如,将[example的示例程序]($NNCASE_WORK_DIR/examples/facedetect_landmark/k210/facedetect_landmark_example)目录,拷贝到SDK的src目录下。 + cp -r $NNCASE_WORK_DIR/examples/facedetect_landmark/k210/facedetect_landmark_example $KENDRYTE_WORKSPACE/src/ + + # 2.cmake 编译App + cd $KENDRYTE_WORKSPACE + mkdir build + cmake .. -DPROJ=facedetect_landmark_example -DTOOLCHAIN=$KENDRYTE_WORKSPACE/toolchain/kendryte-toolchain/bin && make + ```` + + 之后会在当前目录下生成 `facedetect_landmark_example`和 `facedetect_landmark_example.bin` +5. 烧写App + + ```shell + # 1. 检查可用的USB端口 + ls /dev/ttyUSB* + # > /dev/ttyUSB0 /dev/ttyUSB1 + # 2. 使用kflash进行烧录 + kflash -p /dev/ttyUSB0 -t facedetect_landmark_example.bin + ``` + + 烧写过程缓慢,需要耐心等待。 ## nncase 推理模型APIs @@ -1166,144 +1242,4 @@ N/A ```python sim.run() -``` - -# ncc - -## 命令行 - -```shell -DESCRIPTION -NNCASE model compiler and inference tool. - -SYNOPSIS - ncc compile -i -t - [--input-prototxt ] [--output-arrays ] - [--quant-type ] [--w-quant-type ] [--use-mse-quant-w] - [--dataset ] [--dataset-format ] [--calibrate-method ] - [--preprocess] [--swapRB] [--mean ] [--std ] - [--input-range ] [--input-shape ] [--letterbox-value ] - [--input-type ] [--output-type ] - [--input-layout ] [--output-layout ] [--tcu-num ] - [--is-fpga] [--dump-ir] [--dump-asm] [--dump-quant-error] [--dump-import-op-range] [--dump-dir ] - [--dump-range-dataset ] [--dump-range-dataset-format ] [--benchmark-only] - - ncc infer - --dataset [--dataset-format ] - [--input-layout ] - - ncc [-v] - -OPTIONS - compile - - -i, --input-format - input format, e.g. tflite|onnx|caffe - -t, --target target architecture, e.g. cpu|k210|k510 - input file - --input-prototxt - input prototxt - output file - --output-arrays - output arrays - --quant-type - post trainning quantize type, e.g uint8|int8|int16, default is uint8 - --w-quant-type - post trainning weights quantize type, e.g uint8|int8|int16, default is uint8 - --use-mse-quant-w use min mse algorithm to refine weights quantilization or not, default is 0 - --dataset - calibration dataset, used in post quantization - --dataset-format - datset format: e.g. image|raw, default is image - --dump-range-dataset - dump import op range dataset - --dump-range-dataset-format - datset format: e.g. image|raw, default is image - --calibrate-method - calibrate method: e.g. no_clip|l2|kld_m0|kld_m1|kld_m2|cdf, default is no_clip - --preprocess enable preprocess, default is 0 - --swapRB swap red and blue channel, default is 0 - --mean normalize mean, default is 0. 0. 0. - --std normalize std, default is 1. 1. 1. - --input-range - float range after preprocess - --input-shape - shape for input data - --letterbox-value - letter box pad value, default is 0.000000 - --input-type - input type, e.g float32|uint8|default, default is default - --output-type - output type, e.g float32|uint8, default is float32 - --input-layout - input layout, e.g NCHW|NHWC, default is NCHW - --output-layout - output layout, e.g NCHW|NHWC, default is NCHW - --tcu-num tcu number, e.g 1|2|3|4, default is 0 - --is-fpga use fpga parameters, default is 0 - --dump-ir dump ir to .dot, default is 0 - --dump-asm dump assembly, default is 0 - --dump-quant-error dump quant error, default is 0 - --dump-import-op-range dump import op range, default is 0 - --dump-dir - dump to directory - --benchmark-only compile kmodel only for benchmark use, default is 0 - - infer - - kmodel filename - output path - --dataset - dataset path - --dataset-format - dataset format, e.g. image|raw, default is image - --input-layout - input layout, e.g NCHW|NHWC, default is NCHW -``` - -## 描述 - -`ncc` 是 nncase 的命令行工具。它有两个命令: `compile` 和 `infer`。 - -`compile` 命令将你训练好的模型 (`.tflite`, `.caffemodel`, `.onnx`) 编译到 `.kmodel`。 - -- `-i, --input-format` 用来指定输入模型的格式。nncase 现在支持 `tflite`、`caffe` 和 `onnx` 输入格式。 -- `-t, --target` 用来指定你想要你的模型在哪种目标设备上运行。`cpu` 几乎所有平台都支持的通用目标。`k210` 是 Kendryte K210 SoC 平台。如果你指定了 `k210`,这个模型就只能在 K210 运行或在你的 PC 上模拟运行。 -- `` 用于指定输入模型文件 -- `--input-prototxt`用于指定caffe模型的prototxt文件 -- `` 用于指定输出模型文件 -- `--output-arrays `用于指定输出结点的名称 -- `--quant-type` 用于指定数据的量化类型, 如 `uint8`/`int8`/`int16, 默认是`uint8 -- `--w-quant-type` 用于指定权重的量化类型, 如 `uint8`/`int8`/`int16, 默认是`uint8 -- `--use-mse-quant-w`指定是否使用最小化mse(mean-square error, 均方误差)算法来量化权重. -- `--dataset` 用于提供量化校准集来量化你的模型。你需要从训练集中选择几百到上千个数据放到这个目录里。 -- `--dataset-format` 用于指定量化校准集的格式。默认是 `image`,nncase 将使用 `opencv` 读取你的图片,并自动缩放到你的模型输入需要的尺寸。如果你的输入有 3 个通道,ncc 会将你的图片转换为值域是 [0,1] 布局是 `NCHW` 的张量。如果你的输入只有 1 个通道,ncc 会灰度化你的图片。如果你的数据集不是图片(例如音频或者矩阵),把它设置为 `raw`。这种场景下你需要把你的数据集转换为 float 张量的二进制文件。 -- `--dump-range-dataset` 用于提供统计范围数据集来统计原始模型每个节点输出数据范围。你需要从训练集中选择几百到上千个数据放到这个目录里。 -- `--dump-range-dataset-format` 用于指定统计范围数据集的格式。默认是 `image`,nncase 将使用 `opencv` 读取你的图片,并自动缩放到你的模型输入需要的尺寸。如果你的输入有 3 个通道,ncc 会将你的图片转换为值域是 [0,1] 布局是 `NCHW` 的张量。如果你的输入只有 1 个通道,ncc 会灰度化你的图片。如果你的数据集不是图片(例如音频或者矩阵),把它设置为 `raw`。这种场景下你需要把你的数据集转换为 float 张量的二进制文件。 -- `--calibrate-method` 用于设置量化校准方法,它被用来选择最优的激活函数值域。默认值是 `no_clip`,ncc 会使用整个激活函数值域。如果你需要更好的量化结果,你可以使用 `l2`,但它需要花更长的时间寻找最优值域。 -- `--preprocess`指定是否预处理, 添加后表示开启预处理 -- `--swapRB`指定**预处理时**是否交换红和蓝两个通道数据, 用于实现RGB2BGR或BGR2RGB功能 -- `--mean`指定**预处理时**标准化参数均值,例如添加 `--mean "0.1 2.3 33.1f"`用于设置三个通道的均值. -- `--std`指定**预处理时**标准化参数方差,例如添加 `--std "1. 2. 3."`用于设置三个通道的方差. -- `--input-range`指定输入数据反量化后的数据范围,例如添加 `--input-range "0.1 2."`设置反量化的范围为 `[0.1~2]`. -- `--input-shape`指定输入数据的形状. 若与模型的输入形状不同, 则预处理时会做resize/pad等处理, 例如添加 `--input-shape "1 1 28 28"`指明当前输入图像尺寸. -- `--letterbox-value`用于指定预处理时pad填充的值. -- `--input-type` 用于指定推理时输入的数据类型。如果 `--input-type` 是 `uint8`,推理时你需要提供 RGB888 uint8 张量。如果 `--input-type` 是 `float`,你则需要提供 RGB float 张量. -- `--output-type` 用于指定推理时输出的数据类型。如 `float`/`uint8`, `uint8`仅在量化模型时才有效. 默认是 `float` -- `--input-layout`用于指定输入数据的layout. 若输入数据的layout与模型的layout不同, 预处理会添加transpose进行转换. -- `--output-layout`用于指定输出数据的layout -- `--tcu-num`用于指定tcu个数, 默认值为0, 表示不配置tcu个数. -- `--is-fpga`指定编译后的kmodel是否运行在fpga上 -- `--dump-ir` 是一个调试选项。当它打开时 ncc 会在工作目录产生一些 `.dot` 文件。你可以使用 `Graphviz` 或 [Graphviz Online](https://dreampuf.github.io/GraphvizOnline) 来查看这些文件。 -- `--dump-asm` 是一个调试选项。当它打开时 ncc 会生成硬件指令文件compile.text.asm -- `--dump-quant-error`是一个调试选项, 用于dump量化错误信息 -- `--dump-import-op-range`是一个调试选项, 用于dump import之后节点的数据范围,需要同时指定dump-range-dataset -- `--dump-dir`是一个调试选项, 用于指定dump目录. -- `--benchmark-only`是一个调试选项, 用于指定编译后的kmodel用于benchmark. - -`infer` 命令可以运行你的 kmodel,通常它被用来调试。ncc 会将你模型的输出张量按 `NCHW` 布局保存到 `.bin` 文件。 - -- `` kmodel 的路径。 -- `` ncc 输出目录。 -- `--dataset` 测试集路径。 -- `--dataset-format`和 `--input-layout`同 `compile` 命令中的含义。 +``` \ No newline at end of file diff --git a/docs/onnx_ops.md b/docs/onnx_ops.md index 2ea9159538..7c1cfb900f 100644 --- a/docs/onnx_ops.md +++ b/docs/onnx_ops.md @@ -19,6 +19,7 @@ | Ceil | ✅ | | Celu | ✅ | | Clip | ✅ | +| Compress | ✅ | | Concat | ✅ | | Constant | ✅ | | ConstantOfShape | ✅ | @@ -35,20 +36,24 @@ | Exp | ✅ | | Expand | ✅ | | Equal | ✅ | +| Erf | ✅ | | Flatten | ✅ | | Floor | ✅ | | Gather | ✅ | +| GatherElements | ✅ | | GatherND | ✅ | | Gemm | ✅ | | GlobalAveragePool | ✅ | | GlobalMaxPool | ✅ | | Greater | ✅ | | GreaterOrEqual | ✅ | +| GRU | ✅ | | Hardmax | ✅ | | HardSigmoid | ✅ | | HardSwish | ✅ | | Identity | ✅ | | InstanceNormalization | ✅ | +| LayerNormalization | ✅ | | LpNormalization | ✅ | | LeakyRelu | ✅ | | Less | ✅ | @@ -89,6 +94,7 @@ | ReverseSequence | ✅ | | RoiAlign | ✅ | | Round | ✅ | +| Rsqrt | ✅ | | Selu | ✅ | | Shape | ✅ | | Sign | ✅ | @@ -111,6 +117,7 @@ | TopK | ✅ | | Transpose | ✅ | | Trilu | ✅ | +| ThresholdedRelu | ✅ | | Upsample | ✅ | | Unsqueeze | ✅ | | Where | ✅ | diff --git a/examples/user_guide/README.md b/examples/user_guide/README.md new file mode 100644 index 0000000000..34aaf7744a --- /dev/null +++ b/examples/user_guide/README.md @@ -0,0 +1,15 @@ +模型编译推理参考Jupyter脚本:[User_guide](./simulate.ipynb),脚本中包含了单输入和多输入的示例。也可以使用单独的编译脚本 [Single build](../../docs/USAGE_ZH.md#编译模型示例)完成kmodel的编译。 + +如果在Docker中运行Jupyter脚本,可以参考[配置Jupyter lab](https://github.com/kunjing96/docker-jupyterlab#32-%E9%85%8D%E7%BD%AEjupyter-lab)进行配置。 + +在执行脚本之前需要根据自身需求修改以下内容: + +1. `compile_kmodel`函数中 `compile_options`,`ptq_options`相关信息 + `compile_options`详细信息见[CompileOptions](../../docs/USAGE_ZH.md#CompileOptions) + `ptq_options`详细信息见[PTQTensorOptions](../../docs/USAGE_ZH.md#PTQTensorOptions) +2. `compile kmodel single input(multiple inputs)`部分 + 修改 `model_path`和 `dump_path`,用于指定模型路径和编译期间文件生成路径。 + 修改 `calib_data`的实现,数据格式见注释。 +3. `run kmodel(simulate)`部分,修改 `input_data`的实现,数据格式见注释。 + +推理结束后,会在 `dump_path`路径下生成 `kmodel`、输出结果和编译期间的文件。 \ No newline at end of file diff --git a/examples/user_guide/nncase_base_func.py b/examples/user_guide/nncase_base_func.py new file mode 100644 index 0000000000..a624e9c0a7 --- /dev/null +++ b/examples/user_guide/nncase_base_func.py @@ -0,0 +1,95 @@ +import os + +import numpy as np +import onnx +import onnxsim +from sklearn.metrics.pairwise import cosine_similarity + +import nncase + + +def get_cosine(vec1, vec2): + """ + result compare + """ + return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1)) + + + +def read_model_file(model_file): + """ + read model + """ + with open(model_file, 'rb') as f: + model_content = f.read() + return model_content + + +def parse_model_input_output(model_file): + """ + parse onnx model + """ + onnx_model = onnx.load(model_file) + input_all = [node.name for node in onnx_model.graph.input] + input_initializer = [node.name for node in onnx_model.graph.initializer] + input_names = list(set(input_all) - set(input_initializer)) + input_tensors = [ + node for node in onnx_model.graph.input if node.name in input_names] + + # input + inputs = [] + for _, e in enumerate(input_tensors): + onnx_type = e.type.tensor_type + input_dict = {} + input_dict['name'] = e.name + input_dict['dtype'] = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[onnx_type.elem_type] + input_dict['shape'] = [i.dim_value for i in onnx_type.shape.dim] + inputs.append(input_dict) + + return onnx_model, inputs + +def model_simplify(model_file): + """ + simplify onnx model + """ + if model_file.split('.')[-1] == "onnx": + onnx_model, inputs = parse_model_input_output(model_file) + onnx_model = onnx.shape_inference.infer_shapes(onnx_model) + input_shapes = {} + for input in inputs: + input_shapes[input['name']] = input['shape'] + + onnx_model, check = onnxsim.simplify(onnx_model, overwrite_input_shapes=input_shapes) + assert check, "Simplified ONNX model could not be validated" + + model_file = os.path.join(os.path.dirname(model_file), 'simplified.onnx') + onnx.save_model(onnx_model, model_file) + print("[ onnx done ]") + elif model_file.split('.')[-1] == "tflite": + print("[ tflite pass ]") + else: + raise Exception(f"Unsupport type {model_file.split('.')[-1]}") + + return model_file + +def run_kmodel(kmodel_path, input_data): + print("\n---------start run kmodel---------") + print("Load kmodel...") + model_sim = nncase.Simulator() + with open(kmodel_path, 'rb') as f: + model_sim.load_model(f.read()) + + print("Set input data...") + for i, p_d in enumerate(input_data): + model_sim.set_input_tensor(i, nncase.RuntimeTensor.from_numpy(p_d)) + + print("Run...") + model_sim.run() + + print("Get output result...") + all_result = [] + for i in range(model_sim.outputs_size): + result = model_sim.get_output_tensor(i).to_numpy() + all_result.append(result) + print("----------------end-----------------") + return all_result \ No newline at end of file diff --git a/examples/user_guide/simulate.ipynb b/examples/user_guide/simulate.ipynb new file mode 100644 index 0000000000..6a9a041eaa --- /dev/null +++ b/examples/user_guide/simulate.ipynb @@ -0,0 +1,198 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "82a8f9c1-c2bf-4270-9f1f-ac25c9fdd898", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --upgrade pip\n", + "#!pip uninstall -y nncase\n", + "!pip install nncase==1.9.0.20230322 --timeout=1000\n", + "#from versions: 1.0.0.20211029, 1.1.0.20211203, 1.3.0.20220127, 1.4.0.20220303, 1.5.0.20220331, \n", + "# 1.6.0.20220505, 1.7.0.20220530, 1.7.1.20220701, 1.8.0.20220929, 1.9.0.20230322, 2.0.0.20230602, 2.1.0.20230703)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7eff82e-295c-4cce-afbc-ce64c84dc40a", + "metadata": {}, + "outputs": [], + "source": [ + "import nncase\n", + "from nncase_base_func import *\n", + "# from parse_model import *\n", + "\n", + "\n", + "def compile_kmodel(model_path, dump_path, calib_data):\n", + " \"\"\"\n", + " Set compile options and ptq options.\n", + " Compile kmodel.\n", + " Dump the compile-time result to 'compile_options.dump_dir'\n", + " \"\"\"\n", + " print(\"----------model simplify----------\")\n", + " model_file = model_simplify(model_path)\n", + "\n", + " print(\"---------- set options ----------\")\n", + " # import_options\n", + " import_options = nncase.ImportOptions()\n", + " \n", + " # compile_options\n", + " compile_options = nncase.CompileOptions()\n", + " compile_options.target = \"k210\" # \"cpu\" \"k510\"\n", + " compile_options.dump_ir = True # if False, will not dump the compile-time result.\n", + " compile_options.dump_asm = True\n", + " compile_options.dump_dir = dump_path\n", + "\n", + " # preprocess args\n", + " compile_options.preprocess = True\n", + " if compile_options.preprocess:\n", + " compile_options.input_type = \"uint8\" # \"uint8\"\n", + " compile_options.swapRB = False\n", + " compile_options.input_shape = [1,224,320,3]\n", + " compile_options.input_range = [0,1]\n", + " compile_options.mean = [0,0,0]\n", + " compile_options.std = [1,1,1]\n", + " compile_options.input_layout = \"NHWC\" # \"NHWC\"\n", + " compile_options.output_layout = \"NHWC\" # \"NHWC\"\n", + " compile_options.letterbox_value = 0\n", + " \n", + " # quant args\n", + " compile_options.quant_type = \"uint8\" \n", + " compile_options.w_quant_type = \"uint8\"\n", + " compile_options.use_mse_quant_w = True\n", + " compile_options.split_w_to_act = False\n", + "\n", + " # quant options\n", + " ptq_options = nncase.PTQTensorOptions()\n", + " ptq_options.calibrate_method = \"no_clip\" # \"kld_m2\" \"l2\" \"cdf\"\n", + " ptq_options.samples_count = len(calib_data[0])\n", + " ptq_options.set_tensor_data(np.array(calib_data).tobytes())\n", + "\n", + " \n", + " # set options\n", + " compiler = nncase.Compiler(compile_options)\n", + " compiler.use_ptq(ptq_options)\n", + " \n", + " print(\"---------- compile ----------\")\n", + " # import\n", + " model_content = read_model_file(model_file)\n", + " if model_path.split(\".\")[-1] == \"onnx\":\n", + " compiler.import_onnx(model_content, import_options)\n", + " elif model_path.split(\".\")[-1] == \"tflite\":\n", + " compiler.import_tflite(model_content, import_options)\n", + "\n", + " # compile\n", + " compiler.compile()\n", + " kmodel = compiler.gencode_tobytes()\n", + " \n", + " kmodel_path = os.path.join(dump_path, \"test.kmodel\")\n", + " with open(kmodel_path, 'wb') as f:\n", + " f.write(kmodel)\n", + " print(\"---------- compile end ----------\")\n", + " return kmodel_path\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c957fe20-99c9-4a54-bae8-38361a8f8830", + "metadata": {}, + "outputs": [], + "source": [ + "# compile kmodel single input\n", + "model_path = \"./model_f32.tflite\"\n", + "dump_path = \"./tmp\"\n", + "\n", + "# If model has multi inputs, calib_data format is \"[[x1, x2,...], [y1, y2,...], ...]\"\n", + "# e.g. Model has three inputs (x, y, z), the calib_data is '[[x1, x2, x3],[y1, y2, y3],[z1, z2, z3]]'\n", + "\n", + "calib_data = [[np.random.rand(1,224,320,3).astype(np.float32), np.random.rand(1,224,320,3).astype(np.float32)]]\n", + "kmodel_path = compile_kmodel(model_path, dump_path, calib_data)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f617edc-781c-4b8b-b45d-fef2f0b36a46", + "metadata": {}, + "outputs": [], + "source": [ + "# run kmodel(simulate)\n", + "kmodel_path = \"./tmp/test.kmodel\"\n", + "input_data = [np.random.rand(1,224,320,3).astype(np.float32)]\n", + "\n", + "result = run_kmodel(kmodel_path, input_data)\n", + "for idx, i in enumerate(result):\n", + " print(i.shape)\n", + " i.tofile(os.path.join(dump_path, \"nncase_result_{}.bin\".format(idx)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89280d3a", + "metadata": {}, + "outputs": [], + "source": [ + "# compile kmodel multiple inputs\n", + "model_path = \"./decoder_100.onnx\"\n", + "dump_path = \"./tmp_dec\"\n", + "\n", + "# If model has multiple inputs, calib_data format is \"[[x1, x2,...], [y1, y2,...], ...]\"\n", + "# e.g. Model has three inputs (x, y, z), the calib_data is '[[x1, x2, x3],[y1, y2, y3],[z1, z2, z3]]'\n", + "\n", + "calib_data = [[np.random.randint(1, 5, size=[3, 100], dtype='int64'), np.random.randint(1, 5, size=[3, 100], dtype='int64')],\n", + " [np.random.rand(100, 3, 192).astype(np.float32), np.random.rand(100, 3, 192).astype(np.float32)],\n", + " [np.random.rand(3, 100).astype(np.float32) > 0.5, np.random.rand(3, 100).astype(np.float32) > 0.5], ] # bool\n", + "\n", + "kmodel_path = compile_kmodel(model_path, dump_path, calib_data)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22a25a7f", + "metadata": {}, + "outputs": [], + "source": [ + "# run kmodel(simulate)\n", + "import os\n", + "\n", + "kmodel_path = \"./tmp_dec/test.kmodel\"\n", + "input_data = [np.random.randint(1, 5, size=[3, 100], dtype='int64'),\n", + " np.random.rand(100, 3, 192).astype(np.float32),\n", + " np.random.rand(3, 100).astype(np.float32) > 0.5, ]\n", + "\n", + "result = run_kmodel(kmodel_path, input_data)\n", + "\n", + "for idx, i in enumerate(result):\n", + " print(i.shape)\n", + " i.tofile(os.path.join(dump_path, \"nncase_result_{}.bin\".format(idx)))\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/yolox/README.md b/examples/yolox/README.md index 3dda17b474..893c7e7c4f 100644 --- a/examples/yolox/README.md +++ b/examples/yolox/README.md @@ -78,7 +78,11 @@ mv xxx.bin k210/yolox_detect_example/input.bin ## 定点模型推理测试 -使用最新的[裸机sdk](https://github.com/kendryte/kendryte-standalone-sdk/tree/develop),将`yolox_detect_example`拷贝到`src`目录下,然后进行编译(请参考裸机sdk使用指南,首先配置好工具链等相关环境) +使用git clone的develop分支的[裸机sdk](https://github.com/kendryte/kendryte-standalone-sdk/tree/develop),将`yolox_detect_example`拷贝到`src`目录下. + +如果您目前使用的nncase版本大于1.0.0, 请参考[这里](https://github.com/kendryte/nncase/blob/master/docs/USAGE_ZH.md#部署-nncase-runtime)更新sdk中对应的nncase runtime版本(runtime版本需要与自身所使用的nncase版本相匹配). + +按照如下命令编译与烧录(请参考裸机sdk使用指南,首先配置好工具链等相关环境) ```bash mkdir build && cd build cmake .. -DPROJ=yolox_detect_example -DTOOLCHAIN=/usr/local/opt/kendryte-toolchain/bin @@ -88,8 +92,10 @@ kflash yolox_detect_example.bin -B kd233 -p /dev/cu.usbserial-1130 -b 2000000 -t ⚠️不同的电脑上usb端口号并不一致. +⚠️使用example中提供的kmodel时无需更新runtime. + 可能的结果: ![demo](demo.jpg) # 致谢 -[YOLOX](https://github.com/Megvii-BaseDetection/YOLOX) \ No newline at end of file +[YOLOX](https://github.com/Megvii-BaseDetection/YOLOX) diff --git a/include/nncase/codegen/module_builder.h b/include/nncase/codegen/module_builder.h index 55416204f7..e91594c398 100644 --- a/include/nncase/codegen/module_builder.h +++ b/include/nncase/codegen/module_builder.h @@ -93,6 +93,7 @@ class NNCASE_API module_builder section *find_section(std::string_view section_name); void merge_to_rdata_section(std::string_view from); function_call_id function_id(ir::graph *graph); + std::streampos get_current_entry_point(); void set_current_entry_point(std::streampos pos); void set_current_function_text_end(std::streampos pos); diff --git a/include/nncase/codegen/nnil_builder.h b/include/nncase/codegen/nnil_builder.h index a453449a69..3faa2f7789 100644 --- a/include/nncase/codegen/nnil_builder.h +++ b/include/nncase/codegen/nnil_builder.h @@ -65,6 +65,8 @@ class NNCASE_API nnil_builder void emit_pow() { emit_opcode(runtime::nnil_pow); } void emit_clamp() { emit_opcode(runtime::nnil_clamp); } + // emit_erf + void emit_erf() { emit_opcode(runtime::nnil_erf); } void emit_ret() { emit_opcode(runtime::nnil_ret); } private: diff --git a/include/nncase/codegen/stackvm/op_writer.h b/include/nncase/codegen/stackvm/op_writer.h index 668aa9f314..dcff068765 100644 --- a/include/nncase/codegen/stackvm/op_writer.h +++ b/include/nncase/codegen/stackvm/op_writer.h @@ -1,4 +1,4 @@ -/* This file is generated by tools/stackvm_gen/IsaGen at 4/25/2022 3:29:27 PM +08:00. +/* This file is generated by tools/stackvm_gen/IsaGen at 2023/5/9 下午5:18:43 +08:00. * * Copyright 2019-2021 Canaan Inc. * @@ -1366,6 +1366,22 @@ struct op_writer } }; +template <> +struct op_writer +{ + void operator()(const nncase::runtime::stackvm::tensor_space_to_batch_op_t &op, binary_writer &writer) const + { + writer.write(static_cast(op.opcode)); + writer.write(static_cast(op.funct)); + writer.write(static_cast(op.datatype)); + writer.write(op.rshape_src); + writer.write(op.rstride_src); + writer.write(op.rstride_dest); + writer.write(op.rshape_block); + writer.write(op.rpad_crops); + } +}; + template <> struct op_writer { @@ -1449,6 +1465,97 @@ struct op_writer } }; +template <> +struct op_writer +{ + void operator()(const nncase::runtime::stackvm::tensor_gru_op_t &op, binary_writer &writer) const + { + writer.write(static_cast(op.opcode)); + writer.write(static_cast(op.funct)); + writer.write(op.input_shape_src); + writer.write(op.w_shape_src); + writer.write(op.direction); + writer.write(op.linear_before_reset); + } +}; + +template <> +struct op_writer +{ + void operator()(const nncase::runtime::stackvm::tensor_tflite_detection_postprocess_op_t &op, binary_writer &writer) const + { + writer.write(static_cast(op.opcode)); + writer.write(static_cast(op.funct)); + writer.write(op.box_shape_src); + writer.write(op.score_shape_src); + writer.write(op.anchor_shape_src); + writer.write(op.max_detections); + writer.write(op.max_classes_per_detection); + writer.write(op.detections_per_class); + writer.write(op.use_regular_non_max_suppression); + writer.write(op.nms_score_threshold); + writer.write(op.nms_iou_threshold); + writer.write(op.num_classes); + writer.write(op.y_scale); + writer.write(op.x_scale); + writer.write(op.h_scale); + writer.write(op.w_scale); + } +}; + +template <> +struct op_writer +{ + void operator()(const nncase::runtime::stackvm::tensor_layer_normalization_op_t &op, binary_writer &writer) const + { + writer.write(static_cast(op.opcode)); + writer.write(static_cast(op.funct)); + writer.write(static_cast(op.datatype)); + writer.write(op.input_shape); + writer.write(op.axis); + writer.write(op.epsilon); + } +}; + +template <> +struct op_writer +{ + void operator()(const nncase::runtime::stackvm::tensor_compress_op_t &op, binary_writer &writer) const + { + writer.write(static_cast(op.opcode)); + writer.write(static_cast(op.funct)); + writer.write(op.input_shape_src); + writer.write(op.condition_shape_src); + writer.write(op.axis); + } +}; + +template <> +struct op_writer +{ + void operator()(const nncase::runtime::stackvm::tensor_gather_elements_op_t &op, binary_writer &writer) const + { + writer.write(static_cast(op.opcode)); + writer.write(static_cast(op.funct)); + writer.write(op.input_shape_src); + writer.write(op.indices_shape_src); + writer.write(op.axis); + } +}; + +template <> +struct op_writer +{ + void operator()(const nncase::runtime::stackvm::tensor_instance_normalization_op_t &op, binary_writer &writer) const + { + writer.write(static_cast(op.opcode)); + writer.write(static_cast(op.funct)); + writer.write(static_cast(op.datatype)); + writer.write(op.input_shape); + writer.write(op.epsilon); + } +}; + class NNCASE_API op_builder { public: @@ -1579,11 +1686,18 @@ class NNCASE_API op_builder void tensor_sigmoid_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest); void tensor_slice_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest, uint8_t rbegins, uint8_t rends, uint8_t rstrides); void tensor_softmax_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest, int32_t axis, float beta); + void tensor_space_to_batch_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest, uint8_t rshape_block, uint8_t rpad_crops); void tensor_ternary_(datatype_t datatype, uint8_t rshape_src1, uint8_t rstride_src1, uint8_t rshape_src2, uint8_t rstride_src2, uint8_t rshape_src3, uint8_t rstride_src3, uint8_t rstride_dest); void tensor_topk_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rshape_dest1, uint8_t rstride_dest1, uint8_t rshape_dest2, uint8_t rstride_dest2, int64_t k, int32_t axis, bool largest, bool sorted); void tensor_trilu_(datatype_t datatype, uint8_t rshape_src, bool upper, int64_t k); void tensor_unary_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest, unary_op_t unary_op); void tensor_transpose_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest, uint8_t rshape_perm); + void tensor_gru_(uint8_t input_shape_src, uint8_t w_shape_src, uint8_t direction, bool linear_before_reset); + void tensor_tflite_detection_postprocess_(uint8_t box_shape_src, uint8_t score_shape_src, uint8_t anchor_shape_src, int32_t max_detections, int32_t max_classes_per_detection, int32_t detections_per_class, bool use_regular_non_max_suppression, float nms_score_threshold, float nms_iou_threshold, int32_t num_classes, float y_scale, float x_scale, float h_scale, float w_scale); + void tensor_layer_normalization_(datatype_t datatype, uint8_t input_shape, int32_t axis, float epsilon); + void tensor_compress_(uint8_t input_shape_src, uint8_t condition_shape_src, float axis); + void tensor_gather_elements_(uint8_t input_shape_src, uint8_t indices_shape_src, int32_t axis); + void tensor_instance_normalization_(datatype_t datatype, uint8_t input_shape, float epsilon); private: section_writer &writer_; diff --git a/include/nncase/ir/graph.h b/include/nncase/ir/graph.h index 800833ca9c..5486b33b8c 100644 --- a/include/nncase/ir/graph.h +++ b/include/nncase/ir/graph.h @@ -72,7 +72,7 @@ class NNCASE_API graph void dce(); void cse(); void merge_module_regions(); - split_graph_result split_subgraph(std::span nodes); + split_graph_result split_subgraph(std::span nodes, bool reorder_input = false); graph &add_subgraph(std::unique_ptr subgraph); private: diff --git a/include/nncase/ir/ir_types.h b/include/nncase/ir/ir_types.h index 149e6d4b14..77582cb97c 100644 --- a/include/nncase/ir/ir_types.h +++ b/include/nncase/ir/ir_types.h @@ -30,7 +30,8 @@ enum node_attributes node_attr_need_quantize = 2, node_attr_fuse_input_slice = 4, node_attr_fuse_output_concat = 8, - node_attr_skip_constant_folding = 16 + node_attr_skip_constant_folding = 16, + node_attr_skip_quantize = 32, }; enum connector_attributes diff --git a/include/nncase/ir/op_utils.h b/include/nncase/ir/op_utils.h index 07aa5abc0b..0f379661d5 100644 --- a/include/nncase/ir/op_utils.h +++ b/include/nncase/ir/op_utils.h @@ -75,10 +75,22 @@ inline size_t get_bytes(datatype_t type, const shape_t &shape) return xt::compute_size(shape) * get_bytes(type); } +template +inline void compute_strides(const shape_type &shape, strides_type &strides) +{ + using strides_value_type = typename std::decay_t::value_type; + strides_value_type data_size = 1; + for (std::size_t i = shape.size(); i != 0; --i) + { + strides[i - 1] = data_size; + data_size = strides[i - 1] * static_cast(shape[i - 1]); + } +} + inline nncase::ir::shape_t to_strides(const nncase::ir::shape_t &shape) { nncase::ir::shape_t strides(shape.size()); - xt::compute_strides(shape, xt::layout_type::row_major, strides); + compute_strides(shape, strides); return strides; } @@ -373,6 +385,13 @@ inline bool is_simple_slice(const axis_t &begin, const axis_t &end, const axis_t return is_simple_slice; } +inline shape_t get_instancenorm_const_shape(const shape_t &in_shape) +{ + shape_t const_shape(in_shape.size() - 1, 1); + const_shape[0] = in_shape[1]; + return const_shape; +} + inline bool is_axis0_squeeze_or_expand_dim_bitcast(const shape_t &in_shape, const shape_t &out_shape) { auto in_begin = std::find_if_not(in_shape.begin(), in_shape.end(), [](size_t dim) { return dim == 1; }); diff --git a/include/nncase/ir/opcode.def b/include/nncase/ir/opcode.def index 9d656e5008..0fd5ccfb26 100644 --- a/include/nncase/ir/opcode.def +++ b/include/nncase/ir/opcode.def @@ -46,4 +46,10 @@ DEFINE_NEUTRAL_OPCODE(trilu, Trilu, 0x124) DEFINE_NEUTRAL_OPCODE(sigmoid, Sigmoid, 0x125) DEFINE_NEUTRAL_OPCODE(roi_align, RoiAlign, 0x126) DEFINE_NEUTRAL_OPCODE(compare, Compare, 0x127) -DEFINE_NEUTRAL_OPCODE(softmax, Softmax, 0x128) \ No newline at end of file +DEFINE_NEUTRAL_OPCODE(softmax, Softmax, 0x128) +DEFINE_NEUTRAL_OPCODE(gru, GRU, 0x129) +DEFINE_NEUTRAL_OPCODE(tflite_detection_postprocess, TfliteDetectionPostprocess, 0x12A) +DEFINE_NEUTRAL_OPCODE(layernorm, LayerNormalization, 0x12B) +DEFINE_NEUTRAL_OPCODE(compress, Compress, 0x12C) +DEFINE_NEUTRAL_OPCODE(gather_elements, GatherElements, 0x12D) +DEFINE_NEUTRAL_OPCODE(instancenorm, InstanceNormliaztion, 0x12E) diff --git a/include/nncase/ir/ops/compress.h b/include/nncase/ir/ops/compress.h new file mode 100644 index 0000000000..1ee9282a91 --- /dev/null +++ b/include/nncase/ir/ops/compress.h @@ -0,0 +1,40 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "../node.h" +#include + +namespace nncase::ir +{ +class NNCASE_API compress : public node +{ +public: + DEFINE_NODE_OPCODE(op_compress); + + input_connector &input() { return input_at(0); } + input_connector &condition() { return input_at(1); } + output_connector &output() { return output_at(0); } + + int32_t axis() const noexcept { return axis_; } + + compress(datatype_t type, shape_t input_shape, shape_t condition_shape, shape_t output_shape, int32_t axis); + +protected: + bool properties_equal(node &other) const override; + +private: + int32_t axis_; +}; +} diff --git a/include/nncase/ir/ops/gather_elements.h b/include/nncase/ir/ops/gather_elements.h new file mode 100644 index 0000000000..fd9bf44c95 --- /dev/null +++ b/include/nncase/ir/ops/gather_elements.h @@ -0,0 +1,40 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "../node.h" +#include + +namespace nncase::ir +{ +class NNCASE_API gather_elements : public node +{ +public: + DEFINE_NODE_OPCODE(op_gather_elements); + + input_connector &input() { return input_at(0); } + input_connector &indices() { return input_at(1); } + output_connector &output() { return output_at(0); } + + int32_t axis() const noexcept { return axis_; } + + gather_elements(datatype_t in_type, datatype_t indices_type, shape_t input_shape, shape_t indices_shape, shape_t output_shape, int32_t axis); + +protected: + bool properties_equal(node &other) const override; + +private: + int32_t axis_; +}; +} diff --git a/include/nncase/ir/ops/gru.h b/include/nncase/ir/ops/gru.h new file mode 100644 index 0000000000..6c27e9e87a --- /dev/null +++ b/include/nncase/ir/ops/gru.h @@ -0,0 +1,50 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "../node.h" +#include + +namespace nncase::ir +{ +class NNCASE_API gru : public node +{ +public: + DEFINE_NODE_OPCODE(op_gru); + + input_connector &input() { return input_at(0); } + input_connector &w() { return input_at(1); } + input_connector &r() { return input_at(2); } + input_connector &b() { return input_at(3); } + input_connector &initial_h() { return input_at(4); } + input_connector &initial_c() { return input_at(5); } + output_connector &output() { return output_at(0); } + output_connector &output_h() { return output_at(1); } + + lstm_direction direction() const noexcept { return direction_; } + std::string framework() const noexcept { return framework_; } + bool linear_before_reset() const noexcept { return linear_before_reset_; } + + gru(shape_t input_shape, shape_t w_shape, shape_t r_shape, shape_t b_shape, shape_t output_shape, + shape_t output_h_shape, lstm_direction direction, std::string framework, bool linear_before_reset); + +protected: + bool properties_equal(node &other) const override; + +private: + lstm_direction direction_; + std::string framework_; + bool linear_before_reset_; +}; +} diff --git a/include/nncase/ir/ops/instancenorm.h b/include/nncase/ir/ops/instancenorm.h new file mode 100644 index 0000000000..9902f99bc0 --- /dev/null +++ b/include/nncase/ir/ops/instancenorm.h @@ -0,0 +1,39 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "../node.h" +#include "nncase/ir/connectors.h" + +namespace nncase::ir +{ +class NNCASE_API instancenorm : public node +{ +public: + DEFINE_NODE_OPCODE(op_instancenorm); + + input_connector &input() { return input_at(0); } + input_connector &scale() { return input_at(1); } + input_connector &bias() { return input_at(2); } + output_connector &output() { return output_at(0); } + float epsilon() const noexcept { return epsilon_; } + instancenorm(datatype_t input_type, shape_t input_shape, float epsilon); + +protected: + bool properties_equal(node &other) const override; + +private: + float epsilon_; +}; +} diff --git a/include/nncase/ir/ops/layernorm.h b/include/nncase/ir/ops/layernorm.h new file mode 100644 index 0000000000..79581c6f92 --- /dev/null +++ b/include/nncase/ir/ops/layernorm.h @@ -0,0 +1,42 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "../node.h" +#include "nncase/ir/connectors.h" + +namespace nncase::ir +{ +class NNCASE_API layernorm : public node +{ +public: + DEFINE_NODE_OPCODE(op_layernorm); + + input_connector &input() { return input_at(0); } + input_connector &scale() { return input_at(1); } + input_connector &bias() { return input_at(2); } + output_connector &output() { return output_at(0); } + int32_t axis() const noexcept { return axis_; } + float epsilon() const noexcept { return epsilon_; } + layernorm(datatype_t input_type, shape_t input_shape, int32_t axis, float epsilon); + +protected: + bool properties_equal(node &other) const override; + +private: + int32_t axis_; + float epsilon_; + //todo: support stash_type +}; +} diff --git a/include/nncase/ir/ops/tflite_detection_postprocess.h b/include/nncase/ir/ops/tflite_detection_postprocess.h new file mode 100644 index 0000000000..1d88c3abfa --- /dev/null +++ b/include/nncase/ir/ops/tflite_detection_postprocess.h @@ -0,0 +1,74 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "../node.h" +#include + +namespace nncase::ir +{ +class NNCASE_API tflite_detection_postprocess : public node +{ +public: + DEFINE_NODE_OPCODE(op_tflite_detection_postprocess); + + input_connector &boxes() { return input_at(0); } + input_connector &scores() { return input_at(1); } + input_connector &anchors() { return input_at(2); } + output_connector &output_locations() { return output_at(0); } + output_connector &output_classes() { return output_at(1); } + output_connector &output_scores() { return output_at(2); } + output_connector &output_num_detections() { return output_at(3); } + + int32_t max_detections() const noexcept { return max_detections_; } + int32_t max_classes_per_detection() const noexcept { return max_classes_per_detection_; } + int32_t detections_per_class() const noexcept { return detections_per_class_; } + bool use_regular_non_max_suppression() const noexcept { return use_regular_non_max_suppression_; } + float nms_score_threshold() const noexcept { return nms_score_threshold_; } + float nms_iou_threshold() const noexcept { return nms_iou_threshold_; }; + int32_t num_classes() const noexcept { return num_classes_; }; + float y_scale() const noexcept { return y_scale_; }; + float x_scale() const noexcept { return x_scale_; }; + float h_scale() const noexcept { return h_scale_; }; + float w_scale() const noexcept { return w_scale_; }; + + tflite_detection_postprocess( + shape_t boxes_shape, shape_t scores_shape, shape_t anchors_shape, + shape_t output_shape_0, shape_t output_shape_1, shape_t output_shape_2, shape_t output_shape_3, + int32_t max_detections, + int32_t max_classes_per_detection, + int32_t detections_per_class, + bool use_regular_non_max_suppression, + float nms_score_threshold, + float nms_iou_threshold, + int32_t num_classes, + float y_scale, float x_scale, float h_scale, float w_scale); + +protected: + bool properties_equal(node &other) const override; + +private: + int32_t max_detections_; + int32_t max_classes_per_detection_; + int32_t detections_per_class_; + bool use_regular_non_max_suppression_; + float nms_score_threshold_; + float nms_iou_threshold_; + int32_t num_classes_; + float y_scale_; + float x_scale_; + float h_scale_; + float w_scale_; +}; +} diff --git a/include/nncase/ir/quantizer.h b/include/nncase/ir/quantizer.h index dcf8147619..39a83243b1 100644 --- a/include/nncase/ir/quantizer.h +++ b/include/nncase/ir/quantizer.h @@ -100,8 +100,8 @@ class NNCASE_API quantizer auto r = range.max - range.min; if (r == 0) r = 0.1f; - else if (r < 0.01f) - r = 0.01f; + // else if (r < 0.01f) + // r = 0.01f; range.max = range.min + r; } diff --git a/include/nncase/kernels/cpu/optimized/tensor_compute.h b/include/nncase/kernels/cpu/optimized/tensor_compute.h index 2b19afea12..89b81f34f1 100644 --- a/include/nncase/kernels/cpu/optimized/tensor_compute.h +++ b/include/nncase/kernels/cpu/optimized/tensor_compute.h @@ -85,4 +85,19 @@ template NNCASE_API result sigmoid(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides) noexcept; +template +NNCASE_API result instancenorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, float epsilon) noexcept; + +template +NNCASE_API result layernorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept; + +template +NNCASE_API result ternary(const float *input_a, const T *input_b, const T *input_c, T *output, + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, + const runtime_shape_t &out_strides) noexcept; + +template +result reduce(reduce_op_t op, T init_value, const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept; END_NS_NNCASE_KERNELS_CPU_OPT diff --git a/include/nncase/kernels/cpu/reference/tensor_compute.h b/include/nncase/kernels/cpu/reference/tensor_compute.h index aba8774ec9..bc5c3c14db 100644 --- a/include/nncase/kernels/cpu/reference/tensor_compute.h +++ b/include/nncase/kernels/cpu/reference/tensor_compute.h @@ -13,140 +13,255 @@ * limitations under the License. */ #pragma once + #include "runtime_types.h" #include BEGIN_NS_NNCASE_KERNELS_CPU_REF -NNCASE_API result batch_to_space(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, - const runtime_shape_t &block_shape, const runtime_paddings_t &crops, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, +NNCASE_API result batch_to_space(datatype_t type, const gsl::byte *input, gsl::byte *output, + const runtime_shape_t &in_shape, + const runtime_shape_t &block_shape, + const runtime_paddings_t &crops, + const runtime_shape_t &in_strides, + const runtime_shape_t &out_strides, kernel_context &context) noexcept; -NNCASE_API result broadcast(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, - const runtime_shape_t &in_strides, const runtime_shape_t &out_shape, const runtime_shape_t &out_strides, kernel_context &context) noexcept; +NNCASE_API result +broadcast(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, + const runtime_shape_t &in_strides, const runtime_shape_t &out_shape, + const runtime_shape_t &out_strides, kernel_context &context) noexcept; -NNCASE_API result concat(datatype_t type, gsl::span inputs, gsl::byte *output, const runtime_shape_t &out_shape, - gsl::span in_strides, const runtime_shape_t &out_strides, size_t axis, const runtime_shape_t &concat_dims, +NNCASE_API result +concat(datatype_t type, gsl::span inputs, gsl::byte *output, + const runtime_shape_t &out_shape, + gsl::span in_strides, const runtime_shape_t &out_strides, size_t axis, + const runtime_shape_t &concat_dims, kernel_context &context) noexcept; -NNCASE_API result convert(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output, - const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context) noexcept; +NNCASE_API result +convert(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output, + const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, + const runtime_shape_t &out_strides, kernel_context &context) noexcept; NNCASE_API result copy(datatype_t type, const gsl::byte *src, gsl::byte *dest, - const runtime_shape_t &shape, const runtime_shape_t &src_strides, const runtime_shape_t &dest_strides, kernel_context &context) noexcept; + const runtime_shape_t &shape, const runtime_shape_t &src_strides, + const runtime_shape_t &dest_strides, kernel_context &context) noexcept; -NNCASE_API result transpose(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, - const runtime_shape_t &perm, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context) noexcept; +NNCASE_API result +transpose(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, + const runtime_shape_t &perm, const runtime_shape_t &in_strides, + const runtime_shape_t &out_strides, kernel_context &context) noexcept; template NNCASE_API result binary(binary_op_t op, const T *input_a, const T *input_b, T *output, - const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, - const runtime_shape_t &in_b_strides, const runtime_shape_t &out_shape, const runtime_shape_t &out_strides, value_range fused_activation, kernel_context &context) noexcept; + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, + const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &out_shape, + const runtime_shape_t &out_strides, value_range fused_activation, + kernel_context &context) noexcept; -NNCASE_API result dequantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output, - const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, float scale, float bias, +NNCASE_API result +dequantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output, + const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, + const runtime_shape_t &out_strides, float scale, float bias, kernel_context &context) noexcept; template NNCASE_API result compare(compare_op_t op, const T *input_a, const T *input_b, bool *output, const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, const runtime_shape_t &in_b_strides, - const runtime_shape_t &out_shape, const runtime_shape_t &out_strides) noexcept; + const runtime_shape_t &out_shape, + const runtime_shape_t &out_strides) noexcept; -NNCASE_API result lut1d(datatype_t type, const gsl::byte *input, const gsl::byte *table, gsl::byte *output, const runtime_shape_t &shape, - const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const scalar &min, const scalar &max) noexcept; +NNCASE_API result +lut1d(datatype_t type, const gsl::byte *input, const gsl::byte *table, gsl::byte *output, + const runtime_shape_t &shape, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const scalar &min, + const scalar &max) noexcept; template NNCASE_API result matmul(const T *input_a, const T *input_b, const T *bias, T *output, - const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, - const runtime_shape_t &in_b_strides, const runtime_shape_t &out_shape, const runtime_shape_t &out_strides, + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, + const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &out_shape, + const runtime_shape_t &out_strides, value_range fused_activation) noexcept; -NNCASE_API result onehot(datatype_t type, const int32_t *indices, gsl::byte *output, const runtime_shape_t &indices_shape, const runtime_shape_t &out_shape, - const runtime_shape_t &out_strides, gsl::byte *depth, gsl::byte *off_value, gsl::byte *on_value, size_t axis, onehot_mode_t mode, kernel_context &context) noexcept; +NNCASE_API result +onehot(datatype_t type, const int32_t *indices, gsl::byte *output, const runtime_shape_t &indices_shape, + const runtime_shape_t &out_shape, + const runtime_shape_t &out_strides, gsl::byte *depth, gsl::byte *off_value, gsl::byte *on_value, + size_t axis, onehot_mode_t mode, kernel_context &context) noexcept; -NNCASE_API result pad(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, - const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_paddings_t &paddings, pad_mode_t mode, const scalar &pad_value, +NNCASE_API result +pad(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, + const runtime_paddings_t &paddings, pad_mode_t mode, const scalar &pad_value, kernel_context &context) noexcept; -NNCASE_API result quantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output, - const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, float scale, float bias, +NNCASE_API result +quantize(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output, + const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, + const runtime_shape_t &out_strides, float scale, float bias, kernel_context &context) noexcept; -NNCASE_API result unary(unary_op_t op, const float *input, float *output, const runtime_shape_t &shape, - const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context) noexcept; +NNCASE_API result +unary(unary_op_t op, const float *input, float *output, const runtime_shape_t &shape, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, + kernel_context &context) noexcept; template -NNCASE_API result reduce(reduce_op_t op, T init_value, const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, - const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept; +NNCASE_API result +reduce(reduce_op_t op, T init_value, const T *input, T *output, const runtime_shape_t &in_shape, + const runtime_shape_t &axis, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, + kernel_context &context) noexcept; template -NNCASE_API result reduce_arg(reduce_arg_op_t op, const float *input, T *output, const runtime_shape_t &in_shape, +NNCASE_API result +reduce_arg(reduce_arg_op_t op, const float *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, - const runtime_shape_t &axis, bool keep_dims, bool select_last_idx, kernel_context &context) noexcept; + const runtime_shape_t &axis, bool keep_dims, bool select_last_idx, + kernel_context &context) noexcept; template result reduce_prod(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_shape_t &axes, bool keep_dims) noexcept; -NNCASE_API result resize_bilinear(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, - const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, int32_t out_h, int32_t out_w, bool align_corners, bool half_pixel_centers, +NNCASE_API result resize_bilinear(datatype_t type, const gsl::byte *input, gsl::byte *output, + const runtime_shape_t &in_shape, + const runtime_shape_t &in_strides, + const runtime_shape_t &out_strides, int32_t out_h, + int32_t out_w, bool align_corners, bool half_pixel_centers, kernel_context &context) noexcept; -NNCASE_API result resize_nearest_neighbor(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, - const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, int32_t out_h, int32_t out_w, bool align_corners, bool half_pixel_centers, +NNCASE_API result +resize_nearest_neighbor(datatype_t type, const gsl::byte *input, gsl::byte *output, + const runtime_shape_t &in_shape, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, + int32_t out_h, int32_t out_w, bool align_corners, bool half_pixel_centers, kernel_context &context) noexcept; -NNCASE_API result slice(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, - const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const runtime_shape_t &begins, const runtime_axis_t &ends, const runtime_axis_t &strides, +NNCASE_API result +slice(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, + const runtime_shape_t &begins, const runtime_axis_t &ends, const runtime_axis_t &strides, kernel_context &context) noexcept; -NNCASE_API result gather(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, const runtime_shape_t &out_shape, - const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const int32_t *indices, const runtime_shape_t &indices_shape, size_t axis, kernel_context &context) noexcept; +NNCASE_API result +gather(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, + const runtime_shape_t &out_shape, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const int32_t *indices, + const runtime_shape_t &indices_shape, size_t axis, kernel_context &context) noexcept; -NNCASE_API result gather_nd(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, const runtime_shape_t &out_shape, - const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const int32_t *indices, const runtime_shape_t &indices_shape, size_t batch_dims, kernel_context &context) noexcept; +NNCASE_API result +gather_nd(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, + const runtime_shape_t &out_shape, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, const int32_t *indices, + const runtime_shape_t &indices_shape, size_t batch_dims, kernel_context &context) noexcept; template NNCASE_API result cumsum(const T *input, T *output, const runtime_shape_t &in_shape, int32_t axis, bool exclusive, bool reverse) noexcept; template -NNCASE_API result hardmax(const T *input, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, +NNCASE_API result +hardmax(const T *input, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, T *output, int32_t axis) noexcept; template -NNCASE_API result random_normal(T *output, const runtime_shape_t &out_shape, float mean, float std, float seed) noexcept; +NNCASE_API result +random_normal(T *output, const runtime_shape_t &out_shape, float mean, float std, float seed) noexcept; template -NNCASE_API result random_uniform(T *output, const runtime_shape_t &out_shape, float low, float high, float seed) noexcept; +NNCASE_API result +random_uniform(T *output, const runtime_shape_t &out_shape, float low, float high, float seed) noexcept; template -NNCASE_API result roi_align(const T *input, const T *rois, int64_t *batch_indices, T *output, const runtime_shape_t &in_shape, - const runtime_shape_t &out_shape, roi_align_mode_t mode, float spatial_scale, int64_t sampling_ratio) noexcept; +NNCASE_API result roi_align(const T *input, const T *rois, int64_t *batch_indices, T *output, + const runtime_shape_t &in_shape, + const runtime_shape_t &out_shape, roi_align_mode_t mode, + float spatial_scale, int64_t sampling_ratio) noexcept; template -NNCASE_API result sigmoid(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides) noexcept; +NNCASE_API result +sigmoid(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, + const runtime_shape_t &out_strides) noexcept; template -NNCASE_API result softmax(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, +NNCASE_API result +softmax(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, int32_t axis, float beta) noexcept; template NNCASE_API result ternary(const float *input_a, const T *input_b, const T *input_c, T *output, - const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, - const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, + const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, + const runtime_shape_t &in_c_strides, const runtime_shape_t &out_strides) noexcept; template NNCASE_API result topk(const T *input, T *output_values, int64_t *output_indices, const runtime_shape_t &in_shape, const runtime_shape_t &in_strides, - const runtime_shape_t &output_values_shape, const runtime_shape_t &output_values_strides, - const runtime_shape_t &output_indices_shape, const runtime_shape_t &output_indices_strides, - const int64_t k, const int32_t axis, const bool largest, const bool sorted) noexcept; + const runtime_shape_t &output_values_shape, + const runtime_shape_t &output_values_strides, + const runtime_shape_t &output_indices_shape, + const runtime_shape_t &output_indices_strides, + const int64_t k, const int32_t axis, const bool largest, + const bool sorted) noexcept; + +template +NNCASE_API result +trilu(const T *input, T *output, const runtime_shape_t &in_shape, const bool upper, + const int64_t k) noexcept; + +template +NNCASE_API result +gru(const T *input, const T *w, const T *r, const T *b, T *initial_h, T *output, T *output_h, + const runtime_shape_t &input_shape, const runtime_shape_t &w_shape, int mode, bool linear_before_reset) noexcept; + +template +NNCASE_API result +tflite_detection_postprocess(const T *boxes, const T *scores, const T *anchors, T *output_locations, + T *output_classes, T *output_scores, T *output_num_detections, + const runtime_shape_t &boxes_shape, const runtime_shape_t &scores_shape, + const runtime_shape_t &anchors_shape, + const int32_t max_detections, const int32_t max_classes_per_detection, + const int32_t detections_per_class, + const bool use_regular_non_max_suppression, + const float nms_score_threshold, const float nms_iou_threshold, + const int32_t num_classes, const float y_scale, const float x_scale, + const float h_scale, const float w_scale) noexcept; + +NNCASE_API result space_to_batch(datatype_t type, const gsl::byte *input, gsl::byte *output, + const runtime_shape_t &in_shape, + const runtime_shape_t &block_shape, + const runtime_paddings_t &paddings, + const runtime_shape_t &in_strides, + const runtime_shape_t &out_strides, + kernel_context &context) noexcept; + +template +NNCASE_API result +gather_elements(const TI *input, const TK *indices, TI *output, const runtime_shape_t &in_shape, + const runtime_shape_t &indices_shape, const int axis) noexcept; + +template +NNCASE_API result +instancenorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, + float epsilon) noexcept; + +template +NNCASE_API result +layernorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, int32_t axis, + float epsilon) noexcept; template -NNCASE_API result trilu(const T *input, T *output, const runtime_shape_t &in_shape, const bool upper, const int64_t k) noexcept; +NNCASE_API result +compress(const T *input, const uint8_t *condition, T *output, const runtime_shape_t &input_shape, + const runtime_shape_t &condition_shape, const int axis) noexcept; END_NS_NNCASE_KERNELS_CPU_REF diff --git a/include/nncase/kernels/neutral/neutral_kernels.h b/include/nncase/kernels/neutral/neutral_kernels.h index 2a1ee447d5..28add99dbb 100644 --- a/include/nncase/kernels/neutral/neutral_kernels.h +++ b/include/nncase/kernels/neutral/neutral_kernels.h @@ -816,4 +816,575 @@ inline void table_lookup1d(const uint8_t *CXX_RESTRICT input, uint8_t *CXX_RESTR for (size_t i = 0; i < size; i++) output[i] = table[input[i]]; } + +template +void gru(const T *CXX_RESTRICT input, const T *CXX_RESTRICT w, const T *CXX_RESTRICT r, const T *CXX_RESTRICT b, T *CXX_RESTRICT initial_h, T *CXX_RESTRICT output, T *CXX_RESTRICT output_h, + const runtime_shape_t &input_shape, const runtime_shape_t &w_shape, int mode) +{ + const int seq_length = input_shape[0]; + const int batch_size = input_shape[1]; + const int input_size = input_shape[2]; + const int num_direction = w_shape[0]; + const int hidden_size = w_shape[1] / 3; + + auto sigmoid = [&](float x) { + return 1 / (1 + std::exp(-x)); + }; + auto tanh = [&](float x) { + return std::tanh(x); + }; + runtime_shape_t out_shape { (size_t)seq_length, (size_t)num_direction, (size_t)batch_size, (size_t)hidden_size }; + + auto x_gate_size = batch_size * input_size; + auto w_gate_size = 3 * hidden_size * input_size; + auto h_t_size = batch_size * hidden_size; + auto r_gate_size = 3 * hidden_size * hidden_size; + + auto tmp_a = std::vector(batch_size * hidden_size, 0.f); + auto tmp_b = std::vector(batch_size * hidden_size, 0.f); + auto gate_z = std::vector(batch_size * hidden_size, 0.f); + auto gate_r = std::vector(batch_size * hidden_size, 0.f); + auto gate_h = std::vector(batch_size * hidden_size, 0.f); + + std::vector seq_len_loop; + for (int l = 0; l < seq_length; l++) + seq_len_loop.push_back(l); + if (mode == lstm_direction::kReverse) + std::reverse(seq_len_loop.begin(), seq_len_loop.end()); + auto x_i = input; + auto h_t = initial_h; + auto w_i = w; + auto r_i = r; + auto b_i = b; + for (int d = 0; d < num_direction; d++) + { + h_t = initial_h + d * h_t_size; + w_i = w + d * w_gate_size; + r_i = r + d * r_gate_size; + b_i = b + d * 6 * hidden_size; + if (d == 1) + std::reverse(seq_len_loop.begin(), seq_len_loop.end()); + for (auto i : seq_len_loop) + { + x_i = input + i * x_gate_size; + // clean gate_z gate_r gate_h + std::fill(gate_z.begin(), gate_z.end(), 0.f); + std::fill(gate_r.begin(), gate_r.end(), 0.f); + std::fill(gate_h.begin(), gate_h.end(), 0.f); + + // clean tmp_a tmp_b + std::fill(tmp_a.begin(), tmp_a.end(), 0.f); + std::fill(tmp_b.begin(), tmp_b.end(), 0.f); + // gate_z = x_i * w_i_z + b_w_z + h_t *r_i_z + b_r_z + for (int bs = 0; bs < batch_size; bs++) + { + for (int hs = 0; hs < hidden_size; hs++) + { + for (int is = 0; is < input_size; is++) + { + tmp_a[bs * hidden_size + hs] += x_i[bs * input_size + is] * w_i[hs * input_size + is]; + } + tmp_a[bs * hidden_size + hs] += b_i[hs]; + for (int rs = 0; rs < hidden_size; rs++) + { + tmp_b[bs * hidden_size + hs] += h_t[bs * hidden_size + rs] * r_i[hs * hidden_size + rs]; + } + tmp_b[bs * hidden_size + hs] += b_i[3 * hidden_size + hs]; + gate_z[bs * hidden_size + hs] = tmp_a[bs * hidden_size + hs] + tmp_b[bs * hidden_size + hs]; + } + } + // gate_z = sigmoid(gate_z); + std::transform(gate_z.begin(), gate_z.end(), gate_z.begin(), sigmoid); + + // clear tmp_a tmp_b + std::fill(tmp_a.begin(), tmp_a.end(), 0.f); + std::fill(tmp_b.begin(), tmp_b.end(), 0.f); + // gate_r = x_i * w_i_r + b_w_r + h_t *r_i_r + b_r_r + for (int bs = 0; bs < batch_size; bs++) + { + for (int hs = 0; hs < hidden_size; hs++) + { + for (int is = 0; is < input_size; is++) + { + tmp_a[bs * hidden_size + hs] += x_i[bs * input_size + is] * w_i[hidden_size * input_size + hs * input_size + is]; + } + tmp_a[bs * hidden_size + hs] += b_i[hidden_size + hs]; + for (int rs = 0; rs < hidden_size; rs++) + { + tmp_b[bs * hidden_size + hs] += h_t[bs * hidden_size + rs] * r_i[hidden_size * hidden_size + hs * hidden_size + rs]; + } + tmp_b[bs * hidden_size + hs] += b_i[4 * hidden_size + hs]; + gate_r[bs * hidden_size + hs] = tmp_a[bs * hidden_size + hs] + tmp_b[bs * hidden_size + hs]; + } + } + // gate_r = sigmoid(gate_r); + std::transform(gate_r.begin(), gate_r.end(), gate_r.begin(), sigmoid); + + // clear tmp_a tmp_b + std::fill(tmp_a.begin(), tmp_a.end(), 0.f); + std::fill(tmp_b.begin(), tmp_b.end(), 0.f); + // gate_h = x_i * w_i_h + b_w_h + gate_r·h_t *r_i_h + b_r_h + for (int bs = 0; bs < batch_size; bs++) + { + for (int hs = 0; hs < hidden_size; hs++) + { + for (int is = 0; is < input_size; is++) + { + tmp_a[bs * hidden_size + hs] += x_i[bs * input_size + is] * w_i[2 * hidden_size * input_size + hs * input_size + is]; + } + tmp_a[bs * hidden_size + hs] += b_i[2 * hidden_size + hs]; + for (int rs = 0; rs < hidden_size; rs++) + { + // if not linear + tmp_b[bs * hidden_size + hs] += gate_r[bs * hidden_size + rs] * h_t[bs * hidden_size + rs] * r_i[2 * hidden_size * hidden_size + hs * hidden_size + rs]; + // if linear + // tmp_b[bs * batch_size + hs] += h_t[bs * batch_size + rs] * r_i[hidden_size * hidden_size + hs * hidden_size + rs] + b_i[5 * hidden_size + hs]; + } + tmp_b[bs * hidden_size + hs] += b_i[5 * hidden_size + hs]; + + // if not linear + gate_h[bs * hidden_size + hs] = tmp_a[bs * hidden_size + hs] + tmp_b[bs * hidden_size + hs]; + // if linear + // gate_h[bs * batch_size + hs] = tmp_a[bs * batch_size + hs] + gate_r[bs * batch_size + rs] * tmp_b[bs * batch_size + hs]; + } + } + // gate_h = tanh(gate_h); + std::transform(gate_h.begin(), gate_h.end(), gate_h.begin(), tanh); + + for (int k = 0; k < batch_size * hidden_size; k++) + { + h_t[k] = (1 - gate_z[k]) * gate_h[k] + gate_z[k] * h_t[k]; + // *output++ = h_t[k]; + output[i * (num_direction * batch_size * hidden_size) + d * (batch_size * hidden_size) + k] = h_t[k]; + } + } + // if (mode == lstm_direction::kReverse || d == 1) + // h_t.reverse(); + for (int k = 0; k < batch_size * hidden_size; k++) + { + output_h[d * (batch_size * hidden_size) + k] = h_t[k]; + } + } +} + +template +void tflite_detection_postprocess(const T *CXX_RESTRICT boxes, const T *CXX_RESTRICT scores, const T *CXX_RESTRICT anchors, T *CXX_RESTRICT output_locations, T *CXX_RESTRICT output_classes, T *CXX_RESTRICT output_scores, T *CXX_RESTRICT output_num_detections, + const runtime_shape_t &boxes_shape, const runtime_shape_t &scores_shape, NNCASE_UNUSED const runtime_shape_t &anchors_shape, + const int32_t max_detections, const int32_t max_classes_per_detection, const int32_t detections_per_class, + const bool use_regular_non_max_suppression, const float nms_score_threshold, const float nms_iou_threshold, + const int32_t num_classes, const float y_scale, const float x_scale, const float h_scale, const float w_scale) +{ + struct CenterSizeEncoding + { + float y; + float x; + float h; + float w; + }; + struct BoxCornerEncoding + { + float ymin; + float xmin; + float ymax; + float xmax; + }; + struct BoxInfo + { + int index; + float score; + }; + + auto compute_iou = [&](const std::vector &box, const int &i, const int &j) { + auto &box_i = box[i]; + auto &box_j = box[j]; + const float area_i = (box_i.ymax - box_i.ymin) * (box_i.xmax - box_i.xmin); + const float area_j = (box_j.ymax - box_j.ymin) * (box_j.xmax - box_j.xmin); + if (area_i <= 0 || area_j <= 0) + return 0.f; + const float intersection_y_min = std::max(box_i.ymin, box_j.ymin); + const float intersection_x_min = std::max(box_i.xmin, box_j.xmin); + const float intersection_y_max = std::min(box_i.ymax, box_j.ymax); + const float intersection_x_max = std::min(box_i.xmax, box_j.xmax); + const float intersection_area = std::max(intersection_y_max - intersection_y_min, 0.0) * std::max(intersection_x_max - intersection_x_min, 0.0); + return intersection_area / (area_i + area_j - intersection_area); + }; + + const auto num_boxes = (int)anchors_shape[0]; + const auto num_classes_with_background = (int)scores_shape[2]; // num_classes + background + const auto num_detections_per_class = std::min(detections_per_class, max_detections); + int label_offset = num_classes_with_background - num_classes; + // DecodeCenterSizeBoxes: get decoded_boxes + std::vector decoded_boxes(boxes_shape[1]); + { + + CenterSizeEncoding box_center_size; + CenterSizeEncoding scale_values { y_scale, x_scale, h_scale, w_scale }; + CenterSizeEncoding anchor; + + for (int index = 0; index < num_boxes; index++) + { + const auto box_encoding_index = index * boxes_shape[2]; + box_center_size = *reinterpret_cast(boxes + box_encoding_index); + anchor = *reinterpret_cast(anchors + box_encoding_index); + + auto y_center = static_cast(static_cast(box_center_size.y) / static_cast(scale_values.y) * static_cast(anchor.h) + static_cast(anchor.y)); + auto x_center = static_cast(static_cast(box_center_size.x) / static_cast(scale_values.x) * static_cast(anchor.w) + static_cast(anchor.x)); + auto half_h = static_cast(0.5 * (std::exp(static_cast(box_center_size.h) / static_cast(scale_values.h))) * static_cast(anchor.h)); + auto half_w = static_cast(0.5 * (std::exp(static_cast(box_center_size.w) / static_cast(scale_values.w))) * static_cast(anchor.w)); + decoded_boxes[index].ymin = y_center - half_h; + decoded_boxes[index].xmin = x_center - half_w; + decoded_boxes[index].ymax = y_center + half_h; + decoded_boxes[index].xmax = x_center + half_w; + } + } + // NMS MultiClass + { + if (use_regular_non_max_suppression) + { + // NMS Regular + int sorted_indices_size = 0; + std::vector box_info_after_regular_nms(max_detections + num_detections_per_class); + std::vector num_selected(num_classes); + + // compute nms + std::vector class_scores(num_boxes); + std::vector selected; + selected.reserve(num_detections_per_class); + + for (auto col = 0; col < num_classes - 1; col++) + { + const float *scores_base = scores + col + label_offset; + for (int row = 0; row < num_boxes; row++) + { + // Get scores of boxes corresponding to all anchors for single class + class_scores[row] = *scores_base; + scores_base += num_classes_with_background; + } + // Perform non-maximal suppression on single class + selected.clear(); + + // NMS SingleClass + { + std::vector keep_indices; + std::vector keep_scores; + // select detection box score above score threshold + { + for (size_t i = 0; i < class_scores.size(); i++) + { + if (class_scores[i] >= nms_score_threshold) + { + keep_scores.emplace_back(class_scores[i]); + keep_indices.emplace_back(i); + } + } + } + + int num_scores_kept = (int)keep_scores.size(); + std::vector sorted_indices; + sorted_indices.resize(num_scores_kept); + // DecreasingArgSort + { + std::iota(sorted_indices.begin(), sorted_indices.begin() + num_scores_kept, 0); + std::stable_sort( + sorted_indices.begin(), sorted_indices.begin() + num_scores_kept, + [&keep_scores](const int i, const int j) { return keep_scores[i] > keep_scores[j]; }); + } + + const int output_size = std::min(num_scores_kept, max_detections); + selected.clear(); + int num_active_candidate = num_scores_kept; + std::vector active_box_candidate(num_scores_kept, 1); + for (int i = 0; i < num_scores_kept; ++i) + { + if (num_active_candidate == 0 || (int)selected.size() >= output_size) + break; + if (active_box_candidate[i] == 1) + { + selected.push_back(keep_indices[sorted_indices[i]]); + active_box_candidate[i] = 0; + num_active_candidate--; + } + else + { + continue; + } + for (int j = i + 1; j < num_scores_kept; ++j) + { + if (active_box_candidate[j] == 1) + { + + float iou = compute_iou( + decoded_boxes, keep_indices[sorted_indices[i]], + keep_indices[sorted_indices[j]]); + + if (iou > nms_iou_threshold) + { + active_box_candidate[j] = 0; + num_active_candidate--; + } + } + } + } + } + // end NMS SingleClass + + if (selected.empty()) + { + continue; + } + for (size_t i = 0; i < selected.size(); ++i) + { + box_info_after_regular_nms[sorted_indices_size + i].score = class_scores[selected[i]]; + box_info_after_regular_nms[sorted_indices_size + i].index = (selected[i] * num_classes_with_background + col + label_offset); + } + + // In-place merge the original boxes and new selected boxes which are both + // sorted by scores. + std::inplace_merge(box_info_after_regular_nms.begin(), box_info_after_regular_nms.begin() + sorted_indices_size, + box_info_after_regular_nms.begin() + sorted_indices_size + selected.size(), + [](const BoxInfo &a, const BoxInfo &b) { return a.score >= b.score; }); + + sorted_indices_size = std::min(sorted_indices_size + static_cast(selected.size()), max_detections); + } + // end compute nms result + + // Allocate output tensors + for (int output_box_index = 0; output_box_index < max_detections; output_box_index++) + { + if (output_box_index < sorted_indices_size) + { + const int anchor_index = floor( + box_info_after_regular_nms[output_box_index].index / num_classes_with_background); + const int class_index = box_info_after_regular_nms[output_box_index].index - anchor_index * num_classes_with_background - label_offset; + const float selected_score = box_info_after_regular_nms[output_box_index].score; + // detection_boxes + reinterpret_cast(output_locations)[output_box_index] = decoded_boxes[anchor_index]; + // detection_classes + output_classes[output_box_index] = class_index; + // detection_scores + output_scores[output_box_index] = selected_score; + } + else + { + // detection_boxes + reinterpret_cast(output_locations)[output_box_index] = { 0.0f, 0.0f, 0.0f, 0.0f }; + // detection_classes + output_classes[output_box_index] = 0.0f; + // detection_scores + output_scores[output_box_index] = 0.0f; + } + } + output_num_detections[0] = sorted_indices_size; + box_info_after_regular_nms.clear(); + } + else + { + // Fast NMS + + const int max_categories_per_anchor = max_classes_per_detection; + const int num_categories_per_anchor = std::min(max_categories_per_anchor, num_classes); + + std::vector max_scores; + max_scores.resize(num_boxes); + std::vector sorted_class_indices; + sorted_class_indices.resize(num_boxes * num_categories_per_anchor); + + for (int row = 0; row < num_boxes; row++) + { + const float *box_scores = scores + row * num_classes_with_background + label_offset; + int *class_indices = sorted_class_indices.data() + row * num_categories_per_anchor; + + // DecreasingPartialArgSort + if (num_categories_per_anchor == 1) + { + auto arg_max_vector = [&](const T *input_data, int size) { + T max_value = input_data[0]; + int max_index = 0; + for (int i = 1; i < size; ++i) + { + // const T curr_value = input_data[i]; + if (input_data[i] > max_value) + { + max_value = input_data[i]; + max_index = i; + } + } + return max_index; + }; + class_indices[0] = arg_max_vector(box_scores, num_classes); + } + else + { + std::iota(class_indices, class_indices + num_classes, 0); + std::partial_sort( + class_indices, class_indices + num_categories_per_anchor, class_indices + num_classes, + [&box_scores](const int i, const int j) { return box_scores[i] > box_scores[j]; }); + } + // end DecreasingPartialArgSort + + max_scores[row] = box_scores[class_indices[0]]; + } + std::vector selected; + // NMS SingleClass + { + std::vector keep_indices; + std::vector keep_scores; + // select detection box score above score threshold + { + for (size_t i = 0; i < max_scores.size(); i++) + { + if (max_scores[i] >= nms_score_threshold) + { + keep_scores.emplace_back(max_scores[i]); + keep_indices.emplace_back(i); + } + } + } + + int num_scores_kept = (int)keep_scores.size(); + std::vector sorted_indices; + sorted_indices.resize(num_scores_kept); + // DecreasingArgSort + { + std::iota(sorted_indices.begin(), sorted_indices.begin() + num_scores_kept, 0); + std::stable_sort( + sorted_indices.begin(), sorted_indices.begin() + num_scores_kept, + [&keep_scores](const int i, const int j) { return keep_scores[i] > keep_scores[j]; }); + } + const int output_size = std::min(num_scores_kept, max_detections); + selected.clear(); + int num_active_candidate = num_scores_kept; + std::vector active_box_candidate(num_scores_kept, 1); + for (int i = 0; i < num_scores_kept; ++i) + { + if (num_active_candidate == 0 || (int)selected.size() >= output_size) + break; + if (active_box_candidate[i] == 1) + { + selected.push_back(keep_indices[sorted_indices[i]]); + active_box_candidate[i] = 0; + num_active_candidate--; + } + else + { + continue; + } + for (int j = i + 1; j < num_scores_kept; ++j) + { + if (active_box_candidate[j] == 1) + { + + float iou = compute_iou( + decoded_boxes, keep_indices[sorted_indices[i]], + keep_indices[sorted_indices[j]]); + if (iou > nms_iou_threshold) + { + active_box_candidate[j] = 0; + num_active_candidate--; + } + } + } + } + } + // end NMS SingleClass + + // Allocate output tensors + int output_box_index = 0; + for (const auto &selected_index : selected) + { + const float *box_scores = scores + selected_index * num_classes_with_background + label_offset; + const int *class_indices = sorted_class_indices.data() + selected_index * num_categories_per_anchor; + + for (int col = 0; col < num_categories_per_anchor; ++col) + { + int box_offset = max_categories_per_anchor * output_box_index + col; + // detection_boxes + reinterpret_cast(output_locations)[box_offset] = decoded_boxes[selected_index]; + // detection_classes + output_classes[box_offset] = class_indices[col]; + // detection_scores + output_scores[box_offset] = box_scores[class_indices[col]]; + } + output_box_index++; + } + output_num_detections[0] = output_box_index; + } + } +} + +inline void layernorm(const float *input, float *output, float *scale, float *bias, runtime_shape_t in_shape, int32_t axis, float epsilon) +{ + auto outer_size = 1; + auto inner_size = 1; + for (auto i = 0; i < axis; i++) + outer_size *= in_shape[i]; + for (auto i = axis; i < in_shape.size(); i++) + inner_size *= in_shape[i]; + + for (int32_t batch = 0; batch < outer_size; batch++) + { + auto src = input + batch * inner_size; + auto dest = output + batch * inner_size; + + float mean1 = 0.f; + for (size_t i = 0; i < inner_size; i++) + mean1 += src[i] / inner_size; + + std::vector sub(inner_size, 0.f); + for (size_t i = 0; i < inner_size; i++) + sub[i] = src[i] - mean1; + + std::vector pow(inner_size, 0.f); + for (size_t i = 0; i < inner_size; i++) + pow[i] = sub[i] * sub[i]; + + float mean2 = 0.f; + for (size_t i = 0; i < inner_size; i++) + mean2 += pow[i] / inner_size; + + float add = mean2 + epsilon; + float sqrt = std::sqrt(add); + + std::vector div(inner_size, 0.f); + for (size_t i = 0; i < inner_size; i++) + div[i] = sub[i] / sqrt; + + for (size_t i = 0; i < inner_size; i++) + dest[i] = div[i] * scale[i] + bias[i]; + } +} + +template +void compress(const T *input, const uint8_t *condition, T *output, const runtime_shape_t &input_shape, const runtime_shape_t &condition_shape, const int axis) +{ + if (axis == (int)input_shape.size()) + { + for (auto i = 0; i < (int)condition_shape[0]; i++) + { + if ((float)*(condition + i) == 0) + { + continue; + } + *output++ = *(input + i); + } + } + else + { + int select_slice = 1; + for (auto i = axis + 1; i < (int)input_shape.size(); i++) + { + select_slice *= input_shape[i]; + } + for (auto j = 0; j < (int)kernels::detail::compute_size(input_shape); j++) + { + auto i = j % (select_slice * input_shape[axis]); + auto cond_index = i / select_slice; + if (select_slice != 1 && (cond_index >= condition_shape[0] || condition[cond_index] == 0)) + continue; + if (select_slice == 1 && (i % input_shape[axis] >= condition_shape[0] || condition[cond_index % input_shape[axis] % condition_shape[0]] == 0)) + continue; + *output++ = *(input + j); + } + } +} } diff --git a/include/nncase/kernels/tensor_compute.h b/include/nncase/kernels/tensor_compute.h index 0b139f0ebb..6f9c6c8a3d 100644 --- a/include/nncase/kernels/tensor_compute.h +++ b/include/nncase/kernels/tensor_compute.h @@ -152,4 +152,31 @@ NNCASE_API result topk(const T *input, T *output_values, int64_t *output_i template NNCASE_API result trilu(const T *input, T *output, const runtime_shape_t &in_shape, const bool upper, const int64_t k) noexcept; +template +NNCASE_API result gru(const T *input, const T *w, const T *r, const T *b, T *initial_h, T *output, T *output_h, const runtime_shape_t &input_shape, const runtime_shape_t &w_shape, int mode, bool linear_before_reset) noexcept; + +template +NNCASE_API result tflite_detection_postprocess(const T *boxes, const T *scores, const T *anchors, T *output_locations, T *output_classes, T *output_scores, T *output_num_detections, + const runtime_shape_t &boxes_shape, const runtime_shape_t &scores_shape, const runtime_shape_t &anchors_shape, + const int32_t max_detections, const int32_t max_classes_per_detection, const int32_t detections_per_class, + const bool use_regular_non_max_suppression, const float nms_score_threshold, const float nms_iou_threshold, + const int32_t num_classes, const float y_scale, const float x_scale, const float h_scale, const float w_scale) noexcept; + +NNCASE_API result space_to_batch(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, + const runtime_shape_t &block_shape, const runtime_paddings_t &crops, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, + kernel_context &context = default_kernel_context()) noexcept; + +template +NNCASE_API result gather_elements(const TI *input, const TK *indices, TI *output, const runtime_shape_t &in_shape, + const runtime_shape_t &indices_shape, const int axis) noexcept; + +template +NNCASE_API result instancenorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, float epsilon) noexcept; + +template +NNCASE_API result layernorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept; + +template +NNCASE_API result compress(const T *input, const uint8_t *condition, T *output, const runtime_shape_t &input_shape, const runtime_shape_t &condition_shape, const int axis) noexcept; + END_NS_NNCASE_KERNELS diff --git a/include/nncase/runtime/datatypes.h b/include/nncase/runtime/datatypes.h index c42013e068..d894cb4100 100644 --- a/include/nncase/runtime/datatypes.h +++ b/include/nncase/runtime/datatypes.h @@ -258,7 +258,8 @@ typedef enum _unary_op unary_square, unary_tanh, unary_bitwise_not, - unary_logical_not + unary_logical_not, + unary_erf } unary_op_t; inline std::string unary_op_to_string(unary_op_t op) @@ -301,6 +302,8 @@ inline std::string unary_op_to_string(unary_op_t op) return "unary_bitwise_not"; case unary_logical_not: return "unary_logical_not"; + case unary_erf: + return "unary_erf"; } return "unknown"; } diff --git a/include/nncase/runtime/nnil.h b/include/nncase/runtime/nnil.h index b4b0475f1e..5f690ca378 100644 --- a/include/nncase/runtime/nnil.h +++ b/include/nncase/runtime/nnil.h @@ -54,6 +54,7 @@ typedef enum _nnil_opcode nnil_min = 0x44, nnil_max = 0x45, nnil_pow = 0x46, + nnil_erf = 0x47, nnil_clamp = 0x80, nnil_ret = 0xA0 } nnil_opcode_t; diff --git a/include/nncase/runtime/runtime_op_utility.h b/include/nncase/runtime/runtime_op_utility.h index 0399096d11..a284e7df9c 100644 --- a/include/nncase/runtime/runtime_op_utility.h +++ b/include/nncase/runtime/runtime_op_utility.h @@ -71,7 +71,7 @@ inline void adapt_strides(const shape_type &shape, strides_type &strides, template inline std::size_t compute_strides(const shape_type &shape, - strides_type &strides, bs_ptr bs) + strides_type &strides, NNCASE_UNUSED bs_ptr bs) { using strides_value_type = typename std::decay_t::value_type; strides_value_type data_size = 1; @@ -79,7 +79,7 @@ inline std::size_t compute_strides(const shape_type &shape, { strides[i - 1] = data_size; data_size = strides[i - 1] * static_cast(shape[i - 1]); - adapt_strides(shape, strides, bs, i - 1); + // adapt_strides(shape, strides, bs, i - 1); } return static_cast(data_size); } @@ -283,7 +283,7 @@ inline bool is_optimized_binary_op(binary_op_t op) inline bool is_optimized_unary_op(unary_op_t op) { - return op == unary_abs || op == unary_ceil || op == unary_cos || op == unary_exp || op == unary_floor || op == unary_log || op == unary_neg || op == unary_round || op == unary_rsqrt || op == unary_sign || op == unary_sin || op == unary_sqrt || op == unary_square || op == unary_tanh; + return op == unary_abs || op == unary_ceil || op == unary_cos || op == unary_exp || op == unary_floor || op == unary_log || op == unary_neg || op == unary_round || op == unary_sign || op == unary_sin || op == unary_sqrt || op == unary_square || op == unary_tanh; } template @@ -300,6 +300,16 @@ bool is_optimized_input_shape(TShape in_shape, TShape out_shape) return false; } +inline void get_gather_index(const std::vector &per_axis_size, std::vector &index, size_t i, int axis, int idx) +{ + if (idx != (int)per_axis_size.size()) + { + int new_idx = i / per_axis_size[idx]; + index.push_back(new_idx); + get_gather_index(per_axis_size, index, i - new_idx * per_axis_size[idx], axis, idx + 1); + } +} + struct DefaultCallable { }; diff --git a/include/nncase/runtime/runtime_tensor.h b/include/nncase/runtime/runtime_tensor.h index 7ef31f200e..1c7b8955ef 100644 --- a/include/nncase/runtime/runtime_tensor.h +++ b/include/nncase/runtime/runtime_tensor.h @@ -134,12 +134,12 @@ class NNCASE_API mapped_buffer typedef std::function data_deleter_t; NNCASE_API runtime_tensor_type &tensor_type() noexcept; -NNCASE_API result create(datatype_t datatype, runtime_shape_t shape, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept; -NNCASE_API result create(datatype_t datatype, runtime_shape_t shape, gsl::span data, bool copy, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept; -NNCASE_API result create(datatype_t datatype, runtime_shape_t shape, gsl::span data, data_deleter_t data_deleter, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept; -NNCASE_API result create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept; -NNCASE_API result create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, gsl::span data, bool copy, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept; -NNCASE_API result create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, gsl::span data, data_deleter_t data_deleter, memory_pool_t pool = pool_cpu_only, uintptr_t physical_address = 0) noexcept; +NNCASE_API result create(datatype_t datatype, runtime_shape_t shape, memory_pool_t pool = pool_shared, uintptr_t physical_address = 0) noexcept; +NNCASE_API result create(datatype_t datatype, runtime_shape_t shape, gsl::span data, bool copy, memory_pool_t pool = pool_shared, uintptr_t physical_address = 0) noexcept; +NNCASE_API result create(datatype_t datatype, runtime_shape_t shape, gsl::span data, data_deleter_t data_deleter, memory_pool_t pool = pool_shared, uintptr_t physical_address = 0) noexcept; +NNCASE_API result create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, memory_pool_t pool = pool_shared, uintptr_t physical_address = 0) noexcept; +NNCASE_API result create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, gsl::span data, bool copy, memory_pool_t pool = pool_shared, uintptr_t physical_address = 0) noexcept; +NNCASE_API result create(datatype_t datatype, runtime_shape_t shape, runtime_shape_t strides, gsl::span data, data_deleter_t data_deleter, memory_pool_t pool = pool_shared, uintptr_t physical_address = 0) noexcept; NNCASE_API result memory_pool(const runtime_tensor &tensor) noexcept; NNCASE_API result map(runtime_tensor &tensor, map_access_t access) noexcept; NNCASE_API result sync(runtime_tensor &tensor, sync_op_t op, bool force = false) noexcept; diff --git a/include/nncase/runtime/small_vector.hpp b/include/nncase/runtime/small_vector.hpp index 895a489dab..ece4eb7510 100644 --- a/include/nncase/runtime/small_vector.hpp +++ b/include/nncase/runtime/small_vector.hpp @@ -668,7 +668,6 @@ struct small_vector : Alloc iterator insert(const_iterator position, InputIterator first, InputIterator last) { auto pos = grow_at(position, last - first); - size_type i = 0; auto np = pos; for (auto p = first; p != last; ++p, ++np) { diff --git a/include/nncase/runtime/stackvm/op_profile.h b/include/nncase/runtime/stackvm/op_profile.h index 4b0df8a892..b70c82ed83 100644 --- a/include/nncase/runtime/stackvm/op_profile.h +++ b/include/nncase/runtime/stackvm/op_profile.h @@ -16,18 +16,44 @@ #include #include +#if defined(__riscv) + +#define RISCVFREQUENCY 1600000000 + +static uint64_t k230_get_cycles() +{ + uint64_t x; + __asm volatile("rdcycle %0;" + : "=r"(x)::); + return x; +} +#endif + class op_profile { public: op_profile(const std::string &op_type = "op_profile") : op_type_(op_type) { +#if defined(__riscv) + begin_ = k230_get_cycles(); +#else begin_ = clock(); +#endif } + ~op_profile() { +#if defined(__riscv) + + end_ = k230_get_cycles(); + auto cast_time = end_ - begin_; + // std::cout << "cpu op:" << op_type_ << " cast time:" << cast_time << " begin time:" << begin_ << " end time:" << end_ << " " << std::endl; +#else end_ = clock(); auto cast_time = (end_ - begin_) / (double)1000; + // std::cout << "cpu op:" << op_type_ << " cast time:" << cast_time << " begin time:" << begin_ << " end time:" << end_ << " " < op_timing_; diff --git a/include/nncase/runtime/stackvm/op_reader.h b/include/nncase/runtime/stackvm/op_reader.h index 549d16c99e..5246ab4ea7 100644 --- a/include/nncase/runtime/stackvm/op_reader.h +++ b/include/nncase/runtime/stackvm/op_reader.h @@ -1,4 +1,4 @@ -/* This file is generated by tools/stackvm_gen/IsaGen at 4/25/2022 3:29:27 PM +08:00. +/* This file is generated by tools/stackvm_gen/IsaGen at 2023/5/9 下午5:18:43 +08:00. * * Copyright 2019-2021 Canaan Inc. * @@ -1614,6 +1614,24 @@ struct op_reader } }; +template <> +struct op_reader +{ + tensor_space_to_batch_op_t operator()(span_reader &reader) const + { + tensor_space_to_batch_op_t op(default_init); + op.opcode = static_cast(reader.read_unaligned()); + op.funct = static_cast(reader.read_unaligned()); + op.datatype = static_cast(reader.read_unaligned()); + op.rshape_src = reader.read_unaligned(); + op.rstride_src = reader.read_unaligned(); + op.rstride_dest = reader.read_unaligned(); + op.rshape_block = reader.read_unaligned(); + op.rpad_crops = reader.read_unaligned(); + return op; + } +}; + template <> struct op_reader { @@ -1707,6 +1725,109 @@ struct op_reader } }; +template <> +struct op_reader +{ + tensor_gru_op_t operator()(span_reader &reader) const + { + tensor_gru_op_t op(default_init); + op.opcode = static_cast(reader.read_unaligned()); + op.funct = static_cast(reader.read_unaligned()); + op.input_shape_src = reader.read_unaligned(); + op.w_shape_src = reader.read_unaligned(); + op.direction = reader.read_unaligned(); + op.linear_before_reset = reader.read_unaligned(); + return op; + } +}; + +template <> +struct op_reader +{ + tensor_tflite_detection_postprocess_op_t operator()(span_reader &reader) const + { + tensor_tflite_detection_postprocess_op_t op(default_init); + op.opcode = static_cast(reader.read_unaligned()); + op.funct = static_cast(reader.read_unaligned()); + op.box_shape_src = reader.read_unaligned(); + op.score_shape_src = reader.read_unaligned(); + op.anchor_shape_src = reader.read_unaligned(); + op.max_detections = reader.read_unaligned(); + op.max_classes_per_detection = reader.read_unaligned(); + op.detections_per_class = reader.read_unaligned(); + op.use_regular_non_max_suppression = reader.read_unaligned(); + op.nms_score_threshold = reader.read_unaligned(); + op.nms_iou_threshold = reader.read_unaligned(); + op.num_classes = reader.read_unaligned(); + op.y_scale = reader.read_unaligned(); + op.x_scale = reader.read_unaligned(); + op.h_scale = reader.read_unaligned(); + op.w_scale = reader.read_unaligned(); + return op; + } +}; + +template <> +struct op_reader +{ + tensor_layer_normalization_op_t operator()(span_reader &reader) const + { + tensor_layer_normalization_op_t op(default_init); + op.opcode = static_cast(reader.read_unaligned()); + op.funct = static_cast(reader.read_unaligned()); + op.datatype = static_cast(reader.read_unaligned()); + op.input_shape = reader.read_unaligned(); + op.axis = reader.read_unaligned(); + op.epsilon = reader.read_unaligned(); + return op; + } +}; + +template <> +struct op_reader +{ + tensor_compress_op_t operator()(span_reader &reader) const + { + tensor_compress_op_t op(default_init); + op.opcode = static_cast(reader.read_unaligned()); + op.funct = static_cast(reader.read_unaligned()); + op.input_shape_src = reader.read_unaligned(); + op.condition_shape_src = reader.read_unaligned(); + op.axis = reader.read_unaligned(); + return op; + } +}; + +template <> +struct op_reader +{ + tensor_gather_elements_op_t operator()(span_reader &reader) const + { + tensor_gather_elements_op_t op(default_init); + op.opcode = static_cast(reader.read_unaligned()); + op.funct = static_cast(reader.read_unaligned()); + op.input_shape_src = reader.read_unaligned(); + op.indices_shape_src = reader.read_unaligned(); + op.axis = reader.read_unaligned(); + return op; + } +}; + +template <> +struct op_reader +{ + tensor_instance_normalization_op_t operator()(span_reader &reader) const + { + tensor_instance_normalization_op_t op(default_init); + op.opcode = static_cast(reader.read_unaligned()); + op.funct = static_cast(reader.read_unaligned()); + op.datatype = static_cast(reader.read_unaligned()); + op.input_shape = reader.read_unaligned(); + op.epsilon = reader.read_unaligned(); + return op; + } +}; + class NNCASE_API op_visitor { public: @@ -1842,11 +1963,18 @@ class NNCASE_API op_visitor virtual result visit(NNCASE_UNUSED const tensor_sigmoid_op_t &op) noexcept { return ok(); } virtual result visit(NNCASE_UNUSED const tensor_slice_op_t &op) noexcept { return ok(); } virtual result visit(NNCASE_UNUSED const tensor_softmax_op_t &op) noexcept { return ok(); } + virtual result visit(NNCASE_UNUSED const tensor_space_to_batch_op_t &op) noexcept { return ok(); } virtual result visit(NNCASE_UNUSED const tensor_ternary_op_t &op) noexcept { return ok(); } virtual result visit(NNCASE_UNUSED const tensor_topk_op_t &op) noexcept { return ok(); } virtual result visit(NNCASE_UNUSED const tensor_trilu_op_t &op) noexcept { return ok(); } virtual result visit(NNCASE_UNUSED const tensor_unary_op_t &op) noexcept { return ok(); } virtual result visit(NNCASE_UNUSED const tensor_transpose_op_t &op) noexcept { return ok(); } + virtual result visit(NNCASE_UNUSED const tensor_gru_op_t &op) noexcept { return ok(); } + virtual result visit(NNCASE_UNUSED const tensor_tflite_detection_postprocess_op_t &op) noexcept { return ok(); } + virtual result visit(NNCASE_UNUSED const tensor_layer_normalization_op_t &op) noexcept { return ok(); } + virtual result visit(NNCASE_UNUSED const tensor_compress_op_t &op) noexcept { return ok(); } + virtual result visit(NNCASE_UNUSED const tensor_gather_elements_op_t &op) noexcept { return ok(); } + virtual result visit(NNCASE_UNUSED const tensor_instance_normalization_op_t &op) noexcept { return ok(); } protected: bool interrupted_; diff --git a/include/nncase/runtime/stackvm/opcode.h b/include/nncase/runtime/stackvm/opcode.h index 799cd62582..3263a7f0af 100644 --- a/include/nncase/runtime/stackvm/opcode.h +++ b/include/nncase/runtime/stackvm/opcode.h @@ -1,4 +1,4 @@ -/* This file is generated by tools/stackvm_gen/IsaGen at 4/25/2022 3:29:27 PM +08:00. +/* This file is generated by tools/stackvm_gen/IsaGen at 2023/5/9 下午5:18:43 +08:00. * * Copyright 2019-2021 Canaan Inc. * @@ -161,6 +161,12 @@ enum class tensor_function_t TRANSPOSE = 0x0024, TRILU = 0x0025, UNARY = 0x0026, + GRU = 0x0027, + TFLITE_DETECTION_POSTPROCESS = 0x0028, + LAYER_NORMALIZATION = 0x0029, + COMPRESS = 0x002A, + GATHER_ELEMENTS = 0x002B, + INSTANCE_NORMALIZATION = 0x002C, }; // Instructions @@ -1754,6 +1760,24 @@ struct tensor_softmax_op_t } }; +struct tensor_space_to_batch_op_t +{ + opcode_t opcode; + tensor_function_t funct; + datatype_t datatype; + uint8_t rshape_src; + uint8_t rstride_src; + uint8_t rstride_dest; + uint8_t rshape_block; + uint8_t rpad_crops; + + tensor_space_to_batch_op_t(default_init_t) noexcept { } + explicit tensor_space_to_batch_op_t(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest, uint8_t rshape_block, uint8_t rpad_crops) noexcept + : opcode(opcode_t::TENSOR), funct(tensor_function_t::SPACE_TO_BATCH), datatype(datatype), rshape_src(rshape_src), rstride_src(rstride_src), rstride_dest(rstride_dest), rshape_block(rshape_block), rpad_crops(rpad_crops) + { + } +}; + struct tensor_ternary_op_t { opcode_t opcode; @@ -1847,4 +1871,107 @@ struct tensor_transpose_op_t } }; +struct tensor_gru_op_t +{ + opcode_t opcode; + tensor_function_t funct; + uint8_t input_shape_src; + uint8_t w_shape_src; + uint8_t direction; + bool linear_before_reset; + + tensor_gru_op_t(default_init_t) noexcept { } + explicit tensor_gru_op_t(uint8_t input_shape_src, uint8_t w_shape_src, uint8_t direction, bool linear_before_reset) noexcept + : opcode(opcode_t::TENSOR), funct(tensor_function_t::GRU), input_shape_src(input_shape_src), w_shape_src(w_shape_src), direction(direction), linear_before_reset(linear_before_reset) + { + } +}; + +struct tensor_tflite_detection_postprocess_op_t +{ + opcode_t opcode; + tensor_function_t funct; + uint8_t box_shape_src; + uint8_t score_shape_src; + uint8_t anchor_shape_src; + int32_t max_detections; + int32_t max_classes_per_detection; + int32_t detections_per_class; + bool use_regular_non_max_suppression; + float nms_score_threshold; + float nms_iou_threshold; + int32_t num_classes; + float y_scale; + float x_scale; + float h_scale; + float w_scale; + + tensor_tflite_detection_postprocess_op_t(default_init_t) noexcept { } + explicit tensor_tflite_detection_postprocess_op_t(uint8_t box_shape_src, uint8_t score_shape_src, uint8_t anchor_shape_src, int32_t max_detections, int32_t max_classes_per_detection, int32_t detections_per_class, bool use_regular_non_max_suppression, float nms_score_threshold, float nms_iou_threshold, int32_t num_classes, float y_scale, float x_scale, float h_scale, float w_scale) noexcept + : opcode(opcode_t::TENSOR), funct(tensor_function_t::TFLITE_DETECTION_POSTPROCESS), box_shape_src(box_shape_src), score_shape_src(score_shape_src), anchor_shape_src(anchor_shape_src), max_detections(max_detections), max_classes_per_detection(max_classes_per_detection), detections_per_class(detections_per_class), use_regular_non_max_suppression(use_regular_non_max_suppression), nms_score_threshold(nms_score_threshold), nms_iou_threshold(nms_iou_threshold), num_classes(num_classes), y_scale(y_scale), x_scale(x_scale), h_scale(h_scale), w_scale(w_scale) + { + } +}; + +struct tensor_layer_normalization_op_t +{ + opcode_t opcode; + tensor_function_t funct; + datatype_t datatype; + uint8_t input_shape; + int32_t axis; + float epsilon; + + tensor_layer_normalization_op_t(default_init_t) noexcept { } + explicit tensor_layer_normalization_op_t(datatype_t datatype, uint8_t input_shape, int32_t axis, float epsilon) noexcept + : opcode(opcode_t::TENSOR), funct(tensor_function_t::LAYER_NORMALIZATION), datatype(datatype), input_shape(input_shape), axis(axis), epsilon(epsilon) + { + } +}; + +struct tensor_compress_op_t +{ + opcode_t opcode; + tensor_function_t funct; + uint8_t input_shape_src; + uint8_t condition_shape_src; + float axis; + + tensor_compress_op_t(default_init_t) noexcept { } + explicit tensor_compress_op_t(uint8_t input_shape_src, uint8_t condition_shape_src, float axis) noexcept + : opcode(opcode_t::TENSOR), funct(tensor_function_t::COMPRESS), input_shape_src(input_shape_src), condition_shape_src(condition_shape_src), axis(axis) + { + } +}; + +struct tensor_gather_elements_op_t +{ + opcode_t opcode; + tensor_function_t funct; + uint8_t input_shape_src; + uint8_t indices_shape_src; + int32_t axis; + + tensor_gather_elements_op_t(default_init_t) noexcept { } + explicit tensor_gather_elements_op_t(uint8_t input_shape_src, uint8_t indices_shape_src, int32_t axis) noexcept + : opcode(opcode_t::TENSOR), funct(tensor_function_t::GATHER_ELEMENTS), input_shape_src(input_shape_src), indices_shape_src(indices_shape_src), axis(axis) + { + } +}; + +struct tensor_instance_normalization_op_t +{ + opcode_t opcode; + tensor_function_t funct; + datatype_t datatype; + uint8_t input_shape; + float epsilon; + + tensor_instance_normalization_op_t(default_init_t) noexcept { } + explicit tensor_instance_normalization_op_t(datatype_t datatype, uint8_t input_shape, float epsilon) noexcept + : opcode(opcode_t::TENSOR), funct(tensor_function_t::INSTANCE_NORMALIZATION), datatype(datatype), input_shape(input_shape), epsilon(epsilon) + { + } +}; + END_NS_NNCASE_RT_MODULE diff --git a/include/nncase/transforms/neutral/fix_output_shape.h b/include/nncase/transforms/neutral/fix_output_shape.h new file mode 100644 index 0000000000..f44e304804 --- /dev/null +++ b/include/nncase/transforms/neutral/fix_output_shape.h @@ -0,0 +1,28 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "../transform.h" + +namespace nncase::ir::transforms +{ +class NNCASE_API tflite_detection_postprocess_transform : public transform +{ +public: + void process(transform_context &context) override; + +protected: + bool on_try_match(ir::node &node, transform_context &context) override; +}; +} diff --git a/include/nncase/transforms/neutral/fold_instancenorm.h b/include/nncase/transforms/neutral/fold_instancenorm.h new file mode 100644 index 0000000000..f01ee2541c --- /dev/null +++ b/include/nncase/transforms/neutral/fold_instancenorm.h @@ -0,0 +1,29 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "../transform.h" + +namespace nncase::ir::transforms +{ +class NNCASE_API fold_instancenorm_transform : public transform +{ +public: + void process(transform_context &context) override; + +protected: + bool skip_self_contained_check() const noexcept override { return true; } + bool on_try_match(ir::node &node, transform_context &context) override; +}; +} diff --git a/include/nncase/transforms/neutral/fold_layernorm.h b/include/nncase/transforms/neutral/fold_layernorm.h new file mode 100644 index 0000000000..37022a1886 --- /dev/null +++ b/include/nncase/transforms/neutral/fold_layernorm.h @@ -0,0 +1,52 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "../transform.h" + +namespace nncase::ir::transforms +{ +// youdao nmt +class NNCASE_API fold_layernorm_pattern1_transform : public transform +{ +public: + void process(transform_context &context) override; + +protected: + bool skip_self_contained_check() const noexcept override { return true; } + bool on_try_match(ir::node &node, transform_context &context) override; +}; + +// daniu +class NNCASE_API fold_layernorm_pattern2_transform : public transform +{ +public: + void process(transform_context &context) override; + +protected: + bool skip_self_contained_check() const noexcept override { return true; } + bool on_try_match(ir::node &node, transform_context &context) override; +}; + +// fastspeech +class NNCASE_API fold_layernorm_pattern3_transform : public transform +{ +public: + void process(transform_context &context) override; + +protected: + bool skip_self_contained_check() const noexcept override { return true; } + bool on_try_match(ir::node &node, transform_context &context) override; +}; +} diff --git a/include/nncase/transforms/neutral/optimize_allocation.h b/include/nncase/transforms/neutral/optimize_allocation.h index 43d064e4c6..c01b4a1f8b 100644 --- a/include/nncase/transforms/neutral/optimize_allocation.h +++ b/include/nncase/transforms/neutral/optimize_allocation.h @@ -71,6 +71,15 @@ class NNCASE_API add_copy_to_output_pass : public graph_pass void run_core(graph &graph, nncase::target &target, const run_pass_options &options) override; }; +class NNCASE_API add_copy_to_bitcast_pass : public graph_pass +{ +public: + using graph_pass::graph_pass; + +protected: + void run_core(graph &graph, nncase::target &target, const run_pass_options &options) override; +}; + class NNCASE_API remove_exclusive_copy_to_output_transform : public transform { public: @@ -89,6 +98,15 @@ class NNCASE_API remove_exclusive_copy_to_concat_transform : public transform bool on_try_match(ir::node &node, transform_context &context) override; }; +class NNCASE_API remove_exclusive_copy_to_bitcast_transform : public transform +{ +public: + void process(transform_context &context) override; + +protected: + bool on_try_match(ir::node &node, transform_context &context) override; +}; + class NNCASE_API remove_simple_copy_from_slice_transform : public transform { public: diff --git a/include/nncase/transforms/neutral/squeeze_dims.h b/include/nncase/transforms/neutral/squeeze_dims.h new file mode 100644 index 0000000000..26bffd6eb2 --- /dev/null +++ b/include/nncase/transforms/neutral/squeeze_dims.h @@ -0,0 +1,29 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "../transform.h" + +namespace nncase::ir::transforms +{ +class NNCASE_API squeeze_dims_transform : public transform +{ +public: + void process(transform_context &context) override; + +protected: + bool skip_self_contained_check() const noexcept override { return true; } + bool on_try_match(ir::node &node, transform_context &context) override; +}; +} diff --git a/modules/vulkan/src/codegen/templates/template.cpp b/modules/vulkan/src/codegen/templates/template.cpp index 49e3394051..535e057be5 100644 --- a/modules/vulkan/src/codegen/templates/template.cpp +++ b/modules/vulkan/src/codegen/templates/template.cpp @@ -82,7 +82,7 @@ class xz_reader { public: xz_reader() - : archive_(ZipArchive::fromBuffer(xz_res_.data.data(), + : archive_(ZipArchive::fromBuffer((void *)xz_res_.data.data(), (libzippp_uint32)xz_res_.data.size(), ZipArchive::ReadOnly, true)) { if (!archive_) diff --git a/pyproject.toml b/pyproject.toml index b83548a864..ea3670e25f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,35 @@ [project] -requires-python = ">=3.6" +name = "nncase" +dynamic = ["version"] +requires-python = ">=3.7" +authors = [{ name = "sunnycase" }, { email = "sunnycase@live.cn" }] +maintainers = [{ name = "sunnycase" }, { email = "sunnycase@live.cn" }] +readme = "README.md" +description = "A neural network compiler for AI accelerators" +license = { file = "LICENSE" } +classifiers = [ + "Programming Language :: C++", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", +] +keywords = ["kendryte", "nn", "compiler", "k210", "k510"] +dependencies = ["numpy"] + +[project.urls] +homepage = "https://github.com/kendryte/nncase" [build-system] -requires = ["setuptools>=42", "wheel", "conan", "ninja"] +requires = ["setuptools>=42", "wheel", "conan<=1.59", "ninja"] [tool.cibuildwheel] -build = "cp3*" +build = ["cp37*", "cp38*", "cp39*", "cp310*"] skip = "*musllinux*" -manylinux-x86_64-image = "sunnycase/manylinux_2_24_x86_64:version1.0" +manylinux-x86_64-image = "sunnycase/manylinux_2_24_x86_64:version1.1" test-requires = "pytest" test-command = [ "pytest {project}/tests/other" @@ -22,7 +44,7 @@ archs = ["AMD64"] [tool.cibuildwheel.linux] archs = ["x86_64"] before-all = [ - "pip install conan", + "pip install conan==1.59", "conan profile new default --detect", "conan profile update settings.compiler.libcxx=libstdc++11 default", "wget https://sdk.lunarg.com/sdk/download/1.2.182.0/linux/vulkansdk-linux-x86_64-1.2.182.0.tar.gz -O vulkansdk.tar.gz", diff --git a/requirements.test.txt b/requirements.test.txt new file mode 100644 index 0000000000..e8c789b747 --- /dev/null +++ b/requirements.test.txt @@ -0,0 +1,17 @@ +tensorflow==2.10.0 +matplotlib +pillow +opencv-python==4.5.1.48 +onnx==1.12.0 +onnx-simplifier==0.3.6 +onnxoptimizer==0.2.6 +onnxruntime==1.12.0 +numpy==1.21.0 +torch==1.9.0 +torchvision==0.10.0 +imageio==2.15.0 +protobuf==3.12.2 +kendryte-caffe +pytest +pytest-xdist +pyyaml diff --git a/setup.py b/setup.py index 6980af205d..a69d9cbf3f 100644 --- a/setup.py +++ b/setup.py @@ -252,33 +252,11 @@ def find_version(): raise RuntimeError("Unable to find version string.") -requirements = ["numpy"] - setup(name='nncase', version=find_version(), - author="sunnycase", - author_email="sunnycase@live.cn", - maintainer="sunnycase", packages=['nncase'], package_dir={'': 'python'}, - python_requires=">=3.6", - install_requires=requirements, ext_modules=[CMakeExtension(name="_nncase", sourcedir='.')], - description="A neural network compiler for AI accelerators", - url='https://github.com/kendryte/nncase', - long_description=open("README.md", 'r', encoding='utf8').read(), - long_description_content_type="text/markdown", - keywords="kendryte, nn, compiler, k210, k510", - classifiers=[ - "Programming Language :: C++", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", ], - license='Apache-2.0', cmdclass={ 'build_ext': BuildCMakeExt, 'install_data': InstallCMakeLibsData, diff --git a/src/codegen/module_builder.cpp b/src/codegen/module_builder.cpp index 12fc78ad39..fd6ead621c 100644 --- a/src/codegen/module_builder.cpp +++ b/src/codegen/module_builder.cpp @@ -189,6 +189,11 @@ function_call_id module_builder::function_id(ir::graph *graph) throw std::invalid_argument("Can't find graph " + graph->name() + " in modules"); } +std::streampos module_builder::get_current_entry_point() +{ + return entry_points_.at(current_function_); +} + void module_builder::set_current_entry_point(std::streampos pos) { entry_points_[current_function_] = pos; diff --git a/src/codegen/stackvm/CMakeLists.txt b/src/codegen/stackvm/CMakeLists.txt index 8e9769e8a3..f99e2483f6 100644 --- a/src/codegen/stackvm/CMakeLists.txt +++ b/src/codegen/stackvm/CMakeLists.txt @@ -1,41 +1,48 @@ -cmake_minimum_required (VERSION 3.8) +cmake_minimum_required(VERSION 3.8) set(SRCS module_builder.cpp - op_writer.cpp - ops/batch_to_space.cpp - ops/binary.cpp - ops/broadcast.cpp - ops/call.cpp - ops/compare.cpp - ops/conv2d.cpp - ops/convert.cpp - ops/copy.cpp - ops/cumsum.cpp - ops/dequantize.cpp - ops/gather.cpp - ops/gather_nd.cpp - ops/hardmax.cpp - ops/matmul.cpp - ops/onehot.cpp - ops/pad.cpp - ops/quantize.cpp - ops/random_normal.cpp - ops/random_uniform.cpp - ops/reduce.cpp - ops/reduce_arg.cpp - ops/reduce_prod.cpp - ops/reduce_window2d.cpp - ops/resize_image.cpp - ops/roi_align.cpp - ops/slice.cpp - ops/sigmoid.cpp - ops/softmax.cpp - ops/table_lookup1d.cpp - ops/ternary.cpp - ops/topk.cpp - ops/transpose.cpp - ops/trilu.cpp - ops/unary.cpp) + op_writer.cpp + ops/batch_to_space.cpp + ops/binary.cpp + ops/broadcast.cpp + ops/call.cpp + ops/compare.cpp + ops/compress.cpp + ops/conv2d.cpp + ops/convert.cpp + ops/copy.cpp + ops/cumsum.cpp + ops/dequantize.cpp + ops/gather.cpp + ops/gather_elements.cpp + ops/gather_nd.cpp + ops/gru.cpp + ops/hardmax.cpp + ops/matmul.cpp + ops/onehot.cpp + ops/pad.cpp + ops/quantize.cpp + ops/random_normal.cpp + ops/random_uniform.cpp + ops/reduce.cpp + ops/reduce_arg.cpp + ops/reduce_prod.cpp + ops/reduce_window2d.cpp + ops/resize_image.cpp + ops/roi_align.cpp + ops/slice.cpp + ops/sigmoid.cpp + ops/softmax.cpp + ops/space_to_batch.cpp + ops/table_lookup1d.cpp + ops/ternary.cpp + ops/topk.cpp + ops/transpose.cpp + ops/trilu.cpp + ops/tflite_detection_postprocess.cpp + ops/unary.cpp + ops/layernorm.cpp + ops/instancenorm.cpp) add_library(codegen_stackvm OBJECT ${SRCS}) target_link_libraries(codegen_stackvm PUBLIC ir schedule) diff --git a/src/codegen/stackvm/module_builder.h b/src/codegen/stackvm/module_builder.h index 758f33efe7..5bf1d712b4 100644 --- a/src/codegen/stackvm/module_builder.h +++ b/src/codegen/stackvm/module_builder.h @@ -20,14 +20,19 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include +#include #include +#include +#include #include #include #include @@ -43,8 +48,10 @@ #include #include #include +#include #include #include +#include #include #include #include diff --git a/src/codegen/stackvm/op_writer.cpp b/src/codegen/stackvm/op_writer.cpp index 72cb231780..8a6a38dd9d 100644 --- a/src/codegen/stackvm/op_writer.cpp +++ b/src/codegen/stackvm/op_writer.cpp @@ -1,4 +1,4 @@ -/* This file is generated by tools/stackvm_gen/IsaGen at 4/25/2022 3:29:27 PM +08:00. +/* This file is generated by tools/stackvm_gen/IsaGen at 2023/5/9 下午5:18:43 +08:00. * * Copyright 2019-2021 Canaan Inc. * @@ -643,6 +643,11 @@ void op_builder::tensor_softmax_(datatype_t datatype, uint8_t rshape_src, uint8_ op_writer()(tensor_softmax_op_t(datatype, rshape_src, rstride_src, rstride_dest, axis, beta), writer_); } +void op_builder::tensor_space_to_batch_(datatype_t datatype, uint8_t rshape_src, uint8_t rstride_src, uint8_t rstride_dest, uint8_t rshape_block, uint8_t rpad_crops) +{ + op_writer()(tensor_space_to_batch_op_t(datatype, rshape_src, rstride_src, rstride_dest, rshape_block, rpad_crops), writer_); +} + void op_builder::tensor_ternary_(datatype_t datatype, uint8_t rshape_src1, uint8_t rstride_src1, uint8_t rshape_src2, uint8_t rstride_src2, uint8_t rshape_src3, uint8_t rstride_src3, uint8_t rstride_dest) { op_writer()(tensor_ternary_op_t(datatype, rshape_src1, rstride_src1, rshape_src2, rstride_src2, rshape_src3, rstride_src3, rstride_dest), writer_); @@ -667,3 +672,33 @@ void op_builder::tensor_transpose_(datatype_t datatype, uint8_t rshape_src, uint { op_writer()(tensor_transpose_op_t(datatype, rshape_src, rstride_src, rstride_dest, rshape_perm), writer_); } + +void op_builder::tensor_gru_(uint8_t input_shape_src, uint8_t w_shape_src, uint8_t direction, bool linear_before_reset) +{ + op_writer()(tensor_gru_op_t(input_shape_src, w_shape_src, direction, linear_before_reset), writer_); +} + +void op_builder::tensor_tflite_detection_postprocess_(uint8_t box_shape_src, uint8_t score_shape_src, uint8_t anchor_shape_src, int32_t max_detections, int32_t max_classes_per_detection, int32_t detections_per_class, bool use_regular_non_max_suppression, float nms_score_threshold, float nms_iou_threshold, int32_t num_classes, float y_scale, float x_scale, float h_scale, float w_scale) +{ + op_writer()(tensor_tflite_detection_postprocess_op_t(box_shape_src, score_shape_src, anchor_shape_src, max_detections, max_classes_per_detection, detections_per_class, use_regular_non_max_suppression, nms_score_threshold, nms_iou_threshold, num_classes, y_scale, x_scale, h_scale, w_scale), writer_); +} + +void op_builder::tensor_layer_normalization_(datatype_t datatype, uint8_t input_shape, int32_t axis, float epsilon) +{ + op_writer()(tensor_layer_normalization_op_t(datatype, input_shape, axis, epsilon), writer_); +} + +void op_builder::tensor_compress_(uint8_t input_shape_src, uint8_t condition_shape_src, float axis) +{ + op_writer()(tensor_compress_op_t(input_shape_src, condition_shape_src, axis), writer_); +} + +void op_builder::tensor_gather_elements_(uint8_t input_shape_src, uint8_t indices_shape_src, int32_t axis) +{ + op_writer()(tensor_gather_elements_op_t(input_shape_src, indices_shape_src, axis), writer_); +} + +void op_builder::tensor_instance_normalization_(datatype_t datatype, uint8_t input_shape, float epsilon) +{ + op_writer()(tensor_instance_normalization_op_t(datatype, input_shape, epsilon), writer_); +} diff --git a/src/codegen/stackvm/ops.def b/src/codegen/stackvm/ops.def index 42f14e424c..a30ed404bf 100644 --- a/src/codegen/stackvm/ops.def +++ b/src/codegen/stackvm/ops.def @@ -3,13 +3,16 @@ DEFINE_OP(binary) DEFINE_OP(broadcast) DEFINE_OP(call) DEFINE_OP(compare) +DEFINE_OP(compress) DEFINE_OP(conv2d) DEFINE_OP(convert) DEFINE_OP(copy) DEFINE_OP(cumsum) DEFINE_OP(dequantize) DEFINE_OP(gather) +DEFINE_OP(gather_elements) DEFINE_OP(gather_nd) +DEFINE_OP(gru) DEFINE_OP(hardmax) DEFINE_OP(matmul) DEFINE_OP(onehot) @@ -26,9 +29,13 @@ DEFINE_OP(roi_align) DEFINE_OP(sigmoid) DEFINE_OP(slice) DEFINE_OP(softmax) +DEFINE_OP(space_to_batch) DEFINE_OP(table_lookup1d) DEFINE_OP(ternary) DEFINE_OP(topk) DEFINE_OP(transpose) DEFINE_OP(trilu) -DEFINE_OP(unary) \ No newline at end of file +DEFINE_OP(tflite_detection_postprocess) +DEFINE_OP(unary) +DEFINE_OP(layernorm) +DEFINE_OP(instancenorm) \ No newline at end of file diff --git a/src/codegen/stackvm/ops/compress.cpp b/src/codegen/stackvm/ops/compress.cpp new file mode 100644 index 0000000000..2e27b6bf4e --- /dev/null +++ b/src/codegen/stackvm/ops/compress.cpp @@ -0,0 +1,34 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../module_builder.h" + +using namespace nncase; +using namespace nncase::codegen; +using namespace nncase::codegen::stackvm; +using namespace nncase::ir; + +void stackvm_module_builder::emit(compress &node, stackvm_op_builder &builder) +{ + auto &input = allocation(node.input()); + auto &condition = allocation(node.condition()); + auto &output = allocation(node.output()); + builder.lea_buffer(input); + builder.lea_buffer(condition); + builder.lea_buffer(output); + + builder.stshape(0, input.shape); + builder.stshape(1, condition.shape); + builder.tensor_compress_(0, 1, node.axis()); +} diff --git a/src/codegen/stackvm/ops/gather_elements.cpp b/src/codegen/stackvm/ops/gather_elements.cpp new file mode 100644 index 0000000000..23a9cdca54 --- /dev/null +++ b/src/codegen/stackvm/ops/gather_elements.cpp @@ -0,0 +1,35 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../module_builder.h" + +using namespace nncase; +using namespace nncase::codegen; +using namespace nncase::codegen::stackvm; +using namespace nncase::ir; + +void stackvm_module_builder::emit(gather_elements &node, stackvm_op_builder &builder) +{ + auto &input = allocation(node.input()); + auto &indices = allocation(node.indices()); + auto &output = allocation(node.output()); + builder.lea_buffer(input); + builder.lea_buffer(indices); + builder.lea_buffer(output); + + builder.stshape(0, input.shape); + builder.stshape(1, indices.shape); + + builder.tensor_gather_elements_(0, 1, node.axis()); +} \ No newline at end of file diff --git a/src/codegen/stackvm/ops/gru.cpp b/src/codegen/stackvm/ops/gru.cpp new file mode 100644 index 0000000000..76147cc2e2 --- /dev/null +++ b/src/codegen/stackvm/ops/gru.cpp @@ -0,0 +1,43 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../module_builder.h" + +using namespace nncase; +using namespace nncase::codegen; +using namespace nncase::codegen::stackvm; +using namespace nncase::ir; + +void stackvm_module_builder::emit(gru &node, stackvm_op_builder &builder) +{ + auto &input = allocation(node.input()); + auto &w = allocation(node.w()); + auto &r = allocation(node.r()); + auto &b = allocation(node.b()); + auto &initial_h = allocation(node.initial_h()); + auto &output = allocation(node.output()); + auto &output_h = allocation(node.output_h()); + builder.lea_buffer(input); + builder.lea_buffer(w); + builder.lea_buffer(r); + builder.lea_buffer(b); + builder.lea_buffer(initial_h); + builder.lea_buffer(output); + builder.lea_buffer(output_h); + + builder.stshape(0, input.shape); + builder.stshape(1, w.shape); + + builder.tensor_gru_(0, 1, node.direction(), node.linear_before_reset()); +} diff --git a/src/codegen/stackvm/ops/instancenorm.cpp b/src/codegen/stackvm/ops/instancenorm.cpp new file mode 100644 index 0000000000..4f2c323410 --- /dev/null +++ b/src/codegen/stackvm/ops/instancenorm.cpp @@ -0,0 +1,37 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../module_builder.h" + +using namespace nncase; +using namespace nncase::codegen; +using namespace nncase::codegen::stackvm; +using namespace nncase::ir; + +void stackvm_module_builder::emit(instancenorm &node, stackvm_op_builder &builder) +{ + auto &input = allocation(node.input()); + auto &scale = allocation(node.scale()); + auto &bias = allocation(node.bias()); + auto &output = allocation(node.output()); + + builder.lea_buffer(input); + builder.lea_buffer(scale); + builder.lea_buffer(bias); + builder.lea_buffer(output); + + builder.stshape(0, input.shape); + + builder.tensor_instance_normalization_(node.output().type(), 0, node.epsilon()); +} diff --git a/src/codegen/stackvm/ops/layernorm.cpp b/src/codegen/stackvm/ops/layernorm.cpp new file mode 100644 index 0000000000..77042a7e65 --- /dev/null +++ b/src/codegen/stackvm/ops/layernorm.cpp @@ -0,0 +1,37 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../module_builder.h" + +using namespace nncase; +using namespace nncase::codegen; +using namespace nncase::codegen::stackvm; +using namespace nncase::ir; + +void stackvm_module_builder::emit(layernorm &node, stackvm_op_builder &builder) +{ + auto &input = allocation(node.input()); + auto &scale = allocation(node.scale()); + auto &bias = allocation(node.bias()); + auto &output = allocation(node.output()); + + builder.lea_buffer(input); + builder.lea_buffer(scale); + builder.lea_buffer(bias); + builder.lea_buffer(output); + + builder.stshape(0, input.shape); + + builder.tensor_layer_normalization_(node.output().type(), 0, node.axis(), node.epsilon()); +} diff --git a/src/codegen/stackvm/ops/space_to_batch.cpp b/src/codegen/stackvm/ops/space_to_batch.cpp new file mode 100644 index 0000000000..fd518ee31e --- /dev/null +++ b/src/codegen/stackvm/ops/space_to_batch.cpp @@ -0,0 +1,35 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../module_builder.h" + +using namespace nncase; +using namespace nncase::codegen; +using namespace nncase::codegen::stackvm; +using namespace nncase::ir; + +void stackvm_module_builder::emit(space_to_batch &node, stackvm_op_builder &builder) +{ + auto &input = allocation(node.input()); + auto &output = allocation(node.output()); + builder.lea_buffer(input); + builder.lea_buffer(output); + + builder.stshape(0, input.shape); + builder.stshape(1, input.strides); + builder.stshape(2, output.strides); + builder.stshape(3, shape_t { (size_t)node.block_size_h(), (size_t)node.block_size_w() }); + builder.stpaddings(0, std::vector { node.padding_h(), node.padding_w() }); + builder.tensor_space_to_batch_(node.input().type(), 0, 1, 2, 3, 0); +} diff --git a/src/codegen/stackvm/ops/tflite_detection_postprocess.cpp b/src/codegen/stackvm/ops/tflite_detection_postprocess.cpp new file mode 100644 index 0000000000..5da2d2afb9 --- /dev/null +++ b/src/codegen/stackvm/ops/tflite_detection_postprocess.cpp @@ -0,0 +1,47 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../module_builder.h" + +using namespace nncase; +using namespace nncase::codegen; +using namespace nncase::codegen::stackvm; +using namespace nncase::ir; + +void stackvm_module_builder::emit(tflite_detection_postprocess &node, stackvm_op_builder &builder) +{ + auto &box = allocation(node.boxes()); + auto &score = allocation(node.scores()); + auto &anchor = allocation(node.anchors()); + auto &output_locations = allocation(node.output_locations()); + auto &output_classes = allocation(node.output_classes()); + auto &output_scores = allocation(node.output_scores()); + auto &output_num_detections = allocation(node.output_num_detections()); + + builder.lea_buffer(box); + builder.lea_buffer(score); + builder.lea_buffer(anchor); + builder.lea_buffer(output_locations); + builder.lea_buffer(output_classes); + builder.lea_buffer(output_scores); + builder.lea_buffer(output_num_detections); + + builder.stshape(0, box.shape); + builder.stshape(1, score.shape); + builder.stshape(2, anchor.shape); + + builder.tensor_tflite_detection_postprocess_(0, 1, 2, node.max_detections(), node.max_classes_per_detection(), node.detections_per_class(), + node.use_regular_non_max_suppression(), node.nms_score_threshold(), node.nms_iou_threshold(), + node.num_classes(), node.y_scale(), node.x_scale(), node.h_scale(), node.w_scale()); +} diff --git a/src/evaluator/ops/neutral/neutral_ops.cpp b/src/evaluator/ops/neutral/neutral_ops.cpp index 16cb8a7090..6a42b9d585 100644 --- a/src/evaluator/ops/neutral/neutral_ops.cpp +++ b/src/evaluator/ops/neutral/neutral_ops.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -29,8 +30,12 @@ #include #include #include +#include #include +#include #include +#include +#include #include #include #include @@ -46,8 +51,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -108,8 +115,7 @@ void register_neutral_evaluators() runtime_shape_t { (size_t)rnode.block_size_h(), (size_t)rnode.block_size_w() }, runtime_paddings_t { padding { rnode.crop_h()[0], rnode.crop_h()[1] }, padding { rnode.crop_w()[0], rnode.crop_w()[1] } }, input.strides(), output.strides()) - .unwrap_or_throw(); - }); + .unwrap_or_throw(); }); register_evaluator(op_binary, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -141,8 +147,7 @@ void register_neutral_evaluators() break; default: std::cerr << "unsupported dtype for binary: " + std::string(datatype_names(input_type)); - } - }); + } }); register_evaluator(op_broadcast, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -151,8 +156,7 @@ void register_neutral_evaluators() auto output = context.memory_at(rnode.output()); kernels::broadcast(input.datatype(), input.buffer().data(), output.buffer().data(), input.shape(), input.strides(), output.shape(), output.strides()) - .unwrap_or_throw(); - }); + .unwrap_or_throw(); }); register_evaluator(op_concat, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -170,8 +174,7 @@ void register_neutral_evaluators() runtime_shape_t concat_dims { rnode.concat_dims().begin(), rnode.concat_dims().end() }; kernels::concat(rnode.output().type(), inputs_mem, output.buffer().data(), output.shape(), inputs_strides, output.strides(), rnode.axis(), concat_dims) - .unwrap_or_throw(); - }); + .unwrap_or_throw(); }); register_evaluator(op_conv2d, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -190,8 +193,7 @@ void register_neutral_evaluators() kernels::conv2d(input_mem.data(), weights_mem.data(), bias_mem.data(), output_mem.data(), input.shape(), input.strides(), weights.shape(), weights.strides(), bias.strides(), output.strides(), rnode.padding_h(), rnode.padding_w(), rnode.groups(), rnode.stride_h(), rnode.stride_w(), rnode.dilation_h(), rnode.dilation_w(), rnode.fused_activation()) - .unwrap_or_throw(); - }); + .unwrap_or_throw(); }); register_evaluator(op_conv2d_transpose, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -204,8 +206,7 @@ void register_neutral_evaluators() neutral::conv2d_transpose(input.data(), output.data(), weights.data(), bias.data(), to(rnode.input().shape()), rnode.groups(), to(rnode.output().shape()), rnode.filter_h(), rnode.filter_w(), rnode.stride_h(), rnode.stride_w(), - rnode.dilation_h(), rnode.dilation_w(), rnode.padding_h(), rnode.padding_w(), rnode.fused_activation()); - }); + rnode.dilation_h(), rnode.dilation_w(), rnode.padding_h(), rnode.padding_w(), rnode.fused_activation()); }); register_evaluator(op_dequantize, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -229,8 +230,7 @@ void register_neutral_evaluators() assert(false && "not supported type!"); #undef DEQUANTIZE - } - }); + } }); register_evaluator(op_compare, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -268,8 +268,7 @@ void register_neutral_evaluators() break; default: std::cerr << "unsupported dtype for compare: " + std::string(datatype_names(input_type)); - } - }); + } }); register_evaluator(op_fused_unary, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -286,8 +285,7 @@ void register_neutral_evaluators() auto buf = ss.str(); std::vector body(reinterpret_cast(buf.data()), reinterpret_cast(buf.data() + buf.size())); kernels::nnil_unary_method(input.data(), output.data(), input.size(), body) - .unwrap_or_throw(); - }); + .unwrap_or_throw(); }); register_evaluator(op_matmul, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -305,8 +303,7 @@ void register_neutral_evaluators() kernels::matmul(input_a_mem.data(), input_b_mem.data(), bias_mem.data(), output_mem.data(), input_a.shape(), input_a.strides(), input_b.shape(), input_b.strides(), output.shape(), output.strides(), rnode.fused_activation()) - .unwrap_or_throw(); - }); + .unwrap_or_throw(); }); register_evaluator(op_pad, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -318,8 +315,7 @@ void register_neutral_evaluators() kernels::pad(input.datatype(), input_mem.data(), output_mem.data(), input.shape(), input.strides(), output.strides(), to(rnode.paddings()), rnode.pad_mode(), rnode.pad_value()) - .unwrap_or_throw(); - }); + .unwrap_or_throw(); }); register_evaluator(op_quantize, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -339,8 +335,7 @@ void register_neutral_evaluators() default: assert(false && "not supported type!"); #undef QUANTIZE - } - }); + } }); register_evaluator(op_reduce, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -360,10 +355,14 @@ void register_neutral_evaluators() output.buffer().as_span().data(), input.shape(), to(rnode.axis()), input.strides(), output.strides(), rnode.keep_dims()) .unwrap_or_throw(); break; + case dt_int64: + kernels::reduce(rnode.reduce_op(), static_cast(rnode.init_value()), input.buffer().as_span().data(), + output.buffer().as_span().data(), input.shape(), to(rnode.axis()), input.strides(), output.strides(), rnode.keep_dims()) + .unwrap_or_throw(); + break; default: std::cerr << "unsupported dtype for reduce: " + std::string(datatype_names(input_type)); - } - }); + } }); register_evaluator(op_reduce_arg, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -389,8 +388,7 @@ void register_neutral_evaluators() break; default: std::cerr << "unsupported dtype for reduce_arg: " + std::string(datatype_names(output_type)); - } - }); + } }); register_evaluator(op_reduce_prod, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -412,8 +410,7 @@ void register_neutral_evaluators() break; default: std::cerr << "unsupported dtype for reduce_prod: " + std::string(datatype_names(input_type)); - } - }); + } }); register_evaluator(op_reduce_window2d, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -427,8 +424,7 @@ void register_neutral_evaluators() kernels::reduce_window2d(rnode.reduce_op(), input_mem.data(), rnode.init_value(), output_mem.data(), input.shape(), input.strides(), output.strides(), rnode.padding_h(), rnode.padding_w(), rnode.filter_h(), rnode.filter_w(), rnode.stride_h(), rnode.stride_w(), rnode.dilation_h(), rnode.dilation_w(), rnode.fused_activation()) - .unwrap_or_throw(); - }); + .unwrap_or_throw(); }); register_evaluator(op_bitcast, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -436,8 +432,7 @@ void register_neutral_evaluators() auto input = context.memory_at(rnode.input()).buffer(); auto output = context.memory_at(rnode.output()).buffer(); - std::copy(input.begin(), input.end(), output.begin()); - }); + std::copy(input.begin(), input.end(), output.begin()); }); register_evaluator(op_resize_image, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -460,8 +455,7 @@ void register_neutral_evaluators() input.shape(), input.strides(), output.strides(), new_size[0], new_size[1], rnode.align_corners(), rnode.half_pixel_centers()) .unwrap_or_throw(); - } - }); + } }); register_evaluator(op_roi_align, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -482,8 +476,7 @@ void register_neutral_evaluators() break; default: std::cerr << "unsupported dtype for roi_align: " + std::string(datatype_names(input_type)); - } - }); + } }); register_evaluator(op_sigmoid, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -501,8 +494,7 @@ void register_neutral_evaluators() break; default: std::cerr << "unsupported dtype for sigmoid: " + std::string(datatype_names(output_type)); - } - }); + } }); register_evaluator(op_slice, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -514,8 +506,7 @@ void register_neutral_evaluators() kernels::slice(input.datatype(), input_mem.data(), output_mem.data(), input.shape(), input.strides(), output.strides(), to(rnode.begin()), to(rnode.end()), to(rnode.strides())) - .unwrap_or_throw(); - }); + .unwrap_or_throw(); }); register_evaluator(op_softmax, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -533,8 +524,19 @@ void register_neutral_evaluators() break; default: std::cerr << "unsupported dtype for softmax: " + std::string(datatype_names(output_type)); - } - }); + } }); + + register_evaluator(op_space_to_batch, [](ir::node &node, function_evaluate_context &context) { + auto &rnode = static_cast(node); + + auto input = context.memory_at(rnode.input()); + auto output = context.memory_at(rnode.output()); + + kernels::space_to_batch(input.datatype(), input.buffer().data(), output.buffer().data(), input.shape(), + runtime_shape_t { (size_t)rnode.block_size_h(), (size_t)rnode.block_size_w() }, + runtime_paddings_t { rnode.padding_h(), rnode.padding_w() }, + input.strides(), output.strides()) + .unwrap_or_throw(); }); register_evaluator(op_ternary, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -553,10 +555,15 @@ void register_neutral_evaluators() input_b.shape(), input_b.strides(), input_c.shape(), input_c.strides(), output.strides()) .unwrap_or_throw(); break; + case dt_int64: + kernels::ternary(input_a.buffer().as_span().data(), input_b.buffer().as_span().data(), + input_c.buffer().as_span().data(), output.buffer().as_span().data(), input_a.shape(), input_a.strides(), + input_b.shape(), input_b.strides(), input_c.shape(), input_c.strides(), output.strides()) + .unwrap_or_throw(); + break; default: std::cerr << "unsupported dtype for ternary: " + std::string(datatype_names(output_type)); - } - }); + } }); register_evaluator(op_transpose, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -568,8 +575,7 @@ void register_neutral_evaluators() kernels::transpose(input.datatype(), input_mem.data(), output_mem.data(), input.shape(), to(rnode.perm()), input.strides(), output.strides()) - .unwrap_or_throw(); - }); + .unwrap_or_throw(); }); register_evaluator(op_unary, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -635,10 +641,12 @@ void register_neutral_evaluators() case unary_tanh: unary([](auto a) { return tanh(a); }); break; + case unary_erf: + unary([](auto a) { return erf(a); }); + break; default: throw std::runtime_error("Not supported unary"); - } - }); + } }); register_evaluator(op_table_lookup1d, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -648,8 +656,7 @@ void register_neutral_evaluators() auto table = context.memory_at(rnode.table()).buffer().as_span(); auto output = context.memory_at(rnode.output()).buffer().as_span(); - kernels::neutral::table_lookup1d(input.data(), output.data(), input.size(), table.data()); - }); + kernels::neutral::table_lookup1d(input.data(), output.data(), input.size(), table.data()); }); register_evaluator(op_clamp, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -667,8 +674,7 @@ void register_neutral_evaluators() for (size_t i = 0; i < input.size(); i++) { output_ptr[i] = std::clamp(input_ptr[i], low, high); - } - }); + } }); register_evaluator(op_convert, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -680,8 +686,7 @@ void register_neutral_evaluators() kernels::convert(input.datatype(), output.datatype(), input_mem.data(), output_mem.data(), input.shape(), input.strides(), output.strides()) - .unwrap_or_throw(); - }); + .unwrap_or_throw(); }); register_evaluator(op_gather, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -694,8 +699,7 @@ void register_neutral_evaluators() kernels::gather(input.datatype(), input_mem.data(), output_mem.data(), input.shape(), output.shape(), input.strides(), output.strides(), reinterpret_cast(indices.buffer().data()), indices.shape(), rnode.axis()) - .unwrap_or_throw(); - }); + .unwrap_or_throw(); }); register_evaluator(op_gather_nd, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -708,8 +712,7 @@ void register_neutral_evaluators() kernels::gather_nd(input.datatype(), input_mem.data(), output_mem.data(), input.shape(), output.shape(), input.strides(), output.strides(), reinterpret_cast(indices.buffer().data()), indices.shape(), rnode.batch_dims()) - .unwrap_or_throw(); - }); + .unwrap_or_throw(); }); register_evaluator(op_onehot, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -726,8 +729,7 @@ void register_neutral_evaluators() auto off_value_mem = off_value.buffer().data(); kernels::onehot(output.datatype(), indices_mem, output_mem, indices.shape(), output.shape(), output.strides(), depth_mem, off_value_mem, on_value_mem, rnode.axis(), rnode.mode()) - .unwrap_or_throw(); - }); + .unwrap_or_throw(); }); register_evaluator(op_cumsum, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -749,8 +751,7 @@ void register_neutral_evaluators() break; default: throw std::runtime_error("unsupported dtype for cumsum: " + std::string(datatype_names(datatype))); - } - }); + } }); register_evaluator(op_hardmax, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -767,8 +768,7 @@ void register_neutral_evaluators() break; default: throw std::runtime_error("unsupported dtype for hardmax: " + std::string(datatype_names(datatype))); - } - }); + } }); register_evaluator(op_random_normal, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -783,8 +783,7 @@ void register_neutral_evaluators() break; default: throw std::runtime_error("unsupported dtype for random_normal: " + std::string(datatype_names(datatype))); - } - }); + } }); register_evaluator(op_random_uniform, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -799,8 +798,7 @@ void register_neutral_evaluators() break; default: throw std::runtime_error("unsupported dtype for random_uniform: " + std::string(datatype_names(datatype))); - } - }); + } }); register_evaluator(op_topk, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -821,8 +819,7 @@ void register_neutral_evaluators() break; default: throw std::runtime_error("unsupported dtype for topk: " + std::string(datatype_names(datatype))); - } - }); + } }); register_evaluator(op_trilu, [](ir::node &node, function_evaluate_context &context) { auto &rnode = static_cast(node); @@ -839,8 +836,107 @@ void register_neutral_evaluators() break; default: throw std::runtime_error("unsupported dtype for topk: " + std::string(datatype_names(datatype))); + } }); + + register_evaluator(op_gru, [](ir::node &node, function_evaluate_context &context) { + auto &rnode = static_cast(node); + auto input = context.memory_at(rnode.input()); + auto W = context.memory_at(rnode.w()); + auto R = context.memory_at(rnode.r()); + auto B = context.memory_at(rnode.b()); + auto initial_h = context.memory_at(rnode.initial_h()); + auto output = context.memory_at(rnode.output()); + auto output_h = context.memory_at(rnode.output_h()); + kernels::gru(input.buffer().as_span().data(), W.buffer().as_span().data(), R.buffer().as_span().data(), + B.buffer().as_span().data(), initial_h.buffer().as_span().data(), output.buffer().as_span().data(), output_h.buffer().as_span().data(), + input.shape(), W.shape(), rnode.direction(), rnode.linear_before_reset()) + .unwrap_or_throw(); }); + + register_evaluator(op_tflite_detection_postprocess, [](ir::node &node, function_evaluate_context &context) { + auto &rnode = static_cast(node); + auto box = context.memory_at(rnode.boxes()); + auto score = context.memory_at(rnode.scores()); + auto anchor = context.memory_at(rnode.anchors()); + auto output_locations = context.memory_at(rnode.output_locations()); + auto output_classes = context.memory_at(rnode.output_classes()); + auto output_scores = context.memory_at(rnode.output_scores()); + auto output_num_detections = context.memory_at(rnode.output_num_detections()); + kernels::tflite_detection_postprocess(box.buffer().as_span().data(), score.buffer().as_span().data(), anchor.buffer().as_span().data(), + output_locations.buffer().as_span().data(), output_classes.buffer().as_span().data(), output_scores.buffer().as_span().data(), output_num_detections.buffer().as_span().data(), + box.shape(), score.shape(), anchor.shape(), rnode.max_detections(), rnode.max_classes_per_detection(), + rnode.detections_per_class(), rnode.use_regular_non_max_suppression(), rnode.nms_score_threshold(), rnode.nms_iou_threshold(), + rnode.num_classes(), rnode.y_scale(), rnode.x_scale(), rnode.h_scale(), rnode.w_scale()) + .unwrap_or_throw(); }); + + register_evaluator(op_gather_elements, [](ir::node &node, function_evaluate_context &context) { + auto &rnode = static_cast(node); + auto input = context.memory_at(rnode.input()); + auto indices = context.memory_at(rnode.indices()); + auto output = context.memory_at(rnode.output()); + auto input_datatype = rnode.input().type(); + + switch (input_datatype) + { + case dt_float32: + kernels::gather_elements(input.buffer().as_span().data(), indices.buffer().as_span().data(), output.buffer().as_span().data(), + input.shape(), indices.shape(), rnode.axis()) + .unwrap_or_throw(); + break; + default: + throw std::runtime_error("unsupported dtype for gather_elements: " + std::string(datatype_names(input_datatype))); } }); + + register_evaluator(op_instancenorm, [](ir::node &node, function_evaluate_context &context) { + auto &rnode = static_cast(node); + + auto input = context.memory_at(rnode.input()); + auto scale = context.memory_at(rnode.scale()); + auto bias = context.memory_at(rnode.bias()); + auto output = context.memory_at(rnode.output()); + + auto output_type = rnode.output().type(); + switch (output_type) + { + case dt_float32: + kernels::instancenorm(input.buffer().as_span().data(), output.buffer().as_span().data(), + scale.buffer().as_span().data(), bias.buffer().as_span().data(), input.shape(), + rnode.epsilon()) + .unwrap_or_throw(); + break; + default: + std::cerr << "unsupported dtype for layernorm: " + std::string(datatype_names(output_type)); + } }); + + register_evaluator(op_layernorm, [](ir::node &node, function_evaluate_context &context) { + auto &rnode = static_cast(node); + + auto input = context.memory_at(rnode.input()); + auto scale = context.memory_at(rnode.scale()); + auto bias = context.memory_at(rnode.bias()); + auto output = context.memory_at(rnode.output()); + + auto output_type = rnode.output().type(); + switch (output_type) + { + case dt_float32: + kernels::layernorm(input.buffer().as_span().data(), output.buffer().as_span().data(), + scale.buffer().as_span().data(), bias.buffer().as_span().data(), input.shape(), + rnode.axis(), rnode.epsilon()) + .unwrap_or_throw(); + break; + default: + std::cerr << "unsupported dtype for layernorm: " + std::string(datatype_names(output_type)); + } }); + + register_evaluator(op_compress, [](ir::node &node, function_evaluate_context &context) { + auto &rnode = static_cast(node); + auto input = context.memory_at(rnode.input()); + auto condition = context.memory_at(rnode.condition()); + auto output = context.memory_at(rnode.output()); + kernels::compress(input.buffer().as_span().data(), condition.buffer().as_span().data(), output.buffer().as_span().data(), + input.shape(), condition.shape(), rnode.axis()) + .unwrap_or_throw(); }); } } diff --git a/src/importer/caffe/CMakeLists.txt b/src/importer/caffe/CMakeLists.txt index 7fbea7a58c..70e91f9b78 100644 --- a/src/importer/caffe/CMakeLists.txt +++ b/src/importer/caffe/CMakeLists.txt @@ -25,6 +25,6 @@ set(SRCS caffe.pb.cc add_library(caffe_importer ${SRCS}) target_include_directories(caffe_importer PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) target_include_directories(caffe_importer PUBLIC ${PARENT_SOURCE_DIR}/include) -target_link_libraries(caffe_importer PUBLIC ir) +target_link_libraries(caffe_importer PUBLIC ir protobuf::libprotobuf) target_link_libraries(caffe_importer PRIVATE) set_target_properties(caffe_importer PROPERTIES POSITION_INDEPENDENT_CODE ON) \ No newline at end of file diff --git a/src/importer/onnx/CMakeLists.txt b/src/importer/onnx/CMakeLists.txt index 1f4a58fadc..e00d09f29c 100644 --- a/src/importer/onnx/CMakeLists.txt +++ b/src/importer/onnx/CMakeLists.txt @@ -28,6 +28,7 @@ set(ONNX_IMPORTER_OPS_SOURCES ops/hardmax.cpp ops/identity.cpp ops/instancenorm.cpp + ops/layernorm.cpp ops/lpnorm.cpp ops/lrn.cpp ops/matmul.cpp @@ -59,6 +60,9 @@ set(ONNX_IMPORTER_OPS_SOURCES ops/unsqueeze.cpp ops/lstm.cpp ops/where.cpp + ops/gru.cpp + ops/gather_elements.cpp + ops/compress.cpp ) add_library(onnx_importer ${ONNX_IMPORTER_SOURCES} ${ONNX_IMPORTER_OPS_SOURCES}) diff --git a/src/importer/onnx/onnx_importer.h b/src/importer/onnx/onnx_importer.h index 6b496b7258..d9af3c4906 100644 --- a/src/importer/onnx/onnx_importer.h +++ b/src/importer/onnx/onnx_importer.h @@ -53,6 +53,7 @@ class onnx_importer void convert_op_logical(const onnx::NodeProto &node, const binary_op_t binary_op); void convert_op_arg(const onnx::NodeProto &node, reduce_arg_op_t op); void convert_op_compare(const onnx::NodeProto &node, const compare_op_t compare_op); + void convert_op_compress(const onnx::NodeProto &node); template void convert_pool(const onnx::NodeProto &node, const reduce_op_t reduce_op, const float initial_value); diff --git a/src/importer/onnx/opcode.def b/src/importer/onnx/opcode.def index 4d43f16dfd..935edec33f 100644 --- a/src/importer/onnx/opcode.def +++ b/src/importer/onnx/opcode.def @@ -13,6 +13,7 @@ DEFINE_OPCODE(Cast) DEFINE_OPCODE(Ceil) DEFINE_OPCODE(Celu) DEFINE_OPCODE(Clip) +DEFINE_OPCODE(Compress) DEFINE_OPCODE(Concat) DEFINE_OPCODE(Constant) DEFINE_OPCODE(ConstantOfShape) @@ -29,20 +30,24 @@ DEFINE_OPCODE(Elu) DEFINE_OPCODE(Exp) DEFINE_OPCODE(Expand) DEFINE_OPCODE(Equal) +DEFINE_OPCODE(Erf) DEFINE_OPCODE(Flatten) DEFINE_OPCODE(Floor) DEFINE_OPCODE(Gather) +DEFINE_OPCODE(GatherElements) DEFINE_OPCODE(GatherND) DEFINE_OPCODE(Gemm) DEFINE_OPCODE(GlobalAveragePool) DEFINE_OPCODE(GlobalMaxPool) DEFINE_OPCODE(Greater) DEFINE_OPCODE(GreaterOrEqual) +DEFINE_OPCODE(GRU) DEFINE_OPCODE(Hardmax) DEFINE_OPCODE(HardSigmoid) DEFINE_OPCODE(HardSwish) DEFINE_OPCODE(Identity) DEFINE_OPCODE(InstanceNormalization) +DEFINE_OPCODE(LayerNormalization) DEFINE_OPCODE(LpNormalization) DEFINE_OPCODE(LeakyRelu) DEFINE_OPCODE(Less) @@ -83,6 +88,7 @@ DEFINE_OPCODE(Resize) DEFINE_OPCODE(ReverseSequence) DEFINE_OPCODE(RoiAlign) DEFINE_OPCODE(Round) +DEFINE_OPCODE(Rsqrt) DEFINE_OPCODE(Selu) DEFINE_OPCODE(Shape) DEFINE_OPCODE(Sign) @@ -105,6 +111,7 @@ DEFINE_OPCODE(Tile) DEFINE_OPCODE(TopK) DEFINE_OPCODE(Transpose) DEFINE_OPCODE(Trilu) +DEFINE_OPCODE(ThresholdedRelu) DEFINE_OPCODE(Upsample) DEFINE_OPCODE(Unsqueeze) DEFINE_OPCODE(Where) diff --git a/src/importer/onnx/ops/activations.cpp b/src/importer/onnx/ops/activations.cpp index d6e2b3ed07..1ae5edc323 100644 --- a/src/importer/onnx/ops/activations.cpp +++ b/src/importer/onnx/ops/activations.cpp @@ -18,7 +18,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -571,4 +573,34 @@ void onnx_importer::convert_op_Softsign(const NodeProto &node) input_tensors_.emplace(&abs->input(), input); input_tensors_.emplace(&div->input_a(), input); output_tensors_.emplace(output, &div->output()); +} + +void onnx_importer::convert_op_ThresholdedRelu(const NodeProto &node) +{ + const auto &input = node.input()[0]; + const auto &output = node.output()[0]; + auto in_shape = get_shape(input); + const auto input_type = get_datatype(input).value(); + const auto &op_name { generate_name(node) }; + + const auto alpha_value = get_attribute(node, "alpha").value_or(1.0); + auto alpha = graph_.emplace(alpha_value); + alpha->name(op_name + ".alpha(ThresholdedRelu)"); + + auto cmp = graph_.emplace(compare_op_t::compare_greater, input_type, in_shape, alpha->output().shape()); + cmp->name(op_name + ".greater(ThresholdedRelu)"); + cmp->input_b().connect(alpha->output()); + + auto new_alpha = graph_.emplace(cmp->output().type(), cmp->output().shape(), dt_float32); + new_alpha->name(op_name + ".new_alpha(ThresholdedRelu)"); + new_alpha->input().connect(cmp->output()); + + auto b_max = graph_.emplace(binary_mul, input_type, in_shape, new_alpha->output().shape(), value_range::nonnegative()); + b_max->name(op_name + ".mul(ThresholdedRelu)"); + + b_max->input_b().connect(new_alpha->output()); + + input_tensors_.emplace(&cmp->input_a(), input); + input_tensors_.emplace(&b_max->input_a(), input); + output_tensors_.emplace(output, &b_max->output()); } \ No newline at end of file diff --git a/src/importer/onnx/ops/binary.cpp b/src/importer/onnx/ops/binary.cpp index 8f1d1f1901..a92c3df60a 100644 --- a/src/importer/onnx/ops/binary.cpp +++ b/src/importer/onnx/ops/binary.cpp @@ -74,11 +74,26 @@ void onnx_importer::convert_binary(const onnx::NodeProto &node, const binary_op_ auto input_a_shape = get_shape(input_a); const auto input_type = get_datatype(input_a).value(); auto input_b_shape = get_shape(input_b); + const auto input_b_type = get_datatype(input_b).value(); + convert *cvt = nullptr; + if (input_type != input_b_type) + { + cvt = graph_.emplace(input_b_type, input_b_shape, input_type); + cvt->name(op_name + "(Convert)"); + } auto op = graph_.emplace(binary_op, input_type, input_a_shape, input_b_shape, value_range::full()); op->name(op_name + '(' + binary_op_to_string(binary_op) + ')'); input_tensors_.emplace(&op->input_a(), input_a); - input_tensors_.emplace(&op->input_b(), input_b); + if (cvt) + { + input_tensors_.emplace(&cvt->input(), input_b); + op->input_b().connect(cvt->output()); + } + else + { + input_tensors_.emplace(&op->input_b(), input_b); + } output_tensors_.emplace(output, &op->output()); } diff --git a/src/importer/onnx/ops/compress.cpp b/src/importer/onnx/ops/compress.cpp new file mode 100644 index 0000000000..13fea4606e --- /dev/null +++ b/src/importer/onnx/ops/compress.cpp @@ -0,0 +1,49 @@ +/* Copyright 2020 Alexey Chernov <4ernov@gmail.com> + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../onnx_importer.h" +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::importer; +using namespace nncase::ir; +using namespace onnx; + +void onnx_importer::convert_op_Compress(const NodeProto &node) +{ + const auto &op_name { generate_name(node) }; + + auto input = node.input()[0]; + auto condition = node.input()[1]; + auto output = node.output()[0]; + + const auto in_type = get_datatype(input).value(); + const auto in_shape = get_shape(input); + const auto condition_shape = get_shape(condition); + const auto out_shape = get_shape(output); + + auto onnx_axis = get_attribute(node, "axis").value_or((int)in_shape.size()); + + auto onnx_compress = graph_.emplace(in_type, in_shape, condition_shape, out_shape, onnx_axis); + onnx_compress->name(op_name); + + input_tensors_.emplace(&onnx_compress->input_at(0), node.input()[0]); + input_tensors_.emplace(&onnx_compress->input_at(1), node.input()[1]); + + output_tensors_.emplace(node.output()[0], &onnx_compress->output()); +} diff --git a/src/importer/onnx/ops/conv.cpp b/src/importer/onnx/ops/conv.cpp index 34bd36bdc0..f548d514dc 100644 --- a/src/importer/onnx/ops/conv.cpp +++ b/src/importer/onnx/ops/conv.cpp @@ -16,6 +16,7 @@ #include "../onnx_importer.h" #include #include +#include #include #include #include @@ -176,19 +177,38 @@ void onnx_importer::convert_op_ConvTranspose(const NodeProto &node) auto input_shape = get_shape(input); auto weight_shape = get_shape(weight); - auto weight_type = get_datatype(weight).value(); auto output_shape = get_shape(output); + auto input_type = get_datatype(input).value(); + auto weight_type = get_datatype(weight).value(); + auto output_type = get_datatype(output).value(); + + bool model_3d = input_shape.size() == 3; // group const auto &group_attr = get_attribute(node, "group"); size_t group = group_attr ? group_attr.value() : 1; - auto tp = graph_.emplace(weight_type, weight_shape, axis_t { 1, 0, 2, 3 }); - tp->name(op_name + "(Transpose)"); - auto tp_shape = tp->output().shape(); - auto bc = graph_.emplace(weight_type, tp_shape, shape_t { tp_shape[0] * group, tp_shape[1] / group, tp_shape[2], tp_shape[3] }); - bc->name(op_name + "(Bitcast)"); - auto bc_shape = bc->output().shape(); + transpose *tp; + bitcast *bc; + shape_t bc_shape, tp_shape; + if (model_3d) + { + tp = graph_.emplace(weight_type, weight_shape, axis_t { 1, 0, 2 }); + tp->name(op_name + "(Transpose)"); + tp_shape = tp->output().shape(); + bc = graph_.emplace(weight_type, tp_shape, shape_t { tp_shape[0] * group, tp_shape[1] / group, tp_shape[2], 1 }); + bc->name(op_name + "(Bitcast)"); + bc_shape = bc->output().shape(); + } + else + { + tp = graph_.emplace(weight_type, weight_shape, axis_t { 1, 0, 2, 3 }); + tp->name(op_name + "(Transpose)"); + tp_shape = tp->output().shape(); + bc = graph_.emplace(weight_type, tp_shape, shape_t { tp_shape[0] * group, tp_shape[1] / group, tp_shape[2], tp_shape[3] }); + bc->name(op_name + "(Bitcast)"); + bc_shape = bc->output().shape(); + } // stride std::array strides = { 1, 1 }; @@ -244,7 +264,8 @@ void onnx_importer::convert_op_ConvTranspose(const NodeProto &node) { std::array total_paddings { { 0, 0 } }; total_paddings[0] = strides[0] * (input_shape[2] - 1) + output_paddings[0] + ((tp_shape[2] - 1) * dilations[0] + 1) - output_shape[2]; - total_paddings[1] = strides[1] * (input_shape[3] - 1) + output_paddings[1] + ((tp_shape[3] - 1) * dilations[1] + 1) - output_shape[3]; + if (!model_3d) + total_paddings[1] = strides[1] * (input_shape[3] - 1) + output_paddings[1] + ((tp_shape[3] - 1) * dilations[1] + 1) - output_shape[3]; if (pad_mode == "SAME_UPPER") { @@ -269,23 +290,35 @@ void onnx_importer::convert_op_ConvTranspose(const NodeProto &node) if (paddings_attr) { const auto &paddings_values = paddings_attr.value(); - if (paddings_values.size() > 1) + if (model_3d) { - paddings[0].before = paddings_values[0]; - paddings[1].before = paddings_values[1]; + if (paddings_values.size() > 1) + { + paddings[0].before = paddings_values[0]; + paddings[0].after = paddings_values[1]; + } } - - if (paddings_values.size() > 3) + else { - paddings[0].after = paddings_values[2]; - paddings[1].after = paddings_values[3]; + if (paddings_values.size() > 1) + { + paddings[0].before = paddings_values[0]; + paddings[1].before = paddings_values[1]; + } + + if (paddings_values.size() > 3) + { + paddings[0].after = paddings_values[2]; + paddings[1].after = paddings_values[3]; + } } } } else if (pad_mode == "SAME_UPPER") { paddings[0] = get_windowed_padding(input_shape[2], tp_shape[2], strides[0], dilations[0], true); - paddings[1] = get_windowed_padding(input_shape[3], tp_shape[3], strides[1], dilations[1], true); + if (!model_3d) + paddings[1] = get_windowed_padding(input_shape[3], tp_shape[3], strides[1], dilations[1], true); } else if (pad_mode == "SAME_LOWER") { @@ -293,19 +326,46 @@ void onnx_importer::convert_op_ConvTranspose(const NodeProto &node) if (paddings[0].before < paddings[0].after) std::swap(paddings[0].before, paddings[0].after); - paddings[1] = get_windowed_padding(input_shape[3], tp_shape[3], strides[1], dilations[1], true); - if (paddings[1].before < paddings[1].after) - std::swap(paddings[1].before, paddings[1].after); + if (!model_3d) + { + paddings[1] = get_windowed_padding(input_shape[3], tp_shape[3], strides[1], dilations[1], true); + if (paddings[1].before < paddings[1].after) + std::swap(paddings[1].before, paddings[1].after); + } } } + // fit 3D input + auto data_shape = input_shape; + if (model_3d) + { + paddings[1] = padding::zero(); + strides[1] = 1; + dilations[1] = 1; + input_shape.push_back(1); + + output_shape.push_back(1); + } + // ConvTranspose auto conv_transpose = graph_.emplace(input_shape, bc_shape, output_shape, group, paddings[0], paddings[1], output_paddings[0], output_paddings[1], strides[0], strides[1], dilations[0], dilations[1], value_range::full()); conv_transpose->name(op_name + "(ConvTranspose)"); - input_tensors_.emplace(&conv_transpose->input(), input); - input_tensors_.emplace(&tp->input(), weight); + if (model_3d) + { + auto bitc_data = graph_.emplace(input_type, data_shape, input_shape); + conv_transpose->input().connect(bitc_data->output()); + input_tensors_.emplace(&bitc_data->input(), input); + + input_tensors_.emplace(&tp->input(), weight); + } + else + { + input_tensors_.emplace(&conv_transpose->input(), input); + input_tensors_.emplace(&tp->input(), weight); + } + bc->input().connect(tp->output()); conv_transpose->weights().connect(bc->output()); if (node.input().size() > 2) @@ -319,5 +379,15 @@ void onnx_importer::convert_op_ConvTranspose(const NodeProto &node) auto bias = graph_.emplace(dt_float32, shape, zeros); conv_transpose->bias().connect(bias->output()); } - output_tensors_.emplace(output, &conv_transpose->output()); -} + + if (model_3d) + { + auto bitc_out = graph_.emplace(output_type, conv_transpose->output().shape(), shape_t { conv_transpose->output().shape()[0], conv_transpose->output().shape()[1], conv_transpose->output().shape()[2] }); + bitc_out->input().connect(conv_transpose->output()); + output_tensors_.emplace(output, &bitc_out->output()); + } + else + { + output_tensors_.emplace(output, &conv_transpose->output()); + } +} \ No newline at end of file diff --git a/src/importer/onnx/ops/expand.cpp b/src/importer/onnx/ops/expand.cpp index dd219dd114..0871d24a29 100644 --- a/src/importer/onnx/ops/expand.cpp +++ b/src/importer/onnx/ops/expand.cpp @@ -36,9 +36,25 @@ void onnx_importer::convert_op_Expand(const NodeProto &node) auto shape_vec = get_constant_value(node.input()[1]); shape_t shape { shape_vec.begin(), shape_vec.end() }; - auto ones = xt::ones(shape); - std::vector ones_vec { ones.begin(), ones.end() }; - auto con = graph_.emplace(input_type, shape, ones_vec); + constant *con = nullptr; + if (input_type == dt_int64) + { + auto ones = xt::ones(shape); + std::vector ones_vec { ones.begin(), ones.end() }; + con = graph_.emplace(input_type, shape, ones_vec); + } + else if (input_type == dt_float32) + { + auto ones = xt::ones(shape); + std::vector ones_vec { ones.begin(), ones.end() }; + con = graph_.emplace(input_type, shape, ones_vec); + } + else if (input_type == dt_uint8) + { + auto ones = xt::ones(shape); + std::vector ones_vec { ones.begin(), ones.end() }; + con = graph_.emplace(input_type, shape, ones_vec); + } auto op = graph_.emplace(binary_mul, input_type, input_shape, shape, value_range::full()); op->name(generate_name(node) + "(Expand)"); diff --git a/src/importer/onnx/ops/gather_elements.cpp b/src/importer/onnx/ops/gather_elements.cpp new file mode 100644 index 0000000000..b68010af90 --- /dev/null +++ b/src/importer/onnx/ops/gather_elements.cpp @@ -0,0 +1,58 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../onnx_importer.h" +#include +#include +#include + +using namespace nncase; +using namespace nncase::importer; +using namespace nncase::ir; +using namespace onnx; + +void onnx_importer::convert_op_GatherElements(const NodeProto &node) +{ + const auto &input = node.input()[0]; + const auto &indices = node.input()[1]; + const auto &output = node.output()[0]; + + const datatype_t input_type = get_datatype(input).value(); + const datatype_t indices_type = get_datatype(indices).value(); + const auto input_shape = get_shape(input); + const auto indices_shape = get_shape(indices); + const auto out_shape = get_shape(output); + + auto axis = get_attribute(node, "axis").value_or(0); + if (axis < 0) + { + axis += static_cast(input_shape.size()); + } + + auto ga = graph_.emplace(input_type, dt_int64, input_shape, indices_shape, out_shape, axis); + + auto mid_ptr = &ga->indices(); + if (indices_type == dt_int32) + { + auto cvt = graph_.emplace(indices_type, indices_shape, dt_int64); + cvt->name(ga->name() + "(cvt_int_to_int64)"); + ga->indices().connect(cvt->output()); + mid_ptr = &cvt->input(); + } + + link_input_tensor(&ga->input(), input); + link_input_tensor(mid_ptr, indices); + link_output_tensor(output, &ga->output()); +} \ No newline at end of file diff --git a/src/importer/onnx/ops/gru.cpp b/src/importer/onnx/ops/gru.cpp new file mode 100644 index 0000000000..227101fded --- /dev/null +++ b/src/importer/onnx/ops/gru.cpp @@ -0,0 +1,121 @@ +/* Copyright 2020 Alexey Chernov <4ernov@gmail.com> + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../onnx_importer.h" +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::importer; +using namespace nncase::ir; +using namespace onnx; + +void onnx_importer::convert_op_GRU(const NodeProto &node) +{ + const auto &op_name { generate_name(node) }; + + // attribute + auto direction_str = get_attribute(node, "direction").value_or("forward"); + lstm_direction direction = kForward; + if (direction_str == "forward") + direction = kForward; + else if (direction_str == "reverse") + direction = kReverse; + else + direction = kBidirectional; + size_t num_directions = direction == kBidirectional ? 2 : 1; + + auto linear_before_reset = get_attribute(node, "linear_before_reset").value_or(0); + + // input + auto input_size = node.input_size(); + assert(input_size >= 3 && input_size <= 8); + const auto &input = node.input()[0]; + const auto &W = node.input()[1]; + const auto &R = node.input()[2]; + + const datatype_t input_type = get_datatype(input).value(); + const auto &input_shape = get_shape(input); + const auto &W_shape = get_shape(W); + const auto &R_shape = get_shape(R); + + size_t seq_length = input_shape[0]; + size_t batch_size = input_shape[1]; + size_t hidden_size = get_attribute(node, "hidden_size").value_or(W_shape[1] / 3); + + // bias + std::string B; + shape_t B_shape { num_directions, 6 * hidden_size }; + if (input_size >= 4) + { + B = node.input()[3]; + } + + // initial_h + std::string initial_h; + shape_t initial_shape { num_directions, batch_size, hidden_size }; + if (input_size >= 6) + { + initial_h = node.input()[5]; + } + + // output + auto output_size = node.output_size(); + assert(output_size >= 0 && output_size <= 3); + std::string output; + if (output_size >= 1) + output = node.output()[0]; + + std::string output_h; + if (output_size >= 2) + output_h = node.output()[1]; + + shape_t output_shape { seq_length, num_directions, batch_size, hidden_size }; + auto lstm_node = graph_.emplace(input_shape, W_shape, R_shape, B_shape, output_shape, initial_shape, direction, "onnx", linear_before_reset == 0 ? false : true); + lstm_node->name(op_name); + + input_tensors_.emplace(&lstm_node->input_at(0), input); + input_tensors_.emplace(&lstm_node->input_at(1), W); + input_tensors_.emplace(&lstm_node->input_at(2), R); + if (!B.empty()) + { + input_tensors_.emplace(&lstm_node->input_at(3), B); + } + else + { + std::vector v(xt::compute_size(B_shape), 0.f); + auto c = graph_.emplace(input_type, B_shape, v); + lstm_node->b().connect(c->output()); + } + + if (!initial_h.empty()) + { + input_tensors_.emplace(&lstm_node->input_at(4), initial_h); + } + else + { + std::vector v(xt::compute_size(initial_shape), 0.f); + auto c = graph_.emplace(input_type, initial_shape, v); + lstm_node->initial_h().connect(c->output()); + } + + if (!output.empty()) + output_tensors_.emplace(output, &lstm_node->output()); + + if (!output_h.empty()) + output_tensors_.emplace(output_h, &lstm_node->output_h()); +} diff --git a/src/importer/onnx/ops/instancenorm.cpp b/src/importer/onnx/ops/instancenorm.cpp index 80719fcffa..79d35bf23c 100644 --- a/src/importer/onnx/ops/instancenorm.cpp +++ b/src/importer/onnx/ops/instancenorm.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -58,67 +59,85 @@ void onnx_importer::convert_op_InstanceNormalization(const NodeProto &node) auto bias_constant = graph_.emplace(get_datatype(), bias_new_shape, bias_value); bias_constant->name(op_name + ".bias(InstanceNormalization)"); - // mean - axis_t axes; - for (size_t i = 2; i < input_shape.size(); i++) - { - axes.push_back(i); - } - float init_value = 0.f; - bool keepdims = true; - auto mean = graph_.emplace(reduce_mean, input_type, input_shape, axes, init_value, keepdims); - mean->name(op_name + ".reduce_mean(InstanceNormalization)"); - - // x - mean - auto sub = graph_.emplace(binary_sub, input_type, input_shape, mean->output().shape(), value_range::full()); - sub->name(op_name + ".sub(InstanceNormalization)"); - - // scale * (x - mean) - auto mul = graph_.emplace(binary_mul, input_type, scale_new_shape, sub->output().shape(), value_range::full()); - mul->name(op_name + ".mul(InstanceNormalization)"); - - // variance - auto square = graph_.emplace(unary_square, sub->output().shape()); - square->name(op_name + ".square(InstanceNormalization)"); - auto variance = graph_.emplace(reduce_mean, input_type, square->output().shape(), axes, init_value, keepdims); - variance->name(op_name + ".reduce(InstanceNormalization)"); - - // sqrt(variance + epsilon) auto epsilon_attr = get_attribute(node, "epsilon"); auto epsilon = epsilon_attr ? epsilon_attr.value() : 1e-05f; - auto eps_constant = graph_.emplace(epsilon); - eps_constant->name(op_name + ".eps(InstanceNormalization)"); - auto add_eps = graph_.emplace(binary_add, input_type, variance->output().shape(), eps_constant->output().shape(), value_range::full()); - add_eps->name(op_name + ".add(InstanceNormalization)"); - auto sqrt = graph_.emplace(unary_sqrt, add_eps->output().shape()); - sqrt->name(op_name + ".sqrt(InstanceNormalization)"); - // scale * (x - mean) / sqrt(variance + epsilon) + B - auto div = graph_.emplace(binary_div, input_type, mul->output().shape(), sqrt->output().shape(), value_range::full()); - div->name(op_name + ".scale(InstanceNormalization)"); - auto add_bias = graph_.emplace(binary_add, input_type, div->output().shape(), bias_new_shape, value_range::full()); - add_bias->name(op_name + ".bias(InstanceNormalization)"); - - sub->input_b().connect(mean->output()); - - mul->input_a().connect(scale_constant->output()); - mul->input_b().connect(sub->output()); - - square->input().connect(sub->output()); - variance->input().connect(square->output()); - - add_eps->input_a().connect(variance->output()); - add_eps->input_b().connect(eps_constant->output()); - - sqrt->input().connect(add_eps->output()); - - div->input_a().connect(mul->output()); - div->input_b().connect(sqrt->output()); - - add_bias->input_a().connect(div->output()); - add_bias->input_b().connect(bias_constant->output()); - - input_tensors_.emplace(&mean->input(), input); - input_tensors_.emplace(&sub->input_a(), input); - output_tensors_.emplace(output, &add_bias->output()); + auto instance_norm = graph_.emplace(input_type, input_shape, epsilon); + instance_norm->scale().connect(scale_constant->output()); + instance_norm->bias().connect(bias_constant->output()); + input_tensors_.emplace(&instance_norm->input(), input); + output_tensors_.emplace(output, &instance_norm->output()); + + // // mean + // axis_t axes; + // for (size_t i = 2; i < input_shape.size(); i++) + // { + // axes.push_back(i); + // } + // float init_value = 0.f; + // bool keepdims = true; + // auto mean = graph_.emplace(reduce_mean, input_type, input_shape, axes, init_value, keepdims); + // mean->attributes(mean->attributes() | node_attributes::node_attr_skip_quantize); + // mean->name(op_name + ".reduce_mean(InstanceNormalization)"); + + // // x - mean + // auto sub = graph_.emplace(binary_sub, input_type, input_shape, mean->output().shape(), value_range::full()); + // sub->attributes(sub->attributes() | node_attributes::node_attr_skip_quantize); + // sub->name(op_name + ".sub(InstanceNormalization)"); + + // // scale * (x - mean) + // auto mul = graph_.emplace(binary_mul, input_type, scale_new_shape, sub->output().shape(), value_range::full()); + // mul->attributes(mul->attributes() | node_attributes::node_attr_skip_quantize); + // mul->name(op_name + ".mul(InstanceNormalization)"); + + // // variance + // auto square = graph_.emplace(unary_square, sub->output().shape()); + // square->attributes(square->attributes() | node_attributes::node_attr_skip_quantize); + // square->name(op_name + ".square(InstanceNormalization)"); + // auto variance = graph_.emplace(reduce_mean, input_type, square->output().shape(), axes, init_value, keepdims); + // variance->attributes(variance->attributes() | node_attributes::node_attr_skip_quantize); + // variance->name(op_name + ".reduce(InstanceNormalization)"); + + // // sqrt(variance + epsilon) + // auto epsilon_attr = get_attribute(node, "epsilon"); + // auto epsilon = epsilon_attr ? epsilon_attr.value() : 1e-05f; + // auto eps_constant = graph_.emplace(epsilon); + // eps_constant->name(op_name + ".eps(InstanceNormalization)"); + // auto add_eps = graph_.emplace(binary_add, input_type, variance->output().shape(), eps_constant->output().shape(), value_range::full()); + // add_eps->attributes(add_eps->attributes() | node_attributes::node_attr_skip_quantize); + // add_eps->name(op_name + ".add(InstanceNormalization)"); + // auto sqrt = graph_.emplace(unary_sqrt, add_eps->output().shape()); + // sqrt->attributes(sqrt->attributes() | node_attributes::node_attr_skip_quantize); + // sqrt->name(op_name + ".sqrt(InstanceNormalization)"); + + // // scale * (x - mean) / sqrt(variance + epsilon) + B + // auto div = graph_.emplace(binary_div, input_type, mul->output().shape(), sqrt->output().shape(), value_range::full()); + // div->attributes(div->attributes() | node_attributes::node_attr_skip_quantize); + // div->name(op_name + ".scale(InstanceNormalization)"); + // auto add_bias = graph_.emplace(binary_add, input_type, div->output().shape(), bias_new_shape, value_range::full()); + // add_bias->attributes(add_bias->attributes() | node_attributes::node_attr_skip_quantize); + // add_bias->name(op_name + ".bias(InstanceNormalization)"); + + // sub->input_b().connect(mean->output()); + + // mul->input_a().connect(scale_constant->output()); + // mul->input_b().connect(sub->output()); + + // square->input().connect(sub->output()); + // variance->input().connect(square->output()); + + // add_eps->input_a().connect(variance->output()); + // add_eps->input_b().connect(eps_constant->output()); + + // sqrt->input().connect(add_eps->output()); + + // div->input_a().connect(mul->output()); + // div->input_b().connect(sqrt->output()); + + // add_bias->input_a().connect(div->output()); + // add_bias->input_b().connect(bias_constant->output()); + + // input_tensors_.emplace(&mean->input(), input); + // input_tensors_.emplace(&sub->input_a(), input); + // output_tensors_.emplace(output, &add_bias->output()); } diff --git a/src/importer/onnx/ops/layernorm.cpp b/src/importer/onnx/ops/layernorm.cpp new file mode 100644 index 0000000000..239cbc053a --- /dev/null +++ b/src/importer/onnx/ops/layernorm.cpp @@ -0,0 +1,72 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nncase/ir/ops/layernorm.h" +#include "../onnx_importer.h" +#include "nncase/ir/ir_types.h" +#include +#include +#include + +using namespace nncase; +using namespace nncase::importer; +using namespace nncase::ir; +using namespace onnx; + +void onnx_importer::convert_op_LayerNormalization(const NodeProto &node) +{ + assert(node.input().size() >= 2); + + const auto &op_name { generate_name(node) }; + + const auto &input = node.input()[0]; + const auto &scale = node.input()[1]; + const auto output = node.output()[0]; + + auto input_shape = get_shape(input); + const auto input_type = get_datatype(input).value(); + + std::vector scale_value; + auto scale_initializer = get_initializer(scale); + scale_value = scale_initializer ? to>(scale_initializer.value()) : get_constant_input_data(scale).value(); + auto scale_shape = get_shape(scale); + auto scale_constant = graph_.emplace(get_datatype(), scale_shape, scale_value); + scale_constant->name(op_name + ".scale(LayerNormalization)"); + + auto bias_shape = scale_shape; + std::vector bias_value(xt::compute_size(scale_shape), 0.f); + if (node.input().size() > 2) + { + const auto &bias = node.input()[2]; + auto bias_initializer = get_initializer(bias); + bias_value = bias_initializer ? to>(bias_initializer.value()) : get_constant_input_data(bias).value(); + } + auto bias_constant = graph_.emplace(get_datatype(), bias_shape, bias_value); + bias_constant->name(op_name + ".bias(LayerNormalization)"); + + auto axis_attr = get_attribute(node, "axis"); + int32_t axis = axis_attr ? axis_attr.value() : -1; + + auto epsilon_attr = get_attribute(node, "epsilon"); + auto epsilon = epsilon_attr ? epsilon_attr.value() : 1e-05f; + + auto ln = graph_.emplace(input_type, input_shape, axis, epsilon); + ln->name(op_name + ".layer_norm(LayerNormalization)"); + + input_tensors_.emplace(&ln->input(), input); + ln->scale().connect(scale_constant->output()); + ln->bias().connect(bias_constant->output()); + output_tensors_.emplace(output, &ln->output()); +} diff --git a/src/importer/onnx/ops/pool.cpp b/src/importer/onnx/ops/pool.cpp index 41c01d8002..804af97961 100644 --- a/src/importer/onnx/ops/pool.cpp +++ b/src/importer/onnx/ops/pool.cpp @@ -74,6 +74,7 @@ void onnx_importer::convert_pool(const NodeProto &node, const reduce_op_t reduce const auto &output = node.output()[0]; auto input_shape = get_shape(input); + auto output_shape = get_shape(output); padding_mode pad_mode = padding_mode::notset; const auto &auto_pad_attr = get_attribute(node, "auto_pad"); @@ -82,6 +83,13 @@ void onnx_importer::convert_pool(const NodeProto &node, const reduce_op_t reduce pad_mode = parse_padding_mode(auto_pad_attr.value()); } + int ceil_mode = 0; + const auto &ceil_mode_attr = get_attribute(node, "ceil_mode"); + if (ceil_mode_attr) + { + ceil_mode = static_cast(ceil_mode_attr.value()); + } + bool count_include_pad = false; const auto &count_include_pad_attr = get_attribute(node, "count_include_pad"); if (count_include_pad_attr) @@ -133,6 +141,17 @@ void onnx_importer::convert_pool(const NodeProto &node, const reduce_op_t reduce } } + if (ceil_mode) + { + auto get_input_size = [](int output_size, int k, int s, int p) { return (output_size - 1) * s + k - p; }; + auto extra_paddg_h = get_input_size(output_shape[2], kernel_shape[0], strides[0], pads[0].sum()) - input_shape[2]; + if (extra_paddg_h > 0) + pads[0].after += extra_paddg_h; + auto extra_paddg_w = get_input_size(output_shape[3], kernel_shape[1], strides[1], pads[1].sum()) - input_shape[3]; + if (extra_paddg_w > 0) + pads[1].after += extra_paddg_w; + } + auto op = graph_.emplace(reduce_op, move(input_shape), init_value, kernel_shape[0], kernel_shape[1], pads[0], pads[1], strides[0], strides[1], dilations[0], dilations[1], value_range::full(), false, count_include_pad); diff --git a/src/importer/onnx/ops/slice.cpp b/src/importer/onnx/ops/slice.cpp index 3684e56394..ffade78561 100644 --- a/src/importer/onnx/ops/slice.cpp +++ b/src/importer/onnx/ops/slice.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include using namespace nncase; @@ -31,6 +32,23 @@ void onnx_importer::convert_op_Slice(const NodeProto &node) const shape_t &input_shape = get_shape(input); auto ndim = input_shape.size(); +#define GET_ATTRIBUTE(index, dst) \ + { \ + const std::string &name = node.input()[index]; \ + const datatype_t type = get_datatype(name).value(); \ + \ + if (type == datatype_t::dt_int32) \ + { \ + auto vec = get_constant_value(node.input()[index]); \ + dst.assign(vec.begin(), vec.end()); \ + } \ + else \ + { \ + auto vec = get_constant_value(node.input()[index]); \ + dst.assign(vec.begin(), vec.end()); \ + } \ + } + // starts/stops axis_t starts, stops; bool use_opset_1 = node.input().size() == 1; @@ -43,11 +61,9 @@ void onnx_importer::convert_op_Slice(const NodeProto &node) else { // opset 10/11/13 - auto vec = get_constant_value(node.input()[1]); - starts.assign(vec.begin(), vec.end()); + GET_ATTRIBUTE(1, starts) - vec = get_constant_value(node.input()[2]); - stops.assign(vec.begin(), vec.end()); + GET_ATTRIBUTE(2, stops) } assert(starts.size() == stops.size()); assert(starts.size() <= ndim); @@ -63,8 +79,7 @@ void onnx_importer::convert_op_Slice(const NodeProto &node) } else if (node.input().size() >= 4) { - auto vec = get_constant_value(node.input()[3]); - axes.assign(vec.begin(), vec.end()); + GET_ATTRIBUTE(3, axes) } if (axes.empty()) @@ -77,8 +92,7 @@ void onnx_importer::convert_op_Slice(const NodeProto &node) axis_t steps; if (node.input().size() >= 5) { - auto vec = get_constant_value(node.input()[4]); - steps.assign(vec.begin(), vec.end()); + GET_ATTRIBUTE(4, steps); assert(steps.size() == axes.size()); } diff --git a/src/importer/onnx/ops/unary.cpp b/src/importer/onnx/ops/unary.cpp index f2b6a32c55..724a037da1 100644 --- a/src/importer/onnx/ops/unary.cpp +++ b/src/importer/onnx/ops/unary.cpp @@ -129,11 +129,21 @@ void onnx_importer::convert_op_Sqrt(const onnx::NodeProto &node) convert_unary(node, unary_sqrt); } +void onnx_importer::convert_op_Rsqrt(const onnx::NodeProto &node) +{ + convert_unary(node, unary_rsqrt); +} + void onnx_importer::convert_op_Tanh(const onnx::NodeProto &node) { convert_unary(node, unary_tanh); } +void onnx_importer::convert_op_Erf(const onnx::NodeProto &node) +{ + convert_unary(node, unary_erf); +} + void onnx_importer::convert_unary(const onnx::NodeProto &node, const unary_op_t unary_op) { assert(node.input().size() == 1); diff --git a/src/importer/onnx/ops/where.cpp b/src/importer/onnx/ops/where.cpp index 49c66cfcf3..f86985e5cb 100644 --- a/src/importer/onnx/ops/where.cpp +++ b/src/importer/onnx/ops/where.cpp @@ -35,10 +35,9 @@ void onnx_importer::convert_op_Where(const onnx::NodeProto &node) const auto &input_c = node.input()[2]; const auto &output = node.output()[0]; - quant_param_t qparam { 0, 1.f }; datatype_t dtype = dt_float32; - auto deq_a = graph_.emplace(get_datatype(input_a).value(), get_shape(input_a), dtype, qparam); - deq_a->name(op_name + "/deq_a"); + auto deq_a = graph_.emplace(get_datatype(input_a).value(), get_shape(input_a), dtype); + deq_a->name(op_name + "/cvt"); auto op = graph_.emplace(dtype, get_datatype(input_b).value(), deq_a->output().shape(), get_shape(input_b), get_shape(input_c)); op->name(op_name + "/ternary"); diff --git a/src/importer/tflite/CMakeLists.txt b/src/importer/tflite/CMakeLists.txt index 1be871d3c6..1439d623d9 100644 --- a/src/importer/tflite/CMakeLists.txt +++ b/src/importer/tflite/CMakeLists.txt @@ -37,7 +37,7 @@ set(SRCS tflite_importer.cpp ops/split.cpp) add_library(tflite_importer OBJECT ${SRCS}) -target_link_libraries(tflite_importer PUBLIC ir flatbuffers) +target_link_libraries(tflite_importer PUBLIC ir flatbuffers::flatbuffers) target_include_directories(tflite_importer PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) add_dependencies(tflite_importer TFLITE_FB) set_target_properties(tflite_importer PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/src/importer/tflite/ops/custom.cpp b/src/importer/tflite/ops/custom.cpp index 2637db2557..3ef635d077 100644 --- a/src/importer/tflite/ops/custom.cpp +++ b/src/importer/tflite/ops/custom.cpp @@ -15,6 +15,7 @@ #include "../tflite_importer.h" #include #include +#include using namespace nncase; using namespace nncase::importer; @@ -43,6 +44,53 @@ DEFINE_TFLITE_LOWER(CUSTOM) node->name(output.name()->string_view()); link_output_tensor(op.outputs()->Get(0), &node->output()); } + else if (custom_code == "TFLite_Detection_PostProcess") + { + auto &input_decoded_boxes = get_tensor(op.inputs(), 0); + auto &input_scores = get_tensor(op.inputs(), 1); + auto &input_anchors = get_tensor(op.inputs(), 2); + + // get_shape(output_x.shape()): get error shape, ignore it in this step. fix it in independent transform + auto &output_locations = get_tensor(op.outputs(), 0); //detection_boxes (1, num_detected_boxes, 4) + auto &output_classes = get_tensor(op.outputs(), 1); //detection_classes (1, num_detected_boxes) + auto &output_scores = get_tensor(op.outputs(), 2); //detection_scores (1, num_detected_boxes) + auto &output_num_detections = get_tensor(op.outputs(), 3); //num_detections (1) + + auto custom_options = op.custom_options(); + + const auto &m = flexbuffers::GetRoot(custom_options->data(), custom_options->size()).AsMap(); + auto max_detections = m["max_detections"].AsInt32(); + auto max_classes_per_detection = m["max_classes_per_detection"].AsInt32(); + + int32_t detections_per_class = 100; + if (!m["detections_per_class"].IsNull()) + detections_per_class = m["detections_per_class"].AsInt32(); + + bool use_regular_non_max_suppression = false; + if (!m["use_regular_nms"].IsNull()) + use_regular_non_max_suppression = m["use_regular_nms"].AsBool(); + + auto non_max_suppression_score_threshold = m["nms_score_threshold"].AsFloat(); + auto intersection_over_union_threshold = m["nms_iou_threshold"].AsFloat(); + auto num_classes = m["num_classes"].AsInt32(); + auto y = m["y_scale"].AsFloat(); + auto x = m["x_scale"].AsFloat(); + auto h = m["h_scale"].AsFloat(); + auto w = m["w_scale"].AsFloat(); + + auto node = graph_.emplace(get_shape(input_decoded_boxes.shape()), get_shape(input_scores.shape()), get_shape(input_anchors.shape()), + get_shape(output_locations.shape()), get_shape(output_classes.shape()), get_shape(output_scores.shape()), get_shape(output_num_detections.shape()), + max_detections, max_classes_per_detection, detections_per_class, use_regular_non_max_suppression, non_max_suppression_score_threshold, + intersection_over_union_threshold, num_classes, y, x, h, w); + + link_input_tensor(&node->boxes(), op.inputs()->Get(0)); + link_input_tensor(&node->scores(), op.inputs()->Get(1)); + link_input_tensor(&node->anchors(), op.inputs()->Get(2)); + link_output_tensor(op.outputs()->Get(0), &node->output_locations()); + link_output_tensor(op.outputs()->Get(1), &node->output_classes()); + link_output_tensor(op.outputs()->Get(2), &node->output_scores()); + link_output_tensor(op.outputs()->Get(3), &node->output_num_detections()); + } else { throw std::runtime_error(std::string("Unsupported tflite CUSTOM code: ") + custom_code); diff --git a/src/importer/tflite/ops/quantize.cpp b/src/importer/tflite/ops/quantize.cpp index 9c446193e8..46fa70f2de 100644 --- a/src/importer/tflite/ops/quantize.cpp +++ b/src/importer/tflite/ops/quantize.cpp @@ -27,30 +27,12 @@ DEFINE_TFLITE_LOWER(QUANTIZE) auto &input = get_tensor(op.inputs(), 0); auto &output = get_tensor(op.outputs(), 0); - [[maybe_unused]] dequantize *deq; - - auto tp1 = graph_.emplace(to_data_type(input.type()), get_shape(input.shape()), axis_t { 0, 3, 1, 2 }); - tp1->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/pre_trans"); - auto mid_output = &tp1->output(); - if (input.type() != tflite::TensorType_FLOAT32) - { - deq = graph_.emplace(tp1->output().type(), tp1->output().shape(), dt_float32, - to_quant_param(input.quantization())); - deq->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/deq"); - mid_output = &deq->output(); - deq->input().connect(tp1->output()); - } - - auto q = graph_.emplace(dt_float32, mid_output->shape(), to_data_type(output.type()), + auto q = graph_.emplace(to_data_type(input.type()), get_shape(input.shape()), to_data_type(output.type()), to_quant_param(output.quantization())); q->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/q"); - auto tp2 = graph_.emplace(q->output().type(), q->output().shape(), axis_t { 0, 2, 3, 1 }); - tp2->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/trans"); - q->input().connect(*mid_output); - tp2->input().connect(q->output()); - link_input_tensor(&tp1->input(), op.inputs()->Get(0)); - link_output_tensor(op.outputs()->Get(0), &tp2->output()); + link_input_tensor(&q->input(), op.inputs()->Get(0)); + link_output_tensor(op.outputs()->Get(0), &q->output()); } DEFINE_TFLITE_LOWER(FAKE_QUANT) @@ -71,34 +53,12 @@ DEFINE_TFLITE_LOWER(DEQUANTIZE) auto &input = get_tensor(op.inputs(), 0); auto &output = get_tensor(op.outputs(), 0); - [[maybe_unused]] dequantize *deq; - [[maybe_unused]] quantize *q; - - auto tp1 = graph_.emplace(to_data_type(input.type()), get_shape(input.shape()), axis_t { 0, 3, 1, 2 }); - tp1->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/pre_trans"); - auto mid_output = &tp1->output(); - // auto mid_input = &tp1->output(); - if (input.type() != tflite::TensorType_FLOAT32) + if (op.outputs()->size() != 0) { - deq = graph_.emplace(tp1->output().type(), tp1->output().shape(), dt_float32, + auto deq = graph_.emplace(to_data_type(input.type()), get_shape(input.shape()), to_data_type(output.type()), to_quant_param(input.quantization())); deq->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/deq"); - mid_output = &deq->output(); - deq->input().connect(tp1->output()); + link_input_tensor(&deq->input(), op.inputs()->Get(0)); + link_output_tensor(op.outputs()->Get(0), &deq->output()); } - - if (output.type() != tflite::TensorType_FLOAT32) - { - q = graph_.emplace(dt_float32, mid_output->shape(), to_data_type(output.type()), - to_quant_param(output.quantization())); - q->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/q"); - mid_output = &q->output(); - q->input().connect(tp1->output()); - } - auto tp2 = graph_.emplace(mid_output->type(), mid_output->shape(), axis_t { 0, 2, 3, 1 }); - tp2->name(std::string(get_tensor(op.outputs(), 0).name()->string_view()) + "/trans"); - - tp2->input().connect(*mid_output); - link_input_tensor(&tp1->input(), op.inputs()->Get(0)); - link_output_tensor(op.outputs()->Get(0), &tp2->output()); } \ No newline at end of file diff --git a/src/importer/tflite/ops/space_to_batch.cpp b/src/importer/tflite/ops/space_to_batch.cpp index ae9e1edad0..caba5f32a3 100644 --- a/src/importer/tflite/ops/space_to_batch.cpp +++ b/src/importer/tflite/ops/space_to_batch.cpp @@ -86,6 +86,7 @@ DEFINE_TFLITE_LOWER(SPACE_TO_BATCH_ND) else { block_size_w = block_shape.data()[1]; + real_block_size_w = block_shape.data()[1]; tp1 = graph_.emplace(to_data_type(input.type()), get_shape(input.shape()), axis_t { 0, 3, 1, 2 }); input_conn = &tp1->input(); } diff --git a/src/importer/tflite/ops/unary.cpp b/src/importer/tflite/ops/unary.cpp index d52e9eba9a..2722b6c648 100644 --- a/src/importer/tflite/ops/unary.cpp +++ b/src/importer/tflite/ops/unary.cpp @@ -63,22 +63,7 @@ DEFINE_TFLITE_LOWER(ROUND) DEFINE_TFLITE_LOWER(RSQRT) { - auto &input = get_tensor(op.inputs(), 0); - - auto one = graph_.emplace(1.f); - auto sqrt = graph_.emplace(unary_sqrt, get_shape(input.shape())); - auto div = graph_.emplace(binary_div, to_data_type(input.type()), one->output().shape(), sqrt->output().shape(), value_range::full()); - - auto name = std::string(get_tensor(op.outputs(), 0).name()->string_view()); - one->name(name); - sqrt->name(name); - div->name(name); - - div->input_a().connect(one->output()); - div->input_b().connect(sqrt->output()); - - link_input_tensor(&sqrt->input(), op.inputs()->Get(0)); - link_output_tensor(op.outputs()->Get(0), &div->output()); + convert_unary(op, unary_rsqrt); } DEFINE_TFLITE_LOWER(SIN) diff --git a/src/ir/graph.cpp b/src/ir/graph.cpp index e0efc5b462..310a5b3d0e 100644 --- a/src/ir/graph.cpp +++ b/src/ir/graph.cpp @@ -130,7 +130,7 @@ void graph::dce() nodes_.erase(end, std::end(nodes_)); } -split_graph_result graph::split_subgraph(std::span nodes) +split_graph_result graph::split_subgraph(std::span nodes, bool reorder_input) { split_graph_result result; result.subgraph = std::make_unique(nodes.front()->module_type()); @@ -148,9 +148,38 @@ split_graph_result graph::split_subgraph(std::span nodes) } } +#define ADD_INODE \ + auto inode = result.subgraph->emplace(in->type(), in->shape()); \ + inode->name(in->connection()->owner().name()); \ + inode->module_type(in->owner().module_type()); \ + result.inputs.emplace(inode, in->connection()); \ + inputs.emplace(in->connection(), inode); \ + in->connect(inode->output()); + +#define ADD_ONODE \ + auto onode = result.subgraph->emplace(out->type(), out->shape()); \ + onode->name(out->owner().name()); \ + onode->module_type(out->owner().module_type()); \ + \ + for (auto in : dup(conns)) \ + { \ + if (!subgraph_nodes.contains(&in->owner())) \ + { \ + result.outputs[onode].emplace_back(in); \ + in->clear_connection(); \ + } \ + } \ + \ + out->connect(onode->input()); + // 2. Find in/out connectors std::unordered_set outputs; std::unordered_map inputs; + std::vector graph_inputs; + std::vector remained_inputs; + std::vector input_order; + std::vector graph_outputs; + std::vector output_order; for (auto node : nodes) { for (auto in : node->inputs()) @@ -159,16 +188,22 @@ split_graph_result graph::split_subgraph(std::span nodes) { if (outputs.emplace(in->connection()).second) { - auto inode = result.subgraph->emplace(in->type(), in->shape()); - inode->name(in->connection()->owner().name()); - inode->module_type(node->module_type()); - result.inputs.emplace(inode, in->connection()); - inputs.emplace(in->connection(), inode); - in->connect(inode->output()); + if (reorder_input && node_cast(in->connection()->owner())) + { + graph_inputs.push_back(in); + input_order.push_back(std::distance(inputs_.begin(), std::find(inputs_.begin(), inputs_.end(), node_cast(in->connection()->owner())))); + } + else + { + ADD_INODE + } } else { - in->connect(inputs.at(in->connection())->output()); + if (reorder_input && node_cast(in->connection()->owner())) + remained_inputs.push_back(in); + else + in->connect(inputs.at(in->connection())->output()); } } } @@ -178,24 +213,49 @@ split_graph_result graph::split_subgraph(std::span nodes) auto conns = out->connections(); if (std::any_of(conns.begin(), conns.end(), [&](input_connector *in) { return !subgraph_nodes.contains(&in->owner()); })) { - auto onode = result.subgraph->emplace(out->type(), out->shape()); - onode->name(out->owner().name()); - onode->module_type(node->module_type()); - - for (auto in : dup(conns)) + auto it = std::find_if(conns.begin(), conns.end(), [&](input_connector *in) { return node_cast(in->owner()); }); + if (it != conns.end()) { - if (!subgraph_nodes.contains(&in->owner())) - { - result.outputs[onode].emplace_back(in); - in->clear_connection(); - } + graph_outputs.push_back(out); + output_order.push_back(std::distance(outputs_.begin(), std::find(outputs_.begin(), outputs_.end(), node_cast((*it)->owner())))); + } + else + { + ADD_ONODE } - - out->connect(onode->input()); } } } + auto sort_indexes = [](const std::vector &v) { + std::vector idx(v.size()); + iota(idx.begin(), idx.end(), 0); + + stable_sort(idx.begin(), idx.end(), + [&v](size_t i1, size_t i2) { return v[i1] < v[i2]; }); + + return idx; + }; + + auto ordered_in_indexes = sort_indexes(input_order); + for (auto idx : ordered_in_indexes) + { + auto in = graph_inputs[idx]; + ADD_INODE + } + for (auto &in : remained_inputs) + in->connect(inputs.at(in->connection())->output()); + + auto ordered_out_indexes = sort_indexes(output_order); + for (auto idx : ordered_out_indexes) + { + auto out = graph_outputs[idx]; + auto conns = out->connections(); + ADD_ONODE + } + +#undef ADD_ONODE + return result; } diff --git a/src/ir/graph.partition.cpp b/src/ir/graph.partition.cpp index c48f79e695..faea8c21eb 100644 --- a/src/ir/graph.partition.cpp +++ b/src/ir/graph.partition.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include using namespace nncase; @@ -32,6 +33,7 @@ struct region std::unordered_set nodes_set; std::unordered_set region_inputs; std::unordered_set outputs; + std::unordered_map need_remove_outputs; region(module_type_t module_type) : module_type(module_type) @@ -47,14 +49,33 @@ struct region region_inputs.emplace(in); for (auto out : n.outputs()) outputs.emplace(out); + for (auto it = region_inputs.begin(); it != region_inputs.end();) { if (outputs.contains((*it)->connection())) + { + if (need_remove_outputs.find((*it)->connection()) != need_remove_outputs.end()) + need_remove_outputs.at((*it)->connection()) -= 1; + else + need_remove_outputs.emplace((*it)->connection(), + (*it)->connection()->connections().size() - 1); it = region_inputs.erase(it); + } else ++it; } + for (auto it = need_remove_outputs.begin(); it != need_remove_outputs.end();) + { + if (it->second == 0) + { + outputs.erase(it->first); + it = need_remove_outputs.erase(it); + } + else + it++; + } + if (is_all_noaction && n.attributes() & node_attr_action) is_all_noaction = false; return true; @@ -101,6 +122,115 @@ struct region } }; +typedef struct Region_node +{ + std::list::iterator node; + Region_node *parent = nullptr; + Region_node *child = nullptr; + Region_node *bro = nullptr; +} Region_node, *Region_Tree; + +class Region_tree +{ +public: + Region_tree(std::list &rg) + : regions_(rg) { } + Region_node *create_tree(std::list::iterator new_node, int depth) + { + + Region_node *root = create_node(); + root->node = new_node; + + // find a path from itb--> ita + if (new_node == target_region_) + { + leaves_.push_back(root); + return root; + } + + // limit tree depth + if (depth >= 10) + { + skip_ = true; + return root; + } + + for (auto it : new_node->region_inputs) + { + for (auto itb = regions_.begin(); itb != regions_.end(); itb++) + { + if (itb->outputs.contains(it->connection())) + { + if (root->child == nullptr) + { + root->child = create_tree(itb, depth + 1); + root->child->parent = root; + } + else + { + root->bro = create_tree(itb, depth); + root->bro->parent = root; + root->bro = root->bro->bro; + } + } + } + } + + return root; + } + + bool not_have_circle() + { + // if tree depth > 10, ignore merge itb--> ita + if (skip_) + return false; + // each leaf has only one path to root. + // if all the paths of leaves to root don't have CPU op ,itb can merge to ita. + for (auto it : leaves_) + { + auto condition_ptr = it->parent; + if (condition_ptr->node == start_region_) + continue; + while (condition_ptr != nullptr) + { + if (condition_ptr->node->module_type == runtime::stackvm::stackvm_module_type && !condition_ptr->node->is_all_noaction) + return false; + condition_ptr = condition_ptr->parent; + } + } + return true; + } + + void set_label_region(std::list::iterator ita, std::list::iterator itb) + { + start_region_ = itb; + target_region_ = ita; + } + + void free_tree(Region_node *root) + { + if (root != nullptr) + { + free_tree(root->child); + free_tree(root->bro); + delete root; + } + } + +private: + Region_node *create_node() + { + Region_Tree node = new Region_node(); + return node; + } + + std::list::iterator start_region_; + std::list::iterator target_region_; + std::vector leaves_; + bool skip_; + std::list ®ions_; +}; + class graph_merger { public: @@ -133,6 +263,13 @@ class graph_merger for (auto in : node.inputs()) { auto &conn = in->connection()->owner(); + + if (conn.runtime_opcode() == op_constant) + { + last_region = nullptr; + break; + } + auto it = node_to_region_.find(&conn); if (it != node_to_region_.end()) { @@ -181,7 +318,44 @@ class graph_merger changed |= merge_child_region(); changed |= merge_parent_region(); changed |= merge_same_input_region(); + } while (changed); + + do + { + changed = false; + changed |= merge_child_region_stage_2(); + } while (changed); + } + + bool check_circle(std::list::iterator ita, std::list::iterator itb) + { + // merge directly + bool merge_directly = true; + for (auto it : ita->outputs) + { + if (std::all_of(it->connections().begin(), it->connections().end(), + [&](input_connector *out) { + return itb->region_inputs.contains(out); + })) + continue; + else + merge_directly = false; + } + if (merge_directly) + return true; + + if (itb->region_inputs.size() == 1) + { + return true; + } + + auto check = std::make_shared(regions_); + check->set_label_region(ita, itb); + auto root = check->create_tree(itb, 0); + auto flag = check->not_have_circle(); + check->free_tree(root); + return flag; } bool merge_child_region() @@ -224,6 +398,47 @@ class graph_merger return ever_changed; } + bool merge_child_region_stage_2() + { + bool ever_changed = false; + bool changed; + do + { + changed = false; + for (auto ita = regions_.begin(); ita != regions_.end(); ++ita) + { + std::vector::iterator> to_be_merge; + for (auto itb = regions_.begin(); itb != regions_.end(); ++itb) + { + // don't merge stackvm region + if (ita == itb + || (ita->module_type == runtime::stackvm::stackvm_module_type + && itb->module_type == runtime::stackvm::stackvm_module_type)) + continue; + + // itb's has inputs connect to ita's output without circle + if ((ita->module_type == itb->module_type || itb->is_all_noaction) + && std::any_of(itb->region_inputs.begin(), itb->region_inputs.end(), [&](input_connector *in) { return ita->outputs.contains(in->connection()); }) + && check_circle(ita, itb)) + to_be_merge.emplace_back(itb); + } + + if (!to_be_merge.empty()) + { + for (auto region : to_be_merge) + { + ita->merge(*region); + regions_.erase(region); + } + + changed = ever_changed = true; + break; + } + } + } while (changed); + return ever_changed; + } + bool merge_parent_region() { bool ever_changed = false; @@ -314,6 +529,8 @@ class graph_merger void add_node_to_region(region ®ion, node &node) { + if (node.module_type() != runtime::stackvm::stackvm_module_type) + region.module_type = node.module_type(); region.add_node(node); node_to_region_.emplace(&node, ®ion); } @@ -337,7 +554,7 @@ void graph::merge_module_regions() if (region.module_type == runtime::stackvm::stackvm_module_type) continue; - auto split = split_subgraph(region.nodes); + auto split = split_subgraph(region.nodes, true); auto &subg = add_subgraph(std::move(split.subgraph)); auto c = emplace(subg); c->name(std::string(region.module_type.data()) + "_" + std::to_string(subids[region.module_type.data()]++)); diff --git a/src/ir/ops/CMakeLists.txt b/src/ir/ops/CMakeLists.txt index 82829287c6..f8125e4e57 100644 --- a/src/ir/ops/CMakeLists.txt +++ b/src/ir/ops/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required (VERSION 3.13) +cmake_minimum_required(VERSION 3.13) target_sources(ir PRIVATE call.cpp @@ -45,4 +45,11 @@ target_sources(ir PRIVATE softmax.cpp ternary.cpp topk.cpp - trilu.cpp) + trilu.cpp + gru.cpp + tflite_detection_postprocess.cpp + gather_elements.cpp + layernorm.cpp + compress.cpp + instancenorm.cpp +) diff --git a/src/ir/ops/compress.cpp b/src/ir/ops/compress.cpp new file mode 100644 index 0000000000..065aad6187 --- /dev/null +++ b/src/ir/ops/compress.cpp @@ -0,0 +1,35 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +using namespace nncase; +using namespace nncase::ir; + +compress::compress(datatype_t type, shape_t input_shape, shape_t condition_shape, shape_t output_shape, int32_t axis) + : axis_(normalize_axis(input_shape, axis)) +{ + add_input("input", type, input_shape); + add_input("condition", dt_uint8, condition_shape); + + add_output("output", type, output_shape); +} + +bool compress::properties_equal(node &other) const +{ + auto &r = static_cast(other); + return axis() == r.axis(); +} diff --git a/src/ir/ops/fused_unary.cpp b/src/ir/ops/fused_unary.cpp index e600ef9bb1..56d80b93dd 100644 --- a/src/ir/ops/fused_unary.cpp +++ b/src/ir/ops/fused_unary.cpp @@ -140,6 +140,9 @@ void fused_unary::compile_graph(const std::vector &subgraph, cod case unary_logical_not: builder.emit_logical_not(); break; + case unary_erf: + builder.emit_erf(); + break; default: throw std::invalid_argument("Unsupported unary op for nnil: " + (std::string)magic_enum::enum_name(op.unary.unary_op)); } diff --git a/src/ir/ops/gather_elements.cpp b/src/ir/ops/gather_elements.cpp new file mode 100644 index 0000000000..a678ceac4f --- /dev/null +++ b/src/ir/ops/gather_elements.cpp @@ -0,0 +1,34 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +using namespace nncase; +using namespace nncase::ir; + +gather_elements::gather_elements(datatype_t in_type, datatype_t indices_type, shape_t input_shape, shape_t indices_shape, shape_t output_shape, int32_t axis) + : axis_(axis) +{ + add_input("input", in_type, input_shape); + add_input("indices", indices_type, indices_shape); + add_output("output", in_type, output_shape); +} + +bool gather_elements::properties_equal(node &other) const +{ + auto &r = static_cast(other); + return axis() == r.axis(); +} diff --git a/src/ir/ops/gru.cpp b/src/ir/ops/gru.cpp new file mode 100644 index 0000000000..334ed8a109 --- /dev/null +++ b/src/ir/ops/gru.cpp @@ -0,0 +1,40 @@ +/* Copyright 2019-2020 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +using namespace nncase; +using namespace nncase::ir; + +gru::gru(shape_t input_shape, shape_t w_shape, shape_t r_shape, shape_t b_shape, shape_t output_shape, + shape_t output_h_shape, lstm_direction direction, std::string framework, bool linear_before_reset) + : direction_(direction), framework_(framework), linear_before_reset_(linear_before_reset) +{ + add_input("input", dt_float32, input_shape); + add_input("w", dt_float32, w_shape); + add_input("r", dt_float32, r_shape); + add_input("b", dt_float32, b_shape); + add_input("initial_h", dt_float32, output_h_shape); + + add_output("output", dt_float32, output_shape); + add_output("output_h", dt_float32, output_h_shape); +} + +bool gru::properties_equal(node &other) const +{ + auto &r = static_cast(other); + return direction() == r.direction() && framework() == r.framework() && linear_before_reset() == r.linear_before_reset(); +} diff --git a/src/ir/ops/instancenorm.cpp b/src/ir/ops/instancenorm.cpp new file mode 100644 index 0000000000..7d20bb0add --- /dev/null +++ b/src/ir/ops/instancenorm.cpp @@ -0,0 +1,35 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +using namespace nncase; +using namespace nncase::ir; + +instancenorm::instancenorm(datatype_t input_type, shape_t input_shape, float epsilon) + : epsilon_(epsilon) +{ + add_input("input", input_type, input_shape); + add_input("scale", input_type, get_instancenorm_const_shape(input_shape)); + add_input("bias", input_type, get_instancenorm_const_shape(input_shape)); + add_output("output", input_type, input_shape); +} + +bool instancenorm::properties_equal(node &other) const +{ + auto &r = static_cast(other); + return epsilon() == r.epsilon(); +} diff --git a/src/ir/ops/layernorm.cpp b/src/ir/ops/layernorm.cpp new file mode 100644 index 0000000000..373e117eb9 --- /dev/null +++ b/src/ir/ops/layernorm.cpp @@ -0,0 +1,36 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nncase/ir/ir_types.h" +#include +#include +#include + +using namespace nncase; +using namespace nncase::ir; + +layernorm::layernorm(datatype_t input_type, shape_t input_shape, int32_t axis, float epsilon) + : axis_(normalize_axis(input_shape, axis)), epsilon_(epsilon) +{ + add_input("input", input_type, input_shape); + add_input("scale", input_type, shape_t { input_shape.begin() + axis_, input_shape.end() }); + add_input("bias", input_type, shape_t { input_shape.begin() + axis_, input_shape.end() }); + add_output("output", input_type, input_shape); +} + +bool layernorm::properties_equal(node &other) const +{ + auto &r = static_cast(other); + return axis() == r.axis() && epsilon() == r.epsilon(); +} diff --git a/src/ir/ops/tflite_detection_postprocess.cpp b/src/ir/ops/tflite_detection_postprocess.cpp new file mode 100644 index 0000000000..78b91ff94f --- /dev/null +++ b/src/ir/ops/tflite_detection_postprocess.cpp @@ -0,0 +1,57 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +using namespace nncase; +using namespace nncase::ir; + +tflite_detection_postprocess::tflite_detection_postprocess( + shape_t boxes_shape, shape_t scores_shape, shape_t anchors_shape, + shape_t output_shape_0, shape_t output_shape_1, shape_t output_shape_2, shape_t output_shape_3, + int32_t max_detections, + int32_t max_classes_per_detection, + int32_t detections_per_class, + bool use_regular_non_max_suppression, + float nms_score_threshold, + float nms_iou_threshold, + int32_t num_classes, + float y_scale, + float x_scale, + float h_scale, + float w_scale) + : max_detections_(max_detections), max_classes_per_detection_(max_classes_per_detection), detections_per_class_(detections_per_class), use_regular_non_max_suppression_(use_regular_non_max_suppression), nms_score_threshold_(nms_score_threshold), nms_iou_threshold_(nms_iou_threshold), num_classes_(num_classes), y_scale_(y_scale), x_scale_(x_scale), h_scale_(h_scale), w_scale_(w_scale) +{ + add_input("boxes", dt_float32, boxes_shape); + add_input("scores", dt_float32, scores_shape); + add_input("anchors", dt_float32, anchors_shape); + add_output("output_locations", dt_float32, output_shape_0); + add_output("output_classes", dt_float32, output_shape_1); + add_output("output_scores", dt_float32, output_shape_2); + add_output("output_num_detections", dt_float32, output_shape_3); +} + +bool tflite_detection_postprocess::properties_equal(node &other) const +{ + auto &r = static_cast(other); + return max_detections() == r.max_detections() + && max_classes_per_detection() == r.max_classes_per_detection() + && detections_per_class() == r.detections_per_class() + && use_regular_non_max_suppression() == r.use_regular_non_max_suppression() + && nms_score_threshold() == r.nms_score_threshold() + && nms_iou_threshold() == r.nms_iou_threshold() && num_classes() == r.num_classes() + && y_scale() == r.y_scale() && x_scale() == r.x_scale() && h_scale() == r.h_scale() && w_scale() == r.w_scale(); +} diff --git a/src/kernels/cpu/optimized/CMakeLists.txt b/src/kernels/cpu/optimized/CMakeLists.txt index 80b4ff608b..e1c8569f8f 100644 --- a/src/kernels/cpu/optimized/CMakeLists.txt +++ b/src/kernels/cpu/optimized/CMakeLists.txt @@ -1,24 +1,28 @@ -cmake_minimum_required (VERSION 3.13) +cmake_minimum_required(VERSION 3.13) -if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "riscv64") +if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "riscv64") set(ARCH riscv64) else() set(ARCH x86_64) endif() set(SRCS convolution.cpp - concat.cpp - slice.cpp - copy.cpp - dequantize.cpp - resize_image.cpp - gather.cpp - gather_nd.cpp - quantize.cpp - onehot.cpp - ${ARCH}/binary.cpp - ${ARCH}/unary.cpp - ${ARCH}/matmul.cpp - ${ARCH}/sigmoid.cpp - ${ARCH}/softmax.cpp) -target_sources(kernels PRIVATE ${SRCS}) \ No newline at end of file + concat.cpp + slice.cpp + copy.cpp + dequantize.cpp + resize_image.cpp + gather.cpp + gather_nd.cpp + quantize.cpp + onehot.cpp + ${ARCH}/binary.cpp + ${ARCH}/unary.cpp + ${ARCH}/matmul.cpp + ${ARCH}/sigmoid.cpp + ${ARCH}/softmax.cpp + ${ARCH}/layernorm.cpp + ${ARCH}/ternary.cpp + ${ARCH}/reduce.cpp + ${ARCH}/instancenorm.cpp) +target_sources(kernels PRIVATE ${SRCS}) diff --git a/src/kernels/cpu/optimized/riscv64/instancenorm.cpp b/src/kernels/cpu/optimized/riscv64/instancenorm.cpp new file mode 100644 index 0000000000..d19dc3a894 --- /dev/null +++ b/src/kernels/cpu/optimized/riscv64/instancenorm.cpp @@ -0,0 +1,172 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::optimized; + +// #if __riscv_vector +// #define RVV_LMUL 8 +// #define _STR(x) #x +// #define STR(x) _STR(x) +// #define _CONNECT(a, b) a##b +// #define CONNECT(a, b) _CONNECT(a, b) +// #define RVVSETVLI2(evl, avl, elen) "vsetvli " STR(evl) "," STR(avl) "," STR(elen) "," STR(CONNECT(m, RVV_LMUL)) ";" + +// static float get_mean(const float *data, int n) +// { +// float ret; +// __asm volatile( +// "mv a0, %[avl];" +// "mv a1, %[input_ptr1];" RVVSETVLI2(t0, a0, e32) "vmv.s.x v0, x0;" +// "XXXXXX%=:;" RVVSETVLI2(t0, a0, e32) "vle32.v v8, (a1);" +// "sub a0,a0, t0;" +// "slli t1, t0, 2;" +// "vfredsum.vs v0,v8,v0;" + +// "add a1, a1, t1;" +// "bnez a0, XXXXXX%=;" +// "vfmv.f.s f0, v0;" +// "fcvt.s.w f1, %[avl];" +// "fdiv.s %[ret], f0, f1;" + +// : [ret] "=f"(ret) +// : [avl] "r"(n), [input_ptr1] "r"(data) +// : "t0", "t1", "a0", "a1", "f0", "f1", "v0", "v8"); +// return ret; +// } + +// static float get_var(const float *data, int n, float mean) +// { +// float ret; +// __asm volatile( + +// "mv a0, %[avl];" +// "mv a1, %[input_ptr1];" RVVSETVLI2(t0, a0, e32) "vmv.s.x v0, x0;" + +// "vle32.v v8, (a1);" +// "sub a0,a0, t0;" +// "slli t1, t0, 2;" +// "vfsub.vf v8, v8, %[mean];" +// "vfmul.vv v8, v8, v8;" +// "add a1, a1, t1;" +// "beqz a0, X1_END%=;" +// "X1_STRAT%=:;" RVVSETVLI2(t0, a0, e32) "vle32.v v16, (a1);" +// "sub a0,a0, t0;" +// "slli t1, t0, 2;" +// "vfsub.vf v16, v16, %[mean];" +// "vfmacc.vv v8, v16, v16;" + +// "add a1, a1, t1;" +// "bnez a0, X1_STRAT%=;" + +// "X1_END%=:" + +// "vfredsum.vs v0,v8,v0;" + +// "vfmv.f.s f0, v0;" +// "fcvt.s.w f1, %[avl];" +// "fdiv.s %[ret], f0, f1;" + +// : [ret] "=f"(ret) +// : [avl] "r"(n), [input_ptr1] "r"(data), [mean] "f"(mean) +// : "t0", "t1", "a0", "a1", "v0", "v8", "v16", "f0", "f1"); +// return ret; +// } + +// static void layer_norm_update1(const float *data, float *out, int len, float mean, float var, float *r1, float e, float *b) +// { +// float r_sqrt = 1.0f / sqrtf(var + e); +// __asm volatile( +// "mv a0, %[avl];" +// "mv a1, %[input_ptr1];" +// "mv a2, %[out];" +// "mv a3, %[scale];" +// "mv a4, %[b];" +// "layer_norm_update1%=:;" RVVSETVLI2(t0, a0, e32) "vle32.v v16, (a1);" +// "vle32.v v8, (a3);" +// "sub a0,a0, t0;" +// "slli t1, t0, 2;" +// "vfsub.vf v16, v16, %[mean];" +// "add a1, a1, t1;" +// "vfmul.vf v16, v16, %[r_sqrt];" + +// "add a3, a3, t1;" +// "vfmul.vv v16, v8, v16;" + +// "vle32.v v8, (a4);" +// "vfadd.vv v16, v16, v8;" +// "add a4, a4, t1;" + +// "vse32.v v16, (a2);" +// "add a2, a2, t1;" +// "bnez a0, layer_norm_update1%=;" + +// : +// : [avl] "r"(len), [input_ptr1] "r"(data), [mean] "f"(mean), [r_sqrt] "f"(r_sqrt), [b] "r"(b), [out] "r"(out), [scale] "r"(r1) +// : "t0", "t1", "a0", "a1", "a2", "v0", "v16", "a3", "a4", "v8"); +// } + +// result layernorm_impl(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) +// { +// if (axis < 0) +// { +// axis = (int)in_shape.size() + axis; +// } +// auto outer_size = 1; +// auto inner_size = 1; +// for (auto i = 0; i < axis; i++) +// outer_size *= in_shape[i]; +// for (auto i = axis; i < static_cast(in_shape.size()); i++) +// inner_size *= in_shape[i]; + +// for (int32_t batch = 0; batch < outer_size; batch++) +// { +// const float *src = input + batch * inner_size; +// float *dest = output + batch * inner_size; + +// float mean = get_mean(src, inner_size); + +// float var_data = get_var(src, inner_size, mean); + +// layer_norm_update1(src, dest, inner_size, mean, var_data, scale, epsilon, bias); +// } +// return ok(); +// } +// #endif + +template <> +result optimized::instancenorm(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, float epsilon) noexcept +{ + // #if __riscv_vector + // return instancenorm_impl(input, output, scale, bias, in_shape, epsilon); + // #else + return cpu::reference::instancenorm(input, output, scale, bias, in_shape, epsilon); + // #endif +} + +template +result optimized::instancenorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, float epsilon) noexcept +{ + return cpu::reference::instancenorm(input, output, scale, bias, in_shape, epsilon); +} diff --git a/src/kernels/cpu/optimized/riscv64/layernorm.cpp b/src/kernels/cpu/optimized/riscv64/layernorm.cpp new file mode 100644 index 0000000000..2bde01bb03 --- /dev/null +++ b/src/kernels/cpu/optimized/riscv64/layernorm.cpp @@ -0,0 +1,173 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::optimized; + +#if __riscv_vector +#define RVV_LMUL 8 +#define _STR(x) #x +#define STR(x) _STR(x) +#define _CONNECT(a, b) a##b +#define CONNECT(a, b) _CONNECT(a, b) +#define RVVSETVLI2(evl, avl, elen) "vsetvli " STR(evl) "," STR(avl) "," STR(elen) "," STR(CONNECT(m, RVV_LMUL)) ";" + +static float get_mean(const float *data, int n) +{ + float ret; + __asm volatile( + "mv a0, %[avl];" + "mv a1, %[input_ptr1];" RVVSETVLI2(t0, a0, e32) "vmv.s.x v0, x0;" + "XXXXXX%=:;" RVVSETVLI2(t0, a0, e32) "vle32.v v8, (a1);" + "sub a0,a0, t0;" + "slli t1, t0, 2;" + "vfredsum.vs v0,v8,v0;" + + "add a1, a1, t1;" + "bnez a0, XXXXXX%=;" + "vfmv.f.s f0, v0;" + "fcvt.s.w f1, %[avl];" + "fdiv.s %[ret], f0, f1;" + + : [ret] "=f"(ret) + : [avl] "r"(n), [input_ptr1] "r"(data) + : "t0", "t1", "a0", "a1", "f0", "f1", "v0", "v8"); + return ret; +} + +static float get_var(const float *data, int n, float mean) +{ + float ret; + __asm volatile( + + "mv a0, %[avl];" + "mv a1, %[input_ptr1];" RVVSETVLI2(t0, a0, e32) "vmv.s.x v0, x0;" + + "vle32.v v8, (a1);" + "sub a0,a0, t0;" + "slli t1, t0, 2;" + "vfsub.vf v8, v8, %[mean];" + "vfmul.vv v8, v8, v8;" + "add a1, a1, t1;" + "beqz a0, X1_END%=;" + "X1_STRAT%=:;" RVVSETVLI2(t0, a0, e32) "vle32.v v16, (a1);" + "sub a0,a0, t0;" + "slli t1, t0, 2;" + "vfsub.vf v16, v16, %[mean];" + "vfmacc.vv v8, v16, v16;" + + "add a1, a1, t1;" + "bnez a0, X1_STRAT%=;" + + "X1_END%=:" + + "vfredsum.vs v0,v8,v0;" + + "vfmv.f.s f0, v0;" + "fcvt.s.w f1, %[avl];" + "fdiv.s %[ret], f0, f1;" + + : [ret] "=f"(ret) + : [avl] "r"(n), [input_ptr1] "r"(data), [mean] "f"(mean) + : "t0", "t1", "a0", "a1", "v0", "v8", "v16", "f0", "f1"); + return ret; +} + +static void layer_norm_update1(const float *data, float *out, int len, float mean, float var, float *r1, float e, float *b) +{ + float r_sqrt = 1.0f / sqrtf(var + e); + __asm volatile( + "mv a0, %[avl];" + "mv a1, %[input_ptr1];" + "mv a2, %[out];" + "mv a3, %[scale];" + "mv a4, %[b];" + "layer_norm_update1%=:;" RVVSETVLI2(t0, a0, e32) "vle32.v v16, (a1);" + "vle32.v v8, (a3);" + "sub a0,a0, t0;" + "slli t1, t0, 2;" + "vfsub.vf v16, v16, %[mean];" + "add a1, a1, t1;" + "vfmul.vf v16, v16, %[r_sqrt];" + + "add a3, a3, t1;" + "vfmul.vv v16, v8, v16;" + + "vle32.v v8, (a4);" + "vfadd.vv v16, v16, v8;" + "add a4, a4, t1;" + + "vse32.v v16, (a2);" + "add a2, a2, t1;" + "bnez a0, layer_norm_update1%=;" + + : + : [avl] "r"(len), [input_ptr1] "r"(data), [mean] "f"(mean), [r_sqrt] "f"(r_sqrt), [b] "r"(b), [out] "r"(out), [scale] "r"(r1) + : "t0", "t1", "a0", "a1", "a2", "v0", "v16", "a3", "a4", "v8"); +} + +result layernorm_impl(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) +{ + if (axis < 0) + { + axis = (int)in_shape.size() + axis; + } + auto outer_size = 1; + auto inner_size = 1; + for (auto i = 0; i < axis; i++) + outer_size *= in_shape[i]; + for (auto i = axis; i < static_cast(in_shape.size()); i++) + inner_size *= in_shape[i]; + + for (int32_t batch = 0; batch < outer_size; batch++) + { + const float *src = input + batch * inner_size; + float *dest = output + batch * inner_size; + + float mean = get_mean(src, inner_size); + + float var_data = get_var(src, inner_size, mean); + + layer_norm_update1(src, dest, inner_size, mean, var_data, scale, epsilon, bias); + } + return ok(); +} +#endif + +template <> +result optimized::layernorm(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept +{ +#if __riscv_vector + return layernorm_impl(input, output, scale, bias, in_shape, axis, epsilon); +#else + return cpu::reference::layernorm(input, output, scale, bias, in_shape, axis, epsilon); +#endif +} + +template +result optimized::layernorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept + +{ + return cpu::reference::layernorm(input, output, scale, bias, in_shape, axis, epsilon); +} diff --git a/src/kernels/cpu/optimized/riscv64/reduce.cpp b/src/kernels/cpu/optimized/riscv64/reduce.cpp new file mode 100644 index 0000000000..ef9d65b080 --- /dev/null +++ b/src/kernels/cpu/optimized/riscv64/reduce.cpp @@ -0,0 +1,132 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#if __riscv_vector +#include +#endif + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::optimized; + +result reduce_mean_rvv(NNCASE_UNUSED float init_value, const float *input, float *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, + NNCASE_UNUSED const runtime_shape_t &in_strides, NNCASE_UNUSED const runtime_shape_t &out_shape, NNCASE_UNUSED const runtime_shape_t &out_strides) noexcept; + +template <> +result optimized::reduce(reduce_op_t op, float init_value, const float *input, float *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept +{ + if (reduce_mean == op) + { + auto out_shape = kernels::detail::get_reduced_shape(in_shape, axis, keep_dims); + return reduce_mean_rvv(init_value, input, output, in_shape, axis, in_strides, out_shape, out_strides); + } + else + { + return cpu::reference::reduce(op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context); + } +} + +template result optimized::reduce(reduce_op_t op, int32_t init_value, const int32_t *input, int32_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept; + +template result optimized::reduce(reduce_op_t op, int64_t init_value, const int64_t *input, int64_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept; + +template +result optimized::reduce(reduce_op_t op, T init_value, const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept +{ + return cpu::reference::reduce(op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context); +} + +result reduce_mean_rvv(NNCASE_UNUSED float init_value, const float *input, float *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, + NNCASE_UNUSED const runtime_shape_t &in_strides, NNCASE_UNUSED const runtime_shape_t &out_shape, NNCASE_UNUSED const runtime_shape_t &out_strides) noexcept +{ + if (axis[0] == 0) + { + size_t out_n = 1; + size_t inner_n = 1; + for (size_t i = 0; i < axis.size(); ++i) + { + inner_n *= in_shape[i]; + } + for (size_t i = 0; i < in_shape.size() - axis.size(); ++i) + { + out_n *= in_shape[in_shape.size() - i - 1]; + } +#if (!__riscv_vector) + for (size_t i = 0; i < out_n; ++i) + { + float sum = 0.0f; + for (size_t j = 0; j < inner_n; ++j) + { + sum += input[j * out_n + i]; + } + output[i] = sum / inner_n; + } +#else + size_t vl; + float lr = 1.0f / inner_n; + size_t i = 0; + size_t n2 = out_n; + while (n2) + { + vl = vsetvl_e32m8(n2); + vfloat32m8_t _p = vle32_v_f32m8(input + 0 * out_n + i, vl); + + for (size_t j = 1; j < inner_n; ++j) + { + vfloat32m8_t _p1 = vle32_v_f32m8(input + j * out_n + i, vl); + _p = vfadd_vv_f32m8(_p, _p1, vl); + } + _p = vfmul_vf_f32m8(_p, lr, vl); + vse32_v_f32m8(output + i, _p, vl); + i += vl; + n2 -= vl; + } +#endif + } + else + { + size_t out_n = 1; + size_t inner_n = 1; + for (size_t i = 0; i < axis.size(); ++i) + { + out_n *= in_shape[i]; + } + for (size_t i = 0; i < in_shape.size() - axis.size(); ++i) + { + inner_n *= in_shape[in_shape.size() - i - 1]; + } + for (size_t i = 0; i < out_n; ++i) + { + float sum = 0.0f; + for (size_t j = 0; j < inner_n; ++j) + { + sum += input[i * inner_n + j]; + } + output[i] = sum / inner_n; + } + } + return ok(); +} \ No newline at end of file diff --git a/src/kernels/cpu/optimized/riscv64/softmax.cpp b/src/kernels/cpu/optimized/riscv64/softmax.cpp index b6f67bb805..0f96979ca3 100644 --- a/src/kernels/cpu/optimized/riscv64/softmax.cpp +++ b/src/kernels/cpu/optimized/riscv64/softmax.cpp @@ -32,6 +32,205 @@ namespace { #if __riscv_vector +static __inline __attribute__((__always_inline__)) +vfloat32m8_t +exp_ps2(vfloat32m8_t _p, size_t vl) +{ + _p = vfmax_vf_f32m8(_p, -88.0f, vl); + _p = vfmul_vf_f32m8(_p, 12102203.0f, vl); + _p = vfadd_vf_f32m8(_p, 1065414017, vl); + + vint32m8_t p2 = vfcvt_x_f_v_i32m8(_p, vl); + _p = vreinterpret_v_i32m8_f32m8(p2); + return _p; +} + +vfloat32m8_t exp_ps2_opt(vfloat32m8_t _p, const float c0, const float c1, const float c2, size_t vl) +{ + _p = vfmax_vf_f32m8(_p, c0, vl); + _p = vfmadd_vf_f32m8(_p, c1, vfmv_v_f_f32m8(c2, vl), vl); + + vint32m8_t p2 = vfcvt_x_f_v_i32m8(_p, vl); + _p = vreinterpret_v_i32m8_f32m8(p2); + return _p; +} + +result optimized_softmax_impl_opt(const float *input, float *output, const runtime_shape_t &in_shape, int32_t axis, float beta) noexcept +{ + size_t ndim = in_shape.size(); + size_t positive_axis = axis < 0 ? ndim + axis : axis; + size_t axis_dim = in_shape[positive_axis]; + + size_t out_side = 1; + for (size_t i = 0; i < positive_axis; i++) + out_side *= in_shape[i]; + + size_t in_side = 1; + for (size_t i = positive_axis + 1; i < ndim; i++) + in_side *= in_shape[i]; + float c0 = -88.0f * beta; + float c1 = 12102203.0f * beta; + float c2 = 1065414017.0f * beta; + + // axis == -1 + if (positive_axis == (ndim - 1)) + { + const float *ptr_input = input; + float *ptr_output = output; + for (size_t i = 0; i < out_side; i++) + { + auto n = axis_dim; + const float *ptr_input_vl = ptr_input; + float *ptr_output_vl = ptr_output; + + // max + float max = std::numeric_limits::lowest(); + auto s = vfmv_v_f_f32m1(max, vsetvl_e32m8(n)); + while (n) + { + auto vl = vsetvl_e32m8(n); + auto v = vle32_v_f32m8(ptr_input_vl, vl); + s = vfredmax_vs_f32m8_f32m1(s, v, s, vl); + ptr_input_vl += vl; + n -= vl; + } + max = vfmv_f_s_f32m1_f32(s); + + // exp((x - max) * beta) and sum(exp) + float sum = 0.f; + ptr_input_vl = ptr_input; + n = axis_dim; + s = vfmv_v_f_f32m1(sum, vsetvl_e32m8(n)); + while (n) + { + auto vl = vsetvl_e32m8(n); + auto v_in = vle32_v_f32m8(ptr_input_vl, vl); + auto v_out = exp_ps2_opt(vfsub_vf_f32m8(v_in, max, vl), c0, c1, c2, vl); + s = vfredusum_vs_f32m8_f32m1(s, v_out, s, vl); + + vse32_v_f32m8(ptr_output_vl, v_out, vl); + ptr_input_vl += vl; + ptr_output_vl += vl; + n -= vl; + } + sum = vfmv_f_s_f32m1_f32(s); + + // div + ptr_input_vl = ptr_input; + ptr_output_vl = ptr_output; + n = axis_dim; + sum = 1.0f / sum; + while (n) + { + auto vl = vsetvl_e32m8(n); + auto v_out = vle32_v_f32m8(ptr_output_vl, vl); + v_out = vfmul_vf_f32m8(v_out, sum, vl); + + vse32_v_f32m8(ptr_output_vl, v_out, vl); + ptr_output_vl += vl; + n -= vl; + } + + ptr_input += axis_dim; + ptr_output += axis_dim; + } + } + else + { + runtime_shape_t axes { positive_axis }; + auto reduced_shape = kernels::detail::get_reduced_shape(in_shape, axes, true); + auto reduced_size = compute_size(reduced_shape); + std::vector max(reduced_size, std::numeric_limits::lowest()); + std::vector sum(reduced_size, 0.f); + + for (size_t i = 0; i < out_side; i++) + { + const float *ptr_input = input + i * axis_dim * in_side; + const float *ptr_input_vl = ptr_input; + + float *ptr_output = output + i * axis_dim * in_side; + float *ptr_output_vl = ptr_output; + + float *ptr_max = max.data() + i * in_side; + float *ptr_max_vl = ptr_max; + + float *ptr_sum = sum.data() + i * in_side; + float *ptr_sum_vl = ptr_sum; + + // max + for (size_t j = 0; j < axis_dim; j++) + { + ptr_max_vl = ptr_max; + auto n = in_side; + while (n) + { + auto vl = vsetvl_e32m8(n); + auto v_in = vle32_v_f32m8(ptr_input_vl, vl); + auto v_max = vle32_v_f32m8(ptr_max_vl, vl); + + v_max = vfmax_vv_f32m8(v_in, v_max, vl); + vse32_v_f32m8(ptr_max_vl, v_max, vl); + + ptr_input_vl += vl; + ptr_max_vl += vl; + n -= vl; + } + } + + // exp((x - max) * beta) and sum(exp) + ptr_input_vl = ptr_input; + ptr_output_vl = ptr_output; + for (size_t j = 0; j < axis_dim; j++) + { + ptr_max_vl = ptr_max; + ptr_sum_vl = ptr_sum; + auto n = in_side; + while (n) + { + auto vl = vsetvl_e32m8(n); + auto v_in = vle32_v_f32m8(ptr_input_vl, vl); + auto v_max = vle32_v_f32m8(ptr_max_vl, vl); + auto v_sum = vle32_v_f32m8(ptr_sum_vl, vl); + + auto v_out = exp_ps(vfmul_vf_f32m8(vfsub_vv_f32m8(v_in, v_max, vl), beta, vl), vl); + vse32_v_f32m8(ptr_output_vl, v_out, vl); + + v_sum = vfadd_vv_f32m8(v_sum, v_out, vl); + vse32_v_f32m8(ptr_sum_vl, v_sum, vl); + + ptr_input_vl += vl; + ptr_output_vl += vl; + ptr_max_vl += vl; + ptr_sum_vl += vl; + n -= vl; + } + } + + // div + ptr_output_vl = ptr_output; + for (size_t j = 0; j < axis_dim; j++) + { + ptr_sum_vl = ptr_sum; + auto n = in_side; + while (n) + { + auto vl = vsetvl_e32m8(n); + auto v_out = vle32_v_f32m8(ptr_output_vl, vl); + auto v_sum = vle32_v_f32m8(ptr_sum_vl, vl); + + v_out = vfdiv_vv_f32m8(v_out, v_sum, vl); + vse32_v_f32m8(ptr_output_vl, v_out, vl); + + ptr_output_vl += vl; + ptr_sum_vl += vl; + n -= vl; + } + } + } + } + return ok(); +} + result optimized_softmax_impl(const float *input, float *output, const runtime_shape_t &in_shape, int32_t axis, float beta) noexcept { size_t ndim = in_shape.size(); @@ -96,11 +295,12 @@ result optimized_softmax_impl(const float *input, float *output, const run ptr_input_vl = ptr_input; ptr_output_vl = ptr_output; n = axis_dim; + sum = 1.0f / sum; while (n) { auto vl = vsetvl_e32m8(n); auto v_out = vle32_v_f32m8(ptr_output_vl, vl); - v_out = vfdiv_vf_f32m8(v_out, sum, vl); + v_out = vfmul_vf_f32m8(v_out, sum, vl); vse32_v_f32m8(ptr_output_vl, v_out, vl); ptr_output_vl += vl; n -= vl; @@ -205,6 +405,7 @@ result optimized_softmax_impl(const float *input, float *output, const run } return ok(); } + #endif } diff --git a/src/kernels/cpu/optimized/riscv64/ternary.cpp b/src/kernels/cpu/optimized/riscv64/ternary.cpp new file mode 100644 index 0000000000..59ec58170b --- /dev/null +++ b/src/kernels/cpu/optimized/riscv64/ternary.cpp @@ -0,0 +1,165 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::optimized; + +#if __riscv_vector + +void ternary_vec(const float *input_a, int input_a_len, const float *input_b, int input_b_len, const float *input_c, + [[maybe_unused]] int input_c_len, float *out, int out_len) +{ + __asm volatile( + + "div a4, %[dst_len], %[mask_len];" + "mv a2, %[c];" + "mv a3, %[dst];" + + "beq %[mask_len], %[b_len], B_IS_VECTOR%=;" + + "flw ft0, (%[b]);" + + "TERNARY_RVV%=:;" + + "mv a0, %[mask_len];" + "mv a1, %[mask];" + + "XXXXXX%=:" + "vsetvli t0, a0, e32, m8;" + "vle32.v v8, (a1);" + "vle32.v v16,(a2);" + "vmsne.vx v0, v8, x0;" + "vfmerge.vfm v8, v16, ft0, v0;" + "vse32.v v8, (a3);" + + "slli t1, t0, 2;" + "sub a0, a0, t0; " + "add a1, a1, t1;" + "add a2, a2, t1;" + "add a3, a3, t1;" + "bnez a0, XXXXXX%=;" + + "addi a4, a4, -1;" + "bnez a4, TERNARY_RVV%=;" + "j END%=;" + + ////////////////////////////////////// + "B_IS_VECTOR%=:;" + "TERNARY_RVV2%=:;" + + "mv a0, %[mask_len];" + "mv a1, %[mask];" + "mv a5, %[b];" + + "XXXXXX2%=:" + "vsetvli t0, a0, e32, m8;" + "vle32.v v8, (a1);" + "vle32.v v16,(a2);" + "vle32.v v24, (a5);" + "vmsne.vx v0, v8, x0;" + "vmerge.vvm v8, v16, v24, v0;" + "vse32.v v8, (a3);" + + "slli t1, t0, 2;" + "sub a0, a0, t0; " + "add a1, a1, t1;" + "add a2, a2, t1;" + "add a3, a3, t1;" + "add a5, a5, t1;" + "bnez a0, XXXXXX2%=;" + + "addi a4, a4, -1;" + "bnez a4, TERNARY_RVV2%=;" + + "END%=:;" + + : + : [mask] "r"(input_a), [mask_len] "r"(input_a_len), [b] "r"(input_b), [b_len] "r"(input_b_len), [c] "r"(input_c), [dst] "r"(out), [dst_len] "r"(out_len) + : "t0", "t1", "a0", "a1", "a2", "a3", "a4", "a5", "ft0", "v0", "v8", "v16", "v24"); +} + +result tenary_impl(const float *input_a, const float *input_b, const float *input_c, float *output, + const runtime_shape_t &in_a_shape, [[maybe_unused]] const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, + [[maybe_unused]] const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, [[maybe_unused]] const runtime_shape_t &in_c_strides, + [[maybe_unused]] const runtime_shape_t &out_strides) +{ + + int len_a = 1; + for (int i = 0; i < (int)in_a_shape.size(); ++i) + { + len_a *= in_a_shape[i]; + } + int len_b = 1; + for (int i = 0; i < (int)in_b_shape.size(); ++i) + { + len_b *= in_b_shape[i]; + } + int len_c = 1; + for (int i = 0; i < (int)in_c_shape.size(); ++i) + { + len_c *= in_c_shape[i]; + } + const auto out_shape = kernels::detail::get_binary_output_shape(kernels::detail::get_binary_output_shape(in_a_shape, in_b_shape), in_c_shape); + int len_out = 1; + for (int i = 0; i < (int)out_shape.size(); ++i) + { + len_out *= out_shape[i]; + } + ternary_vec(input_a, len_a, input_b, len_b, input_c, len_c, output, len_out); + return ok(); +} + +#endif + +template <> +result optimized::ternary(const float *input_a, const float *input_b, const float *input_c, float *output, + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, + const runtime_shape_t &out_strides) noexcept +{ +#if __riscv_vector + return tenary_impl(input_a, input_b, input_c, output, in_a_shape, in_a_strides, in_b_shape, in_b_strides, in_c_shape, in_c_strides, out_strides); +#else + return cpu::reference::ternary(input_a, input_b, input_c, output, in_a_shape, in_a_strides, in_b_shape, in_b_strides, in_c_shape, in_c_strides, out_strides); +#endif +} + +template result optimized::ternary(const float *input_a, const int64_t *input_b, const int64_t *input_c, int64_t *output, + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, + const runtime_shape_t &out_strides) noexcept; + +template result optimized::ternary(const float *input_a, const int32_t *input_b, const int32_t *input_c, int32_t *output, + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, + const runtime_shape_t &out_strides) noexcept; + +template +result optimized::ternary(const float *input_a, const T *input_b, const T *input_c, T *output, + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, + const runtime_shape_t &out_strides) noexcept +{ + return cpu::reference::ternary(input_a, input_b, input_c, output, in_a_shape, in_a_strides, in_b_shape, in_b_strides, in_c_shape, in_c_strides, out_strides); +} diff --git a/src/kernels/cpu/optimized/riscv64/unary.cpp b/src/kernels/cpu/optimized/riscv64/unary.cpp index 80e9583ee3..16f607a655 100644 --- a/src/kernels/cpu/optimized/riscv64/unary.cpp +++ b/src/kernels/cpu/optimized/riscv64/unary.cpp @@ -43,7 +43,7 @@ struct unary_op_abs_rvv struct unary_op_ceil_rvv { - vfloat32m8_t operator()(const vfloat32m8_t &x, const word_type &vl) const + vfloat32m8_t operator()(const vfloat32m8_t &x, const size_t &vl) const { vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl); vbool4_t _mask = vmflt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl); @@ -61,7 +61,7 @@ struct unary_op_cos_rvv struct unary_op_exp_rvv { - vfloat32m8_t operator()(const vfloat32m8_t &x, const word_type &vl) const + vfloat32m8_t operator()(const vfloat32m8_t &x, const size_t &vl) const { return exp_ps(x, vl); } diff --git a/src/kernels/cpu/optimized/riscv64/utils.h b/src/kernels/cpu/optimized/riscv64/utils.h index 967ee57db8..2e147793d6 100644 --- a/src/kernels/cpu/optimized/riscv64/utils.h +++ b/src/kernels/cpu/optimized/riscv64/utils.h @@ -31,7 +31,7 @@ #define c_cephes_log_q2 0.693359375 #define _RVV_FLOAT32_LOG_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t log_ps(vfloat32m##LMUL##_t x, word_type vl) \ + static inline vfloat32m##LMUL##_t log_ps(vfloat32m##LMUL##_t x, size_t vl) \ { \ x = vfmax_vf_f32m##LMUL(x, 0.f, vl); /* force flush to zero on denormal values */ \ vbool##MLEN##_t invalid_mask = vmfle_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl); \ @@ -117,7 +117,7 @@ _RVV_FLOAT32_LOG_OP(8, 4) #define c_cephes_exp_p5 5.0000001201E-1 #define _RVV_FLOAT32_EXP_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t exp_ps(vfloat32m##LMUL##_t x, word_type vl) \ + static inline vfloat32m##LMUL##_t exp_ps(vfloat32m##LMUL##_t x, size_t vl) \ { \ vfloat32m##LMUL##_t tmp, fx; \ \ @@ -183,7 +183,7 @@ _RVV_FLOAT32_EXP_OP(8, 4) #define c_cephes_FOPI 1.27323954473516 // 4 / M_PI #define _RVV_FLOAT32_SINCOS_OP(LMUL, MLEN) \ - static inline void sincos_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t *ysin, vfloat32m##LMUL##_t *ycos, word_type vl) \ + static inline void sincos_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t *ysin, vfloat32m##LMUL##_t *ycos, size_t vl) \ { \ /* any x */ \ vfloat32m##LMUL##_t xmm1, xmm2, xmm3, y; \ @@ -256,12 +256,12 @@ _RVV_FLOAT32_SINCOS_OP(2, 16) _RVV_FLOAT32_SINCOS_OP(4, 8) _RVV_FLOAT32_SINCOS_OP(8, 4) -#define _RVV_FLOAT32_SIN_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t sin_ps(vfloat32m##LMUL##_t x, word_type vl) \ - { \ - vfloat32m##LMUL##_t ysin, ycos; \ - sincos_ps(x, &ysin, &ycos, vl); \ - return ysin; \ +#define _RVV_FLOAT32_SIN_OP(LMUL, MLEN) \ + static inline vfloat32m##LMUL##_t sin_ps(vfloat32m##LMUL##_t x, size_t vl) \ + { \ + vfloat32m##LMUL##_t ysin, ycos; \ + sincos_ps(x, &ysin, &ycos, vl); \ + return ysin; \ } _RVV_FLOAT32_SIN_OP(1, 32) @@ -269,12 +269,12 @@ _RVV_FLOAT32_SIN_OP(2, 16) _RVV_FLOAT32_SIN_OP(4, 8) _RVV_FLOAT32_SIN_OP(8, 4) -#define _RVV_FLOAT32_COS_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t cos_ps(vfloat32m##LMUL##_t x, word_type vl) \ - { \ - vfloat32m##LMUL##_t ysin, ycos; \ - sincos_ps(x, &ysin, &ycos, vl); \ - return ycos; \ +#define _RVV_FLOAT32_COS_OP(LMUL, MLEN) \ + static inline vfloat32m##LMUL##_t cos_ps(vfloat32m##LMUL##_t x, size_t vl) \ + { \ + vfloat32m##LMUL##_t ysin, ycos; \ + sincos_ps(x, &ysin, &ycos, vl); \ + return ycos; \ } _RVV_FLOAT32_COS_OP(1, 32) @@ -292,7 +292,7 @@ _RVV_FLOAT32_COS_OP(8, 4) #define c_cephes_tanh_p4 -3.33332819422E-1 #define _RVV_FLOAT32_TANH_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t tanh_ps(vfloat32m##LMUL##_t x, word_type vl) \ + static inline vfloat32m##LMUL##_t tanh_ps(vfloat32m##LMUL##_t x, size_t vl) \ { \ vfloat32m##LMUL##_t x2 = vfsgnj_vf_f32m##LMUL(x, 1.f, vl); \ \ @@ -340,11 +340,11 @@ _RVV_FLOAT32_TANH_OP(2, 16) _RVV_FLOAT32_TANH_OP(4, 8) _RVV_FLOAT32_TANH_OP(8, 4) -#define _RVV_FLOAT32_POW_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t pow_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, word_type vl) \ - { \ - /* pow(x, m) = exp(m * log(x)) */ \ - return exp_ps(vfmul_vv_f32m##LMUL(b, log_ps(a, vl), vl), vl); \ +#define _RVV_FLOAT32_POW_OP(LMUL, MLEN) \ + static inline vfloat32m##LMUL##_t pow_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, size_t vl) \ + { \ + /* pow(x, m) = exp(m * log(x)) */ \ + return exp_ps(vfmul_vv_f32m##LMUL(b, log_ps(a, vl), vl), vl); \ } _RVV_FLOAT32_POW_OP(1, 32) diff --git a/src/kernels/cpu/optimized/x86_64/avx_mathfun.h b/src/kernels/cpu/optimized/x86_64/avx_mathfun.h new file mode 100644 index 0000000000..b5743ea4f2 --- /dev/null +++ b/src/kernels/cpu/optimized/x86_64/avx_mathfun.h @@ -0,0 +1,827 @@ +/* + AVX implementation of sin, cos, sincos, exp and log + + Based on "sse_mathfun.h", by Julien Pommier + http://gruntthepeon.free.fr/ssemath/ + + Copyright (C) 2012 Giovanni Garberoglio + Interdisciplinary Laboratory for Computational Science (LISC) + Fondazione Bruno Kessler and University of Trento + via Sommarive, 18 + I-38123 Trento (Italy) + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +#ifndef AVX_MATHFUN_H +#define AVX_MATHFUN_H + +#include +#include +#if __AVX__ +#include +#if __XOP__ +#ifdef _MSC_VER +#include +#else +#include +#endif +#endif +#endif + +#ifdef _MSC_VER /* visual c++ */ +#define ALIGN32_BEG __declspec(align(32)) +#define ALIGN32_END +#else /* gcc or icc */ +#define ALIGN32_BEG +#define ALIGN32_END __attribute__((aligned(32))) +#endif + +#ifdef _MSC_VER +#define CAN_FORCEINLINE __forceinline +#elif defined(__GNUC__) +#define CAN_FORCEINLINE inline __attribute__((__always_inline__)) +#elif defined(__CLANG__) +#if __has_attribute(__always_inline__) +#define CAN_FORCEINLINE inline __attribute__((__always_inline__)) +#else +#define CAN_FORCEINLINE inline +#endif +#else +#define CAN_FORCEINLINE inline +#endif + +#ifndef __FMA__ +static CAN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(const __m256 &_a, const __m256 &_b, const __m256 &_c) +{ + return _mm256_add_ps(_mm256_mul_ps(_a, _b), _c); +} +static CAN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(const __m256 &_a, const __m256 &_b, const __m256 &_c) +{ + return _mm256_sub_ps(_c, _mm256_mul_ps(_a, _b)); +} +#else +static CAN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(const __m256 &_a, const __m256 &_b, const __m256 &_c) +{ + return _mm256_fmadd_ps(_a, _b, _c); +} +static CAN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(const __m256 &_a, const __m256 &_b, const __m256 &_c) +{ + // return -a * b + c + return _mm256_fnmadd_ps(_a, _b, _c); +} +#endif + +static CAN_FORCEINLINE __m256 _mm256_fmadd_1_ps(const __m256 &a, const __m256 &b, float c) +{ + return _mm256_comp_fmadd_ps(b, _mm256_set1_ps(c), a); +} + +static CAN_FORCEINLINE __m256 _mm256_fmrsub_1_ps(const __m256 &a, const __m256 &b, float c) +{ + // return a - b * c + return _mm256_comp_fnmadd_ps(b, _mm256_set1_ps(c), a); +} + +static CAN_FORCEINLINE float _mm256_reduce_add_ps(__m256 x) +{ + /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */ + const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x)); + /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */ + const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); + /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */ + const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); + /* Conversion to float is a no-op on x86-64 */ + return _mm_cvtss_f32(x32); +} + +static CAN_FORCEINLINE float _mm256_reduce_max_ps(__m256 x) +{ + const __m128 x128 = _mm_max_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x)); + const __m128 x64 = _mm_max_ps(x128, _mm_movehl_ps(x128, x128)); + const __m128 x32 = _mm_max_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); + return _mm_cvtss_f32(x32); +} + +#define _PI32AVX_CONST(Name, Val) \ + static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val } + +_PI32AVX_CONST(1, 1); +_PI32AVX_CONST(inv1, ~1); +_PI32AVX_CONST(2, 2); +_PI32AVX_CONST(4, 4); + +/* declare some AVX constants -- why can't I figure a better way to do that? */ +#define _PS256_CONST(Name, Val) \ + static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } +#define _PI32_CONST256(Name, Val) \ + static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } +#define _PS256_CONST_TYPE(Name, Type, Val) \ + static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } + +_PS256_CONST(1, 1.0f); +_PS256_CONST(0p5, 0.5f); +/* the smallest non denormalized float number */ +_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); +_PS256_CONST_TYPE(mant_mask, int, 0x7f800000); +_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); + +_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000); +_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000); + +_PI32_CONST256(0, 0); +_PI32_CONST256(1, 1); +_PI32_CONST256(inv1, ~1); +_PI32_CONST256(2, 2); +_PI32_CONST256(4, 4); +_PI32_CONST256(0x7f, 0x7f); + +_PS256_CONST(cephes_SQRTHF, 0.707106781186547524f); +_PS256_CONST(cephes_log_p0, 7.0376836292E-2f); +_PS256_CONST(cephes_log_p1, -1.1514610310E-1f); +_PS256_CONST(cephes_log_p2, 1.1676998740E-1f); +_PS256_CONST(cephes_log_p3, -1.2420140846E-1f); +_PS256_CONST(cephes_log_p4, +1.4249322787E-1f); +_PS256_CONST(cephes_log_p5, -1.6668057665E-1f); +_PS256_CONST(cephes_log_p6, +2.0000714765E-1f); +_PS256_CONST(cephes_log_p7, -2.4999993993E-1f); +_PS256_CONST(cephes_log_p8, +3.3333331174E-1f); +_PS256_CONST(cephes_log_q1, -2.12194440e-4f); +_PS256_CONST(cephes_log_q2, 0.693359375f); + +#ifndef __AVX2__ +typedef union imm_xmm_union +{ + __m256i imm; + __m128i xmm[2]; +} imm_xmm_union; + +#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \ + { \ + ALIGN32_BEG imm_xmm_union u ALIGN32_END; \ + u.imm = imm_; \ + xmm0_ = u.xmm[0]; \ + xmm1_ = u.xmm[1]; \ + } + +#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \ + { \ + ALIGN32_BEG imm_xmm_union u ALIGN32_END; \ + u.xmm[0] = xmm0_; \ + u.xmm[1] = xmm1_; \ + imm_ = u.imm; \ + } + +#define AVX2_BITOP_USING_SSE2(fn) \ + static CAN_FORCEINLINE __m256i _mm256_comp_##fn(__m256i x, int a) \ + { \ + /* use SSE2 instruction to perform the bitop AVX2 */ \ + __m128i x1, x2; \ + __m256i ret; \ + COPY_IMM_TO_XMM(x, x1, x2); \ + x1 = _mm_##fn(x1, a); \ + x2 = _mm_##fn(x2, a); \ + COPY_XMM_TO_IMM(x1, x2, ret); \ + return (ret); \ + } +#define AVX2_INTOP_USING_SSE2(fn) \ + static CAN_FORCEINLINE __m256i _mm256_comp_##fn(__m256i x, __m256i y) \ + { \ + /* use SSE2 instructions to perform the AVX2 integer operation */ \ + __m128i x1, x2; \ + __m128i y1, y2; \ + __m256i ret; \ + COPY_IMM_TO_XMM(x, x1, x2); \ + COPY_IMM_TO_XMM(y, y1, y2); \ + x1 = _mm_##fn(x1, y1); \ + x2 = _mm_##fn(x2, y2); \ + COPY_XMM_TO_IMM(x1, x2, ret); \ + return (ret); \ + } +#else +#define AVX2_BITOP_USING_SSE2(fn) \ + static CAN_FORCEINLINE __m256i _mm256_comp_##fn(__m256i x, int a) \ + { \ + return _mm256_##fn(x, a); \ + } +#define AVX2_INTOP_USING_SSE2(fn) \ + static CAN_FORCEINLINE __m256i _mm256_comp_##fn(__m256i x, __m256i y) \ + { \ + return _mm256_##fn(x, y); \ + } +#endif + +AVX2_BITOP_USING_SSE2(slli_epi32) +AVX2_BITOP_USING_SSE2(srli_epi32) +AVX2_INTOP_USING_SSE2(cmpeq_epi32) +AVX2_INTOP_USING_SSE2(sub_epi32) +AVX2_INTOP_USING_SSE2(add_epi32) + +// Replace 256 bit operations with 128 bit ones when AVX2 is disabled +#ifndef __AVX2__ +AVX2_INTOP_USING_SSE2(and_si128) +AVX2_INTOP_USING_SSE2(andnot_si128) +#endif + +/* natural logarithm computed for 8 simultaneous float + return NaN for x <= 0 +*/ +static CAN_FORCEINLINE __m256 log256_ps(__m256 x) +{ + __m256i imm0; + __m256 one = *(__m256 *)_ps256_1; + + //__m256 invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); + __m256 invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); + + x = _mm256_max_ps(x, *(__m256 *)_ps256_min_norm_pos); /* cut off denormalized stuff */ + + // can be done with AVX2 + imm0 = _mm256_comp_srli_epi32(_mm256_castps_si256(x), 23); + + /* keep only the fractional part */ + x = _mm256_and_ps(x, *(__m256 *)_ps256_inv_mant_mask); + x = _mm256_or_ps(x, *(__m256 *)_ps256_0p5); + + // this is again another AVX2 instruction + imm0 = _mm256_comp_sub_epi32(imm0, *(__m256i *)_pi32_256_0x7f); + __m256 e = _mm256_cvtepi32_ps(imm0); + + e = _mm256_add_ps(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + //__m256 mask = _mm256_cmplt_ps(x, *(__m256*)_ps256_cephes_SQRTHF); + __m256 mask = _mm256_cmp_ps(x, *(__m256 *)_ps256_cephes_SQRTHF, _CMP_LT_OS); + __m256 tmp = _mm256_and_ps(x, mask); + x = _mm256_sub_ps(x, one); + e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); + x = _mm256_add_ps(x, tmp); + + __m256 z = _mm256_mul_ps(x, x); + + __m256 y = *(__m256 *)_ps256_cephes_log_p0; + y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p1); + y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p2); + y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p3); + y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p4); + y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p5); + y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p6); + y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p7); + y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_log_p8); + y = _mm256_mul_ps(y, x); + + y = _mm256_mul_ps(y, z); + + y = _mm256_comp_fmadd_ps(e, *(__m256 *)_ps256_cephes_log_q1, y); + + //y = -z * 0.5 + y + y = _mm256_comp_fnmadd_ps(z, *(__m256 *)_ps256_0p5, y); + + x = _mm256_add_ps(x, y); + x = _mm256_comp_fmadd_ps(e, *(__m256 *)_ps256_cephes_log_q2, x); + y = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN + return y; +} + +_PS256_CONST(exp_hi, 88.3762626647949f); +_PS256_CONST(exp_lo, -88.3762626647949f); + +_PS256_CONST(cephes_LOG2EF, 1.44269504088896341f); +_PS256_CONST(cephes_exp_C1, 0.693359375f); +_PS256_CONST(cephes_exp_C2, -2.12194440e-4f); + +_PS256_CONST(cephes_exp_p0, 1.9875691500E-4f); +_PS256_CONST(cephes_exp_p1, 1.3981999507E-3f); +_PS256_CONST(cephes_exp_p2, 8.3334519073E-3f); +_PS256_CONST(cephes_exp_p3, 4.1665795894E-2f); +_PS256_CONST(cephes_exp_p4, 1.6666665459E-1f); +_PS256_CONST(cephes_exp_p5, 5.0000001201E-1f); + +static CAN_FORCEINLINE __m256 exp256_ps(__m256 x) +{ + __m256 tmp = _mm256_setzero_ps(), fx; + __m256i imm0; + __m256 one = *(__m256 *)_ps256_1; + + x = _mm256_min_ps(x, *(__m256 *)_ps256_exp_hi); + x = _mm256_max_ps(x, *(__m256 *)_ps256_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = _mm256_comp_fmadd_ps(x, *(__m256 *)_ps256_cephes_LOG2EF, *(__m256 *)_ps256_0p5); + + /* how to perform a floorf with SSE: just below */ + //imm0 = _mm256_cvttps_epi32(fx); + //tmp = _mm256_cvtepi32_ps(imm0); + + tmp = _mm256_floor_ps(fx); + + /* if greater, subtract 1 */ + //__m256 mask = _mm256_cmpgt_ps(tmp, fx); + __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); + mask = _mm256_and_ps(mask, one); + fx = _mm256_sub_ps(tmp, mask); + + // x = x - fx * exp_C1 + x = _mm256_comp_fnmadd_ps(fx, *(__m256 *)_ps256_cephes_exp_C1, x); + // x = x - fx * exp_C2 + x = _mm256_comp_fnmadd_ps(fx, *(__m256 *)_ps256_cephes_exp_C2, x); + + tmp = _mm256_mul_ps(x, x); + + __m256 y = *(__m256 *)_ps256_cephes_exp_p0; + y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_exp_p1); + y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_exp_p2); + y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_exp_p3); + y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_exp_p4); + y = _mm256_comp_fmadd_ps(y, x, *(__m256 *)_ps256_cephes_exp_p5); + y = _mm256_comp_fmadd_ps(y, tmp, x); + y = _mm256_add_ps(y, one); + + /* build 2^n */ + imm0 = _mm256_cvttps_epi32(fx); + // another two AVX2 instructions + imm0 = _mm256_comp_add_epi32(imm0, *(__m256i *)_pi32_256_0x7f); + imm0 = _mm256_comp_slli_epi32(imm0, 23); + __m256 pow2n = _mm256_castsi256_ps(imm0); + y = _mm256_mul_ps(y, pow2n); + return y; +} + +_PS256_CONST(tanh_hi, 9.0f); +_PS256_CONST(tanh_lo, -9.0f); + +_PS256_CONST(cephes_tanh_p0, -2.76076847742355E-16f); +_PS256_CONST(cephes_tanh_p1, 2.00018790482477E-13f); +_PS256_CONST(cephes_tanh_p2, -8.60467152213735E-11f); +_PS256_CONST(cephes_tanh_p3, 5.12229709037114E-08f); +_PS256_CONST(cephes_tanh_p4, 1.48572235717979E-05f); +_PS256_CONST(cephes_tanh_p5, 6.37261928875436E-04f); +_PS256_CONST(cephes_tanh_p6, 4.89352455891786E-03f); + +_PS256_CONST(cephes_tanh_p7, 1.19825839466702e-06f); +_PS256_CONST(cephes_tanh_p8, 1.18534705686654e-04f); +_PS256_CONST(cephes_tanh_p9, 2.26843463243900e-03f); + +// an approximation of tanh +static inline __m256 tanh256_ps(const __m256 x) +{ + __m256 value = x; + value = _mm256_max_ps(*(__m256 *)_ps256_tanh_lo, value); + value = _mm256_min_ps(*(__m256 *)_ps256_tanh_hi, value); + + __m256 value_squared = _mm256_mul_ps(value, value); + + __m256 p; + p = _mm256_comp_fmadd_ps(value_squared, *(__m256 *)_ps256_cephes_tanh_p0, *(__m256 *)_ps256_cephes_tanh_p1); + p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256 *)_ps256_cephes_tanh_p2); + p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256 *)_ps256_cephes_tanh_p3); + p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256 *)_ps256_cephes_tanh_p4); + p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256 *)_ps256_cephes_tanh_p5); + p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256 *)_ps256_cephes_tanh_p6); + p = _mm256_mul_ps(p, value); + + __m256 q; + q = _mm256_comp_fmadd_ps(value_squared, *(__m256 *)_ps256_cephes_tanh_p7, *(__m256 *)_ps256_cephes_tanh_p8); + q = _mm256_comp_fmadd_ps(q, value_squared, *(__m256 *)_ps256_cephes_tanh_p9); + q = _mm256_comp_fmadd_ps(q, value_squared, *(__m256 *)_ps256_cephes_tanh_p6); + + __m256 dst = _mm256_div_ps(p, q); + return dst; +} + +_PS256_CONST(minus_cephes_DP1, -0.78515625f); +_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f); +_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8f); +_PS256_CONST(sincof_p0, -1.9515295891E-4f); +_PS256_CONST(sincof_p1, 8.3321608736E-3f); +_PS256_CONST(sincof_p2, -1.6666654611E-1f); +_PS256_CONST(coscof_p0, 2.443315711809948E-005f); +_PS256_CONST(coscof_p1, -1.388731625493765E-003f); +_PS256_CONST(coscof_p2, 4.166664568298827E-002f); +_PS256_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI + +/* evaluation of 8 sines at onces using AVX intrisics + + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. + +*/ +static CAN_FORCEINLINE __m256 sin256_ps(__m256 x) +{ // any x + __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; + __m256i imm0, imm2; + +#ifndef __AVX2__ + __m128i imm0_1, imm0_2; + __m128i imm2_1, imm2_2; +#endif + + sign_bit = x; + /* take the absolute value */ + x = _mm256_and_ps(x, *(__m256 *)_ps256_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit = _mm256_and_ps(sign_bit, *(__m256 *)_ps256_sign_mask); + + /* scale by 4/Pi */ + y = _mm256_mul_ps(x, *(__m256 *)_ps256_cephes_FOPI); + + /* + Here we start a series of integer operations, which are in the + realm of AVX2. + If we don't have AVX, let's perform them using SSE2 directives + */ + +#ifdef __AVX2__ + /* store the integer part of y in mm0 */ + imm2 = _mm256_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + // another two AVX2 instruction + imm2 = _mm256_comp_add_epi32(imm2, *(__m256i *)_pi32_256_1); + imm2 = _mm256_and_si256(imm2, *(__m256i *)_pi32_256_inv1); + y = _mm256_cvtepi32_ps(imm2); + + /* get the swap sign flag */ + imm0 = _mm256_and_si256(imm2, *(__m256i *)_pi32_256_4); + imm0 = _mm256_comp_slli_epi32(imm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4 +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::optimized; + +template result optimized::instancenorm(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, float epsilon) noexcept; + +template +result optimized::instancenorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, float epsilon) noexcept +{ + return cpu::reference::instancenorm(input, output, scale, bias, in_shape, epsilon); +} \ No newline at end of file diff --git a/src/kernels/cpu/optimized/x86_64/layernorm.cpp b/src/kernels/cpu/optimized/x86_64/layernorm.cpp new file mode 100644 index 0000000000..fbe05bead3 --- /dev/null +++ b/src/kernels/cpu/optimized/x86_64/layernorm.cpp @@ -0,0 +1,34 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::optimized; + +template result optimized::layernorm(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept; + +template +result optimized::layernorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept +{ + return cpu::reference::layernorm(input, output, scale, bias, in_shape, axis, epsilon); +} \ No newline at end of file diff --git a/src/kernels/cpu/optimized/x86_64/reduce.cpp b/src/kernels/cpu/optimized/x86_64/reduce.cpp new file mode 100644 index 0000000000..01869f4ed6 --- /dev/null +++ b/src/kernels/cpu/optimized/x86_64/reduce.cpp @@ -0,0 +1,43 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::optimized; +// nncase::kernels::cpu::optimized + +template result optimized::reduce(reduce_op_t op, float init_value, const float *input, float *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept; + +template result optimized::reduce(reduce_op_t op, int32_t init_value, const int32_t *input, int32_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept; + +template result optimized::reduce(reduce_op_t op, int64_t init_value, const int64_t *input, int64_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept; + +template +result optimized::reduce(reduce_op_t op, T init_value, const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept +{ + return cpu::reference::reduce(op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context); +} \ No newline at end of file diff --git a/src/kernels/cpu/optimized/x86_64/ternary.cpp b/src/kernels/cpu/optimized/x86_64/ternary.cpp new file mode 100644 index 0000000000..1ec3d48414 --- /dev/null +++ b/src/kernels/cpu/optimized/x86_64/ternary.cpp @@ -0,0 +1,55 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::optimized; + +template result optimized::ternary(const float *input_a, const float *input_b, const float *input_c, float *output, + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, + const runtime_shape_t &out_strides) noexcept; + +template result optimized::ternary(const float *input_a, const int64_t *input_b, const int64_t *input_c, int64_t *output, + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, + const runtime_shape_t &out_strides) noexcept; + +template result optimized::ternary(const float *input_a, const int32_t *input_b, const int32_t *input_c, int32_t *output, + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, + const runtime_shape_t &out_strides) noexcept; + +// template result optimized::ternary(const float *input_a, const long *input_b, const long *input_c, long *output, +// const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, +// const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, +// const runtime_shape_t &out_strides) noexcept; + +template +result optimized::ternary(const float *input_a, const T *input_b, const T *input_c, T *output, + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, + const runtime_shape_t &out_strides) noexcept +{ + return cpu::reference::ternary(input_a, input_b, input_c, output, in_a_shape, in_a_strides, in_b_shape, in_b_strides, in_c_shape, in_c_strides, out_strides); +} \ No newline at end of file diff --git a/src/kernels/cpu/optimized/x86_64/unary.cpp b/src/kernels/cpu/optimized/x86_64/unary.cpp index c5752e5487..c5f65e92be 100644 --- a/src/kernels/cpu/optimized/x86_64/unary.cpp +++ b/src/kernels/cpu/optimized/x86_64/unary.cpp @@ -12,6 +12,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #include #include #include @@ -23,8 +24,762 @@ using namespace nncase::kernels; using namespace nncase::kernels::cpu; using namespace nncase::kernels::cpu::optimized; +#if defined(X86_64_SIMD_ON) + +#include "avx_mathfun.h" +static void round_f32_vec(const float *a, float *b, int n) +{ + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 vector_a = _mm256_loadu_ps(a); + __m256 dst_a = _mm256_round_ps(vector_a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + _mm256_storeu_ps(b, dst_a); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = roundf(a[j]); + } +} + +static void ceil_f32_vec(const float *a, float *b, int n) +{ + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 vector_a = _mm256_loadu_ps(a); + __m256 dst_a = _mm256_round_ps(vector_a, (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); + _mm256_storeu_ps(b, dst_a); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = ceilf(a[j]); + } +} + +static void floor_f32_vec(const float *a, float *b, int n) +{ + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 vector_a = _mm256_loadu_ps(a); + __m256 dst_a = _mm256_round_ps(vector_a, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); + _mm256_storeu_ps(b, dst_a); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = floorf(a[j]); + } +} + +static void sqrt_f32_vec(const float *a, float *b, int n) +{ + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 vector_a = _mm256_loadu_ps(a); + __m256 dst_a = _mm256_sqrt_ps(vector_a); + _mm256_storeu_ps(b, dst_a); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = sqrtf(a[j]); + } +} + +static void rsqrt_f32_vec(const float *a, float *b, int n) +{ + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 aa = _mm256_loadu_ps(a); + __m256 bb = _mm256_rsqrt_ps(aa); + _mm256_storeu_ps(b, bb); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = 1.0f / sqrtf(a[j]); + } +} + +static void exp_f32_vec(const float *a, float *b, int n) +{ + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 vector_a = _mm256_loadu_ps(a); + __m256 dst_a = exp256_ps(vector_a); + _mm256_storeu_ps(b, dst_a); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = expf(a[j]); + } +} + +static void log_f32_vec(const float *a, float *b, int n) +{ + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 vector_a = _mm256_loadu_ps(a); + __m256 dst_a = log256_ps(vector_a); + _mm256_storeu_ps(b, dst_a); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = logf(a[j]); + } +} + +static void cos_f32_vec(const float *a, float *b, int n) +{ + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 vector_a = _mm256_loadu_ps(a); + __m256 dst_a = cos256_ps(vector_a); + _mm256_storeu_ps(b, dst_a); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = cosf(a[j]); + } +} + +static void sin_f32_vec(const float *a, float *b, int n) +{ + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 vector_a = _mm256_loadu_ps(a); + __m256 dst_a = sin256_ps(vector_a); + _mm256_storeu_ps(b, dst_a); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = sinf(a[j]); + } +} + +static void negative_f32_vec(const float *a, float *b, int n) +{ + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 vector_a = _mm256_loadu_ps(a); + __m256 dst_a = _mm256_sub_ps(_mm256_setzero_ps(), vector_a); + _mm256_storeu_ps(b, dst_a); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = -(a[j]); + } +} + +static void logical_not_f32_vec(const float *a, float *b, int n) +{ + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + __m256i i_zeros = _mm256_setzero_si256(); + for (int j = 0; j < n8; ++j) + { + __m256i vector_a = _mm256_loadu_si256((__m256i const *)a); + __m256i i_dst_a = _mm256_cmpeq_epi32(vector_a, i_zeros); + i_dst_a = _mm256_sub_epi32(i_zeros, i_dst_a); + __m256 f_dst_a = _mm256_cvtepi32_ps(i_dst_a); + _mm256_storeu_ps(b, f_dst_a); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = !a[j]; + } +} + +static void abs_f32_vec(const float *a, float *b, int n) +{ + const ALIGN32_BEG int32_t remove_sign_bit_data[8] ALIGN32_END = { + 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, + 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF + }; + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + __m256i remove_sign_bit_flag = _mm256_load_si256((__m256i const *)remove_sign_bit_data); + for (int j = 0; j < n8; ++j) + { + __m256i vector_a = _mm256_loadu_si256((__m256i const *)a); + __m256i dst_a = _mm256_and_si256(vector_a, remove_sign_bit_flag); + _mm256_storeu_si256((__m256i *)b, dst_a); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = fabs(a[j]); + } +} + +static void tanh_f32_vec(const float *a, float *b, int n) +{ + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 vector_a = _mm256_loadu_ps(a); + __m256 dst_a = tanh256_ps(vector_a); + _mm256_storeu_ps(b, dst_a); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = tanhf(a[j]); + } +} + +#ifdef _MSC_VER /* visual c++ */ +static CAN_FORCEINLINE float abs_f32(float x) +{ + return fabsf(x); +} +#else /* gcc or icc */ +static CAN_FORCEINLINE float abs_f32(float x) +{ + asm( + "and $0x7FFFFFFF, %0;" + : "+r"(x)::); + return x; +} +#endif + +static CAN_FORCEINLINE __m256 _mm256_can_acos_ps(__m256 x) +{ + const __m256 zero = _mm256_set1_ps(0.0f); + const __m256 two = _mm256_set1_ps(2.0f); + const __m256 mtwo = _mm256_set1_ps(-2.0f); + const __m256 c0 = _mm256_set1_ps(0x1.c86000p-22f); // 4.25032340e-7 + const __m256 c1 = _mm256_set1_ps(-0x1.0258fap-19f); // -1.92483935e-6 + const __m256 c2 = _mm256_set1_ps(0x1.90c5c4p-18f); // 5.97197595e-6 + const __m256 c3 = _mm256_set1_ps(-0x1.55668cp-19f); // -2.54363249e-6 + const __m256 c4 = _mm256_set1_ps(0x1.c3f78ap-16f); // 2.69393295e-5 + const __m256 c5 = _mm256_set1_ps(0x1.e8f446p-14f); // 1.16575764e-4 + const __m256 c6 = _mm256_set1_ps(0x1.6df072p-11f); // 6.97973708e-4 + const __m256 c7 = _mm256_set1_ps(0x1.3332a6p-8f); // 4.68746712e-3 + const __m256 c8 = _mm256_set1_ps(0x1.555550p-5f); // 4.16666567e-2 + const __m256 pi0 = _mm256_set1_ps(0x1.ddcb02p+0f); // 1.86637890e+0 + const __m256 pi1 = _mm256_set1_ps(0x1.aee9d6p+0f); // 1.68325555e+0 + __m256 s, r, t, m; + + s = two; + t = mtwo; + m = _mm256_cmp_ps(x, zero, _CMP_LT_OQ); + t = _mm256_blendv_ps(t, s, m); + t = _mm256_fmadd_ps(x, t, s); + s = _mm256_sqrt_ps(t); + r = c0; + r = _mm256_fmadd_ps(r, t, c1); + r = _mm256_fmadd_ps(r, t, c2); + r = _mm256_fmadd_ps(r, t, c3); + r = _mm256_fmadd_ps(r, t, c4); + r = _mm256_fmadd_ps(r, t, c5); + r = _mm256_fmadd_ps(r, t, c6); + r = _mm256_fmadd_ps(r, t, c7); + r = _mm256_fmadd_ps(r, t, c8); + r = _mm256_mul_ps(r, t); + r = _mm256_fmadd_ps(r, s, s); + t = _mm256_sub_ps(zero, r); + t = _mm256_fmadd_ps(pi0, pi1, t); + r = _mm256_blendv_ps(r, t, m); + return r; +} + +//t > 0.921875f +static CAN_FORCEINLINE __m256 erf_core_ps1(__m256 a, __m256 t, __m256 s, __m256 r0, __m256 r1, __m256 r2, + __m256 r3, __m256 r4, __m256 r5, __m256 r6, __m256i n_flag) +{ + __m256 r = _mm256_fmadd_ps(r0, t, r1); + __m256 u = _mm256_fmadd_ps(r2, t, r3); + r = _mm256_fmadd_ps(r, s, u); + r = _mm256_fmadd_ps(r, t, r4); + r = _mm256_fmadd_ps(r, t, r5); + r = _mm256_fmadd_ps(r, t, r6); + r = _mm256_fmadd_ps(r, t, t); + __m256 _zeros = _mm256_setzero_ps(); + __m256 _ones = _mm256_set1_ps(1.0f); + __m256 minus_r = _mm256_sub_ps(_zeros, r); + r = exp256_ps(minus_r); + r = _mm256_sub_ps(_ones, r); + + __m256i sign_flag = _mm256_andnot_si256(n_flag, _mm256_castps_si256(a)); + __m256i pr = _mm256_and_si256(n_flag, _mm256_castps_si256(r)); + r = _mm256_castsi256_ps(_mm256_or_si256(sign_flag, pr)); + return r; +} + +// t <= 0.921875f +static CAN_FORCEINLINE __m256 erf_core_ps2(__m256 a, __m256 s, __m256 r1, __m256 r2, + __m256 r3, __m256 r4, __m256 r5, __m256 r6) +{ + __m256 r = _mm256_fmadd_ps(r1, s, r2); + r = _mm256_fmadd_ps(r, s, r3); + r = _mm256_fmadd_ps(r, s, r4); + r = _mm256_fmadd_ps(r, s, r5); + r = _mm256_fmadd_ps(r, s, r6); + r = _mm256_fmadd_ps(r, a, a); + return r; +} + +static void erf_f32_vec(const float *a, float *b, int n) +{ + const float erf_const_data[] = { -0x1.3a1a82p-11f, 0x1.473f48p-08f, -0x1.b68bd2p-06f, + 0x1.ce1a46p-04f, -0x1.8126e0p-02f, 0x1.06eba6p-03f }; + __m256 r1 = _mm256_broadcast_ss(erf_const_data); + __m256 r2 = _mm256_broadcast_ss(erf_const_data + 1); + __m256 r3 = _mm256_broadcast_ss(erf_const_data + 2); + __m256 r4 = _mm256_broadcast_ss(erf_const_data + 3); + __m256 r5 = _mm256_broadcast_ss(erf_const_data + 4); + __m256 r6 = _mm256_broadcast_ss(erf_const_data + 5); + + ///////////////////////////// + // if t > 0.921875f + const __m256 c0 = _mm256_set1_ps(0x1.222900p-16f); + const __m256 c1 = _mm256_set1_ps(-0x1.91d2ccp-12f); + const __m256 c2 = _mm256_set1_ps(0x1.fd1336p-09f); + const __m256 c3 = _mm256_set1_ps(-0x1.8d6300p-06f); + const __m256 c4 = _mm256_set1_ps(0x1.b55cb0p-4f); + const __m256 c5 = _mm256_set1_ps(0x1.450aa0p-1f); + const __m256 c6 = _mm256_set1_ps(0x1.079d0cp-3f); + const __m256 c7 = _mm256_set1_ps(0.921875f); + ///////////////////////////// + + __m256i n_flag = _mm256_set1_epi32(0x7fffffff); + + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 aa = _mm256_loadu_ps(a); + __m256 s = _mm256_mul_ps(aa, aa); // s + __m256 t = _mm256_castsi256_ps(_mm256_and_si256(_mm256_castps_si256(aa), n_flag)); + + __m256 ret1 = erf_core_ps1(aa, t, s, c0, c1, c2, c3, c4, c5, c6, n_flag); + __m256 ret2 = erf_core_ps2(aa, s, r1, r2, r3, r4, r5, r6); + + __m256 _flag = _mm256_cmp_ps(t, c7, _CMP_LT_OQ); + + ret1 = _mm256_blendv_ps(ret1, ret2, _flag); + _mm256_storeu_ps(b, ret1); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = erff(a[j]); + } +} + +static void sign_f32_vec(const float *a, float *b, int n) +{ + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 aa = _mm256_loadu_ps(a); + __m256 b1 = _mm256_cmp_ps(_mm256_setzero_ps(), aa, _CMP_LT_OQ); + __m256 b2 = _mm256_cmp_ps(aa, _mm256_setzero_ps(), _CMP_LT_OQ); + __m256i ib1 = _mm256_castps_si256(b1); + __m256i ib2 = _mm256_castps_si256(b2); + __m256i ret = _mm256_sub_epi32(ib2, ib1); + + __m256 kbb = _mm256_cvtepi32_ps(ret); + _mm256_storeu_ps(b, kbb); + + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = (0.f < a[j]) - (a[j] < 0.f); + } +} + +static void acos_f32_vec(const float *a, float *b, int n) +{ + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 vecotr_a = _mm256_loadu_ps(a); + __m256 dst_a = _mm256_can_acos_ps(vecotr_a); + _mm256_storeu_ps(b, dst_a); + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = acosf(a[j]); + } +} + +CAN_FORCEINLINE __m256 asinf_core_ps(__m256 a, __m256 r0, __m256 r1, __m256 r2, __m256 r3, __m256 r4) +{ + __m256 ss = _mm256_mul_ps(a, a); // s = a * a; + __m256 r = r0; + r = _mm256_fmadd_ps(r, ss, r1); //r = fmaf(r, s, 0x1.29a5cep-6f); // 1.81669723e-23 + r = _mm256_fmadd_ps(r, ss, r2); + r = _mm256_fmadd_ps(r, ss, r3); + r = _mm256_fmadd_ps(r, ss, r4); + r = _mm256_mul_ps(r, ss); + r = _mm256_fmadd_ps(r, a, a); + return r; +} + +CAN_FORCEINLINE __m256 asinf_core2_ps(__m256 a, __m256 r0, __m256 r1, __m256 r2, __m256 r3, __m256 r4, __m256 one_256, __m256 half_one_256, __m256 half_pi_256, + __m256i abs_flag, __m256i sign_flag) +{ + __m256 s; // = a; + + //////////////////// + // 获取符号位 + __m256i isign_flag = _mm256_and_si256(_mm256_castps_si256(a), sign_flag); + __m256i _xv = _mm256_and_si256(_mm256_castps_si256(a), abs_flag); + s = _mm256_castsi256_ps(_xv); + //////////////////// + + //////////////////////////// + // before + s = _mm256_sub_ps(one_256, s); // 1 - x + s = _mm256_mul_ps(half_one_256, s); // (1 - x) / 2 + s = _mm256_sqrt_ps(s); + ///////////////////////////// + + __m256 ss = _mm256_mul_ps(s, s); // s = a * a; + __m256 r = r0; + r = _mm256_fmadd_ps(r, ss, r1); //r = fmaf(r, s, 0x1.29a5cep-6f); // 1.81669723e-23 + r = _mm256_fmadd_ps(r, ss, r2); + r = _mm256_fmadd_ps(r, ss, r3); + r = _mm256_fmadd_ps(r, ss, r4); + r = _mm256_mul_ps(r, ss); + r = _mm256_fmadd_ps(r, s, s); + + //////////////////////////// + // after + s = _mm256_div_ps(r, half_one_256); // 2 * asinf_core(x) + s = _mm256_sub_ps(half_pi_256, s); // pi / 2 - 2 * asinf_core(x) + ///////////////////////////// + + //////////////////// + // 恢复符号位 + s = _mm256_castsi256_ps(_mm256_or_si256(_mm256_castps_si256(s), isign_flag)); + return s; +} + +void asinf_f32_vec(const float *a, float *b, int n) +{ + const float pi = 3.1415926f; + const float __init__data[] = { 0x1.a7f260p-5f, 0x1.29a5cep-6f, 0x1.7f0842p-5f, 0x1.329256p-4f, 0x1.555728p-3f, 1.0f, 0.5f, pi / 2 }; + __m256 r0 = _mm256_broadcast_ss(__init__data); + __m256 r1 = _mm256_broadcast_ss(__init__data + 1); + __m256 r2 = _mm256_broadcast_ss(__init__data + 2); + __m256 r3 = _mm256_broadcast_ss(__init__data + 3); + __m256 r4 = _mm256_broadcast_ss(__init__data + 4); + + __m256 one_256 = _mm256_broadcast_ss(__init__data + 5); + __m256 half_one_256 = _mm256_broadcast_ss(__init__data + 6); + __m256 half_pi_256 = _mm256_broadcast_ss(__init__data + 7); + + const ALIGN32_BEG int32_t x1[8] ALIGN32_END = { + 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, + 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF + }; + const ALIGN32_BEG uint32_t x2[8] ALIGN32_END = { + 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x80000000, 0x80000000, 0x80000000, 0x80000000 + }; + + __m256i abs_flag = _mm256_load_si256((__m256i const *)x1); + __m256i sign_flag = _mm256_load_si256((__m256i const *)x2); + + int n8 = (n >> 3); + int n8_left = n & (8 - 1); + for (int j = 0; j < n8; ++j) + { + __m256 s = _mm256_loadu_ps(a); + __m256 s1 = asinf_core_ps(s, r0, r1, r2, r3, r4); + //////////// + // fabsf 是否大于 0.5f + ///////////// + __m256 abs_s = _mm256_castsi256_ps(_mm256_and_si256(_mm256_castps_si256(s), abs_flag)); + ////__m256 _mm256_cmp_ps(__m256 a, __m256 b, const int imm8); + __m256 flags_half_2 = _mm256_cmp_ps(abs_s, half_one_256, _CMP_NLT_UQ); + + __m256 flags_half_2_1 = _mm256_cmp_ps(half_one_256, abs_s, _CMP_NLT_UQ); + + __m256 s2 = asinf_core2_ps(s, r0, r1, r2, r3, r4, one_256, half_one_256, half_pi_256, abs_flag, sign_flag); + + s1 = _mm256_and_ps(s1, flags_half_2_1); + s2 = _mm256_and_ps(s2, flags_half_2); + s2 = _mm256_or_ps(s1, s2); + _mm256_storeu_ps(b, s2); + + a += 8; + b += 8; + } + for (int j = 0; j < n8_left; ++j) + { + b[j] = asinf(a[j]); + } +} +#else // X86_64_SIMD_ON + +static void round_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = roundf(a[j]); + } +} + +static void ceil_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = ceilf(a[j]); + } +} + +static void floor_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = floorf(a[j]); + } +} + +static void sqrt_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = sqrtf(a[j]); + } +} + +static void rsqrt_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = 1.0f / sqrtf(a[j]); + } +} + +static void exp_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = expf(a[j]); + } +} + +static void log_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = logf(a[j]); + } +} + +static void cos_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = cosf(a[j]); + } +} + +static void sin_f32_vec(const float *a, float *b, int n) +{ + + for (int j = 0; j < n; ++j) + { + b[j] = sinf(a[j]); + } +} + +static void negative_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = -(a[j]); + } +} + +static void logical_not_f32_vec(const float *a, float *b, int n) +{ + + for (int j = 0; j < n; ++j) + { + b[j] = !a[j]; + } +} + +static void abs_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = fabs(a[j]); + } +} + +static void tanh_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = tanhf(a[j]); + } +} + +static void erf_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = erff(a[j]); + } +} + +static void sign_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = (0.f < a[j]) - (a[j] < 0.f); + } +} + +static void acos_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = acosf(a[j]); + } +} + +static void asinf_f32_vec(const float *a, float *b, int n) +{ + for (int j = 0; j < n; ++j) + { + b[j] = asinf(a[j]); + } +} +#endif // X86_64_SIMD_ON + result optimized::unary(unary_op_t op, const float *input, float *output, const runtime_shape_t &shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context) noexcept { - return cpu::reference::unary(op, input, output, shape, in_strides, out_strides, context); + result ret_value = ok(); + int len = (int)compute_size(shape); + + if (op == unary_round) + { + round_f32_vec(input, output, len); + } + else if (op == unary_ceil) + { + ceil_f32_vec(input, output, len); + } + else if (op == unary_floor) + { + floor_f32_vec(input, output, len); + } + else if (op == unary_sqrt) + { + sqrt_f32_vec(input, output, len); + } + else if (op == unary_rsqrt) + { + rsqrt_f32_vec(input, output, len); + } + else if (op == unary_exp) + { + exp_f32_vec(input, output, len); + } + else if (op == unary_log) + { + log_f32_vec(input, output, len); + } + else if (op == unary_cos) + { + cos_f32_vec(input, output, len); + } + else if (op == unary_sin) + { + sin_f32_vec(input, output, len); + } + else if (op == unary_neg) + { + negative_f32_vec(input, output, len); + } + else if (op == unary_abs) + { + abs_f32_vec(input, output, len); + } + else if (op == unary_logical_not) + { + logical_not_f32_vec(input, output, len); + } + else if (op == unary_tanh) + { + tanh_f32_vec(input, output, len); + } + else if (op == unary_erf) + { + erf_f32_vec(input, output, len); + } + else if (op == unary_sign) + { + sign_f32_vec(input, output, len); + } + else if (op == unary_acos) + { + acos_f32_vec(input, output, len); + } + else if (op == unary_asin) + { + asinf_f32_vec(input, output, len); + } + else + { + ret_value = cpu::reference::unary(op, input, output, shape, in_strides, out_strides, context); + } + return ret_value; } diff --git a/src/kernels/cpu/reference/CMakeLists.txt b/src/kernels/cpu/reference/CMakeLists.txt index c24e49e55d..0c14cde7c8 100644 --- a/src/kernels/cpu/reference/CMakeLists.txt +++ b/src/kernels/cpu/reference/CMakeLists.txt @@ -4,6 +4,7 @@ set(SRCS batch_to_space.cpp binary.cpp broadcast.cpp compare.cpp + compress.cpp concat.cpp convolution.cpp convert.cpp @@ -11,7 +12,9 @@ set(SRCS batch_to_space.cpp cumsum.cpp dequantize.cpp gather.cpp + gather_elements.cpp gather_nd.cpp + gru.cpp hardmax.cpp lut1d.cpp matmul.cpp @@ -29,9 +32,13 @@ set(SRCS batch_to_space.cpp sigmoid.cpp softmax.cpp slice.cpp + space_to_batch.cpp ternary.cpp topk.cpp transpose.cpp trilu.cpp - unary.cpp) + tflite_detection_postprocess.cpp + unary.cpp + layernorm.cpp + instancenorm.cpp) target_sources(kernels PRIVATE ${SRCS}) diff --git a/src/kernels/cpu/reference/compress.cpp b/src/kernels/cpu/reference/compress.cpp new file mode 100644 index 0000000000..1bccc3a0be --- /dev/null +++ b/src/kernels/cpu/reference/compress.cpp @@ -0,0 +1,61 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::reference; + +template result reference::compress(const float *input, const uint8_t *condition, float *output, const runtime_shape_t &input_shape, const runtime_shape_t &condition_shape, const int axis) noexcept; + +template +result reference::compress(const T *input, const uint8_t *condition, T *output, const runtime_shape_t &input_shape, const runtime_shape_t &condition_shape, const int axis) noexcept +{ + if (axis == (int)input_shape.size()) + { + for (auto i = 0; i < (int)condition_shape[0]; i++) + { + if ((float)*(condition + i) == 0) + { + continue; + } + *output++ = input[i]; + } + } + else + { + int select_slice = 1; + for (auto i = axis + 1; i < (int)input_shape.size(); i++) + { + select_slice *= input_shape[i]; + } + for (auto j = 0; j < (int)kernels::detail::compute_size(input_shape); j++) + { + auto i = j % (select_slice * input_shape[axis]); + auto cond_index = i / select_slice; + if (select_slice != 1 && (cond_index >= condition_shape[0] || condition[cond_index] == 0)) + continue; + if (select_slice == 1 && (i % input_shape[axis] >= condition_shape[0] || condition[cond_index % input_shape[axis] % condition_shape[0]] == 0)) + continue; + *output++ = input[j]; + } + } + return ok(); +} \ No newline at end of file diff --git a/src/kernels/cpu/reference/convert.cpp b/src/kernels/cpu/reference/convert.cpp index b5c6d26168..0de06d8c4c 100644 --- a/src/kernels/cpu/reference/convert.cpp +++ b/src/kernels/cpu/reference/convert.cpp @@ -30,7 +30,10 @@ result convert_impl(const TInput *input, TOutput *output, const runtime_sh { return apply(in_shape, [&](const runtime_shape_t &index) -> result { auto value = input[offset(in_strides, index)]; - output[offset(out_strides, index)] = static_cast(value); + if (to_datatype() == dt_bfloat16) + output[offset(out_strides, index)] = bfloat16::round_to_bfloat16(static_cast(value)); + else + output[offset(out_strides, index)] = static_cast(value); return ok(); }); } @@ -72,6 +75,7 @@ result convert_f32_to_fp16_impl(const float *input, half *output, const ru CONVERT_IMPL_LV2(input_t, int32_t); \ CONVERT_IMPL_LV2(input_t, int64_t); \ CONVERT_IMPL_LV2(input_t, float); \ + CONVERT_IMPL_LV2(input_t, bfloat16); \ } result reference::convert(datatype_t in_type, datatype_t out_type, const gsl::byte *input, gsl::byte *output, diff --git a/src/kernels/cpu/reference/gather_elements.cpp b/src/kernels/cpu/reference/gather_elements.cpp new file mode 100644 index 0000000000..736ecd925a --- /dev/null +++ b/src/kernels/cpu/reference/gather_elements.cpp @@ -0,0 +1,71 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::reference; +using namespace std; + +template result reference::gather_elements(const float *input, const int64_t *indices, float *output, const runtime_shape_t &in_shape, + const runtime_shape_t &indices_shape, const int axis) noexcept; + +template +result reference::gather_elements(const TI *input, const TK *indices, TI *output, const runtime_shape_t &in_shape, + const runtime_shape_t &indices_shape, const int axis) noexcept +{ + // indices_shape == output_shape + // out[i][j][k] = input[index[i][j][k]][j][k] if axis = 0, + // out[i][j][k] = input[i][index[i][j][k]][k] if axis = 1, + // out[i][j][k] = input[i][j][index[i][j][k]] if axis = 2, + std::vector per_axis_size(indices_shape.size(), 1); + std::vector input_per_axis_size(indices_shape.size(), 1); + + // compute size per axis + for (int idx = indices_shape.size() - 2; idx >= 0; idx--) + { + per_axis_size[idx] = indices_shape[idx + 1] * per_axis_size[idx + 1]; + input_per_axis_size[idx] = in_shape[idx + 1] * input_per_axis_size[idx + 1]; + } + + for (size_t i = 0; i < compute_size(indices_shape); i++) + { + std::vector index; + get_gather_index(per_axis_size, index, i, axis, 0); + + // compute indices offset to update index + int indice_index = 0; + for (size_t t = 0; t < index.size(); t++) + { + indice_index += per_axis_size[t] * index[t]; + } + // process index value if negative value + index[axis] = indices[indice_index] < 0 ? indices[indice_index] + in_shape[axis] : indices[indice_index]; + + // compute input offset + int input_index = 0; + for (size_t t = 0; t < index.size(); t++) + { + input_index += input_per_axis_size[t] * index[t]; + } + output[i] = input[input_index]; + } + + return ok(); +} diff --git a/src/kernels/cpu/reference/gru.cpp b/src/kernels/cpu/reference/gru.cpp new file mode 100644 index 0000000000..d61b0bedb1 --- /dev/null +++ b/src/kernels/cpu/reference/gru.cpp @@ -0,0 +1,184 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::reference; + +template result +reference::gru(const float *input, const float *w, const float *r, const float *b, float *initial_h, + float *output, float *output_h, const runtime_shape_t &input_shape, + const runtime_shape_t &w_shape, int mode, bool linear_before_reset) noexcept; + +template +result reference::gru(const T *input, const T *w, const T *r, const T *b, T *initial_h, T *output, T *output_h, + const runtime_shape_t &input_shape, const runtime_shape_t &w_shape, int mode, + bool linear_before_reset) noexcept +{ + const int seq_length = input_shape[0]; + const int batch_size = input_shape[1]; + const int input_size = input_shape[2]; + const int num_direction = w_shape[0]; + const int hidden_size = w_shape[1] / 3; + + auto sigmoid = [&](float x) { + return 1 / (1 + std::exp(-x)); + }; + auto tanh = [&](float x) { + return std::tanh(x); + }; + // copy input to output + runtime_shape_t out_shape { (size_t)seq_length, (size_t)num_direction, (size_t)batch_size, (size_t)hidden_size }; + + auto x_gate_size = batch_size * input_size; + auto w_gate_size = 3 * hidden_size * input_size; + auto h_t_size = batch_size * hidden_size; + auto r_gate_size = 3 * hidden_size * hidden_size; + + auto tmp_a = std::vector(batch_size * hidden_size, 0.f); + auto tmp_b = std::vector(batch_size * hidden_size, 0.f); + auto gate_z = std::vector(batch_size * hidden_size, 0.f); + auto gate_r = std::vector(batch_size * hidden_size, 0.f); + auto gate_h = std::vector(batch_size * hidden_size, 0.f); + + std::vector seq_len_loop; + for (int l = 0; l < seq_length; l++) + seq_len_loop.push_back(l); + if (mode == lstm_direction::kReverse) + std::reverse(seq_len_loop.begin(), seq_len_loop.end()); + auto x_i = input; + auto h_t = initial_h; + auto w_i = w; + auto r_i = r; + auto b_i = b; + for (int d = 0; d < num_direction; d++) + { + h_t = initial_h + d * h_t_size; + w_i = w + d * w_gate_size; + r_i = r + d * r_gate_size; + b_i = b + d * 6 * hidden_size; + if (d == 1) + std::reverse(seq_len_loop.begin(), seq_len_loop.end()); + for (auto i : seq_len_loop) + { + x_i = input + i * x_gate_size; + // clean gate_z gate_r gate_h + std::fill(gate_z.begin(), gate_z.end(), 0.f); + std::fill(gate_r.begin(), gate_r.end(), 0.f); + std::fill(gate_h.begin(), gate_h.end(), 0.f); + + // clean tmp_a tmp_b + std::fill(tmp_a.begin(), tmp_a.end(), 0.f); + std::fill(tmp_b.begin(), tmp_b.end(), 0.f); + // gate_z = x_i * w_i_z + b_w_z + h_t *r_i_z + b_r_z + for (int bs = 0; bs < batch_size; bs++) + { + for (int hs = 0; hs < hidden_size; hs++) + { + for (int is = 0; is < input_size; is++) + { + tmp_a[bs * hidden_size + hs] += x_i[bs * input_size + is] * w_i[hs * input_size + is]; + } + tmp_a[bs * hidden_size + hs] += b_i[hs]; + for (int rs = 0; rs < hidden_size; rs++) + { + tmp_b[bs * hidden_size + hs] += h_t[bs * hidden_size + rs] * r_i[hs * hidden_size + rs]; + } + tmp_b[bs * hidden_size + hs] += b_i[3 * hidden_size + hs]; + gate_z[bs * hidden_size + hs] = tmp_a[bs * hidden_size + hs] + tmp_b[bs * hidden_size + hs]; + } + } + // gate_z = sigmoid(gate_z); + std::transform(gate_z.begin(), gate_z.end(), gate_z.begin(), sigmoid); + + // clear tmp_a tmp_b + std::fill(tmp_a.begin(), tmp_a.end(), 0.f); + std::fill(tmp_b.begin(), tmp_b.end(), 0.f); + // gate_r = x_i * w_i_r + b_w_r + h_t *r_i_r + b_r_r + for (int bs = 0; bs < batch_size; bs++) + { + for (int hs = 0; hs < hidden_size; hs++) + { + for (int is = 0; is < input_size; is++) + { + tmp_a[bs * hidden_size + hs] += x_i[bs * input_size + is] * w_i[hidden_size * input_size + hs * input_size + is]; + } + tmp_a[bs * hidden_size + hs] += b_i[hidden_size + hs]; + for (int rs = 0; rs < hidden_size; rs++) + { + tmp_b[bs * hidden_size + hs] += h_t[bs * hidden_size + rs] * r_i[hidden_size * hidden_size + hs * hidden_size + rs]; + } + tmp_b[bs * hidden_size + hs] += b_i[4 * hidden_size + hs]; + gate_r[bs * hidden_size + hs] = tmp_a[bs * hidden_size + hs] + tmp_b[bs * hidden_size + hs]; + } + } + // gate_r = sigmoid(gate_r); + std::transform(gate_r.begin(), gate_r.end(), gate_r.begin(), sigmoid); + + // clear tmp_a tmp_b + std::fill(tmp_a.begin(), tmp_a.end(), 0.f); + std::fill(tmp_b.begin(), tmp_b.end(), 0.f); + // gate_h = x_i * w_i_h + b_w_h + gate_r·h_t *r_i_h + b_r_h + for (int bs = 0; bs < batch_size; bs++) + { + for (int hs = 0; hs < hidden_size; hs++) + { + for (int is = 0; is < input_size; is++) + { + tmp_a[bs * hidden_size + hs] += x_i[bs * input_size + is] * w_i[2 * hidden_size * input_size + hs * input_size + is]; + } + tmp_a[bs * hidden_size + hs] += b_i[2 * hidden_size + hs]; + + for (int rs = 0; rs < hidden_size; rs++) + { + if (!linear_before_reset) + tmp_b[bs * hidden_size + hs] += gate_r[bs * hidden_size + rs] * h_t[bs * hidden_size + rs] * r_i[2 * hidden_size * hidden_size + hs * hidden_size + rs]; + else + tmp_b[bs * hidden_size + hs] += h_t[bs * hidden_size + rs] * r_i[2 * hidden_size * hidden_size + hs * hidden_size + rs]; + } + tmp_b[bs * hidden_size + hs] += b_i[5 * hidden_size + hs]; + + if (!linear_before_reset) + gate_h[bs * hidden_size + hs] = tmp_a[bs * hidden_size + hs] + tmp_b[bs * hidden_size + hs]; + else + gate_h[bs * hidden_size + hs] = tmp_a[bs * hidden_size + hs] + gate_r[bs * hidden_size + hs] * tmp_b[bs * hidden_size + hs]; + } + } + // gate_h = tanh(gate_h); + std::transform(gate_h.begin(), gate_h.end(), gate_h.begin(), tanh); + + for (int k = 0; k < batch_size * hidden_size; k++) + { + h_t[k] = (1 - gate_z[k]) * gate_h[k] + gate_z[k] * h_t[k]; + // *output++ = h_t[k]; + output[i * (num_direction * batch_size * hidden_size) + d * (batch_size * hidden_size) + k] = h_t[k]; + } + } + // if (mode == lstm_direction::kReverse || d == 1) + // h_t.reverse(); + for (int k = 0; k < batch_size * hidden_size; k++) + { + output_h[d * (batch_size * hidden_size) + k] = h_t[k]; + } + } + + return ok(); +} diff --git a/src/kernels/cpu/reference/instancenorm.cpp b/src/kernels/cpu/reference/instancenorm.cpp new file mode 100644 index 0000000000..d688aa77a1 --- /dev/null +++ b/src/kernels/cpu/reference/instancenorm.cpp @@ -0,0 +1,68 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::reference; + +template result reference::instancenorm(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, float epsilon) noexcept; + +template +result reference::instancenorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, float epsilon) noexcept +{ + auto outer_size = static_cast(in_shape[0]); + auto inner_size = 1; + for (size_t i = 2; i < in_shape.size(); i++) + inner_size *= static_cast(in_shape[i]); + for (auto batch = 0; batch < outer_size; batch++) + { + for (size_t c = 0; c < in_shape[1]; c++) + { + auto src = input + batch * inner_size * in_shape[1] + c * inner_size; + auto dest = output + batch * inner_size * in_shape[1] + c * inner_size; + + double mean1 = 0.f; + for (auto i = 0; i < inner_size; i++) + mean1 += src[i] / inner_size; + + std::vector sub(inner_size, 0.f); + for (auto i = 0; i < inner_size; i++) + sub[i] = (src[i] - mean1); + + std::vector pow(inner_size, 0.f); + for (auto i = 0; i < inner_size; i++) + pow[i] = sub[i] * sub[i]; + + double mean2 = 0.f; + for (auto i = 0; i < inner_size; i++) + mean2 += pow[i] / inner_size; + + double add = mean2 + epsilon; + double sqrt = std::sqrt(add); + + for (auto i = 0; i < inner_size; i++) + dest[i] = sub[i] * scale[c] / sqrt + bias[c]; + } + } + + return ok(); +} \ No newline at end of file diff --git a/src/kernels/cpu/reference/layernorm.cpp b/src/kernels/cpu/reference/layernorm.cpp new file mode 100644 index 0000000000..f8c57cbf2c --- /dev/null +++ b/src/kernels/cpu/reference/layernorm.cpp @@ -0,0 +1,72 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::reference; + +template result reference::layernorm(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept; + +template +result reference::layernorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept +{ + auto outer_size = 1; + auto inner_size = 1; + for (auto i = 0; i < axis; i++) + outer_size *= in_shape[i]; + for (auto i = axis; i < static_cast(in_shape.size()); i++) + inner_size *= in_shape[i]; + + for (int32_t batch = 0; batch < outer_size; batch++) + { + auto src = input + batch * inner_size; + auto dest = output + batch * inner_size; + + float mean1 = 0.f; + for (auto i = 0; i < inner_size; i++) + mean1 += src[i] / inner_size; + + std::vector sub(inner_size, 0.f); + for (auto i = 0; i < inner_size; i++) + sub[i] = src[i] - mean1; + + std::vector pow(inner_size, 0.f); + for (auto i = 0; i < inner_size; i++) + pow[i] = sub[i] * sub[i]; + + float mean2 = 0.f; + for (auto i = 0; i < inner_size; i++) + mean2 += pow[i] / inner_size; + + float add = mean2 + epsilon; + float sqrt = std::sqrt(add); + + std::vector div(inner_size, 0.f); + for (auto i = 0; i < inner_size; i++) + div[i] = sub[i] / sqrt; + + for (auto i = 0; i < inner_size; i++) + dest[i] = div[i] * scale[i] + bias[i]; + } + + return ok(); +} \ No newline at end of file diff --git a/src/kernels/cpu/reference/nnil.cpp b/src/kernels/cpu/reference/nnil.cpp index ec66a0ab89..a417a6a704 100644 --- a/src/kernels/cpu/reference/nnil.cpp +++ b/src/kernels/cpu/reference/nnil.cpp @@ -78,6 +78,9 @@ result reference::nnil_unary_method(const float *input, float *output, siz case nnil_floor: stack.push(floorf(stack.pop())); break; + case nnil_erf: + stack.push(erff(stack.pop())); + break; case nnil_log: stack.push(logf(stack.pop())); break; diff --git a/src/kernels/cpu/reference/reduce.cpp b/src/kernels/cpu/reference/reduce.cpp index ced8f804f5..da5a3f69dd 100644 --- a/src/kernels/cpu/reference/reduce.cpp +++ b/src/kernels/cpu/reference/reduce.cpp @@ -71,6 +71,9 @@ template result reference::reduce(reduce_op_t op, float init_value, template result reference::reduce(reduce_op_t op, int32_t init_value, const int32_t *input, int32_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept; +template result reference::reduce(reduce_op_t op, int64_t init_value, const int64_t *input, int64_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept; + template result reference::reduce(reduce_op_t op, T init_value, const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept diff --git a/src/kernels/cpu/reference/reduce_arg.cpp b/src/kernels/cpu/reference/reduce_arg.cpp index 400b7289ba..e24ee758f5 100644 --- a/src/kernels/cpu/reference/reduce_arg.cpp +++ b/src/kernels/cpu/reference/reduce_arg.cpp @@ -92,7 +92,7 @@ result reference::reduce_arg(reduce_arg_op_t op, const float *input, T *ou return reduce_arg_impl([](float a, float b) { return a < b; }, std::numeric_limits::max(), input, output, in_shape, out_shape, in_strides, out_strides, axes, keep_dims, select_last_idx, context); case reduce_arg_max: - return reduce_arg_impl([](float a, float b) { return a > b; }, std::numeric_limits::min(), + return reduce_arg_impl([](float a, float b) { return a > b; }, std::numeric_limits::lowest(), input, output, in_shape, out_shape, in_strides, out_strides, axes, keep_dims, select_last_idx, context); default: return err(std::errc::not_supported); diff --git a/src/kernels/cpu/reference/space_to_batch.cpp b/src/kernels/cpu/reference/space_to_batch.cpp new file mode 100644 index 0000000000..fc732ef578 --- /dev/null +++ b/src/kernels/cpu/reference/space_to_batch.cpp @@ -0,0 +1,112 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::reference; + +namespace +{ +runtime_shape_t get_padded_shape(const runtime_shape_t &in_shape, const runtime_paddings_t &paddings) +{ + runtime_shape_t out_shape(in_shape.size()); + for (size_t i = 0; i < in_shape.size(); i++) + out_shape[i] = (size_t)((int32_t)in_shape[i] + paddings[i].sum() + (in_shape[i] - 1) * paddings[i].interior); + return out_shape; +} + +inline runtime_shape_t get_transposed_shape(const runtime_shape_t &input_shape, const runtime_shape_t &perm) +{ + runtime_shape_t new_shape(input_shape.size()); + for (size_t i = 0; i < new_shape.size(); i++) + new_shape[i] = input_shape[perm[i]]; + return new_shape; +} + +template +result space_to_batch_impl(datatype_t dt, const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &block_shape, + const runtime_paddings_t &paddings, const runtime_shape_t &in_strides, [[maybe_unused]] const runtime_shape_t &out_strides, kernel_context &context) noexcept +{ + auto in_shape_size = in_shape.size(); + auto spatial_size = block_shape.size(); + auto new_paddings = runtime_paddings_t(in_shape_size, { 0, 0, 0 }); + for (size_t i = 0; i < spatial_size; ++i) + { + new_paddings[in_shape_size - spatial_size + i] = paddings[i]; + } + auto pad_out_shape = get_padded_shape(in_shape, new_paddings); + auto pad_output = std::make_unique(compute_size(pad_out_shape)); + auto pad_out_strides = get_default_strides(pad_out_shape); + scalar pad_value(0); + + try_(reference::pad(dt, reinterpret_cast(input), + reinterpret_cast(pad_output.get()), in_shape, in_strides, + pad_out_strides, new_paddings, + pad_mode_t::pad_constant, + pad_value, context)); + + runtime_shape_t new_shape; + new_shape.reserve(in_shape_size + spatial_size); + new_shape.assign(pad_out_shape.begin(), pad_out_shape.begin() + in_shape_size - spatial_size); + + runtime_shape_t perms(in_shape_size - spatial_size); + perms.reserve(in_shape_size + spatial_size); + std::iota(perms.begin(), perms.begin() + in_shape_size - spatial_size, 0); + + runtime_shape_t spatial_perms; + spatial_perms.reserve(spatial_size); + + for (size_t i = 0; i < spatial_size; i++) + { + size_t idx = in_shape_size - spatial_size + i; + perms.push_back(new_shape.size()); + new_shape.push_back(pad_out_shape[idx] / block_shape[i]); + + spatial_perms.push_back(new_shape.size()); + new_shape.push_back(block_shape[i]); + } + + perms.insert(perms.begin(), spatial_perms.begin(), spatial_perms.end()); + + auto tp_shape = get_transposed_shape(new_shape, perms); + auto tp_stride = get_default_strides(tp_shape); + try_(reference::transpose(dt, reinterpret_cast(pad_output.get()), reinterpret_cast(output), new_shape, perms, get_default_strides(new_shape), tp_stride, context)); + return ok(); +} +} + +#define SPACE_TO_BATCH_IMPL(size, type) \ + case size: \ + return space_to_batch_impl(dt, reinterpret_cast(input), reinterpret_cast(output), in_shape, block_shape, crops, in_strides, out_strides, context) + +result reference::space_to_batch(datatype_t dt, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, + const runtime_shape_t &block_shape, const runtime_paddings_t &crops, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context) noexcept +{ + switch (runtime::get_bytes(dt)) + { + SPACE_TO_BATCH_IMPL(1, uint8_t); + SPACE_TO_BATCH_IMPL(2, uint16_t); + SPACE_TO_BATCH_IMPL(4, uint32_t); + SPACE_TO_BATCH_IMPL(8, uint64_t); + default: + return err(std::errc::not_supported); + } +} diff --git a/src/kernels/cpu/reference/ternary.cpp b/src/kernels/cpu/reference/ternary.cpp index c8966a4135..e2f341396c 100644 --- a/src/kernels/cpu/reference/ternary.cpp +++ b/src/kernels/cpu/reference/ternary.cpp @@ -27,6 +27,16 @@ template result reference::ternary(const float *input_a, const floa const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, const runtime_shape_t &out_strides) noexcept; +template result reference::ternary(const float *input_a, const int64_t *input_b, const int64_t *input_c, int64_t *output, + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, + const runtime_shape_t &out_strides) noexcept; + +template result reference::ternary(const float *input_a, const int32_t *input_b, const int32_t *input_c, int32_t *output, + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, + const runtime_shape_t &out_strides) noexcept; + template result reference::ternary(const float *input_a, const T *input_b, const T *input_c, T *output, const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, diff --git a/src/kernels/cpu/reference/tflite_detection_postprocess.cpp b/src/kernels/cpu/reference/tflite_detection_postprocess.cpp new file mode 100644 index 0000000000..05251d1d6a --- /dev/null +++ b/src/kernels/cpu/reference/tflite_detection_postprocess.cpp @@ -0,0 +1,376 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::kernels; +using namespace nncase::kernels::cpu; +using namespace nncase::kernels::cpu::reference; + +template result reference::tflite_detection_postprocess(const float *boxes, const float *scores, const float *anchors, float *output_locations, float *output_classes, float *output_scores, float *output_num_detections, + const runtime_shape_t &boxes_shape, const runtime_shape_t &scores_shape, const runtime_shape_t &anchors_shape, + const int32_t max_detections, const int32_t max_classes_per_detection, const int32_t detections_per_class, + const bool use_regular_non_max_suppression, const float nms_score_threshold, const float nms_iou_threshold, + const int32_t num_classes, const float y_scale, const float x_scale, const float h_scale, const float w_scale) noexcept; + +template +result reference::tflite_detection_postprocess(const T *boxes, const T *scores, const T *anchors, T *output_locations, T *output_classes, T *output_scores, T *output_num_detections, + const runtime_shape_t &boxes_shape, const runtime_shape_t &scores_shape, const runtime_shape_t &anchors_shape, + const int32_t max_detections, const int32_t max_classes_per_detection, const int32_t detections_per_class, + const bool use_regular_non_max_suppression, const float nms_score_threshold, const float nms_iou_threshold, + const int32_t num_classes, const float y_scale, const float x_scale, const float h_scale, const float w_scale) noexcept +{ + struct CenterSizeEncoding + { + float y; + float x; + float h; + float w; + }; + struct BoxCornerEncoding + { + float ymin; + float xmin; + float ymax; + float xmax; + }; + struct BoxInfo + { + int index; + float score; + }; + + auto compute_iou = [&](const std::vector &box, const int &i, const int &j) { + auto &box_i = box[i]; + auto &box_j = box[j]; + const float area_i = (box_i.ymax - box_i.ymin) * (box_i.xmax - box_i.xmin); + const float area_j = (box_j.ymax - box_j.ymin) * (box_j.xmax - box_j.xmin); + if (area_i <= 0 || area_j <= 0) + return 0.f; + const float intersection_y_min = std::max(box_i.ymin, box_j.ymin); + const float intersection_x_min = std::max(box_i.xmin, box_j.xmin); + const float intersection_y_max = std::min(box_i.ymax, box_j.ymax); + const float intersection_x_max = std::min(box_i.xmax, box_j.xmax); + const float intersection_area = std::max(intersection_y_max - intersection_y_min, 0.0) * std::max(intersection_x_max - intersection_x_min, 0.0); + return intersection_area / (area_i + area_j - intersection_area); + }; + + const auto num_boxes = (int)anchors_shape[0]; + const auto num_classes_with_background = (int)scores_shape[2]; // num_classes + background + const auto num_detections_per_class = std::min(detections_per_class, max_detections); + int label_offset = num_classes_with_background - num_classes; + // DecodeCenterSizeBoxes: get decoded_boxes + std::vector decoded_boxes(boxes_shape[1]); + { + CenterSizeEncoding box_center_size; + CenterSizeEncoding scale_values { y_scale, x_scale, h_scale, w_scale }; + CenterSizeEncoding anchor; + + for (int index = 0; index < num_boxes; index++) + { + const auto box_encoding_index = index * boxes_shape[2]; + box_center_size = *reinterpret_cast(boxes + box_encoding_index); + anchor = *reinterpret_cast(anchors + box_encoding_index); + + auto y_center = static_cast(static_cast(box_center_size.y) / static_cast(scale_values.y) * static_cast(anchor.h) + static_cast(anchor.y)); + auto x_center = static_cast(static_cast(box_center_size.x) / static_cast(scale_values.x) * static_cast(anchor.w) + static_cast(anchor.x)); + auto half_h = static_cast(0.5 * (std::exp(static_cast(box_center_size.h) / static_cast(scale_values.h))) * static_cast(anchor.h)); + auto half_w = static_cast(0.5 * (std::exp(static_cast(box_center_size.w) / static_cast(scale_values.w))) * static_cast(anchor.w)); + decoded_boxes[index].ymin = y_center - half_h; + decoded_boxes[index].xmin = x_center - half_w; + decoded_boxes[index].ymax = y_center + half_h; + decoded_boxes[index].xmax = x_center + half_w; + } + } + // NMS MultiClass + { + if (use_regular_non_max_suppression) + { + // NMS Regular + int sorted_indices_size = 0; + std::vector box_info_after_regular_nms(max_detections + num_detections_per_class); + std::vector num_selected(num_classes); + + // compute nms + std::vector class_scores(num_boxes); + std::vector selected; + selected.reserve(num_detections_per_class); + + for (auto col = 0; col < num_classes - 1; col++) + { + const float *scores_base = scores + col + label_offset; + for (int row = 0; row < num_boxes; row++) + { + // Get scores of boxes corresponding to all anchors for single class + class_scores[row] = *scores_base; + scores_base += num_classes_with_background; + } + // Perform non-maximal suppression on single class + selected.clear(); + + // NMS SingleClass + { + std::vector keep_indices; + std::vector keep_scores; + // select detection box score above score threshold + { + for (size_t i = 0; i < class_scores.size(); i++) + { + if (class_scores[i] >= nms_score_threshold) + { + keep_scores.emplace_back(class_scores[i]); + keep_indices.emplace_back(i); + } + } + } + + int num_scores_kept = (int)keep_scores.size(); + std::vector sorted_indices; + sorted_indices.resize(num_scores_kept); + // DecreasingArgSort + { + std::iota(sorted_indices.begin(), sorted_indices.begin() + num_scores_kept, 0); + std::stable_sort( + sorted_indices.begin(), sorted_indices.begin() + num_scores_kept, + [&keep_scores](const int i, const int j) { return keep_scores[i] > keep_scores[j]; }); + } + + const int output_size = std::min(num_scores_kept, max_detections); + selected.clear(); + int num_active_candidate = num_scores_kept; + std::vector active_box_candidate(num_scores_kept, 1); + for (int i = 0; i < num_scores_kept; ++i) + { + if (num_active_candidate == 0 || (int)selected.size() >= output_size) + break; + if (active_box_candidate[i] == 1) + { + selected.push_back(keep_indices[sorted_indices[i]]); + active_box_candidate[i] = 0; + num_active_candidate--; + } + else + { + continue; + } + for (int j = i + 1; j < num_scores_kept; ++j) + { + if (active_box_candidate[j] == 1) + { + + float iou = compute_iou( + decoded_boxes, keep_indices[sorted_indices[i]], + keep_indices[sorted_indices[j]]); + + if (iou > nms_iou_threshold) + { + active_box_candidate[j] = 0; + num_active_candidate--; + } + } + } + } + } + // end NMS SingleClass + + if (selected.empty()) + { + continue; + } + for (size_t i = 0; i < selected.size(); ++i) + { + box_info_after_regular_nms[sorted_indices_size + i].score = class_scores[selected[i]]; + box_info_after_regular_nms[sorted_indices_size + i].index = (selected[i] * num_classes_with_background + col + label_offset); + } + + // In-place merge the original boxes and new selected boxes which are both + // sorted by scores. + std::inplace_merge(box_info_after_regular_nms.begin(), box_info_after_regular_nms.begin() + sorted_indices_size, + box_info_after_regular_nms.begin() + sorted_indices_size + selected.size(), + [](const BoxInfo &a, const BoxInfo &b) { return a.score >= b.score; }); + + sorted_indices_size = std::min(sorted_indices_size + static_cast(selected.size()), max_detections); + } + // end compute nms result + + // Allocate output tensors + for (int output_box_index = 0; output_box_index < max_detections; output_box_index++) + { + if (output_box_index < sorted_indices_size) + { + const int anchor_index = floor( + box_info_after_regular_nms[output_box_index].index / num_classes_with_background); + const int class_index = box_info_after_regular_nms[output_box_index].index - anchor_index * num_classes_with_background - label_offset; + const float selected_score = box_info_after_regular_nms[output_box_index].score; + // detection_boxes + reinterpret_cast(output_locations)[output_box_index] = decoded_boxes[anchor_index]; + // detection_classes + output_classes[output_box_index] = class_index; + // detection_scores + output_scores[output_box_index] = selected_score; + } + else + { + // detection_boxes + reinterpret_cast(output_locations)[output_box_index] = { 0.0f, 0.0f, 0.0f, 0.0f }; + // detection_classes + output_classes[output_box_index] = 0.0f; + // detection_scores + output_scores[output_box_index] = 0.0f; + } + } + output_num_detections[0] = sorted_indices_size; + box_info_after_regular_nms.clear(); + } + else + { + // Fast NMS + + const int max_categories_per_anchor = max_classes_per_detection; + const int num_categories_per_anchor = std::min(max_categories_per_anchor, num_classes); + + std::vector max_scores; + max_scores.resize(num_boxes); + std::vector sorted_class_indices; + sorted_class_indices.resize(num_boxes * num_categories_per_anchor); + + for (int row = 0; row < num_boxes; row++) + { + const float *box_scores = scores + row * num_classes_with_background + label_offset; + int *class_indices = sorted_class_indices.data() + row * num_categories_per_anchor; + + // DecreasingPartialArgSort + if (num_categories_per_anchor == 1) + { + auto arg_max_vector = [&](const T *input_data, int size) { + T max_value = input_data[0]; + int max_index = 0; + for (int i = 1; i < size; ++i) + { + // const T curr_value = input_data[i]; + if (input_data[i] > max_value) + { + max_value = input_data[i]; + max_index = i; + } + } + return max_index; + }; + class_indices[0] = arg_max_vector(box_scores, num_classes); + } + else + { + std::iota(class_indices, class_indices + num_classes, 0); + std::partial_sort( + class_indices, class_indices + num_categories_per_anchor, class_indices + num_classes, + [&box_scores](const int i, const int j) { return box_scores[i] > box_scores[j]; }); + } + // end DecreasingPartialArgSort + + max_scores[row] = box_scores[class_indices[0]]; + } + std::vector selected; + // NMS SingleClass + { + std::vector keep_indices; + std::vector keep_scores; + // select detection box score above score threshold + { + for (size_t i = 0; i < max_scores.size(); i++) + { + if (max_scores[i] >= nms_score_threshold) + { + keep_scores.emplace_back(max_scores[i]); + keep_indices.emplace_back(i); + } + } + } + + int num_scores_kept = (int)keep_scores.size(); + std::vector sorted_indices; + sorted_indices.resize(num_scores_kept); + // DecreasingArgSort + { + std::iota(sorted_indices.begin(), sorted_indices.begin() + num_scores_kept, 0); + std::stable_sort( + sorted_indices.begin(), sorted_indices.begin() + num_scores_kept, + [&keep_scores](const int i, const int j) { return keep_scores[i] > keep_scores[j]; }); + } + const int output_size = std::min(num_scores_kept, max_detections); + selected.clear(); + int num_active_candidate = num_scores_kept; + std::vector active_box_candidate(num_scores_kept, 1); + for (int i = 0; i < num_scores_kept; ++i) + { + if (num_active_candidate == 0 || (int)selected.size() >= output_size) + break; + if (active_box_candidate[i] == 1) + { + selected.push_back(keep_indices[sorted_indices[i]]); + active_box_candidate[i] = 0; + num_active_candidate--; + } + else + { + continue; + } + for (int j = i + 1; j < num_scores_kept; ++j) + { + if (active_box_candidate[j] == 1) + { + + float iou = compute_iou( + decoded_boxes, keep_indices[sorted_indices[i]], + keep_indices[sorted_indices[j]]); + if (iou > nms_iou_threshold) + { + active_box_candidate[j] = 0; + num_active_candidate--; + } + } + } + } + } + // end NMS SingleClass + + // Allocate output tensors + int output_box_index = 0; + for (const auto &selected_index : selected) + { + const float *box_scores = scores + selected_index * num_classes_with_background + label_offset; + const int *class_indices = sorted_class_indices.data() + selected_index * num_categories_per_anchor; + + for (int col = 0; col < num_categories_per_anchor; ++col) + { + int box_offset = max_categories_per_anchor * output_box_index + col; + // detection_boxes + reinterpret_cast(output_locations)[box_offset] = decoded_boxes[selected_index]; + // detection_classes + output_classes[box_offset] = class_indices[col]; + // detection_scores + output_scores[box_offset] = box_scores[class_indices[col]]; + } + output_box_index++; + } + output_num_detections[0] = output_box_index; + } + } + + return ok(); +} diff --git a/src/kernels/cpu/reference/unary.cpp b/src/kernels/cpu/reference/unary.cpp index a9c242d483..53c07b0aad 100644 --- a/src/kernels/cpu/reference/unary.cpp +++ b/src/kernels/cpu/reference/unary.cpp @@ -62,6 +62,7 @@ result reference::unary(unary_op_t op, const float *input, float *output, UNARY_IMPL(unary_sqrt, sqrtf); UNARY_IMPL(unary_square, [](float v) { return v * v; }); UNARY_IMPL(unary_tanh, tanhf); + UNARY_IMPL(unary_erf, erff); default: return err(std::errc::not_supported); } diff --git a/src/kernels/tensor_compute.cpp b/src/kernels/tensor_compute.cpp index e60decc722..9e29a234e7 100644 --- a/src/kernels/tensor_compute.cpp +++ b/src/kernels/tensor_compute.cpp @@ -226,7 +226,7 @@ result kernels::binary(binary_op_t op, const T *input_a, const T *input_b, result kernels::unary(unary_op_t op, const float *input, float *output, const runtime_shape_t &shape, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context) noexcept { - if (is_contiguous(shape, in_strides) && is_contiguous(shape, out_strides) && is_optimized_unary_op(op)) + if (is_contiguous(shape, in_strides) && is_contiguous(shape, out_strides)) { // optimization return cpu::optimized::unary(op, input, output, shape, in_strides, out_strides, context); @@ -240,11 +240,15 @@ template result kernels::reduce(reduce_op_t op, float init_value, c template result kernels::reduce(reduce_op_t op, int32_t init_value, const int32_t *input, int32_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept; +template result kernels::reduce(reduce_op_t op, int64_t init_value, const int64_t *input, int64_t *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, + const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept; + template result kernels::reduce(reduce_op_t op, T init_value, const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &axis, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, bool keep_dims, kernel_context &context) noexcept { - return cpu::reference::reduce(op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context); + // return cpu::reference::reduce(op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context); + return cpu::optimized::reduce(op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context); } template result kernels::reduce_arg(reduce_arg_op_t op, const float *input, int32_t *output, const runtime_shape_t &in_shape, @@ -433,13 +437,18 @@ template result kernels::ternary(const float *input_a, const float const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, const runtime_shape_t &out_strides) noexcept; +template result kernels::ternary(const float *input_a, const int64_t *input_b, const int64_t *input_c, int64_t *output, + const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, + const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, + const runtime_shape_t &out_strides) noexcept; + template result kernels::ternary(const float *input_a, const T *input_b, const T *input_c, T *output, const runtime_shape_t &in_a_shape, const runtime_shape_t &in_a_strides, const runtime_shape_t &in_b_shape, const runtime_shape_t &in_b_strides, const runtime_shape_t &in_c_shape, const runtime_shape_t &in_c_strides, const runtime_shape_t &out_strides) noexcept { - return cpu::reference::ternary(input_a, input_b, input_c, output, in_a_shape, in_a_strides, in_b_shape, in_b_strides, + return cpu::optimized::ternary(input_a, input_b, input_c, output, in_a_shape, in_a_strides, in_b_shape, in_b_strides, in_c_shape, in_c_strides, out_strides); } @@ -467,3 +476,72 @@ result kernels::trilu(const T *input, T *output, const runtime_shape_t &in { return cpu::reference::trilu(input, output, in_shape, upper, k); } + +template result kernels::gru(const float *input, const float *w, const float *r, const float *b, float *initial_h, float *output, float *output_h, const runtime_shape_t &input_shape, const runtime_shape_t &w_shape, int mode, bool linear_before_reset) noexcept; + +template +result kernels::gru(const T *input, const T *w, const T *r, const T *b, T *initial_h, T *output, T *output_h, const runtime_shape_t &input_shape, const runtime_shape_t &w_shape, int mode, bool linear_before_reset) noexcept +{ + return cpu::reference::gru(input, w, r, b, initial_h, output, output_h, input_shape, w_shape, mode, linear_before_reset); +} + +template result kernels::tflite_detection_postprocess(const float *boxes, const float *scores, const float *anchors, float *output_locations, float *output_classes, float *output_scores, float *output_num_detections, + const runtime_shape_t &boxes_shape, const runtime_shape_t &scores_shape, const runtime_shape_t &anchors_shape, + const int32_t max_detections, const int32_t max_classes_per_detection, const int32_t detections_per_class, + const bool use_regular_non_max_suppression, const float nms_score_threshold, const float nms_iou_threshold, + const int32_t num_classes, const float y_scale, const float x_scale, const float h_scale, const float w_scale) noexcept; + +template +result kernels::tflite_detection_postprocess(const T *boxes, const T *scores, const T *anchors, T *output_locations, T *output_classes, T *output_scores, T *output_num_detections, + const runtime_shape_t &boxes_shape, const runtime_shape_t &scores_shape, const runtime_shape_t &anchors_shape, + const int32_t max_detections, const int32_t max_classes_per_detection, const int32_t detections_per_class, + const bool use_regular_non_max_suppression, const float nms_score_threshold, const float nms_iou_threshold, + const int32_t num_classes, const float y_scale, const float x_scale, const float h_scale, const float w_scale) noexcept +{ + return cpu::reference::tflite_detection_postprocess(boxes, scores, anchors, output_locations, output_classes, output_scores, output_num_detections, + boxes_shape, scores_shape, anchors_shape, + max_detections, max_classes_per_detection, detections_per_class, + use_regular_non_max_suppression, nms_score_threshold, nms_iou_threshold, + num_classes, y_scale, x_scale, h_scale, w_scale); +} + +result kernels::space_to_batch(datatype_t type, const gsl::byte *input, gsl::byte *output, const runtime_shape_t &in_shape, + const runtime_shape_t &block_shape, const runtime_paddings_t &crops, const runtime_shape_t &in_strides, const runtime_shape_t &out_strides, kernel_context &context) noexcept +{ + return cpu::reference::space_to_batch(type, input, output, in_shape, block_shape, crops, in_strides, out_strides, context); +} + +template result kernels::gather_elements(const float *input, const int64_t *indices, float *output, const runtime_shape_t &in_shape, + const runtime_shape_t &indices_shape, const int axis) noexcept; + +template +result kernels::gather_elements(const TI *input, const TK *indices, TI *output, const runtime_shape_t &in_shape, + const runtime_shape_t &indices_shape, const int axis) noexcept +{ + return cpu::reference::gather_elements(input, indices, output, in_shape, indices_shape, axis); +} + +template result kernels::instancenorm(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, float epsilon) noexcept; + +template +result kernels::instancenorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, float epsilon) noexcept +{ + return cpu::optimized::instancenorm(input, output, scale, bias, in_shape, epsilon); +} + +template result kernels::layernorm(const float *input, float *output, float *scale, float *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept; + +template +result kernels::layernorm(const T *input, T *output, T *scale, T *bias, const runtime_shape_t &in_shape, int32_t axis, float epsilon) noexcept +{ + // return cpu::reference::layernorm(input, output, scale, bias, in_shape, axis, epsilon); + return cpu::optimized::layernorm(input, output, scale, bias, in_shape, axis, epsilon); +} + +template result kernels::compress(const float *input, const uint8_t *condition, float *output, const runtime_shape_t &input_shape, const runtime_shape_t &condition_shape, const int axis) noexcept; + +template +result kernels::compress(const T *input, const uint8_t *condition, T *output, const runtime_shape_t &input_shape, const runtime_shape_t &condition_shape, const int axis) noexcept +{ + return cpu::reference::compress(input, condition, output, input_shape, condition_shape, axis); +} diff --git a/src/nncase/compiler.cpp b/src/nncase/compiler.cpp index c6c6e95466..d75a2c74ff 100644 --- a/src/nncase/compiler.cpp +++ b/src/nncase/compiler.cpp @@ -488,13 +488,16 @@ class compiler_impl : public compiler pmgr.add_pass(); pmgr.add_pass(); pmgr.add_pass(); + pmgr.add_pass(); transform_pass pass("optimize_copy"); pass.emplace(); pass.emplace(); pass.emplace(); pass.emplace(); - pmgr.add_pass(std::move(pass)); }); + pass.emplace(); + pmgr.add_pass(std::move(pass)); + }); } void optimize_target_dependent_after_buffer_fusion(ir::graph &graph) diff --git a/src/runtime/op_profile.cpp b/src/runtime/op_profile.cpp index 0cd8633951..6f717f6d51 100644 --- a/src/runtime/op_profile.cpp +++ b/src/runtime/op_profile.cpp @@ -32,13 +32,34 @@ void op_profile::print() [=](std::pair &a, std::pair &b) { return a.second > b.second; }); std::cout << "stackvm OPs profile" << std::endl; - std::cout << std::setw(24) << std::left << "stackvm tensor op" << std::setw(12) << std::left << "timing(ms)" - << std::setw(12) << std::left << "percent(%)" << std::endl; + std::cout << "|" << std::setw(30) << std::left << "stackvm tensor op" + << "|" << std::setw(12) << std::left << "timing(ms)" + << "|" << std::setw(12) << std::left << "percent(%)" + << "|" << std::endl; + + std::cout << "|" << std::setw(30) << std::left << "---" + << "|" << std::setw(12) << std::left << "---" + << "|" << std::setw(12) << std::left << "---" + << "|" << std::endl; +#if !defined(__riscv) + double convert_number = 1.0f; +#else + double convert_number = RISCVFREQUENCY / 1000.0f; +#endif + for (auto e : v) { - std::cout << std::setw(24) << std::left << e.first << std::setw(12) << std::left << e.second - << std::setw(12) << std::left << e.second / total * 100 << std::endl; + std::cout << "|" << std::setw(30) << std::left << e.first + << "|" << std::setw(12) << std::left << e.second / convert_number + << "|" << std::setw(12) << std::left << e.second / total * 100 + << "|" << std::endl; } - std::cout << std::setw(24) << std::left << "total" << std::setw(12) << std::left << total - << std::setw(12) << std::left << total / total * 100 << std::endl; + + std::cout << "|" << std::setw(30) << std::left << "total" + << "|" << std::setw(12) << std::left << total / convert_number + << "|" << std::setw(12) << std::left << total / total * 100 + << "|" << std::endl + << std::endl; + + op_timing_.clear(); } \ No newline at end of file diff --git a/src/runtime/stackvm/CMakeLists.txt b/src/runtime/stackvm/CMakeLists.txt index a26e7675c8..7aaeaf3755 100644 --- a/src/runtime/stackvm/CMakeLists.txt +++ b/src/runtime/stackvm/CMakeLists.txt @@ -1,48 +1,55 @@ -cmake_minimum_required (VERSION 3.13) +cmake_minimum_required(VERSION 3.13) set(SRCS runtime_module.cpp - runtime_function.cpp - op_reader.cpp - evaluate_stack.cpp - ops/control.cpp - ops/loadstore.cpp - ops/stack.cpp - ops/scalar.cpp - ops/conversion.cpp - ops/tensor.batch_to_space.cpp - ops/tensor.binary.cpp - ops/tensor.broadcast.cpp - ops/tensor.call.cpp - ops/tensor.compare.cpp - ops/tensor.conv2d.cpp - ops/tensor.convert.cpp - ops/tensor.copy.cpp - ops/tensor.cumsum.cpp - ops/tensor.dequantize.cpp - ops/tensor.gather.cpp - ops/tensor.gather_nd.cpp - ops/tensor.hardmax.cpp - ops/tensor.lut1d.cpp - ops/tensor.matmul.cpp - ops/tensor.onehot.cpp - ops/tensor.pad.cpp - ops/tensor.quantize.cpp - ops/tensor.random_normal.cpp - ops/tensor.random_uniform.cpp - ops/tensor.reduce.cpp - ops/tensor.reduce_arg.cpp - ops/tensor.reduce_prod.cpp - ops/tensor.reduce_window2d.cpp - ops/tensor.resize_image.cpp - ops/tensor.roi_align.cpp - ops/tensor.sigmoid.cpp - ops/tensor.slice.cpp - ops/tensor.softmax.cpp - ops/tersor.ternary.cpp - ops/tensor.topk.cpp - ops/tensor.transpose.cpp - ops/tensor.trilu.cpp - ops/tensor.unary.cpp) + runtime_function.cpp + op_reader.cpp + evaluate_stack.cpp + ops/control.cpp + ops/loadstore.cpp + ops/stack.cpp + ops/scalar.cpp + ops/conversion.cpp + ops/tensor.batch_to_space.cpp + ops/tensor.binary.cpp + ops/tensor.broadcast.cpp + ops/tensor.call.cpp + ops/tensor.compare.cpp + ops/tensor.compress.cpp + ops/tensor.conv2d.cpp + ops/tensor.convert.cpp + ops/tensor.copy.cpp + ops/tensor.cumsum.cpp + ops/tensor.dequantize.cpp + ops/tensor.gather.cpp + ops/tensor.gather_elements.cpp + ops/tensor.gather_nd.cpp + ops/tensor.gru.cpp + ops/tensor.hardmax.cpp + ops/tensor.lut1d.cpp + ops/tensor.matmul.cpp + ops/tensor.onehot.cpp + ops/tensor.pad.cpp + ops/tensor.quantize.cpp + ops/tensor.random_normal.cpp + ops/tensor.random_uniform.cpp + ops/tensor.reduce.cpp + ops/tensor.reduce_arg.cpp + ops/tensor.reduce_prod.cpp + ops/tensor.reduce_window2d.cpp + ops/tensor.resize_image.cpp + ops/tensor.roi_align.cpp + ops/tensor.sigmoid.cpp + ops/tensor.slice.cpp + ops/tensor.softmax.cpp + ops/tensor.space_to_batch.cpp + ops/tersor.ternary.cpp + ops/tensor.topk.cpp + ops/tensor.transpose.cpp + ops/tensor.trilu.cpp + ops/tensor.tflite_detection_postprocess.cpp + ops/tensor.unary.cpp + ops/tensor.layernorm.cpp + ops/tensor.instancenorm.cpp) if (BUILDING_RUNTIME) add_library(runtime_stackvm OBJECT ${SRCS}) @@ -50,9 +57,9 @@ if (BUILDING_RUNTIME) target_link_libraries(runtime_stackvm PRIVATE kernels) set_property(TARGET runtime_stackvm PROPERTY POSITION_INDEPENDENT_CODE ON) install(TARGETS runtime_stackvm EXPORT nncaseruntimeTargets) -else() +else () add_library(simulator_stackvm OBJECT ${SRCS}) target_link_libraries(simulator_stackvm PUBLIC simulator) target_link_libraries(simulator_stackvm PRIVATE kernels) set_property(TARGET simulator_stackvm PROPERTY POSITION_INDEPENDENT_CODE ON) -endif() +endif () diff --git a/src/runtime/stackvm/evaluate_stack.h b/src/runtime/stackvm/evaluate_stack.h index e408c2bc2a..c9a7f738c9 100644 --- a/src/runtime/stackvm/evaluate_stack.h +++ b/src/runtime/stackvm/evaluate_stack.h @@ -85,6 +85,7 @@ class stack_entry int8_t as_i1() const noexcept { return (int8_t)i_; } int16_t as_i2() const noexcept { return (int16_t)i_; } int32_t as_i4() const noexcept { return (int32_t)i_; } + int64_t as_i8() const noexcept { return (int64_t)i_; } uintptr_t as_u() const noexcept { return (uintptr_t)i_; } intptr_t as_i() const noexcept { return i_; } diff --git a/src/runtime/stackvm/op_reader.cpp b/src/runtime/stackvm/op_reader.cpp index c7509e4480..e36fcabf11 100644 --- a/src/runtime/stackvm/op_reader.cpp +++ b/src/runtime/stackvm/op_reader.cpp @@ -1,4 +1,4 @@ -/* This file is generated by tools/stackvm_gen/IsaGen at 4/25/2022 3:29:27 PM +08:00. +/* This file is generated by tools/stackvm_gen/IsaGen at 2023/5/9 下午5:18:43 +08:00. * * Copyright 2019-2021 Canaan Inc. * @@ -232,6 +232,13 @@ result op_visitor::next() noexcept #endif return visit(op_reader()(reader_)); } + case tensor_function_t::SPACE_TO_BATCH: + { +#if defined ENABLE_OP_PROFILE + op_profile st("tensor_space_to_batch"); +#endif + return visit(op_reader()(reader_)); + } case tensor_function_t::TERNARY: { #if defined ENABLE_OP_PROFILE @@ -267,6 +274,48 @@ result op_visitor::next() noexcept #endif return visit(op_reader()(reader_)); } + case tensor_function_t::GRU: + { +#if defined ENABLE_OP_PROFILE + op_profile st("tensor_gru"); +#endif + return visit(op_reader()(reader_)); + } + case tensor_function_t::TFLITE_DETECTION_POSTPROCESS: + { +#if defined ENABLE_OP_PROFILE + op_profile st("tensor_tflite_detection_postprocess"); +#endif + return visit(op_reader()(reader_)); + } + case tensor_function_t::LAYER_NORMALIZATION: + { +#if defined ENABLE_OP_PROFILE + op_profile st("tensor_layer_normalization"); +#endif + return visit(op_reader()(reader_)); + } + case tensor_function_t::COMPRESS: + { +#if defined ENABLE_OP_PROFILE + op_profile st("tensor_compress"); +#endif + return visit(op_reader()(reader_)); + } + case tensor_function_t::GATHER_ELEMENTS: + { +#if defined ENABLE_OP_PROFILE + op_profile st("tensor_gather_elements"); +#endif + return visit(op_reader()(reader_)); + } + case tensor_function_t::INSTANCE_NORMALIZATION: + { +#if defined ENABLE_OP_PROFILE + op_profile st("tensor_instance_normalization"); +#endif + return visit(op_reader()(reader_)); + } default: break; } @@ -480,8 +529,7 @@ result op_visitor::visit(gsl::span text) noexcept try_(next()); #ifdef ENABLE_OP_PROFILE - op_profile profile_time; - profile_time.print(); + op_profile::print(); #endif return ok(); diff --git a/src/runtime/stackvm/ops/tensor.compress.cpp b/src/runtime/stackvm/ops/tensor.compress.cpp new file mode 100644 index 0000000000..0e1a9688ed --- /dev/null +++ b/src/runtime/stackvm/ops/tensor.compress.cpp @@ -0,0 +1,34 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../runtime_function.h" +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::runtime::stackvm; + +result stackvm_runtime_function::visit(const tensor_compress_op_t &op) noexcept +{ + try_var(output, pop_addr()); + try_var(condition, pop_addr()); + try_var(input, pop_addr()); + try_var(input_shape, module().shape_reg(op.input_shape_src)); + try_var(condition_shape, module().shape_reg(op.condition_shape_src)); + + return kernels::compress(reinterpret_cast(input), reinterpret_cast(condition), + reinterpret_cast(output), input_shape, condition_shape, op.axis); +} diff --git a/src/runtime/stackvm/ops/tensor.gather_elements.cpp b/src/runtime/stackvm/ops/tensor.gather_elements.cpp new file mode 100644 index 0000000000..b668a35911 --- /dev/null +++ b/src/runtime/stackvm/ops/tensor.gather_elements.cpp @@ -0,0 +1,35 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../runtime_function.h" +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::runtime::stackvm; + +result stackvm_runtime_function::visit(const tensor_gather_elements_op_t &op) noexcept +{ + try_var(output, pop_addr()); + try_var(indices, pop_addr()); + try_var(input, pop_addr()); + + try_var(in_shape, module().shape_reg(op.input_shape_src)); + try_var(indices_shape, module().shape_reg(op.indices_shape_src)); + + return kernels::gather_elements(reinterpret_cast(input), reinterpret_cast(indices), + reinterpret_cast(output), in_shape, indices_shape, op.axis); +} \ No newline at end of file diff --git a/src/runtime/stackvm/ops/tensor.gru.cpp b/src/runtime/stackvm/ops/tensor.gru.cpp new file mode 100644 index 0000000000..e80c5e44ae --- /dev/null +++ b/src/runtime/stackvm/ops/tensor.gru.cpp @@ -0,0 +1,41 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../runtime_function.h" +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::runtime::stackvm; + +result stackvm_runtime_function::visit(const tensor_gru_op_t &op) noexcept +{ + try_var(output_h, pop_addr()); + try_var(output, pop_addr()); + try_var(initial_h, pop_addr()); + try_var(b, pop_addr()); + try_var(r, pop_addr()); + try_var(w, pop_addr()); + try_var(input, pop_addr()); + + try_var(in_shape, module().shape_reg(op.input_shape_src)); + try_var(w_shape, module().shape_reg(op.w_shape_src)); + + return kernels::gru(reinterpret_cast(input), reinterpret_cast(w), + reinterpret_cast(r), reinterpret_cast(b), + reinterpret_cast(initial_h), reinterpret_cast(output), + reinterpret_cast(output_h), in_shape, w_shape, op.direction, op.linear_before_reset); +} \ No newline at end of file diff --git a/src/runtime/stackvm/ops/tensor.instancenorm.cpp b/src/runtime/stackvm/ops/tensor.instancenorm.cpp new file mode 100644 index 0000000000..ebcaa1ced7 --- /dev/null +++ b/src/runtime/stackvm/ops/tensor.instancenorm.cpp @@ -0,0 +1,42 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../runtime_function.h" +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::runtime::stackvm; + +result stackvm_runtime_function::visit(const tensor_instance_normalization_op_t &op) noexcept +{ + try_var(output, pop_addr()); + try_var(bias, pop_addr()); + try_var(scale, pop_addr()); + try_var(input, pop_addr()); + try_var(in_shape, module().shape_reg(op.input_shape)); + + switch (op.datatype) + { + case dt_float32: + return kernels::instancenorm(reinterpret_cast(input), reinterpret_cast(output), + reinterpret_cast(scale), reinterpret_cast(bias), in_shape, op.epsilon); + break; + default: + std::cerr << "unsupported dtype for instancenorm: " + std::string(datatype_names(op.datatype)); + return err(std::errc::invalid_argument); + } +} diff --git a/src/runtime/stackvm/ops/tensor.layernorm.cpp b/src/runtime/stackvm/ops/tensor.layernorm.cpp new file mode 100644 index 0000000000..547edd658c --- /dev/null +++ b/src/runtime/stackvm/ops/tensor.layernorm.cpp @@ -0,0 +1,42 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../runtime_function.h" +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::runtime::stackvm; + +result stackvm_runtime_function::visit(const tensor_layer_normalization_op_t &op) noexcept +{ + try_var(output, pop_addr()); + try_var(bias, pop_addr()); + try_var(scale, pop_addr()); + try_var(input, pop_addr()); + try_var(in_shape, module().shape_reg(op.input_shape)); + + switch (op.datatype) + { + case dt_float32: + return kernels::layernorm(reinterpret_cast(input), reinterpret_cast(output), + reinterpret_cast(scale), reinterpret_cast(bias), in_shape, op.axis, op.epsilon); + break; + default: + std::cerr << "unsupported dtype for layernorm: " + std::string(datatype_names(op.datatype)); + return err(std::errc::invalid_argument); + } +} diff --git a/src/runtime/stackvm/ops/tensor.reduce.cpp b/src/runtime/stackvm/ops/tensor.reduce.cpp index 19d1f71a17..10b9b18a68 100644 --- a/src/runtime/stackvm/ops/tensor.reduce.cpp +++ b/src/runtime/stackvm/ops/tensor.reduce.cpp @@ -41,6 +41,10 @@ result stackvm_runtime_function::visit(const tensor_reduce_op_t &op) noexc return kernels::reduce(op.reduce_op, init_value.as_i4(), reinterpret_cast(input), reinterpret_cast(output), in_shape, axis, in_strides, out_strides, op.keep_dims, module().kernel_context()); break; + case dt_int64: + return kernels::reduce(op.reduce_op, init_value.as_i8(), reinterpret_cast(input), + reinterpret_cast(output), in_shape, axis, in_strides, out_strides, op.keep_dims, module().kernel_context()); + break; default: std::cerr << "unsupported dtype for reduce: " + std::string(datatype_names(op.datatype)) << std::endl; return err(std::errc::invalid_argument); diff --git a/src/runtime/stackvm/ops/tensor.space_to_batch.cpp b/src/runtime/stackvm/ops/tensor.space_to_batch.cpp new file mode 100644 index 0000000000..b8041042a2 --- /dev/null +++ b/src/runtime/stackvm/ops/tensor.space_to_batch.cpp @@ -0,0 +1,34 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../runtime_function.h" +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::runtime::stackvm; + +result stackvm_runtime_function::visit(const tensor_space_to_batch_op_t &op) noexcept +{ + try_var(output, pop_addr()); + try_var(input, pop_addr()); + try_var(in_shape, module().shape_reg(op.rshape_src)); + try_var(block_shape, module().shape_reg(op.rshape_block)); + try_var(crops, module().paddings_reg(op.rpad_crops)); + try_var(in_strides, module().shape_reg(op.rstride_src)); + try_var(out_strides, module().shape_reg(op.rstride_dest)); + + return kernels::space_to_batch(op.datatype, reinterpret_cast(input), reinterpret_cast(output), + in_shape, block_shape, crops, in_strides, out_strides, module().kernel_context()); +} diff --git a/src/runtime/stackvm/ops/tensor.tflite_detection_postprocess.cpp b/src/runtime/stackvm/ops/tensor.tflite_detection_postprocess.cpp new file mode 100644 index 0000000000..633b6305fa --- /dev/null +++ b/src/runtime/stackvm/ops/tensor.tflite_detection_postprocess.cpp @@ -0,0 +1,44 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "../runtime_function.h" +#include +#include +#include + +using namespace nncase; +using namespace nncase::runtime; +using namespace nncase::runtime::stackvm; + +result stackvm_runtime_function::visit(const tensor_tflite_detection_postprocess_op_t &op) noexcept +{ + try_var(output_num_detections, pop_addr()); + try_var(output_scores, pop_addr()); + try_var(output_classes, pop_addr()); + try_var(output_locations, pop_addr()); + try_var(anchor, pop_addr()); + try_var(score, pop_addr()); + try_var(box, pop_addr()); + + try_var(box_shape, module().shape_reg(op.box_shape_src)); + try_var(score_shape, module().shape_reg(op.score_shape_src)); + try_var(anchor_shape, module().shape_reg(op.anchor_shape_src)); + + return kernels::tflite_detection_postprocess(reinterpret_cast(box), reinterpret_cast(score), + reinterpret_cast(anchor), reinterpret_cast(output_locations), + reinterpret_cast(output_classes), reinterpret_cast(output_scores), + reinterpret_cast(output_num_detections), box_shape, score_shape, anchor_shape, op.max_detections, op.max_classes_per_detection, op.detections_per_class, + op.use_regular_non_max_suppression, op.nms_score_threshold, op.nms_iou_threshold, + op.num_classes, op.y_scale, op.x_scale, op.h_scale, op.w_scale); +} diff --git a/src/runtime/stackvm/runtime_function.cpp b/src/runtime/stackvm/runtime_function.cpp index 7942eec925..bd58bf0702 100644 --- a/src/runtime/stackvm/runtime_function.cpp +++ b/src/runtime/stackvm/runtime_function.cpp @@ -143,13 +143,19 @@ result stackvm_runtime_function::create_tensor(uintptr_t addr, d { hrt::memory_pool_t pool; uintptr_t physical_address = 0; - if (addr >= reinterpret_cast(module().data().begin()) - && addr < reinterpret_cast(module().data().end())) + auto data_span = module().data(); + auto rdata_span = module().rdata(); + + if (addr >= reinterpret_cast(data_span.begin()) + && addr < reinterpret_cast(data_span.end())) { - pool = hrt::pool_cpu_only; + auto &tensor = module().data_tensor(); + auto &block = static_cast(tensor.impl())->memory_block(); + pool = block.pool; + physical_address = block.physical_block.physical_address + (addr - block.virtual_address); } - else if (addr >= reinterpret_cast(module().rdata().begin()) - && addr < reinterpret_cast(module().rdata().end())) + else if (addr >= reinterpret_cast(rdata_span.begin()) + && addr < reinterpret_cast(rdata_span.end())) { pool = hrt::pool_cpu_only; } diff --git a/src/runtime/stackvm/runtime_function.h b/src/runtime/stackvm/runtime_function.h index 6b4ca2cce0..0e10304d41 100644 --- a/src/runtime/stackvm/runtime_function.h +++ b/src/runtime/stackvm/runtime_function.h @@ -142,14 +142,17 @@ class stackvm_runtime_function : public runtime_function, private op_visitor result visit(const tensor_broadcast_op_t &op) noexcept override; result visit(const tensor_call_op_t &op) noexcept override; result visit(const tensor_compare_op_t &op) noexcept override; + result visit(const tensor_compress_op_t &op) noexcept override; result visit(const tensor_conv2d_op_t &op) noexcept override; result visit(const tensor_convert_op_t &op) noexcept override; result visit(const tensor_copy_op_t &op) noexcept override; result visit(const tensor_cumsum_op_t &op) noexcept override; result visit(const tensor_dequantize_op_t &op) noexcept override; - result visit(const tensor_gather_op_t &op) noexcept override; result visit(const tensor_hardmax_op_t &op) noexcept override; + result visit(const tensor_gather_op_t &op) noexcept override; + result visit(const tensor_gather_elements_op_t &op) noexcept override; result visit(const tensor_gather_nd_op_t &op) noexcept override; + result visit(const tensor_gru_op_t &op) noexcept override; result visit(const tensor_lut1d_op_t &op) noexcept override; result visit(const tensor_matmul_op_t &op) noexcept override; result visit(const tensor_onehot_op_t &op) noexcept override; @@ -166,11 +169,15 @@ class stackvm_runtime_function : public runtime_function, private op_visitor result visit(const tensor_sigmoid_op_t &op) noexcept override; result visit(const tensor_slice_op_t &op) noexcept override; result visit(const tensor_softmax_op_t &op) noexcept override; + result visit(const tensor_space_to_batch_op_t &op) noexcept override; result visit(const tensor_ternary_op_t &op) noexcept override; result visit(const tensor_topk_op_t &op) noexcept override; result visit(const tensor_transpose_op_t &op) noexcept override; result visit(const tensor_trilu_op_t &op) noexcept override; + result visit(const tensor_tflite_detection_postprocess_op_t &op) noexcept override; result visit(const tensor_unary_op_t &op) noexcept override; + result visit(const tensor_layer_normalization_op_t &op) noexcept override; + result visit(const tensor_instance_normalization_op_t &op) noexcept override; private: uintptr_t pc() const noexcept; diff --git a/src/runtime/stackvm/runtime_module.cpp b/src/runtime/stackvm/runtime_module.cpp index ca5a75b2f6..4805b93f76 100644 --- a/src/runtime/stackvm/runtime_module.cpp +++ b/src/runtime/stackvm/runtime_module.cpp @@ -24,7 +24,13 @@ using namespace nncase::runtime::stackvm; gsl::span stackvm_runtime_module::data() const noexcept { - return { data_.get(), mempool(mem_data).size }; + if (!data_.empty()) + { + auto &block = static_cast(data_tensor().impl())->memory_block(); + return block.virtual_buffer(); + } + + return {}; } gsl::span stackvm_runtime_module::rdata() const noexcept @@ -32,15 +38,18 @@ gsl::span stackvm_runtime_module::rdata() const noexcept return rdata_; } +const runtime_tensor &stackvm_runtime_module::data_tensor() const noexcept +{ + return data_; +} + result stackvm_runtime_module::initialize_before_functions(runtime_module_init_context &context) noexcept { assert(context.is_section_pinned()); auto data_pool = mempool(mem_data); if (data_pool.size) { - data_.reset(new (std::nothrow) gsl::byte[data_pool.size]); - if (!data_) - return err(std::errc::not_enough_memory); + try_set(data_, hrt::create(dt_uint8, { data_pool.size }, hrt::pool_shared)); } rdata_ = context.section(".rdata"); diff --git a/src/runtime/stackvm/runtime_module.h b/src/runtime/stackvm/runtime_module.h index 33bca0d454..26e2d9be3b 100644 --- a/src/runtime/stackvm/runtime_module.h +++ b/src/runtime/stackvm/runtime_module.h @@ -29,6 +29,8 @@ class stackvm_runtime_module : public runtime_module gsl::span data() const noexcept; gsl::span rdata() const noexcept; + const runtime_tensor &data_tensor() const noexcept; + result reg(size_t id) const noexcept; result reg(size_t id, uintptr_t value) noexcept; @@ -43,7 +45,7 @@ class stackvm_runtime_module : public runtime_module result> create_function() noexcept override; private: - std::unique_ptr data_; + runtime_tensor data_; gsl::span rdata_; std::array regs_; std::vector shape_regs_; diff --git a/src/targets/neutral_target.cpp b/src/targets/neutral_target.cpp index fecb71452b..e24a058528 100644 --- a/src/targets/neutral_target.cpp +++ b/src/targets/neutral_target.cpp @@ -21,12 +21,15 @@ #include #include #include +#include #include #include #include #include #include #include +#include +#include #include #include #include @@ -95,7 +98,7 @@ void neutral_target::add_default_transforms(ir::transforms::transform_pass &pass // pass.emplace(); pass.emplace(); pass.emplace(); - pass.emplace(); + // pass.emplace(); // pass.emplace(); pass.emplace(); // pass.emplace(); @@ -110,6 +113,10 @@ void neutral_target::add_default_transforms(ir::transforms::transform_pass &pass pass.emplace(); pass.emplace(); + pass.emplace(); + pass.emplace(); + pass.emplace(); + pass.emplace(); pass.emplace(); @@ -185,6 +192,31 @@ void neutral_target::register_target_independent_passes(const module_type_t &typ using namespace nncase::ir; using namespace nncase::ir::transforms; + { + transform_pass p("fold_instancenorm"); + p.emplace(); + pass_mgr.add_pass(std::move(p)); + } + // fix tflite_detection_postprocess shape error in tflite + { + transform_pass p("fix_shape_tdp"); + p.emplace(); + pass_mgr.add_pass(std::move(p)); + } + + // fold quant node in source model + { + transform_pass p("fold_quantize_in_source_model"); + p.emplace(); + pass_mgr.add_pass(std::move(p)); + } + // split to slice + { + transform_pass p("split_to_slice"); + p.emplace(); + pass_mgr.add_pass(std::move(p)); + } + if (type == runtime::stackvm::stackvm_module_type) { // fold_pad_conv @@ -223,16 +255,6 @@ void neutral_target::register_target_dependent_passes([[maybe_unused]] const mod void neutral_target::register_quantize_annotation_passes([[maybe_unused]] const module_type_t &type, ir::transforms::pass_manager &pass_mgr) { - { - transform_pass p("fuse_unary"); - p.emplace(); - p.emplace(); - p.emplace(); - p.emplace(); - p.emplace(); - pass_mgr.add_pass(std::move(p)); - } - { transform_pass p("annotate_neutral_quantize"); p.emplace(std::in_place, ir::op_fused_unary, ir::op_bitcast, ir::op_dequantize, ir::op_binary, ir::op_output_node); diff --git a/src/transforms/neutral/CMakeLists.txt b/src/transforms/neutral/CMakeLists.txt index 50bcb13d51..a6950b78c3 100644 --- a/src/transforms/neutral/CMakeLists.txt +++ b/src/transforms/neutral/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required (VERSION 3.13) +cmake_minimum_required(VERSION 3.13) target_sources(transforms PRIVATE add_quant_checkpoints.cpp @@ -47,4 +47,8 @@ target_sources(transforms PRIVATE pad_conv.cpp merge_binary_before_conv.cpp fold_matmul_add.cpp - ) + squeeze_dims.cpp + fix_output_shape.cpp + fold_layernorm.cpp + fold_instancenorm.cpp +) diff --git a/src/transforms/neutral/binary_motion.cpp b/src/transforms/neutral/binary_motion.cpp index 6fe90106f3..1c429cd90b 100644 --- a/src/transforms/neutral/binary_motion.cpp +++ b/src/transforms/neutral/binary_motion.cpp @@ -81,6 +81,7 @@ void binary_reduce_window2d_motion_up_transform::process(transform_context &cont auto &old_b = static_cast(*context.matched_nodes[3]); auto b = context.graph.emplace(old_b.binary_op(), conv.output().type(), conv.output().shape(), c.output().shape(), old_b.fused_activation()); + b->attributes(old_b.attributes()); b->name(old_b.name()); b->input_a().connect(conv.output()); b->input_b().connect(c.output()); diff --git a/src/transforms/neutral/fix_output_shape.cpp b/src/transforms/neutral/fix_output_shape.cpp new file mode 100644 index 0000000000..6df7642621 --- /dev/null +++ b/src/transforms/neutral/fix_output_shape.cpp @@ -0,0 +1,93 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::ir; +using namespace nncase::ir::transforms; + +bool tflite_detection_postprocess_transform::on_try_match(node &node, transform_context &context) +{ + if (auto tdp = node_cast(node)) + { + if (tdp->output_locations().shape() == shape_t { 1, (size_t)tdp->max_detections(), 4 }) + return false; + context.inputs.emplace_back(&tdp->boxes()); + context.inputs.emplace_back(&tdp->scores()); + context.inputs.emplace_back(&tdp->anchors()); + + context.outputs.emplace_back(&tdp->output_locations()); + context.outputs.emplace_back(&tdp->output_classes()); + context.outputs.emplace_back(&tdp->output_scores()); + context.outputs.emplace_back(&tdp->output_num_detections()); + + context.matched_nodes.emplace_back(tdp); + return true; + } + + return false; +} + +void tflite_detection_postprocess_transform::process(transform_context &context) +{ + auto &box = *context.inputs[0]->connection(); + auto &score = *context.inputs[1]->connection(); + auto &anchor = *context.inputs[2]->connection(); + auto output_locations = context.outputs[0]->connections(); + auto output_classes = context.outputs[1]->connections(); + auto output_scores = context.outputs[2]->connections(); + auto output_num_detections = context.outputs[3]->connections(); + + auto &old_tdp = static_cast(*context.matched_nodes[0]); + shape_t new_output_shape_0 { 1, (size_t)old_tdp.max_detections(), 4 }; + shape_t new_output_shape_1 { 1, (size_t)old_tdp.max_detections() }; + shape_t new_output_shape_2 { 1, (size_t)old_tdp.max_detections() }; + shape_t new_output_shape_3 { 1 }; + + context.graph.outputs(); + auto new_output_node_0 = context.graph.emplace(output_locations[0]->type(), new_output_shape_0); + auto new_output_node_1 = context.graph.emplace(output_classes[0]->type(), new_output_shape_1); + auto new_output_node_2 = context.graph.emplace(output_scores[0]->type(), new_output_shape_2); + auto new_output_node_3 = context.graph.emplace(output_num_detections[0]->type(), new_output_shape_3); + new_output_node_0->name("output_locations"); + new_output_node_1->name("output_classes"); + new_output_node_2->name("output_scores"); + new_output_node_3->name("output_num_detections"); + + auto new_tdp = context.graph.emplace(old_tdp.boxes().shape(), old_tdp.scores().shape(), old_tdp.anchors().shape(), + new_output_shape_0, new_output_shape_1, new_output_shape_2, new_output_shape_3, old_tdp.max_detections(), old_tdp.max_classes_per_detection(), + old_tdp.detections_per_class(), old_tdp.use_regular_non_max_suppression(), old_tdp.nms_score_threshold(), old_tdp.nms_iou_threshold(), + old_tdp.num_classes(), old_tdp.y_scale(), old_tdp.x_scale(), old_tdp.h_scale(), old_tdp.w_scale()); + new_tdp->name(old_tdp.name()); + + for (auto &i : context.graph.outputs()) + { + i->input().clear_connection(); + } + + new_tdp->boxes().connect(box); + new_tdp->scores().connect(score); + new_tdp->anchors().connect(anchor); + + new_output_node_0->input().connect(new_tdp->output_locations()); + new_output_node_1->input().connect(new_tdp->output_classes()); + new_output_node_2->input().connect(new_tdp->output_scores()); + new_output_node_3->input().connect(new_tdp->output_num_detections()); + + context.graph.dce(); +} \ No newline at end of file diff --git a/src/transforms/neutral/fold_instancenorm.cpp b/src/transforms/neutral/fold_instancenorm.cpp new file mode 100644 index 0000000000..f7e7e5556a --- /dev/null +++ b/src/transforms/neutral/fold_instancenorm.cpp @@ -0,0 +1,73 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::ir; +using namespace nncase::ir::transforms; + +bool fold_instancenorm_transform::on_try_match(node &node, transform_context &context) +{ + binary *add_bias = nullptr, *mul_scale = nullptr, *div = nullptr, *add_e = nullptr, *sub_mean = nullptr, *sub_mean_cmp = nullptr; + unary *u_sqrt = nullptr, *u_square = nullptr; + reduce *reduce_mean0 = nullptr, *reduce_mean1 = nullptr; + constant *scale = nullptr, *bias = nullptr, *eps = nullptr; + if (((add_bias = node_cast(node)) && (bias = try_get_direct_parent(*add_bias))) && add_bias->binary_op() == binary_add + && (div = try_get_direct_parent(*add_bias)) && div->binary_op() == binary_div + && (mul_scale = try_get_direct_parent(*div)) && (scale = try_get_direct_parent(*mul_scale)) && mul_scale->binary_op() == binary_mul + && (u_sqrt = try_get_direct_parent(*div)) && u_sqrt->unary_op() == unary_sqrt + && (add_e = try_get_direct_parent(*u_sqrt)) && (eps = try_get_direct_parent(*add_e)) && add_e->binary_op() == binary_add + && (reduce_mean0 = try_get_direct_parent(*add_e)) && reduce_mean0->reduce_op() == reduce_mean + && (u_square = try_get_direct_parent(*reduce_mean0)) && u_square->unary_op() == unary_square + && ((sub_mean = try_get_direct_parent(*u_square)) && (sub_mean_cmp = try_get_direct_parent(*mul_scale)) + && (sub_mean == sub_mean_cmp) && sub_mean->binary_op() == binary_sub) + && (reduce_mean1 = try_get_direct_parent(*sub_mean)) && reduce_mean1->reduce_op() == reduce_mean) + { + context.inputs.emplace_back(&reduce_mean1->input()); + context.outputs.emplace_back(&add_bias->output()); + context.matched_nodes.emplace_back(scale); + context.matched_nodes.emplace_back(bias); + context.matched_nodes.emplace_back(eps); + return true; + } + + return false; +} + +void fold_instancenorm_transform::process(transform_context &context) +{ + auto &output = *context.inputs[0]->connection(); + auto inputs = context.outputs[0]->connections(); + + auto scale = node_cast(*context.matched_nodes[0]); + auto bias = node_cast(*context.matched_nodes[1]); + auto eps = node_cast(*context.matched_nodes[2]); + + auto instancenorm_ = context.graph.emplace(output.type(), output.shape(), *reinterpret_cast(eps->data().data())); + instancenorm_->name(scale->name()); + instancenorm_->input().connect(output); + instancenorm_->scale().connect(scale->output()); + instancenorm_->bias().connect(bias->output()); + + for (auto &in : dup(inputs)) + in->connect(instancenorm_->output()); +} \ No newline at end of file diff --git a/src/transforms/neutral/fold_layernorm.cpp b/src/transforms/neutral/fold_layernorm.cpp new file mode 100644 index 0000000000..521512e8db --- /dev/null +++ b/src/transforms/neutral/fold_layernorm.cpp @@ -0,0 +1,212 @@ +/* Copyright 2020 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::ir; +using namespace nncase::ir::transforms; + +bool fold_layernorm_pattern1_transform::on_try_match(node &node, transform_context &context) +{ + reduce *rd1 = nullptr, *rd2 = nullptr; + binary *sub = nullptr, *pow = nullptr, *add_eps = nullptr, *div = nullptr, *mul = nullptr, *add_beta = nullptr; + unary *sqrt = nullptr; + bitcast *rshape1 = nullptr, *rshape2 = nullptr; + + if ((add_beta = node_cast(node)) and add_beta->binary_op() == binary_op_t::binary_add + and (mul = try_get_direct_parent(*add_beta)) and mul->binary_op() == binary_op_t::binary_mul + and (rshape2 = try_get_direct_parent(*mul)) + and (div = try_get_direct_parent(*rshape2)) and div->binary_op() == binary_op_t::binary_div + and (sqrt = try_get_direct_parent(*div)) and sqrt->unary_op() == unary_op_t::unary_sqrt + and (add_eps = try_get_direct_parent(*sqrt)) and add_eps->binary_op() == binary_op_t::binary_add + and (rd2 = try_get_direct_parent(*add_eps)) and rd2->reduce_op() == reduce_op_t::reduce_mean + and (pow = try_get_direct_parent(*rd2)) and pow->binary_op() == binary_op_t::binary_pow + and (sub = try_get_direct_parent(*pow)) and sub->binary_op() == binary_op_t::binary_sub + and (rd1 = try_get_direct_parent(*sub)) and rd1->reduce_op() == reduce_op_t::reduce_mean + and (rshape1 = try_get_direct_parent(*rd1)) + and (sub->input_a().connection() == rd1->input().connection() or sub->input_b().connection() == rd1->input().connection()) + and try_get_direct_parent(*div) == sub) + { + context.inputs.emplace_back(&rshape1->input()); + context.outputs.emplace_back(&add_beta->output()); + + context.matched_nodes.emplace_back(rshape1); + context.matched_nodes.emplace_back(rd1); + context.matched_nodes.emplace_back(sub); + context.matched_nodes.emplace_back(pow); + context.matched_nodes.emplace_back(rd2); + context.matched_nodes.emplace_back(add_eps); + context.matched_nodes.emplace_back(sqrt); + context.matched_nodes.emplace_back(div); + context.matched_nodes.emplace_back(rshape2); + context.matched_nodes.emplace_back(mul); + context.matched_nodes.emplace_back(add_beta); + + return true; + } + + return false; +} + +void fold_layernorm_pattern1_transform::process(transform_context &context) +{ + auto &output = *context.inputs[0]->connection(); + auto inputs = context.outputs[0]->connections(); + + auto eps = node_cast(context.matched_nodes[5]->input_at(1).connection()->owner()); + auto gamma = node_cast(context.matched_nodes[9]->input_at(1).connection()->owner()); + auto beta = node_cast(context.matched_nodes[10]->input_at(1).connection()->owner()); + + auto axis = output.shape().size() - gamma->output().shape().size(); + auto ln = context.graph.emplace(output.type(), output.shape(), axis, *reinterpret_cast(eps->data().data())); + ln->name(output.name() + "/layernorm"); + + ln->input().connect(output); + ln->scale().connect(gamma->output()); + ln->bias().connect(beta->output()); + + for (auto &in : dup(inputs)) + in->connect(ln->output()); +} + +bool fold_layernorm_pattern2_transform::on_try_match(node &node, transform_context &context) +{ + reduce *rd1 = nullptr, *rd2 = nullptr; + binary *sub = nullptr, *pow = nullptr, *add_eps = nullptr, *div = nullptr, *mul = nullptr, *add_beta = nullptr; + unary *sqrt = nullptr; + + if ((add_beta = node_cast(node)) and add_beta->binary_op() == binary_op_t::binary_add + and (mul = try_get_direct_parent(*add_beta)) and mul->binary_op() == binary_op_t::binary_mul + and (div = try_get_direct_parent(*mul)) and div->binary_op() == binary_op_t::binary_div + and (sqrt = try_get_direct_parent(*div)) and sqrt->unary_op() == unary_op_t::unary_sqrt + and (add_eps = try_get_direct_parent(*sqrt)) and add_eps->binary_op() == binary_op_t::binary_add + and (rd2 = try_get_direct_parent(*add_eps)) and rd2->reduce_op() == reduce_op_t::reduce_mean + and (pow = try_get_direct_parent(*rd2)) and pow->binary_op() == binary_op_t::binary_pow + and ((sub = try_get_direct_parent(*pow, 0)) or (sub = try_get_direct_parent(*pow, 1))) and sub->binary_op() == binary_op_t::binary_sub + and (rd1 = try_get_direct_parent(*sub)) and rd1->reduce_op() == reduce_op_t::reduce_mean + and (sub->input_a().connection() == rd1->input().connection() or sub->input_b().connection() == rd1->input().connection()) + and try_get_direct_parent(*div) == sub) + { + context.inputs.emplace_back(&rd1->input()); + context.outputs.emplace_back(&add_beta->output()); + + context.matched_nodes.emplace_back(rd1); + context.matched_nodes.emplace_back(sub); + context.matched_nodes.emplace_back(pow); + context.matched_nodes.emplace_back(rd2); + context.matched_nodes.emplace_back(add_eps); + context.matched_nodes.emplace_back(sqrt); + context.matched_nodes.emplace_back(div); + context.matched_nodes.emplace_back(mul); + context.matched_nodes.emplace_back(add_beta); + + return true; + } + + return false; +} + +void fold_layernorm_pattern2_transform::process(transform_context &context) +{ + auto &output = *context.inputs[0]->connection(); + auto inputs = context.outputs[0]->connections(); + + auto eps = node_cast(context.matched_nodes[4]->input_at(1).connection()->owner()); + auto gamma = node_cast(context.matched_nodes[7]->input_at(1).connection()->owner()); + auto beta = node_cast(context.matched_nodes[8]->input_at(1).connection()->owner()); + + auto axis = output.shape().size() - gamma->output().shape().size(); + auto ln = context.graph.emplace(output.type(), output.shape(), axis, *reinterpret_cast(eps->data().data())); + ln->name(output.name() + "/layernorm"); + + ln->input().connect(output); + ln->scale().connect(gamma->output()); + ln->bias().connect(beta->output()); + + for (auto &in : dup(inputs)) + in->connect(ln->output()); +} + +bool fold_layernorm_pattern3_transform::on_try_match(node &node, transform_context &context) +{ + reduce *rd_mu = nullptr, *rd_var = nullptr; + binary *sub_mu = nullptr, *add_eps = nullptr, *mul_gamma = nullptr, *mul_x = nullptr, *mul_mu = nullptr, *sub_beta = nullptr, *add_all = nullptr; + unary *rsqrt = nullptr, *square = nullptr; + + if ((add_all = node_cast(node)) and add_all->binary_op() == binary_op_t::binary_add + and (mul_x = try_get_direct_parent(*add_all, 0)) and mul_x->binary_op() == binary_op_t::binary_mul + and (sub_beta = try_get_direct_parent(*add_all, 1)) and sub_beta->binary_op() == binary_op_t::binary_sub + and (mul_gamma = try_get_direct_parent(*mul_x, 1)) and mul_gamma->binary_op() == binary_op_t::binary_mul + and (rsqrt = try_get_direct_parent(*mul_gamma, 0)) and rsqrt->unary_op() == unary_op_t::unary_rsqrt + and (add_eps = try_get_direct_parent(*rsqrt)) and add_eps->binary_op() == binary_op_t::binary_add + and (rd_var = try_get_direct_parent(*add_eps, 0)) and rd_var->reduce_op() == reduce_op_t::reduce_mean + and (square = try_get_direct_parent(*rd_var)) and square->unary_op() == unary_op_t::unary_square + and (sub_mu = try_get_direct_parent(*square)) and sub_mu->binary_op() == binary_op_t::binary_sub + and (rd_mu = try_get_direct_parent(*sub_mu, 1)) and rd_mu->reduce_op() == reduce_op_t::reduce_mean + and (mul_mu = try_get_direct_parent(*sub_beta, 1)) and mul_mu->binary_op() == binary_op_t::binary_mul + and (mul_mu->input_a().connection() == sub_mu->input_b().connection()) + and (mul_mu->input_b().connection() == mul_x->input_b().connection()) + and (mul_x->input_a().connection() == sub_mu->input_a().connection()) + and (mul_x->input_a().connection() == rd_mu->input().connection())) + { + context.inputs.emplace_back(&rd_mu->input()); + context.outputs.emplace_back(&add_all->output()); + + context.matched_nodes.emplace_back(rd_mu); + context.matched_nodes.emplace_back(sub_mu); + context.matched_nodes.emplace_back(square); + context.matched_nodes.emplace_back(rd_var); + context.matched_nodes.emplace_back(add_eps); + context.matched_nodes.emplace_back(rsqrt); + context.matched_nodes.emplace_back(mul_gamma); + context.matched_nodes.emplace_back(mul_x); + context.matched_nodes.emplace_back(mul_mu); + context.matched_nodes.emplace_back(sub_beta); + context.matched_nodes.emplace_back(add_all); + + return true; + } + + return false; +} + +void fold_layernorm_pattern3_transform::process(transform_context &context) +{ + auto &output = *context.inputs[0]->connection(); + auto inputs = context.outputs[0]->connections(); + + auto eps = node_cast(context.matched_nodes[4]->input_at(1).connection()->owner()); + auto gamma = node_cast(context.matched_nodes[6]->input_at(1).connection()->owner()); + auto beta = node_cast(context.matched_nodes[9]->input_at(0).connection()->owner()); + + auto axis = output.shape().size() - gamma->output().shape().size(); + auto ln = context.graph.emplace(output.type(), output.shape(), axis, *reinterpret_cast(eps->data().data())); + ln->name(output.name() + "/layernorm"); + + ln->input().connect(output); + ln->scale().connect(gamma->output()); + ln->bias().connect(beta->output()); + + for (auto &in : dup(inputs)) + in->connect(ln->output()); +} \ No newline at end of file diff --git a/src/transforms/neutral/fold_quantize.cpp b/src/transforms/neutral/fold_quantize.cpp index ead23f7870..bba88f5173 100644 --- a/src/transforms/neutral/fold_quantize.cpp +++ b/src/transforms/neutral/fold_quantize.cpp @@ -44,11 +44,11 @@ bool fold_quantize_transform::on_try_match(node &node, transform_context &contex context.matched_nodes.emplace_back(&q); context.matched_nodes.emplace_back(&deq); - if ((try_get_direct_parent(q) && try_get_direct_child(deq)) - || (try_get_direct_parent(q) && try_get_direct_child(deq))) - { - return true; - } + // if ((try_get_direct_parent(q) && try_get_direct_child(deq)) + // || (try_get_direct_parent(q) && try_get_direct_child(deq))) + // { + return true; + // } } } } diff --git a/src/transforms/neutral/optimize_allocation.cpp b/src/transforms/neutral/optimize_allocation.cpp index cd25b67c41..4de646be69 100644 --- a/src/transforms/neutral/optimize_allocation.cpp +++ b/src/transforms/neutral/optimize_allocation.cpp @@ -140,6 +140,25 @@ void add_copy_to_output_pass::run_core(graph &graph, [[maybe_unused]] nncase::ta alias_visitor.visit(graph); } +void add_copy_to_bitcast_pass::run_core(graph &graph, [[maybe_unused]] nncase::target &target, [[maybe_unused]] const run_pass_options &options) +{ + auto alias_visitor = make_relay_ir_visitor([&](node &node) { + if (auto b = node_cast(node)) + { + auto &out = *b->input().connection(); + if (out.owner().runtime_opcode() != op_copy) + { + auto cp = graph.emplace(out.type(), out.shape()); + cp->module_type(graph.module_type()); + cp->name(out.owner().name() + "/copy"); + cp->input().connect(out); + b->input().connect(cp->output()); + } + } + }); + alias_visitor.visit(graph); +} + // x@data x@output // | | // copy | @@ -173,8 +192,10 @@ void remove_exclusive_copy_to_output_transform::process(transform_context &conte { auto &output = *context.inputs[0]->connection(); auto &old_out = static_cast(*context.matched_nodes[1]); - - output.memory_location(mem_output); + if (output.connections().size() == 1) + output.memory_location(mem_output); + else + output.memory_location(mem_shared_data); output.attributes(output.attributes() | cnctr_attr_no_layout_strides); old_out.input().connect(output); } @@ -188,7 +209,7 @@ void remove_exclusive_copy_to_output_transform::process(transform_context &conte bool remove_exclusive_copy_to_concat_transform::on_try_match(node &node, transform_context &context) { copy *cp; - concat *c; + concat *c, *pre_c; if ((cp = node_cast(node)) && (c = try_get_direct_child(*cp))) @@ -201,6 +222,8 @@ bool remove_exclusive_copy_to_concat_transform::on_try_match(node &node, transfo && ((input->attributes() & (cnctr_attr_no_buffer_fusion | cnctr_attr_buffer_slice)) == 0) && (is_simple_concat || (input->attributes() & (cnctr_attr_no_layout_strides)) == 0)) { + if ((pre_c = try_get_direct_parent(*cp)) && pre_c->axis() != c->axis()) + return false; context.inputs.emplace_back(&cp->input()); context.outputs.emplace_back(&cp->output()); @@ -222,6 +245,39 @@ void remove_exclusive_copy_to_concat_transform::process(transform_context &conte in->connect(output); } +bool remove_exclusive_copy_to_bitcast_transform::on_try_match(node &node, transform_context &context) +{ + copy *cp; + bitcast *b; + + if ((cp = node_cast(node)) + && (b = try_get_direct_child(*cp))) + { + auto input = cp->input().connection(); + if ((input->memory_location() == mem_data || (input->memory_location() == mem_input && !try_get_direct_child(*b))) + && ((input->attributes() & cnctr_attr_no_buffer_fusion) == 0)) + { + context.inputs.emplace_back(&cp->input()); + context.outputs.emplace_back(&cp->output()); + + context.matched_nodes.emplace_back(cp); + return true; + } + } + + return false; +} + +void remove_exclusive_copy_to_bitcast_transform::process(transform_context &context) +{ + auto &output = *context.inputs[0]->connection(); + auto inputs = context.outputs[0]->connections(); + + output.attributes(output.attributes() | cnctr_attr_no_buffer_fusion); + for (auto &in : dup(inputs)) + in->connect(output); +} + // x x // | | // slice | diff --git a/src/transforms/neutral/pre_process_setting.cpp b/src/transforms/neutral/pre_process_setting.cpp index 9fd46cfbce..66ade2128d 100644 --- a/src/transforms/neutral/pre_process_setting.cpp +++ b/src/transforms/neutral/pre_process_setting.cpp @@ -54,23 +54,6 @@ void pre_process_transform::run_core(graph &graph, [[maybe_unused]] nncase::targ mid_ptr = &new_input->output(); - //dequantize: input_range_ - if (mid_ptr->type() != dt_float32) - { - std::cout << " |Dequantize:" << std::endl; - value_range range = { input_range_[0], input_range_[1] }; - - auto Q_max = 255; - auto Q_min = 0; - auto scale = (range.max - range.min) / (Q_max - Q_min); - auto bias = std::round((range.max * Q_min - range.min * Q_max) / (range.max - range.min)); - quant_param_t deq_params { static_cast(bias), scale }; - auto deq_input = graph.emplace(mid_ptr->type(), mid_ptr->shape(), dt_float32, deq_params); - deq_input->name("dequantize_input"); - deq_input->input().connect(*mid_ptr); - mid_ptr = &deq_input->output(); - } - if (input_layout_ == "NHWC") { auto transpose_pre = graph.emplace(mid_ptr->type(), mid_ptr->shape(), axis_t { 0, 3, 1, 2 }); @@ -99,6 +82,23 @@ void pre_process_transform::run_core(graph &graph, [[maybe_unused]] nncase::targ mid_ptr = &concat_slice->output(); } + //dequantize: input_range_ + if (mid_ptr->type() != dt_float32) + { + std::cout << " |Dequantize:" << std::endl; + value_range range = { input_range_[0], input_range_[1] }; + + auto Q_max = 255; + auto Q_min = 0; + auto scale = (range.max - range.min) / (Q_max - Q_min); + auto bias = std::round((range.max * Q_min - range.min * Q_max) / (range.max - range.min)); + quant_param_t deq_params { static_cast(bias), scale }; + auto deq_input = graph.emplace(mid_ptr->type(), mid_ptr->shape(), dt_float32, deq_params); + deq_input->name("dequantize_input"); + deq_input->input().connect(*mid_ptr); + mid_ptr = &deq_input->output(); + } + // letterbox : /** * input_layout: HW have different axis diff --git a/src/transforms/neutral/split_softmax.cpp b/src/transforms/neutral/split_softmax.cpp index 1cdf9019c3..b9008b520b 100644 --- a/src/transforms/neutral/split_softmax.cpp +++ b/src/transforms/neutral/split_softmax.cpp @@ -48,24 +48,30 @@ void split_softmax_transform::process(transform_context &context) auto input_shape = output.shape(); axis_t axes { sm.axis() }; auto rmax = context.graph.emplace(reduce_max, input_type, input_shape, axes, std::numeric_limits::lowest(), true); + rmax->attributes(rmax->attributes() | node_attributes::node_attr_skip_quantize); rmax->name(sm.name() + ".rmax"); auto sub = context.graph.emplace(binary_sub, input_type, input_shape, rmax->output().shape(), value_range::full()); + sub->attributes(sub->attributes() | node_attributes::node_attr_skip_quantize); sub->name(sm.name() + ".sub"); auto beta = context.graph.emplace(sm.beta()); beta->name(sm.name() + ".beta"); auto mul = context.graph.emplace(binary_mul, input_type, sub->output().shape(), beta->output().shape(), value_range::full()); + mul->attributes(mul->attributes() | node_attributes::node_attr_skip_quantize); mul->name(sm.name() + ".mul"); auto exp = context.graph.emplace(unary_exp, sub->output().shape()); + exp->attributes(exp->attributes() | node_attributes::node_attr_skip_quantize); exp->name(sm.name() + ".exp"); auto rsum = context.graph.emplace(reduce_sum, input_type, exp->output().shape(), axes, 0.f, true); + rsum->attributes(rsum->attributes() | node_attributes::node_attr_skip_quantize); rsum->name(sm.name() + ".rsum"); auto div = context.graph.emplace(binary_div, input_type, exp->output().shape(), rsum->output().shape(), value_range::full()); + div->attributes(div->attributes() | node_attributes::node_attr_skip_quantize); div->name(sm.name() + ".div"); rmax->input().connect(output); diff --git a/src/transforms/neutral/squeeze_dims.cpp b/src/transforms/neutral/squeeze_dims.cpp new file mode 100644 index 0000000000..309b335415 --- /dev/null +++ b/src/transforms/neutral/squeeze_dims.cpp @@ -0,0 +1,400 @@ +/* Copyright 2019-2021 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace nncase; +using namespace nncase::ir; +using namespace nncase::ir::transforms; + +shape_t squeeze_shape(shape_t old_shape) +{ + shape_t new_shape { 1, 1, 1, 1 }; + for (int i = old_shape.size() - 1, k = 3; i >= 0; i--) + { + new_shape[k] *= old_shape[i]; + if (k > 0) + k--; + } + return new_shape; +} + +auto squeeze_binary_shape(shape_t old_a_shape, shape_t old_b_shape) +{ + auto a_size = old_a_shape.size(); + auto b_size = old_b_shape.size(); + auto squeeze_times = std::max(a_size > 4 ? a_size - 4 : 0, b_size > 4 ? b_size - 4 : 0); + if (squeeze_times <= 0) + return std::tuple(false, old_a_shape, old_b_shape); + shape_t new_a_shape, new_b_shape; + + if (a_size == b_size) + { + /* + 1. a.shape == b.shape + 2. a.shape : [s1, s2, s3, s4, s5] + b.shape : [1, 1, s3, 1, 1] ||[1, 1, 1, s4, 1] ||... + */ + // 1. a.shape == b.shape + if (old_a_shape == old_b_shape) + { + new_a_shape = squeeze_shape(old_a_shape); + new_b_shape = squeeze_shape(old_b_shape); + } + // 2. a.shape : [s1, s2, s3, s4, s5] + // b.shape : [1, 1, s3, 1, 1] ||[1, 1, 1, s4, 1] ||... + else + { + new_a_shape = old_a_shape; + new_b_shape = old_b_shape; + + // inquiry which dim can be fold + std::vector can_fold_index_list(a_size, true); + std::vector> fold_index_couple; + for (size_t i = 0; i < a_size; i++) + { + if (old_a_shape[i] == old_b_shape[i]) + can_fold_index_list[i] = false; + } + for (size_t i = a_size - 1; i > 0; i--) + { + if (can_fold_index_list[i] && can_fold_index_list[i - 1]) + fold_index_couple.emplace_back(std::make_tuple(i - 1, i)); + } + + while (squeeze_times && !fold_index_couple.empty()) + { + auto it = fold_index_couple.back(); + auto front = std::get<0>(it); + auto back = std::get<1>(it); + new_a_shape[front] *= new_a_shape[back]; + new_b_shape[front] *= new_b_shape[back]; + new_a_shape.erase(std::begin(new_a_shape) + back); + new_b_shape.erase(std::begin(new_b_shape) + back); + fold_index_couple.pop_back(); + squeeze_times--; + } + + if (new_a_shape.size() > 4) + { + // remove shape.front() == 1 || shape.back() == 1 + if (new_a_shape.front() == 1 && new_b_shape.front() == 1) + { + new_a_shape.erase(std::begin(new_a_shape)); + new_b_shape.erase(std::begin(new_b_shape)); + } + else if (new_a_shape.back() == 1 && new_b_shape.back() == 1) + { + new_a_shape.erase(std::end(new_a_shape) - 1); + new_b_shape.erase(std::end(new_b_shape) - 1); + } + } + + new_a_shape.shrink_to_fit(); + new_b_shape.shrink_to_fit(); + if (new_a_shape.size() > 4) + return std::make_tuple(false, new_a_shape, new_b_shape); + } + } + else + { + if (a_size != 1) + new_a_shape = squeeze_shape(old_a_shape); + else + new_a_shape = old_a_shape; + if (b_size != 1) + new_b_shape = squeeze_shape(old_b_shape); + else + new_b_shape = old_b_shape; + } + return std::make_tuple(true, new_a_shape, new_b_shape); +} + +auto squeeze_transpose_shape(shape_t old_shape, axis_t old_axis) +{ + if (old_shape.size() <= 4) + return std::make_tuple(false, old_axis, old_shape); + + axis_t new_axis = old_axis; + shape_t new_shape = old_shape; + int squeeze_times = old_shape.size() - 4; + + std::vector> fold_index_couple; + for (size_t i = old_shape.size() - 1; i > 0; i--) + { + if (old_axis[i - 1] + 1 == old_axis[i]) + fold_index_couple.emplace_back(std::make_tuple(i - 1, i)); + } + if (fold_index_couple.size() < squeeze_times) + return std::make_tuple(false, new_axis, new_shape); + + while (squeeze_times && !fold_index_couple.empty()) + { + auto it = fold_index_couple.back(); + auto front = std::get<0>(it); + auto back = std::get<1>(it); + new_shape[front] *= new_shape[back]; + new_shape.erase(std::begin(new_shape) + back); + new_axis.erase(std::begin(new_axis) + back); + fold_index_couple.pop_back(); + squeeze_times--; + } + + // fix axis + for (int i = 0, j = 0; j < 4; i++) + { + auto find_index = std::find(new_axis.begin(), new_axis.end(), i); + if (find_index != new_axis.end()) + { + *find_index = j; + j++; + } + } + + return std::make_tuple(true, new_axis, new_shape); +} + +auto squeeze_concat_shape(std::vector &old_shape, int concat_axis) +{ + int new_axis = 0; + for (int index = 0; index < old_shape.size(); index++) + { + auto tmp_axis = concat_axis; + auto squeeze_times = old_shape[index].size() - 4; + shape_t new_shape { 1, 1, 1, 1 }; + for (int i = 0, j = 0; i < 4; i++, j++) + { + if (concat_axis > old_shape[index].size() - 4 - 1 && squeeze_times != 0) + { + new_shape[i] = old_shape[index][j] * old_shape[index][j + 1]; + squeeze_times--; + j++; + tmp_axis--; + } + else + { + new_shape[i] = old_shape[index][j]; + } + } + old_shape[index] = new_shape; + new_axis = tmp_axis; + } + + return new_axis; +} + +bool check_op(node_opcode op) +{ + if (op == op_binary || op == op_sigmoid || op == op_transpose || op == op_concat) + return true; + return false; +} + +bool squeeze_dims_transform::on_try_match(node &node, transform_context &context) +{ + if (check_op(node.runtime_opcode())) + { + bool need_squeeze = false; + for (auto &it : node.inputs()) + { + if (need_squeeze || it->shape().size() > 4) + { + need_squeeze = true; + context.inputs.emplace_back(it); + } + } + + // double check all input emplaced if need squeeze + if (need_squeeze) + { + for (auto &it : node.inputs()) + { + if (std::find(context.inputs.begin(), context.inputs.end(), it) == context.inputs.end()) + context.inputs.emplace_back(it); + } + } + + for (auto &it : node.outputs()) + { + if (need_squeeze || it->shape().size() > 4) + { + need_squeeze = true; + context.outputs.emplace_back(it); + } + } + if (need_squeeze) + { + context.matched_nodes.emplace_back(&node); + bool can_squeeze = true; + NNCASE_UNUSED shape_t a_shape, b_shape; + NNCASE_UNUSED axis_t new_axis; + if (node.runtime_opcode() == op_binary) + std::tie(can_squeeze, a_shape, b_shape) = squeeze_binary_shape(context.inputs[0]->shape(), context.inputs[1]->shape()); + else if (node.runtime_opcode() == op_transpose) + std::tie(can_squeeze, new_axis, b_shape) = squeeze_transpose_shape(node_cast(node)->input().shape(), node_cast(node)->perm()); + + return can_squeeze; + } + } + + return false; +} + +void squeeze_dims_transform::process(transform_context &context) +{ + if (context.matched_nodes[0]->runtime_opcode() == op_binary) + { + auto &output_a = *context.inputs[0]->connection(); + auto &output_b = *context.inputs[1]->connection(); + auto inputs = context.outputs[0]->connections(); + auto &old_binary = static_cast(*context.matched_nodes[0]); + + bitcast *in_a_bitc, *in_b_bitc, *out_bitc; + auto [_, new_a_shape, new_b_shape] = squeeze_binary_shape(output_a.shape(), output_b.shape()); + if (output_a.shape().size() > 4) + in_a_bitc = context.graph.emplace(output_a.type(), output_a.shape(), new_a_shape); + else + in_a_bitc = context.graph.emplace(output_a.type(), output_a.shape(), output_a.shape()); + + if (output_b.shape().size() > 4) + in_b_bitc = context.graph.emplace(output_b.type(), output_b.shape(), new_b_shape); + else + in_b_bitc = context.graph.emplace(output_b.type(), output_b.shape(), output_b.shape()); + + auto new_binary = context.graph.emplace(old_binary.binary_op(), in_a_bitc->output().type(), in_a_bitc->output().shape(), in_b_bitc->output().shape(), + old_binary.fused_activation()); + if (old_binary.output_at(0).shape().size() > 4) + out_bitc = context.graph.emplace(new_binary->output().type(), new_binary->output().shape(), old_binary.output_at(0).shape()); + else + out_bitc = context.graph.emplace(new_binary->output().type(), new_binary->output().shape(), new_binary->output().shape()); + + in_a_bitc->name(old_binary.name() + "_in_a_bitc"); + in_b_bitc->name(old_binary.name() + "_in_b_bitc"); + new_binary->name(old_binary.name()); + out_bitc->name(old_binary.name() + "_out_bitc"); + + new_binary->input_a().connect(in_a_bitc->output()); + new_binary->input_b().connect(in_b_bitc->output()); + out_bitc->input().connect(new_binary->output()); + + in_a_bitc->input().connect(output_a); + in_b_bitc->input().connect(output_b); + for (auto &in : dup(inputs)) + in->connect(out_bitc->output()); + } + else if (context.matched_nodes[0]->runtime_opcode() == op_sigmoid) + { + auto &output = *context.inputs[0]->connection(); + auto inputs = context.outputs[0]->connections(); + auto &old_sigmoid = static_cast(*context.matched_nodes[0]); + + bitcast *in_bitc, *out_bitc; + if (output.shape().size() > 4) + in_bitc = context.graph.emplace(output.type(), output.shape(), squeeze_shape(output.shape())); + else + in_bitc = context.graph.emplace(output.type(), output.shape(), output.shape()); + + auto new_sigmoid = context.graph.emplace(in_bitc->output().type(), in_bitc->output().shape()); + if (old_sigmoid.output_at(0).shape().size() > 4) + out_bitc = context.graph.emplace(new_sigmoid->output().type(), new_sigmoid->output().shape(), old_sigmoid.output_at(0).shape()); + else + out_bitc = context.graph.emplace(new_sigmoid->output().type(), new_sigmoid->output().shape(), new_sigmoid->output().shape()); + + in_bitc->name(old_sigmoid.name() + "_in_bitc"); + new_sigmoid->name(old_sigmoid.name()); + out_bitc->name(old_sigmoid.name() + "_out_bitc"); + + new_sigmoid->input().connect(in_bitc->output()); + out_bitc->input().connect(new_sigmoid->output()); + + in_bitc->input().connect(output); + for (auto &in : dup(inputs)) + in->connect(out_bitc->output()); + } + else if (context.matched_nodes[0]->runtime_opcode() == op_transpose) + { + auto &output = *context.inputs[0]->connection(); + auto inputs = context.outputs[0]->connections(); + auto &old_transpose = static_cast(*context.matched_nodes[0]); + + auto [_, new_axis, new_shape] = squeeze_transpose_shape(output.shape(), old_transpose.perm()); + + bitcast *in_bitc, *out_bitc; + if (output.shape().size() > 4) + in_bitc = context.graph.emplace(output.type(), output.shape(), new_shape); + else + in_bitc = context.graph.emplace(output.type(), output.shape(), output.shape()); + + auto new_transpose = context.graph.emplace(in_bitc->output().type(), in_bitc->output().shape(), new_axis); + if (old_transpose.output_at(0).shape().size() > 4) + out_bitc = context.graph.emplace(new_transpose->output().type(), new_transpose->output().shape(), old_transpose.output_at(0).shape()); + else + out_bitc = context.graph.emplace(new_transpose->output().type(), new_transpose->output().shape(), new_transpose->output().shape()); + + in_bitc->name(old_transpose.name() + "_in_bitc"); + new_transpose->name(old_transpose.name()); + out_bitc->name(old_transpose.name() + "_out_bitc"); + + new_transpose->input().connect(in_bitc->output()); + out_bitc->input().connect(new_transpose->output()); + + in_bitc->input().connect(output); + for (auto &in : dup(inputs)) + in->connect(out_bitc->output()); + } + else if (context.matched_nodes[0]->runtime_opcode() == op_concat) + { + auto inputs = context.outputs[0]->connections(); + auto &old_concat = static_cast(*context.matched_nodes[0]); + + std::vector concat_shape; + std::vector concat_inputs; + + for (auto &it : context.inputs) + { + concat_shape.emplace_back(it->shape()); + } + auto new_axis = squeeze_concat_shape(concat_shape, old_concat.axis()); + auto new_concat = context.graph.emplace(old_concat.output().type(), concat_shape, new_axis); + new_concat->name(old_concat.name()); + + for (size_t i = 0; i < context.inputs.size(); i++) + { + auto in_bitc = context.graph.emplace(context.inputs[i]->connection()->type(), context.inputs[i]->connection()->shape(), concat_shape[i]); + + in_bitc->input().connect(*context.inputs[i]->connection()); + in_bitc->name(old_concat.name() + "_in_bitc_" + std::to_string(i)); + new_concat->input_at(i).connect(in_bitc->output()); + } + bitcast *out_bitc; + if (old_concat.output_at(0).shape().size() > 4) + out_bitc = context.graph.emplace(new_concat->output().type(), new_concat->output().shape(), old_concat.output_at(0).shape()); + else + out_bitc = context.graph.emplace(new_concat->output().type(), new_concat->output().shape(), new_concat->output().shape()); + + out_bitc->name(old_concat.name() + "_out_bitc"); + + out_bitc->input().connect(new_concat->output()); + + for (auto &in : dup(inputs)) + in->connect(out_bitc->output()); + } +} diff --git a/src/transforms/neutral/transpose_motion.cpp b/src/transforms/neutral/transpose_motion.cpp index 06d3f3118d..b92da4e461 100644 --- a/src/transforms/neutral/transpose_motion.cpp +++ b/src/transforms/neutral/transpose_motion.cpp @@ -66,6 +66,7 @@ void transpose_binary_motion_transform::process(transform_context &context) auto &old_bin = static_cast(*context.matched_nodes[2]); auto bin = context.graph.emplace(old_bin.binary_op(), output_a.type(), output_a.shape(), output_b.shape(), old_bin.fused_activation()); + bin->attributes(old_bin.attributes()); bin->name(old_bin.name()); auto tp = context.graph.emplace(bin->output().type(), bin->output().shape(), old_tp.perm()); tp->name(old_tp.name()); @@ -138,6 +139,7 @@ void transpose_constant_binary_motion_transform::process(transform_context &cont if (old_bin.input_a().connection()->owner().runtime_opcode() == op_constant) { bin = context.graph.emplace(old_bin.binary_op(), output.type(), con->output().shape(), output.shape(), old_bin.fused_activation()); + bin->attributes(old_bin.attributes()); bin->name(old_bin.name()); bin->input_a().connect(con->output()); bin->input_b().connect(output); @@ -145,6 +147,7 @@ void transpose_constant_binary_motion_transform::process(transform_context &cont else { bin = context.graph.emplace(old_bin.binary_op(), output.type(), output.shape(), con->output().shape(), old_bin.fused_activation()); + bin->attributes(old_bin.attributes()); bin->name(old_bin.name()); bin->input_a().connect(output); bin->input_b().connect(con->output()); @@ -317,6 +320,7 @@ void transpose_reduce_motion_transform::process(transform_context &context) } auto r = context.graph.emplace(old_r.reduce_op(), output.type(), output.shape(), axes, old_r.init_value(), old_r.keep_dims()); + r->attributes(old_r.attributes()); r->name(old_r.name()); auto tp = context.graph.emplace(r->output().type(), r->output().shape(), perm); tp->name(old_tp.name()); @@ -355,6 +359,7 @@ void transpose_unary_motion_transform::process(transform_context &context) auto &old_u = static_cast(*context.matched_nodes[1]); auto u = context.graph.emplace(old_u.unary_op(), output.shape()); + u->attributes(old_u.attributes()); u->name(old_u.name()); auto tp = context.graph.emplace(u->output().type(), u->output().shape(), old_tp.perm()); tp->name(old_tp.name()); @@ -501,6 +506,7 @@ void transpose_sigmoid_motion_transform::process(transform_context &context) auto new_sigmd = context.graph.emplace(old_tp.input().type(), old_tp.input().shape()); auto new_b = context.graph.emplace(old_b.binary_op(), old_tp.input().type(), old_tp.input().shape(), new_sigmd->output().shape(), old_b.fused_activation()); + new_b->attributes(old_b.attributes()); auto new_tp = context.graph.emplace(new_b->output().type(), new_b->output().shape(), old_tp.perm()); new_sigmd->name(old_sigmd.name()); new_b->name(old_b.name()); diff --git a/targets/cpu/cpu_target.cpp b/targets/cpu/cpu_target.cpp index 19a8d18afe..e01288fbea 100644 --- a/targets/cpu/cpu_target.cpp +++ b/targets/cpu/cpu_target.cpp @@ -14,7 +14,10 @@ */ #include "cpu_target.h" #include +#include #include +#include +#include #include #include @@ -46,4 +49,23 @@ void cpu_target::register_target_dependent_passes([[maybe_unused]] const module_ p.emplace(); pass_mgr.add_pass(std::move(p)); } +} + +void cpu_target::register_quantize_annotation_passes([[maybe_unused]] const module_type_t &type, ir::transforms::pass_manager &pass_mgr) +{ + { + transform_pass p("fuse_unary"); + p.emplace(); + p.emplace(); + p.emplace(); + p.emplace(); + p.emplace(); + pass_mgr.add_pass(std::move(p)); + } + + { + transform_pass p("annotate_neutral_quantize"); + p.emplace(std::in_place, ir::op_fused_unary, ir::op_bitcast, ir::op_dequantize, ir::op_binary, ir::op_output_node); + pass_mgr.add_pass(std::move(p)); + } } \ No newline at end of file diff --git a/targets/cpu/cpu_target.h b/targets/cpu/cpu_target.h index 5e68a0976a..14fdc1ef79 100644 --- a/targets/cpu/cpu_target.h +++ b/targets/cpu/cpu_target.h @@ -23,5 +23,6 @@ class cpu_target : public neutral_target using neutral_target::neutral_target; void register_target_dependent_passes(const module_type_t &type, ir::transforms::pass_manager &pass_mgr, bool use_ptq, bool split_w_to_act) override; + void register_quantize_annotation_passes(const module_type_t &type, ir::transforms::pass_manager &pass_mgr) override; }; } diff --git a/targets/k210/k210_target.cpp b/targets/k210/k210_target.cpp index 660a64a950..16270efb31 100644 --- a/targets/k210/k210_target.cpp +++ b/targets/k210/k210_target.cpp @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include #include #include @@ -138,7 +140,7 @@ void k210_target::register_target_dependent_passes([[maybe_unused]] const module } } -void k210_target::register_quantize_annotation_passes(const module_type_t &type, ir::transforms::pass_manager &pass_mgr) +void k210_target::register_quantize_annotation_passes(NNCASE_UNUSED const module_type_t &type, ir::transforms::pass_manager &pass_mgr) { { transform_pass p("annotate_kpu1"); @@ -150,7 +152,15 @@ void k210_target::register_quantize_annotation_passes(const module_type_t &type, pass_mgr.add_pass(std::move(p)); } - neutral_target::register_quantize_annotation_passes(type, pass_mgr); + { + transform_pass p("fuse_unary"); + p.emplace(); + p.emplace(); + p.emplace(); + p.emplace(); + p.emplace(); + pass_mgr.add_pass(std::move(p)); + } { transform_pass p("annotate_kpu2"); @@ -172,7 +182,7 @@ void k210_target::register_quantize_annotation_passes(const module_type_t &type, { transform_pass p("annotate_kpu_quantize"); - p.emplace(std::in_place, ir::op_fused_unary, ir::k210::op_k210_fake_kpu_conv2d, ir::op_bitcast, ir::op_dequantize, ir::op_binary); + p.emplace(std::in_place, ir::op_fused_unary, ir::k210::op_k210_fake_kpu_conv2d, ir::op_bitcast, ir::op_dequantize, ir::op_binary, ir::op_slice); pass_mgr.add_pass(std::move(p)); } } diff --git a/tests/ci_proxy.py b/tests/ci_proxy.py new file mode 100644 index 0000000000..ab876cddf1 --- /dev/null +++ b/tests/ci_proxy.py @@ -0,0 +1,178 @@ +import os +import argparse +import stat +import socket +import json +import threading +import queue +import logging +import logging.handlers +import telnetlib +import time + +class TelnetClient(): + def __init__(self, mylogger): + self.tn = telnetlib.Telnet() + self.logger = mylogger + self.ip = '10.99.105.216' + self.timeout = 60 + + def login(self, ip, username, password): + try: + self.tn.open(ip, port=23) + except: + self.logger.error('telnet {0} failed'.format(ip)) + return False + + self.ip = ip + self.tn.read_until(b'login: ', timeout=self.timeout) + self.tn.write(username.encode() + b'\r\n') + + cmd_result = self.tn.read_very_eager().decode() + if 'Login incorrect' not in cmd_result: + self.logger.info('{0} login succeed'.format(ip)) + return True + else: + self.logger.error('{0} login failed'.format(ip)) + return False + + def logout(self): + self.tn.close() + self.logger.info('{0} logout succeed'.format(self.ip)) + + def execute(self, cmd, flag): + self.logger.debug('execute: cmd = {0}, flag = {1}'.format(cmd, flag)) + self.tn.write(cmd.encode() + b'\r\n') + cmd_result = self.tn.read_until(flag.encode(), timeout=self.timeout).decode() + if flag not in cmd_result: + # time out + self.tn.write(telnetlib.IP) + cmd_result = f'timeout for {self.timeout} seconds' + self.logger.error('execute {0} failed: {1}'.format(cmd, cmd_result)) + return cmd_result, False + else: + self.tn.write('echo $?'.encode() + b'\r\n') + cmd_status = self.tn.read_until(flag.encode(), self.timeout).decode() + if cmd_status.find('\r\n0\r\n') == -1: + self.logger.error('execute {0} failed: {1}'.format(cmd, cmd_result)) + return cmd_result, False + else: + return cmd_result, True + +def recv_file(conn, target_root, mylogger): + header = conn.recv(1024) + file_dict = json.loads(header.decode()) + file_name = file_dict['file_name'] + file_size = file_dict['file_size'] + mylogger.debug('recv: file = {0}, size = {1}'.format(file_name, file_size)) + conn.sendall(f"pls send {file_name}".encode()) + + full_file = os.path.join(target_root, file_name) + with open(full_file, 'wb') as f: + recv_size = 0 + while recv_size < file_size: + slice = conn.recv(4096) + f.write(slice) + recv_size += len(slice) + + conn.sendall(f"recv {file_name} succeed".encode()) + os.chmod(full_file, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO) + return file_name + +def Consumer(kpu_target, kpu_ip, kpu_username, kpu_password, nfsroot, q, mylogger): + # create target root + target_root = os.path.join(nfsroot, kpu_target) + if not os.path.exists(target_root): + os.makedirs(target_root) + + telnet_client = TelnetClient(mylogger) + while True: + cmd = './' + conn = q.get() + + # recv header + header = conn.recv(1024) + header_dict = json.loads(header.decode()) + mylogger.info("test case = {0}".format(header_dict['case'])) + file_num = header_dict['app'] + header_dict['kmodel'] + header_dict['inputs'] + conn.sendall(f"pls send {file_num} files".encode()) + + # recv all kinds of files(app + kmodel + inputs) + for i in range(file_num): + file = recv_file(conn, target_root, mylogger) + if i == 0: + cmd = cmd + file + else: + cmd = cmd + ' ' + file + + # telnet target devcie to infer + telnet_client.login(kpu_ip, kpu_username, kpu_password) + flag = f'/mnt/{kpu_target} ]$' + cmd_result, cmd_status = telnet_client.execute(f'cd /mnt/{kpu_target} && {cmd}', flag) + if cmd_status: + conn.sendall(f'infer succeed'.encode()) + dummy = conn.recv(1024) + + # send outputs + for i in range(header_dict['outputs']): + file = os.path.join(target_root, f'nncase_result_{i}.bin') + file_size = os.path.getsize(file) + conn.sendall(str(file_size).encode()) + dummy = conn.recv(1024) + + with open(file, 'rb') as f: + conn.sendall(f.read()) + dummy = conn.recv(1024) + mylogger.debug('send: file = {0}, size = {1}'.format(file, file_size)) + else: + conn.sendall(f'infer failed on {kpu_target} board: {cmd_result}'.encode()) + conn.close() + + if 'timeout' not in cmd_result: + telnet_client.logout() + else: + # reboot kpu_target when timeout + telnet_client.logout() + mylogger.error('reboot {0}({1}) for timeout'.format(kpu_target, kpu_ip)) + telnet_client.login(kpu_ip, kpu_username, kpu_password) + flag = f'[{kpu_username}@canaan ~ ]$' + telnet_client.execute('reboot', flag) + telnet_client.logout() + time.sleep(60) + +def main(): + # args + parser = argparse.ArgumentParser(prog="ci_proxy") + parser.add_argument("--kpu_target", help='kpu device target', type=str, default='k510') + parser.add_argument("--kpu_ip", help='kpu deivce ip address', type=str, default='10.99.105.216') + parser.add_argument("--kpu_username", help='kpu device usernmae', type=str, default='root') + parser.add_argument("--kpu_password", help='kpu device password', type=str, default='') + parser.add_argument("--nfsroot", help='nfsroot on pc', type=str, default='/data/nfs') + parser.add_argument("--port", help='listenning port of ci_proxy', type=int, default=51000) + args = parser.parse_args() + + # logging + mylogger = logging.getLogger() + mylogger.setLevel(logging.DEBUG) + rf_handler = logging.handlers.RotatingFileHandler(f'ci_proxy_{args.kpu_target}.log', mode='a', maxBytes=32 * 1024 * 1024, backupCount=10) + rf_handler.setLevel(logging.INFO) + rf_handler.setFormatter(logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')) + mylogger.addHandler(rf_handler) + + # producer + size = 256 + q = queue.Queue(maxsize=size) + + # comsumer + t_consumer = threading.Thread(target=Consumer, args=(args.kpu_target, args.kpu_ip, args.kpu_username, args.kpu_password, args.nfsroot, q, mylogger)) + t_consumer.start() + + server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server_socket.bind(('localhost', args.port)) + server_socket.listen(size) + while True: + conn, addr = server_socket.accept() + q.put(conn) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/tests/config.yml b/tests/config.yml index ec895d53b6..75eba9c5d0 100644 --- a/tests/config.yml +++ b/tests/config.yml @@ -108,9 +108,13 @@ judge: specifics: - matchs: #target: [cpu, vulkan, k210, k510] - target: [cpu, k210, k510] + target: [cpu, k510] ptq: true threshold: 0.98 + - matchs: + target: [k210] + ptq: true + threshold: 0.97 - matchs: target: [k510] ptq: false diff --git a/tests/dataset_utils.py b/tests/dataset_utils.py index ac36956487..385992607c 100644 --- a/tests/dataset_utils.py +++ b/tests/dataset_utils.py @@ -2,7 +2,8 @@ import numpy as np from compare_util import * import copy - +import socket +import json def get_topK(info, k, result): tmp = copy.deepcopy(result) @@ -27,3 +28,85 @@ def sim_run(kmodel, data, paths, target, model_type, model_shape): for i in range(len(tmp)): f.write(tmp[i][0].split("/")[-1] + " " + str(tmp[i][1][0]) + '\n') return tmp + +def on_board_run(kmodel, data, paths, target, port, case, nncase_test_ci, input_num, output_num, model_type, model_shape): + # connect server + client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + client_socket.connect(('localhost', int(port))) + + # send header + header_dict = {} + header_dict['case'] = case + header_dict['app'] = 1 + header_dict['kmodel']= 1 + header_dict['inputs'] = 1 + header_dict['outputs'] = 1 + client_socket.sendall(json.dumps(header_dict).encode()) + dummy = client_socket.recv(1024) + + # send app + file_dict = {} + file_dict['file_name'] = os.path.basename(nncase_test_ci) + file_dict['file_size'] = os.path.getsize(nncase_test_ci) + client_socket.sendall(json.dumps(file_dict).encode()) + dummy = client_socket.recv(1024) + with open(nncase_test_ci, 'rb') as f: + client_socket.sendall(f.read()) + dummy = client_socket.recv(1024) + + # send kmodel + file_dict['file_name'] = 'test.kmodel' + file_dict['file_size'] = len(kmodel) + client_socket.sendall(json.dumps(file_dict).encode()) + dummy = client_socket.recv(1024) + client_socket.sendall(kmodel) + dummy = client_socket.recv(1024) + + # send inputs + for i in range(input_num): + if(model_type != "tflite" and model_shape[-1] != 3): + new_data = np.transpose(data[0], [0, 3, 1, 2]).astype(np.float32) + else: + new_data = data[0].astype(np.float32) + + data_in_bytes = new_data.tobytes() + file_dict['file_name'] = f'input_0_{i}.bin' + file_dict['file_size'] = len(data_in_bytes) + client_socket.sendall(json.dumps(file_dict).encode()) + dummy = client_socket.recv(1024) + client_socket.sendall(data_in_bytes) + dummy = client_socket.recv(1024) + + # infer result + cmd_result = client_socket.recv(1024).decode() + if cmd_result.find('succeed') != -1: + client_socket.sendall(f"pls send outputs".encode()) + + # recv outputs + for i in range(output_num): + header = client_socket.recv(1024) + file_size = int(header.decode()) + client_socket.sendall(f"pls send nncase_result_{i}.bin".encode()) + + recv_size = 0 + buffer = bytearray(file_size) + while recv_size < file_size: + slice = client_socket.recv(4096) + buffer[recv_size:] = slice + recv_size += len(slice) + + # result + result = np.frombuffer(buffer, dtype=np.float32) + tmp = [] + tmp.append((data[1], get_topK(target, 1, result))) + with open(paths[-1][1], 'a') as f: + for i in range(len(tmp)): + f.write(tmp[i][0].split("/")[-1] + " " + str(tmp[i][1][0]) + '\n') + + client_socket.sendall(f"recv nncase_result_{i}.bin succeed".encode()) + + client_socket.close() + return tmp + else: + client_socket.close() + raise Exception(f'{cmd_result}') \ No newline at end of file diff --git a/tests/importer/onnx_/basic/test_compress.py b/tests/importer/onnx_/basic/test_compress.py new file mode 100644 index 0000000000..0b4f21f630 --- /dev/null +++ b/tests/importer/onnx_/basic/test_compress.py @@ -0,0 +1,124 @@ +# Copyright 2019-2021 Canaan Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=invalid-name, unused-argument, import-outside-toplevel + +import pytest +import onnx +from onnx import helper +from onnx import AttributeProto, TensorProto, GraphProto +from onnx_test_runner import OnnxTestRunner +import numpy as np +import random + + +def _make_module(in_shape_0, condition_shape, axis=None): + inputs = [] + outputs = [] + attributes_dict = {} + nodes = [] + + # input + input = helper.make_tensor_value_info('input', TensorProto.FLOAT, in_shape_0) + inputs.append('input') + + # output + x = np.random.rand(*in_shape_0).astype(np.float32) + condition = np.array(np.random.rand(*condition_shape) > .5).astype(np.bool_) + if(condition.sum() == 0): + print(condition.sum()) + condition[-1] = True + + output_shape = np.compress(condition, x, axis=axis).shape + output = helper.make_tensor_value_info('output', TensorProto.FLOAT, output_shape) + outputs.append('output') + + condi_data = helper.make_tensor( + 'condi_Constant', + TensorProto.BOOL, + dims=condition_shape, + vals=condition.astype(np.bool).flatten() + ) + weights_constant = helper.make_node( + "Constant", + inputs=[], + outputs=["condi"], + value=condi_data, + name="condition") + + nodes.append(weights_constant) + if axis != None: + attributes_dict['axis'] = axis + node = helper.make_node( + 'Compress', + inputs=['input', 'condi'], + outputs=outputs, + **attributes_dict + ) + nodes.append(node) + + graph_def = helper.make_graph( + nodes, + 'test-model', + [input], + [output], + ) + model_def = helper.make_model(graph_def, producer_name='kendryte') + + return model_def + + +in_shapes_0 = [ + [1], + [16], + [1, 16], + [16, 16], + [1, 15, 16], + [1, 3, 3, 3] +] + +condition = [ + [1], + [3], + [6], + +] + +axes = [ + None, + -1, + 0, + 1, + 2, + 3 +] + + +@pytest.mark.parametrize('in_shape_0', in_shapes_0) +@pytest.mark.parametrize('condition', condition) +@pytest.mark.parametrize('axes', axes) +def test_compress(in_shape_0, condition, axes, request): + size = 1 + for x in in_shape_0: + size *= x + if((axes != None and axes < len(in_shape_0) and condition[0] <= in_shape_0[axes]) + or (axes == None and condition[0] <= size)): + model_def = _make_module(in_shape_0, condition, axes) + + runner = OnnxTestRunner(request.node.name) + model_file = runner.from_onnx_helper(model_def) + runner.run(model_file) + + +if __name__ == "__main__": + pytest.main(['-vv', 'test_compress.py']) diff --git a/tests/importer/onnx_/basic/test_conv.py b/tests/importer/onnx_/basic/test_conv.py index b58ccc34e1..2cb9717ccf 100644 --- a/tests/importer/onnx_/basic/test_conv.py +++ b/tests/importer/onnx_/basic/test_conv.py @@ -195,7 +195,7 @@ def _make_module(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dil @pytest.mark.parametrize('pad', pads) @pytest.mark.parametrize('stride', strides) def test_conv(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dilation, group, kernel_shape, pad, stride, request): - if (bias_shape is None or (bias_shape is not None and bias_shape[0] == kernel_output_channel)) and ((auto_pad_mode is not None and pad is None) or (auto_pad_mode is None and pad is not None)) and (dilation is None or (auto_pad_modes is None or auto_pad_modes == 'NOTSET')): + if (bias_shape is None or (bias_shape is not None and bias_shape[0] == kernel_output_channel)) and ((auto_pad_mode is not None and pad is None) or (auto_pad_mode is None and pad is not None)) and (dilation is None or auto_pad_mode is None or auto_pad_mode == 'NOTSET'): model_def = _make_module(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dilation, group, kernel_shape, pad, stride) diff --git a/tests/importer/onnx_/basic/test_conv_transpose.py b/tests/importer/onnx_/basic/test_conv_transpose.py index ec780ddee3..632a67aa53 100644 --- a/tests/importer/onnx_/basic/test_conv_transpose.py +++ b/tests/importer/onnx_/basic/test_conv_transpose.py @@ -30,10 +30,12 @@ def _make_module(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dil input = helper.make_tensor_value_info('input', TensorProto.FLOAT, in_shape) inputs.append('input') + group = 1 if group is None else group + # weight w_shape = [] w_shape.append(in_shape[1]) - w_shape.append(kernel_output_channel) + w_shape.append(kernel_output_channel // group) w_shape.extend(kernel_shape) weight = helper.make_tensor( 'weight', @@ -67,7 +69,7 @@ def _make_module(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dil # output out_shape = [] out_shape.append(in_shape[0]) - out_shape.append(w_shape[1]) + out_shape.append(w_shape[1] * group) # pad padding = [0, 0, 0, 0] @@ -136,7 +138,7 @@ def _make_module(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dil ] kernel_output_channels = [ - 2 + 3 ] bias_shapes = [ @@ -158,6 +160,7 @@ def _make_module(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dil groups = [ None, + 3 ] kernel_shapes = [ diff --git a/tests/importer/onnx_/basic/test_conv_transpose1d.py b/tests/importer/onnx_/basic/test_conv_transpose1d.py new file mode 100644 index 0000000000..ced69786a1 --- /dev/null +++ b/tests/importer/onnx_/basic/test_conv_transpose1d.py @@ -0,0 +1,203 @@ +# Copyright 2019-2021 Canaan Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=invalid-name, unused-argument, import-outside-toplevel + +import math +import pytest +import onnx +from onnx import helper +from onnx import AttributeProto, TensorProto, GraphProto +from onnx_test_runner import OnnxTestRunner +import numpy as np + + +def _make_module(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dilation, group, kernel_shape, output_padding, pad, stride): + inputs = [] + initializers = [] + + # input + input = helper.make_tensor_value_info('input', TensorProto.FLOAT, in_shape) + inputs.append('input') + + group = 1 if group is None else group + + # weight + w_shape = [] + w_shape.append(in_shape[1]) + w_shape.append(kernel_output_channel // group) + w_shape.extend(kernel_shape) + weight = helper.make_tensor( + 'weight', + TensorProto.FLOAT, + dims=w_shape, + vals=np.random.rand(*w_shape).astype(np.float32).flatten().tolist() + ) + inputs.append('weight') + initializers.append(weight) + + # bias + if bias_shape is not None: + bias = helper.make_tensor( + 'bias', + TensorProto.FLOAT, + dims=bias_shape, + vals=np.random.rand(*bias_shape).astype(np.float32).flatten().tolist() + ) + inputs.append('bias') + initializers.append(bias) + + # dilation + d = [1] if dilation is None else dilation + + # output_padding + out_padding = [0] if output_padding is None else output_padding + + # stride + s = [1] if stride is None else stride + + # output + out_shape = [] + out_shape.append(in_shape[0]) + out_shape.append(w_shape[1] * group) + + # pad + padding = [0, 0] + if auto_pad_mode in [None, 'NOTSET'] and pad is not None: + padding = pad + out_shape.append(s[0] * (in_shape[2] - 1) + out_padding[0] + + (w_shape[2] - 1) * d[0] + 1 - padding[0] - padding[1]) + elif auto_pad_mode in ['SAME_UPPER', 'SAME_LOWER']: + out_shape.append(in_shape[2] * s[0]) + else: + out_shape.append(in_shape[2] + (in_shape[2] - 1) * (s[0] - 1) - w_shape[2] + 1) + + output = helper.make_tensor_value_info('output', TensorProto.FLOAT, out_shape) + + attributes_dict = {} + + if auto_pad_mode is not None: + attributes_dict['auto_pad'] = auto_pad_mode + + if dilation is not None: + attributes_dict['dilations'] = dilation + + if group is not None: + attributes_dict['group'] = group + + if kernel_shape is not None: + attributes_dict['kernel_shape'] = kernel_shape + + if output_padding is not None: + attributes_dict['output_padding'] = output_padding + + if pad is not None: + attributes_dict['pads'] = padding + + if stride is not None: + attributes_dict['strides'] = stride + + node = onnx.helper.make_node( + 'ConvTranspose', + inputs=inputs, + outputs=['output'], + **attributes_dict + ) + + nodes = [] + nodes.append(node) + + graph_def = helper.make_graph( + nodes, + 'test-model', + [input], + [output], + initializer=initializers) + + model_def = helper.make_model(graph_def, producer_name='kendryte') + + return model_def + + +in_shapes = [ + [1, 3, 16] +] + +kernel_output_channels = [ + 3 +] + +bias_shapes = [ + None, +] +bias_shapes.extend(list([[x] for x in kernel_output_channels])) + +auto_pad_modes = [ + None, + 'NOTSET', + 'SAME_UPPER', + 'SAME_LOWER', + 'VALID' +] + +dilations = [ + None, +] + +groups = [ + None, + 3 +] + +kernel_shapes = [ + [3], +] + +output_paddings = [ + None, +] + +pads = [ + # None, + [1, 1], +] + +strides = [ + None, + [2], + [3], +] + + +@pytest.mark.parametrize('in_shape', in_shapes) +@pytest.mark.parametrize('kernel_output_channel', kernel_output_channels) +@pytest.mark.parametrize('bias_shape', bias_shapes) +@pytest.mark.parametrize('auto_pad_mode', auto_pad_modes) +@pytest.mark.parametrize('dilation', dilations) +@pytest.mark.parametrize('group', groups) +@pytest.mark.parametrize('kernel_shape', kernel_shapes) +@pytest.mark.parametrize('output_padding', output_paddings) +@pytest.mark.parametrize('pad', pads) +@pytest.mark.parametrize('stride', strides) +def test_conv_transpose1d(in_shape, kernel_output_channel, bias_shape, auto_pad_mode, dilation, group, kernel_shape, output_padding, pad, stride, request): + if (bias_shape is None or (bias_shape is not None and bias_shape[0] == kernel_output_channel)) and ((auto_pad_mode in [None, 'NOTSET'] and pad is not None) or (auto_pad_mode in ['SAME_UPPER', 'SAME_LOWER', 'VALID'] and pad is None)) and (dilation is None or (auto_pad_modes in [None, 'NOTSET'])) and ((output_padding is None) or (output_padding is not None and stride is not None)): + model_def = _make_module(in_shape, kernel_output_channel, bias_shape, + auto_pad_mode, dilation, group, kernel_shape, output_padding, pad, stride) + + runner = OnnxTestRunner(request.node.name, ['k510']) + model_file = runner.from_onnx_helper(model_def) + runner.run(model_file) + + +if __name__ == "__main__": + pytest.main(['-vv', 'test_conv_transpose1d.py']) diff --git a/tests/importer/onnx_/basic/test_gather_elements.py b/tests/importer/onnx_/basic/test_gather_elements.py new file mode 100644 index 0000000000..e4887b6b8d --- /dev/null +++ b/tests/importer/onnx_/basic/test_gather_elements.py @@ -0,0 +1,91 @@ +# Copyright 2019-2021 Canaan Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""System test: test gather""" +# pylint: disable=invalid-name, unused-argument, import-outside-toplevel + +import pytest +import onnx +from onnx import helper +from onnx import AttributeProto, TensorProto, GraphProto +from onnx_test_runner import OnnxTestRunner +import numpy as np + + +def _make_module(in_shape, index, axis): + initializers = [] + attributes_dict = {} + + input = helper.make_tensor_value_info('input', TensorProto.FLOAT, in_shape) + + # i_shape = list(np.array(indices).shape) + indices = helper.make_tensor( + 'indices', + TensorProto.INT32, + np.array(index).shape, + np.array(index).astype(np.int32).flatten()) + initializers.append(indices) + + # axis + if axis is not None: + default_axis = axis + else: + default_axis = 0 + attributes_dict['axis'] = default_axis + + output = helper.make_tensor_value_info( + 'output', TensorProto.FLOAT, np.array(index).shape) + + node = onnx.helper.make_node( + 'GatherElements', + inputs=['input', 'indices'], + outputs=['output'], + **attributes_dict + ) + + graph_def = helper.make_graph( + [node], + 'test-model', + [input], + [output], + initializer=initializers + ) + + return helper.make_model(graph_def, producer_name='onnx') + + +# input_shape, indices_data, axis +# input_shape[i] >= indices_data.shape[i] +in_shapes_indices_dim = [ + ([2, 2], [[0, 0], [1, 0]], 0), + ([2, 3], [[0, 2], [1, 0]], 1), + ([2, 3], [[0, -2], [-1, 0]], 1), + ([1, 3, 3], [[[1, 2, 0], [2, 0, 0]]], 2), + ([1, 3, 3], [[[1, 2, 0], [2, 0, 0]]], 1), + ([4, 2, 3], [[[1, 2, 0], [2, 0, 0]], [[3, 2, 1], [2, 3, 1]]], 0), + ([1, 5, 3], [[[1, 2, 0], [2, 0, 0]]], 1), + ([2, 5, 6], [[[1, 2, 0], [2, 0, 0]], [[3, 2, 1], [2, 3, 1]]], 2), + +] + + +@pytest.mark.parametrize('in_shape, indices, axis', in_shapes_indices_dim) +def test_gather_elements(in_shape, indices, axis, request): + model_def = _make_module(in_shape, indices, axis) + runner = OnnxTestRunner(request.node.name) + model_file = runner.from_onnx_helper(model_def) + runner.run(model_file) + + +if __name__ == "__main__": + pytest.main(['-vv', 'test_gather_elements.py']) diff --git a/tests/importer/onnx_/basic/test_gather_nd.py b/tests/importer/onnx_/basic/test_gather_nd.py index 26981f1929..c1f9f9d478 100644 --- a/tests/importer/onnx_/basic/test_gather_nd.py +++ b/tests/importer/onnx_/basic/test_gather_nd.py @@ -53,7 +53,10 @@ def _make_module(in_shape, indices, batch_dims): initializer=initializers ) - return helper.make_model(graph_def, producer_name='kendryte') + # todo: support other opset + op = onnx.OperatorSetIdProto() + op.version = 12 + return helper.make_model(graph_def, producer_name='kendryte', opset_imports=[op]) in_shapes_indices_dim = [ diff --git a/tests/importer/onnx_/basic/test_gru.py b/tests/importer/onnx_/basic/test_gru.py new file mode 100644 index 0000000000..0c04c0a658 --- /dev/null +++ b/tests/importer/onnx_/basic/test_gru.py @@ -0,0 +1,222 @@ +# Copyright 2019-2021 Canaan Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=invalid-name, unused-argument, import-outside-toplevel + +import pytest +import onnx +from onnx import helper +from onnx import AttributeProto, TensorProto, GraphProto +from onnx_test_runner import OnnxTestRunner +import numpy as np + + + +def _make_module(direction, hidden_size, seq_length, batch_size, input_size, bias, sequence_lens, initial_h, Y, Y_h, + LBR): + nodes_inputs = [] + nodes_outputs = [] + initializers = [] + attributes_dict = {} + nodes = [] + graph_inputs = [] + graph_outputs = [] + + num_directions = 2 if direction == 'bidirectional' else 1 + if direction is not None: + attributes_dict['direction'] = direction + attributes_dict['hidden_size'] = hidden_size + attributes_dict['linear_before_reset'] = LBR + + # input + input_shape = [seq_length, batch_size, input_size] + input = helper.make_tensor_value_info('input', TensorProto.FLOAT, input_shape) + nodes_inputs.append('input') + graph_inputs.append(input) + + w_shape = [num_directions, 3 * hidden_size, input_size] + w_tensor = helper.make_tensor( + 'W', + TensorProto.FLOAT, + dims=w_shape, + vals=(np.random.rand(*w_shape) * 2 - 1).astype(np.float32).flatten().tolist() + ) + nodes_inputs.append('W') + initializers.append(w_tensor) + + r_shape = [num_directions, 3 * hidden_size, hidden_size] + r_tensor = helper.make_tensor( + 'R', + TensorProto.FLOAT, + dims=r_shape, + vals=(np.random.rand(*r_shape) * 2 - 1).astype(np.float32).flatten().tolist() + ) + nodes_inputs.append('R') + initializers.append(r_tensor) + + # bias + if bias is None: + nodes_inputs.append('') + else: + bias_shape = [num_directions, 6 * hidden_size] + bias_tensor = helper.make_tensor( + 'B', + TensorProto.FLOAT, + dims=bias_shape, + vals=(np.random.rand(*bias_shape) * 2 - 1).astype(np.float32).flatten().tolist() + ) + nodes_inputs.append('B') + initializers.append(bias_tensor) + + if sequence_lens is None: + nodes_inputs.append('') + else: + sequence_lens_shape = [batch_size] + sequence_lens_tensor = helper.make_tensor( + 'sequence_lens', + TensorProto.INT32, + dims=sequence_lens_shape, + vals=np.full(sequence_lens_shape, seq_length).flatten().tolist() + ) + nodes_inputs.append('sequence_lens') + initializers.append(sequence_lens_tensor) + + if initial_h is None: + nodes_inputs.append('') + else: + initial_h_shape = [num_directions, batch_size, hidden_size] + initial_h_tensor = helper.make_tensor( + 'initial_h', + TensorProto.FLOAT, + dims=initial_h_shape, + vals=np.random.rand(*initial_h_shape).astype(np.float32).flatten().tolist() + ) + nodes_inputs.append('initial_h') + initializers.append(initial_h_tensor) + + # output + if Y is None: + nodes_outputs.append('') + else: + output_shape = [seq_length, num_directions, batch_size, hidden_size] + output = helper.make_tensor_value_info('Y', TensorProto.FLOAT, output_shape) + nodes_outputs.append('Y') + graph_outputs.append(output) + + if Y_h is None: + nodes_outputs.append('') + else: + h_shape = [num_directions, batch_size, hidden_size] + y_h = helper.make_tensor_value_info('Y_h', TensorProto.FLOAT, h_shape) + nodes_outputs.append('Y_h') + graph_outputs.append(y_h) + + # lstm node + node = onnx.helper.make_node( + 'GRU', + inputs=nodes_inputs, + outputs=nodes_outputs, + **attributes_dict + ) + nodes.append(node) + + # graph + graph_def = helper.make_graph( + nodes, + 'test-model', + graph_inputs, + graph_outputs, + initializer=initializers + ) + + model_def = helper.make_model(graph_def, producer_name='onnx') + + return model_def + + +directions = [ + None, + 'forward', + 'reverse', + 'bidirectional' +] + +hidden_sizes = [ + 32, +] + +seq_lengths = [ + 4, +] + +batch_sizes = [ + 16, +] + +input_sizes = [ + 64, +] + +biases = [ + None, + 1 +] + +sequence_lenses = [ + None, +] + +initial_hs = [ + None, + 1 +] + +Ys = [ + # None, // At least one output be requested + 1 +] + +Y_hs = [ + None, + 1 +] + +LBRs = [ + 0, + 1 +] + + +@pytest.mark.parametrize('direction', directions) +@pytest.mark.parametrize('hidden_size', hidden_sizes) +@pytest.mark.parametrize('seq_length', seq_lengths) +@pytest.mark.parametrize('batch_size', batch_sizes) +@pytest.mark.parametrize('input_size', input_sizes) +@pytest.mark.parametrize('bias', biases) +@pytest.mark.parametrize('sequence_lens', sequence_lenses) +@pytest.mark.parametrize('initial_h', initial_hs) +@pytest.mark.parametrize('Y', Ys) +@pytest.mark.parametrize('Y_h', Y_hs) +@pytest.mark.parametrize('LBR', LBRs) +def test_gru(direction, hidden_size, seq_length, batch_size, input_size, bias, sequence_lens, initial_h, Y, Y_h, LBR, + request): + model_def = _make_module(direction, hidden_size, seq_length, batch_size, + input_size, bias, sequence_lens, initial_h, Y, Y_h, LBR) + + runner = OnnxTestRunner(request.node.name) + model_file = runner.from_onnx_helper(model_def) + runner.run(model_file) + + +if __name__ == "__main__": + pytest.main(['-vv', 'test_gru.py']) diff --git a/tests/importer/onnx_/basic/test_layer_norm.py b/tests/importer/onnx_/basic/test_layer_norm.py new file mode 100644 index 0000000000..63f611f920 --- /dev/null +++ b/tests/importer/onnx_/basic/test_layer_norm.py @@ -0,0 +1,103 @@ +# Copyright 2019-2021 Canaan Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=invalid-name, unused-argument, import-outside-toplevel + +from importlib import import_module +import pytest +import onnx +from onnx import helper +from onnx import AttributeProto, TensorProto, GraphProto +from onnx_test_runner import OnnxTestRunner +import numpy as np + + +def _make_module(in_shape, axis, epsilon): + + input = helper.make_tensor_value_info('input', TensorProto.FLOAT, in_shape) + output = helper.make_tensor_value_info('output', TensorProto.FLOAT, in_shape) + + initializers = [] + actual_axis = -1 if axis is None else axis + scale = helper.make_tensor("scale", + TensorProto.FLOAT, + dims=in_shape[actual_axis:], + vals=np.random.randn(*in_shape[actual_axis:]).astype(np.float32).flatten().tolist()) + initializers.append(scale) + + bias = helper.make_tensor("bias", + TensorProto.FLOAT, + dims=in_shape[actual_axis:], + vals=np.random.randn(*in_shape[actual_axis:],).astype(np.float32).flatten().tolist()) + initializers.append(bias) + + if axis is None and epsilon is None: + node = onnx.helper.make_node('LayerNormalization', + inputs=['input', 'scale', 'bias'], + outputs=['output']) + elif axis is None: + node = onnx.helper.make_node('LayerNormalization', + inputs=['input', 'scale', 'bias'], + outputs=['output'], + epsilon=epsilon) + elif epsilon is None: + node = onnx.helper.make_node('LayerNormalization', + inputs=['input', 'scale', 'bias'], + outputs=['output'], + axis=axis) + else: + node = onnx.helper.make_node('LayerNormalization', + inputs=['input', 'scale', 'bias'], + outputs=['output'], + axis=axis, + epsilon=epsilon) + + graph_def = helper.make_graph([node], 'test-model', [input], [output], initializer=initializers) + op = onnx.OperatorSetIdProto() + op.version = 17 + model_def = helper.make_model(graph_def, producer_name='onnx', opset_imports=[op]) + + return model_def + + +in_shapes = [ + [1, 24, 256] +] + +axes = [ + None, + -1, + 2, + 1, + 0 +] + +epsilons = [ + None, + 1e-2 +] + + +@pytest.mark.parametrize('in_shape', in_shapes) +@pytest.mark.parametrize('axis', axes) +@pytest.mark.parametrize('epsilon', epsilons) +def test_layer_norm(in_shape, axis, epsilon, request): + model_def = _make_module(in_shape, axis, epsilon) + + runner = OnnxTestRunner(request.node.name) + model_file = runner.from_onnx_helper(model_def) + runner.run(model_file) + + +if __name__ == "__main__": + pytest.main(['-vv', 'test_layer_norm.py']) diff --git a/tests/importer/onnx_/basic/test_pool.py b/tests/importer/onnx_/basic/test_pool.py index 7336279a1e..30ac7ae95c 100644 --- a/tests/importer/onnx_/basic/test_pool.py +++ b/tests/importer/onnx_/basic/test_pool.py @@ -70,7 +70,6 @@ def forward(self, x): True ] - @pytest.mark.parametrize('in_shape', in_shapes) @pytest.mark.parametrize('kernel_size', kernel_sizes) @pytest.mark.parametrize('stride', strides) diff --git a/tests/importer/onnx_/basic/test_pool2.py b/tests/importer/onnx_/basic/test_pool2.py new file mode 100644 index 0000000000..6e57c2c93b --- /dev/null +++ b/tests/importer/onnx_/basic/test_pool2.py @@ -0,0 +1,111 @@ +# Copyright 2019-2021 Canaan Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=invalid-name, unused-argument, import-outside-toplevel + +import pytest +import onnx +from onnx import helper +from onnx import AttributeProto, TensorProto, GraphProto +from onnx_test_runner import OnnxTestRunner +import numpy as np +import math + + +def _make_module(in_shape, kernel_size, stride, padding, count_include_pad, ceil_mode): + nodes = [] + initializers = [] + inputs = [] + outputs = [] + + input = helper.make_tensor_value_info('input', TensorProto.FLOAT, in_shape) + inputs.append('input') + + out_shape = in_shape.copy() + out_shape[2] = (in_shape[2] + padding[0] + padding[2] - kernel_size[0]) // stride[0] + 1 if ceil_mode == 0 else math.ceil((in_shape[2] + padding[0] + padding[2] - kernel_size[0]) / stride[0]) + 1 + out_shape[3] = (in_shape[3] + padding[1] + padding[3] - kernel_size[1]) // stride[1] + 1 if ceil_mode == 0 else math.ceil((in_shape[3] + padding[1] + padding[3] - kernel_size[1]) / stride[1]) + 1 + output = helper.make_tensor_value_info('output', TensorProto.FLOAT, out_shape) + outputs.append('output') + + node = onnx.helper.make_node( + 'MaxPool', + inputs=inputs, + outputs=outputs, + kernel_shape=kernel_size, + strides=stride, + ceil_mode=ceil_mode, + pads=padding) + + nodes.append(node) + + graph_def = helper.make_graph( + nodes, + 'test-model', + [input], + [output], + initializer=initializers) + + op = onnx.OperatorSetIdProto() + op.version = 11 + model_def = helper.make_model(graph_def, producer_name='kendryte', opset_imports=[op]) + + return model_def + + +in_shapes = [ + [1, 3, 60, 72], +] + +kernel_sizes = [ + (3, 3), +] + +strides = [ + (1, 1), + (2, 2), + [2, 1] +] + +paddings = [ + (0, 0, 0, 0), + (1, 1, 1, 1), + (1, 1, 1, 2) +] + +count_include_pads = [ + False, + True +] + +ceil_modes = [ + False, + True +] + +@pytest.mark.parametrize('in_shape', in_shapes) +@pytest.mark.parametrize('kernel_size', kernel_sizes) +@pytest.mark.parametrize('stride', strides) +@pytest.mark.parametrize('padding', paddings) +@pytest.mark.parametrize('count_include_pad', count_include_pads) +@pytest.mark.parametrize('ceil_mode', ceil_modes) +def test_pool2(in_shape, kernel_size, stride, padding, count_include_pad, ceil_mode, request): + if kernel_size[0] / 2 > padding[0] and kernel_size[1] / 2 > padding[1]: + module = _make_module(in_shape, kernel_size, stride, padding, count_include_pad, ceil_mode) + + runner = OnnxTestRunner(request.node.name) + model_file = runner.from_onnx_helper(module) + runner.run(model_file) + + +if __name__ == "__main__": + pytest.main(['-vv', 'test_pool2.py']) diff --git a/tests/importer/onnx_/basic/test_reduce.py b/tests/importer/onnx_/basic/test_reduce.py index de48be47c2..3ef1c0906f 100644 --- a/tests/importer/onnx_/basic/test_reduce.py +++ b/tests/importer/onnx_/basic/test_reduce.py @@ -63,7 +63,12 @@ def _make_module(in_type, in_shape, reduce_op, axes, keepdims): [output], initializer=initializers) - model_def = helper.make_model(graph_def, producer_name='onnx') + if reduce_op=='ReduceSum': + op = onnx.OperatorSetIdProto() + op.version = 11 + model_def = helper.make_model(graph_def, producer_name='onnx', opset_imports=[op]) + else: + model_def = helper.make_model(graph_def, producer_name='onnx') return model_def @@ -80,6 +85,7 @@ def _make_module(in_type, in_shape, reduce_op, axes, keepdims): 'ReduceMax', 'ReduceMean', 'ReduceMin', + 'ReduceSum' ] axes_list = [ diff --git a/tests/importer/onnx_/basic/test_roi_align.py b/tests/importer/onnx_/basic/test_roi_align.py index c32c2e8597..61088cc074 100644 --- a/tests/importer/onnx_/basic/test_roi_align.py +++ b/tests/importer/onnx_/basic/test_roi_align.py @@ -21,7 +21,7 @@ import numpy as np import copy -def _make_module(in_shape, rois, batch_indices, mode, output_height, output_width, sampling_ratio, spatial_scale): +def _make_module(in_shape, rois, batch_indices, mode, output_height, output_width, sampling_ratio, spatial_scale, op_version): inputs = [] outputs = [] initializers = [] @@ -54,7 +54,7 @@ def _make_module(in_shape, rois, batch_indices, mode, output_height, output_widt inputs.append('batch_indices') # output - out_shape = [rois_array.shape[0], in_shape[1], output_height, output_width] + out_shape = [rois_array.shape[0], in_shape[1], output_height if output_height is not None else 1, output_width if output_width is not None else 1] output = helper.make_tensor_value_info('output', TensorProto.FLOAT, out_shape) outputs.append('output') @@ -95,7 +95,9 @@ def _make_module(in_shape, rois, batch_indices, mode, output_height, output_widt initializer=initializers ) - model_def = helper.make_model(graph_def, producer_name='onnx') + op = onnx.OperatorSetIdProto() + op.version = op_version + model_def = helper.make_model(graph_def, producer_name='onnx helper', opset_imports=[op]) return model_def @@ -137,6 +139,10 @@ def _make_module(in_shape, rois, batch_indices, mode, output_height, output_widt 1.0 ] +op_versions = [ + 10 +] + @pytest.mark.parametrize('in_shape', in_shapes) @pytest.mark.parametrize('roi', rois) @pytest.mark.parametrize('batch_index', batch_indices) @@ -145,8 +151,9 @@ def _make_module(in_shape, rois, batch_indices, mode, output_height, output_widt @pytest.mark.parametrize('output_width', output_widths) @pytest.mark.parametrize('sampling_ratio', sampling_ratios) @pytest.mark.parametrize('spatial_scale', spatial_scales) -def test_roi_align(in_shape, roi, batch_index, mode, output_height, output_width, sampling_ratio, spatial_scale, request): - model_def = _make_module(in_shape, roi, batch_index, mode, output_height, output_width, sampling_ratio, spatial_scale) +@pytest.mark.parametrize('op_version', op_versions) +def test_roi_align(in_shape, roi, batch_index, mode, output_height, output_width, sampling_ratio, spatial_scale, op_version, request): + model_def = _make_module(in_shape, roi, batch_index, mode, output_height, output_width, sampling_ratio, spatial_scale, op_version) runner = OnnxTestRunner(request.node.name) model_file = runner.from_onnx_helper(model_def) diff --git a/tests/importer/onnx_/basic/test_slice.py b/tests/importer/onnx_/basic/test_slice.py index 3dc04e46fe..45c86684c3 100644 --- a/tests/importer/onnx_/basic/test_slice.py +++ b/tests/importer/onnx_/basic/test_slice.py @@ -21,7 +21,7 @@ import numpy as np -def _make_module(in_shape, start, end, axes, step, outshape, op_version, value_format): +def _make_module(in_shape, start, end, axes, step, outshape, op_version, value_format, attribute_dtype): input_names = [] output_names = [] inputs = [] @@ -52,7 +52,7 @@ def _make_module(in_shape, start, end, axes, step, outshape, op_version, value_f # starts start_tensor = helper.make_tensor( 'starts', - TensorProto.INT64, + attribute_dtype, dims=[len(start)], vals=start ) @@ -73,7 +73,7 @@ def _make_module(in_shape, start, end, axes, step, outshape, op_version, value_f # ends end_tensor = helper.make_tensor( 'ends', - TensorProto.INT64, + attribute_dtype, dims=[len(end)], vals=end ) @@ -95,7 +95,7 @@ def _make_module(in_shape, start, end, axes, step, outshape, op_version, value_f if axes is not None: axes_tensor = helper.make_tensor( 'axes', - TensorProto.INT64, + attribute_dtype, dims=[len(end)], vals=axes ) @@ -117,7 +117,7 @@ def _make_module(in_shape, start, end, axes, step, outshape, op_version, value_f if step is not None: step_tensor = helper.make_tensor( 'steps', - TensorProto.INT64, + attribute_dtype, dims=[len(step)], vals=step ) @@ -183,16 +183,22 @@ def _make_module(in_shape, start, end, axes, step, outshape, op_version, value_f [13, 'constant'] ] +attribute_dtypes = [ + TensorProto.INT64, + TensorProto.INT32 +] + @pytest.mark.parametrize('in_shape', in_shapes) @pytest.mark.parametrize('start_end_axes_step_outshape', starts_ends_axes_steps_outshapes) @pytest.mark.parametrize('op_versions_and_value_format', op_versions_and_value_formats) -def test_slice(in_shape, start_end_axes_step_outshape, op_versions_and_value_format, request): +@pytest.mark.parametrize('attribute_dtype', attribute_dtypes) +def test_slice(in_shape, start_end_axes_step_outshape, op_versions_and_value_format, attribute_dtype, request): start, end, axes, step, outshape = start_end_axes_step_outshape op_version, value_format = op_versions_and_value_format - if op_version != 1 or (op_version == 1 and step is not None and all([x == 1 for x in step])): + if op_version != 1 or (op_version == 1 and step is not None and all([x == 1 for x in step]) and attribute_dtype == TensorProto.INT64): model_def = _make_module(in_shape, start, end, axes, step, - outshape, op_version, value_format) + outshape, op_version, value_format, attribute_dtype) runner = OnnxTestRunner(request.node.name) model_file = runner.from_onnx_helper(model_def) diff --git a/tests/importer/onnx_/basic/test_threadholdrelu.py b/tests/importer/onnx_/basic/test_threadholdrelu.py new file mode 100644 index 0000000000..16a7e2d8c9 --- /dev/null +++ b/tests/importer/onnx_/basic/test_threadholdrelu.py @@ -0,0 +1,101 @@ +# Copyright 2019-2021 Canaan Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=invalid-name, unused-argument, import-outside-toplevel + +import pytest +import onnx +import numpy as np +from onnx import helper +from onnx import AttributeProto, TensorProto, GraphProto +from onnx_test_runner import OnnxTestRunner + + +def _make_module(in_shape, alpha): + inputs = [] + outputs = [] + initializers = [] + attributes_dict = {} + nodes = [] + + # input + input = helper.make_tensor_value_info('input', TensorProto.FLOAT, in_shape) + inputs.append('input') + + # output + output = helper.make_tensor_value_info('output', TensorProto.FLOAT, in_shape) + outputs.append('output') + + # alpha + if alpha is not None: + attributes_dict['alpha'] = alpha + + tensor = helper.make_tensor( + 'input2', + TensorProto.FLOAT, + dims=in_shape, + vals=(np.random.rand(*in_shape) + 2).astype(onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[TensorProto.FLOAT]).flatten().tolist() + ) + # inputs.append('input2') + initializers.append(tensor) + + # enable default alphas: None -> 1 + node = onnx.helper.make_node( + 'Mul', + inputs=[inputs[0], 'input2'], + outputs=['0'], + ) + nodes.append(node) + + # Celu node + node = onnx.helper.make_node( + 'ThresholdedRelu', + inputs=['0'], + outputs=outputs, + **attributes_dict + ) + nodes.append(node) + + graph_def = helper.make_graph( + nodes, + 'test-model', + [input], + [output], + initializer=initializers) + + model_def = helper.make_model(graph_def, producer_name='onnx') + + return model_def + + +in_shapes = [ + [1, 3, 16, 16] +] + +alphas = [ + None, + 0.5, + 1.5 +] + +@pytest.mark.parametrize('in_shape', in_shapes) +@pytest.mark.parametrize('alpha', alphas) +def test_threadholdrelu(in_shape, alpha, request): + model_def = _make_module(in_shape, alpha) + + runner = OnnxTestRunner(request.node.name) + model_file = runner.from_onnx_helper(model_def) + runner.run(model_file) + +if __name__ == "__main__": + pytest.main(['-vv', 'test_threadholdrelu.py']) diff --git a/tests/importer/onnx_/basic/test_unary.py b/tests/importer/onnx_/basic/test_unary.py index 1137dd0a06..bf470172b4 100644 --- a/tests/importer/onnx_/basic/test_unary.py +++ b/tests/importer/onnx_/basic/test_unary.py @@ -13,12 +13,137 @@ # limitations under the License. # pylint: disable=invalid-name, unused-argument, import-outside-toplevel +import sys import pytest -import torch -# import test_util +import onnx +from onnx import helper +from onnx import AttributeProto, TensorProto, GraphProto from onnx_test_runner import OnnxTestRunner +import numpy as np + +def _make_module(op, in_type, in_shape): + inputs = [] + outputs = [] + # initializers = [] + attributes_dict = {} + nodes = [] + + # input1 + input1 = helper.make_tensor_value_info('input1', in_type, in_shape) + inputs.append('input1') + + output_shape = in_shape + output = helper.make_tensor_value_info('output', in_type, output_shape) + outputs.append('output') + + node = onnx.helper.make_node( + op, + inputs=inputs, + outputs=outputs, + **attributes_dict + ) + + nodes.append(node) + graph_def = helper.make_graph( + nodes, + 'test-model', + [input1], + [output], + initializer=None) + + model_def = helper.make_model(graph_def, producer_name='onnx') + return model_def +in_shapes = [ + [16], + [1, 3, 16, 16] +] + +# calc operators +ops = [ + # 'Rsqrt', 'Square' # 这 2 个算子目前不支持 + 'Ceil', + 'Floor', + 'Round', + 'Sqrt', + 'Tanh', + 'Erf', + 'Abs', + 'Acos', + 'Asin', + 'Exp', + 'Log', + 'Neg', + 'Sign', + 'Sin', + 'Cos', +] + +# calc operators data type +in_types = [ + TensorProto.FLOAT, + # TensorProto.INT32, // Not supported at present + # TensorProto.INT8, // Not supported at present + # TensorProto.INT64, // Not supported at present +] + +# logical operators +logical_ops = [ + 'Not' +] + +# logical operators data type +logical_types = [ + TensorProto.BOOL +] + +# operators and types group +op_type_pairs = [ + [logical_ops, logical_types], + [ops, in_types] +] + +def get_case_data(in_datas): + case_data = [] + for op_types in in_datas: + _ops = op_types[0] + _types = op_types[1] + for _op in _ops: + for _type in _types: + tmp_pair = [] + tmp_pair.append(_op) + tmp_pair.append(_type) + case_data.append(tmp_pair) + return case_data + pass + +class TestUnaryModule(object): + + def setup_class(self): + pass + + def teardown_class(self): + pass + + # get the test case + case_data=get_case_data(op_type_pairs) + print(case_data) + + @pytest.mark.parametrize('in_shape', in_shapes) + @pytest.mark.parametrize('op, in_type', case_data) + def test_unary(self, op, in_type, in_shape, request): + model_def = _make_module(op, in_type, in_shape) + runner = OnnxTestRunner(request.node.name) + model_file = runner.from_onnx_helper(model_def) + runner.run(model_file) + pass + +''' +import pytest +import torch +import test_util +from onnx_test_runner import OnnxTestRunner def _make_module(): class UnaryModule(torch.nn.Module): def __init__(self): @@ -39,17 +164,10 @@ def forward(self, x): outs.append(torch.sin(x)) outs.append(torch.sqrt(x + 2)) outs.append(torch.tanh(x)) + outs.append(torch.rsqrt(x + 2)) return outs return UnaryModule() - - -in_shapes = [ - [16], - [1, 3, 16, 16] -] - - @pytest.mark.parametrize('in_shape', in_shapes) def test_unary(in_shape, request): module = _make_module() @@ -57,7 +175,7 @@ def test_unary(in_shape, request): runner = OnnxTestRunner(request.node.name) model_file = runner.from_torch(module, in_shape) runner.run(model_file) - +''' if __name__ == "__main__": - pytest.main(['-vv', 'test_unary.py']) + pytest.main(['-v', 'test_unary.py']) diff --git a/tests/importer/onnx_/combine/squeeze_dim/test_squeeze_binary.py b/tests/importer/onnx_/combine/squeeze_dim/test_squeeze_binary.py new file mode 100644 index 0000000000..1d8d0e5d66 --- /dev/null +++ b/tests/importer/onnx_/combine/squeeze_dim/test_squeeze_binary.py @@ -0,0 +1,100 @@ +# Copyright 2019-2021 Canaan Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=invalid-name, unused-argument, import-outside-toplevel + +import pytest +import onnx +from onnx import helper +from onnx import AttributeProto, TensorProto, GraphProto +from onnx_test_runner import OnnxTestRunner +import numpy as np + +def _make_module(op, in_type, in_shape_0, in_shape_1): + inputs = [] + outputs = [] + initializers = [] + attributes_dict = {} + nodes = [] + + # input1 + input1 = helper.make_tensor_value_info('input1', in_type, in_shape_0) + inputs.append('input1') + + # set input2 to avoid SIGFPE for div op. + tensor = helper.make_tensor( + 'input2', + in_type, + dims=in_shape_1, + vals=(np.random.rand(*in_shape_1) + 2).astype(onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[in_type]).flatten().tolist() + ) + inputs.append('input2') + initializers.append(tensor) + + # output + x = np.random.randn(*in_shape_0) + y = np.random.randn(*in_shape_1) + output_shape = np.add(x, y).shape + output = helper.make_tensor_value_info('output', in_type, output_shape) + outputs.append('output') + + node = onnx.helper.make_node( + op, + inputs=inputs, + outputs=outputs, + **attributes_dict + ) + nodes.append(node) + + graph_def = helper.make_graph( + nodes, + 'test-model', + [input1], + [output], + initializer=initializers) + + model_def = helper.make_model(graph_def, producer_name='onnx') + return model_def + +ops = [ + 'Add', +] + +in_types = [ + TensorProto.FLOAT, +] + +in_shapes = [ + [[1, 3, 4, 5, 2], [1]], + [[4, 3, 4, 5, 2], [2]], + [[1, 3, 4, 5, 2], [1, 3, 4, 1, 1]], + [[1, 3, 16, 16, 2], [1, 1, 1, 16, 1]], + [[1, 3, 16, 16, 2], [1, 3, 1, 16, 1]], + [[2, 3, 16, 16, 2], [2, 1, 16, 1, 2]], + [[1, 3, 16, 16, 2, 3], [1, 3, 1, 16, 1, 1]], + [[1, 3, 16, 16, 2, 3], [1, 3, 1, 16, 2, 1]], +] + +@pytest.mark.parametrize('op', ops) +@pytest.mark.parametrize('in_type', in_types) +@pytest.mark.parametrize('in_shape', in_shapes) +def test_squeeze_binary(op, in_type, in_shape, request): + model_def = _make_module(op, in_type, in_shape[0], in_shape[1]) + + runner = OnnxTestRunner(request.node.name, ['cpu', 'k210', 'k510']) + model_file = runner.from_onnx_helper(model_def) + runner.run(model_file) + + +if __name__ == "__main__": + pytest.main(['-vv', 'test_squeeze_binary.py']) diff --git a/tests/importer/onnx_/combine/squeeze_dim/test_squeeze_transpose.py b/tests/importer/onnx_/combine/squeeze_dim/test_squeeze_transpose.py new file mode 100644 index 0000000000..4057c6b8ce --- /dev/null +++ b/tests/importer/onnx_/combine/squeeze_dim/test_squeeze_transpose.py @@ -0,0 +1,57 @@ +# Copyright 2019-2021 Canaan Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""System test: test transpose""" +# pylint: disable=invalid-name, unused-argument, import-outside-toplevel + +import pytest +import tensorflow as tf +import numpy as np +from tflite_test_runner import TfliteTestRunner + + +def _make_module(in_shape, perm): + class TransposeModule(tf.Module): + def __init__(self): + super(TransposeModule).__init__() + + @tf.function(input_signature=[tf.TensorSpec(in_shape, tf.float32)]) + def __call__(self, x): + return tf.transpose(x, perm=perm) + return TransposeModule() + + +in_shapes = [ + [8, 3, 64, 3, 4], + [1, 3, 8, 8, 4] +] + +perms = [ + [2, 1, 0, 4, 3], #CPU + [0, 1, 3, 4, 2] #target +] + + +@pytest.mark.parametrize('in_shape', in_shapes) +@pytest.mark.parametrize('perm', perms) +def test_squeeze_transpose(in_shape, perm, request): + if len(perm) == len(in_shape): + module = _make_module(in_shape, perm) + + runner = TfliteTestRunner(request.node.name, ['cpu', 'k210', 'k510']) + model_file = runner.from_tensorflow(module) + runner.run(model_file) + + +if __name__ == "__main__": + pytest.main(['-vv', 'test_squeeze_transpose.py']) diff --git a/tests/importer/tflite_/basic/test_compare.py b/tests/importer/tflite_/basic/test_compare.py index 321a8cc1bd..7c4b8caea0 100644 --- a/tests/importer/tflite_/basic/test_compare.py +++ b/tests/importer/tflite_/basic/test_compare.py @@ -23,7 +23,7 @@ def _make_module(compare_op, in_type_0, in_shape_0, in_type_1, in_shape_1): class CompareModule(tf.Module): def __init__(self): super(CompareModule).__init__() - self.v = tf.constant(np.random.rand(*in_shape_1).astype(in_type_1)) + self.v = tf.constant((np.ones(in_shape_1)/2.0).astype(in_type_1)) @tf.function(input_signature=[tf.TensorSpec(in_shape_0, in_type_0)]) def __call__(self, x): diff --git a/tests/importer/tflite_/basic/test_conv2d.py b/tests/importer/tflite_/basic/test_conv2d.py index 8e31c43ed5..f1a6dba7d0 100644 --- a/tests/importer/tflite_/basic/test_conv2d.py +++ b/tests/importer/tflite_/basic/test_conv2d.py @@ -89,7 +89,7 @@ def __call__(self, x): @pytest.mark.parametrize('padding', paddings) @pytest.mark.parametrize('dilations', dilations) def test_conv2d(n, i_channels, i_size, k_size, o_channels, strides, padding, dilations, request): - if padding != 'VALID' or (k_size[0] <= i_size[0] and k_size[1] <= i_size[1]): + if k_size[0] <= i_size[0] and k_size[1] <= i_size[1] and strides[0] <= k_size[0] and strides[1] <= k_size[1]: module = _make_module(n, i_channels, i_size, k_size, o_channels, strides, padding, dilations) diff --git a/tests/importer/tflite_/basic/test_space_to_batch.py b/tests/importer/tflite_/basic/test_space_to_batch.py new file mode 100644 index 0000000000..7e6559481b --- /dev/null +++ b/tests/importer/tflite_/basic/test_space_to_batch.py @@ -0,0 +1,58 @@ +# Copyright 2019-2021 Canaan Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=invalid-name, unused-argument, import-outside-toplevel +import pytest +import os +import tensorflow as tf +import numpy as np +import sys +from tflite_test_runner import TfliteTestRunner + + +def _make_module(in_shape, block_shape, paddings): + class SpaceToBatchModule(tf.Module): + def __init__(self): + super(SpaceToBatchModule).__init__() + + @tf.function(input_signature=[tf.TensorSpec(in_shape, tf.float32)]) + def __call__(self, x): + return tf.space_to_batch(x, block_shape, paddings) + return SpaceToBatchModule() + +in_shapes = [ + [1, 16, 16, 3] +] + +block_shapes = [ + [2, 2], +] + +paddings = [ + [[0, 0], [0, 0]], + [[0, 2], [0, 2]], + [[2, 0], [2, 0]], + [[2, 2], [2, 2]] +] + +@pytest.mark.parametrize('in_shape', in_shapes) +@pytest.mark.parametrize('block_shape', block_shapes) +@pytest.mark.parametrize('padding', paddings) +def test_space_to_batch(in_shape, block_shape,padding, request): + module = _make_module(in_shape, block_shape, padding) + runner = TfliteTestRunner(request.node.name) + model_file = runner.from_tensorflow(module) + runner.run(model_file) + +if __name__ == "__main__": + pytest.main(['-vv', 'test_space_to_batch.py']) diff --git a/tests/importer/tflite_/model/test_mobilenetv1.py b/tests/importer/tflite_/model/test_mobilenetv1.py index e3f1ce3179..987a910e53 100644 --- a/tests/importer/tflite_/model/test_mobilenetv1.py +++ b/tests/importer/tflite_/model/test_mobilenetv1.py @@ -40,20 +40,55 @@ def _make_module(in_shape, alpha): def test_mobilenetv1(in_shape, alpha, request): module = _make_module(in_shape, alpha) overwrite_cfg = """ - judge: - specifics: - - matchs: - target: [cpu, k510] - ptq: true - threshold: 0.98 - - matchs: - target: [k210] - ptq: true - threshold: 0.94 - - matchs: - target: [k510] - ptq: false - threshold: 0.99 + case: + preprocess_opt: + - name: preprocess + values: + - true + - name: swapRB + values: + - false + - name: input_shape + values: + - [1,224,224,3] + - name: mean + values: + - [0.5,0.5,0.5] + - name: std + values: + - [0.5,0.5,0.5] + - name: input_range + values: + - [0,1] + - name: input_type + values: + - float32 + - name: model_layout + values: + - NHWC + - name: input_layout + values: + - NHWC + - name: output_layout + values: + - NHWC + - name: letterbox_value + values: + - 0. + judge: + specifics: + - matchs: + target: [cpu, k510] + ptq: true + threshold: 0.97 + - matchs: + target: [k210] + ptq: true + threshold: 0.94 + - matchs: + target: [k510] + ptq: false + threshold: 0.99 """ runner = TfliteTestRunner(request.node.name, overwrite_configs=overwrite_cfg) model_file = runner.from_tensorflow(module) diff --git a/tests/onnx_test_runner.py b/tests/onnx_test_runner.py index 3f7c9d28c6..559c6169ae 100644 --- a/tests/onnx_test_runner.py +++ b/tests/onnx_test_runner.py @@ -63,6 +63,7 @@ def run(self, model_file): model_file = os.path.join( os.path.dirname(model_file), 'simplified.onnx') + onnx_model = onnx.shape_inference.infer_shapes(onnx_model) onnx.save_model(onnx_model, model_file) super().run(model_file) @@ -115,10 +116,10 @@ def parse_model_input_output(self, model_file: str): input_dict = {} input_dict['name'] = e.name input_dict['dtype'] = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[onnx_type.elem_type] - input_dict['shape'] = [(i.dim_value if i.dim_value != 0 else d) for i, d in zip( - onnx_type.shape.dim, [1, 3, 224, 224])] - input_dict['model_shape'] = [(i.dim_value if i.dim_value != 0 else d) for i, d in zip( - onnx_type.shape.dim, [1, 3, 224, 224])] + input_dict['shape'] = [(i.dim_value if i.dim_value != 0 else 10) for i in + onnx_type.shape.dim] + input_dict['model_shape'] = [(i.dim_value if i.dim_value != 0 else 10) for i in + onnx_type.shape.dim] self.inputs.append(input_dict) self.calibs.append(copy.deepcopy(input_dict)) self.dump_range_data.append(copy.deepcopy(input_dict)) @@ -150,6 +151,7 @@ def cpu_infer(self, case_dir: str, model_file: bytes, type: str, mode: str): onnx_model = onnx.load(model_file) onnx_model = version_converter.convert_version(onnx_model, 8) model_file = os.path.join(case_dir, 'converted.onnx') + onnx_model = onnx.shape_inference(onnx_model) onnx.save_model(onnx_model, model_file) sess = ort.InferenceSession(model_file) diff --git a/tests/schedule/buffer_fusion/test_bitcast.py b/tests/schedule/buffer_fusion/test_bitcast.py new file mode 100644 index 0000000000..5797e0c29e --- /dev/null +++ b/tests/schedule/buffer_fusion/test_bitcast.py @@ -0,0 +1,42 @@ +# Copyright 2019-2021 Canaan Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=invalid-name, unused-argument, import-outside-toplevel + +import pytest +import tensorflow as tf +import numpy as np +from tflite_test_runner import TfliteTestRunner + + +def _make_module(): + class Module(tf.Module): + def __init__(self): + super(Module).__init__() + + @tf.function(input_signature=[tf.TensorSpec([1, 4, 4, 3], tf.float32)]) + def __call__(self, x): + return tf.reshape(x, [1, -1, 3]) + return Module() + + +def test_bitcast(request): + module = _make_module() + + runner = TfliteTestRunner(request.node.name) + model_file = runner.from_tensorflow(module) + runner.run(model_file) + + +if __name__ == "__main__": + pytest.main(['-vv', 'test_bitcast.py']) diff --git a/tests/test_runner.py b/tests/test_runner.py index 220eeb89bf..f2605597b5 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -20,7 +20,8 @@ from compare_util import compare from dataset_utils import * from models.preprocess.preprocess import preprocess - +import socket +import json class Edict: def __init__(self, d: Dict[str, int]) -> None: @@ -108,7 +109,7 @@ def generate_random(shape: List[int], dtype: np.dtype, elif dtype == np.bool: data = np.random.rand(*shape) > 0.5 else: - data = np.random.rand(*shape) + data = np.random.uniform(0.01, 1, shape) data = data.astype(dtype=dtype) if abs: return np.abs(data) @@ -231,7 +232,16 @@ def __init__(self, case_name, targets=None, overwrite_configs: Union[Dict, str] self.case_dir = os.path.join(self.cfg.setup.root, case_name) self.clear(self.case_dir) - self.validate_targets(targets) + self.kpu_target = os.getenv('KPU_TARGET') + self.port = os.getenv('PORT') + self.nncase_test_ci = os.getenv('NNCASE_TEST_CI') + + if self.in_ci and self.cfg.case.generate_inputs.name == 'generate_random' and self.kpu_target is not None and self.port is not None and self.nncase_test_ci is not None and (targets is None or self.kpu_target in targets): + new_targets = [] + new_targets.append(self.kpu_target) + else: + new_targets = targets + self.validate_targets(new_targets) self.inputs: List[Dict] = [] self.calibs: List[Dict] = [] @@ -316,7 +326,7 @@ def get_process_config(self, config): def data_pre_process(self, data): data = copy.deepcopy(data) - if self.pre_process[3]['input_type'] == "float32": + if self.pre_process[0]['preprocess'] and self.pre_process[3]['input_type'] == "float32": data = np.asarray(data, dtype=np.float32) if self.pre_process[0]['preprocess'] and len(data.shape) == 4: if self.pre_process[-1]['input_layout'] == 'NCHW': @@ -462,6 +472,14 @@ def import_model(self, compiler, model_content, import_options): def run_single(self, cfg, case_dir: str, model_file: Union[List[str], str]): if not self.inputs: self.parse_model_input_output(model_file) + + on_board = self.in_ci and self.kpu_target is not None and self.port is not None and self.nncase_test_ci is not None and len(self.inputs) > 0 and len(self.outputs) > 0 + if on_board and cfg.generate_inputs.name == 'generate_imagenet_dataset': + cfg.generate_inputs.batch_size = 1 + + if on_board and cfg.generate_calibs.name == 'generate_imagenet_dataset': + cfg.generate_calibs.batch_size = 1 + names, args = TestRunner.split_value(cfg.preprocess_opt) for combine_args in product(*args): dict_args = dict(zip(names, combine_args)) @@ -734,8 +752,13 @@ def nncase_infer(self, cfg, case_dir: str, if kwargs['ptq']: ptq_options = nncase.PTQTensorOptions() if cfg.generate_calibs.name == "generate_imagenet_dataset": - ptq_options.set_tensor_data(np.asarray( - [sample['data'] for sample in self.calibs]).tobytes()) + # ptq_options.set_tensor_data(np.asarray( + # [sample['data'] for sample in self.calibs]).tobytes()) + calib_len = len(self.calibs[0]['data']) + byte_inputs = np.asarray(self.calibs[0]['data'][0][0]).tobytes() + for i in range(1, len(self.calibs[0]['data'])): + byte_inputs += np.asarray(self.calibs[0]['data'][i][0]).tobytes() + ptq_options.set_tensor_data(byte_inputs) ptq_options.calibrate_method = self.cfg.case.compile_opt.quant_method else: raw_inputs = [self.transform_input(sample['data'], preprocess['input_type'], "infer") for sample in @@ -754,6 +777,10 @@ def nncase_infer(self, cfg, case_dir: str, f.write(kmodel) infer_output_paths: List[np.ndarray] = [] + + on_board = self.in_ci and kwargs['target'] == self.kpu_target and self.port is not None and self.nncase_test_ci is not None and len(self.inputs) > 0 and len(self.outputs) > 0 + case_name = f'{os.path.basename(case_dir)}_{os.path.basename(infer_dir)}' + if cfg.generate_inputs.name == "generate_imagenet_dataset": gnne_txt = "gnne_no_ptq" if kwargs['ptq'] is False else "gnne_ptq" infer_output_paths.append(( @@ -767,45 +794,157 @@ def nncase_infer(self, cfg, case_dir: str, result = [] for in_data in self.inputs[0]['data']: input_data = copy.deepcopy(in_data) - p.apply_async(sim_run, args=( - kmodel, input_data, infer_output_paths, kwargs['target'], self.model_type, - self.inputs[0]['model_shape'])) + if on_board: + on_board_run(kmodel, input_data, infer_output_paths, kwargs['target'], self.port, case_name, self.nncase_test_ci, len(self.inputs), len(self.outputs), self.model_type, + self.inputs[0]['model_shape']) + else: + p.apply_async(sim_run, args=( + kmodel, input_data, infer_output_paths, kwargs['target'], self.model_type, + self.inputs[0]['model_shape'])) p.close() p.join() else: - sim = nncase.Simulator() - sim.load_model(kmodel) - for i in range(len(self.inputs)): - data = self.transform_input( - self.inputs[i]['data'], preprocess['input_type'], "infer") - dtype = preprocess['input_type'] - if preprocess['preprocess']: - data.tofile(os.path.join(case_dir, f'input_{i}_{dtype}.bin')) - self.totxtfile(os.path.join(case_dir, f'input_{i}_{dtype}.txt'), data) - - sim.set_input_tensor(i, nncase.RuntimeTensor.from_numpy(data)) - sim.run() - - for i in range(sim.outputs_size): - result = sim.get_output_tensor(i).to_numpy() - if preprocess['preprocess'] and len(result.shape) == 4: - if (preprocess['output_layout'] == 'NHWC' and self.model_type in ['caffe', 'onnx']): - result = np.transpose(result, [0, 3, 1, 2]) - elif (preprocess['output_layout'] == 'NCHW' and self.model_type in ['tflite']): - result = np.transpose(result, [0, 2, 3, 1]) - infer_output_paths.append(( - os.path.join(infer_dir, f'nncase_result_{i}.bin'), - os.path.join(infer_dir, f'nncase_result_{i}.txt'))) - if cfg.compile_opt.output_type != "float32" and infer_dir.split('/')[-1] == "ptq": - result.tofile(os.path.join( - infer_dir, f'nncase_result_{cfg.compile_opt.output_type}_{i}.bin')) - self.totxtfile(os.path.join( - infer_dir, f'nncase_result_{cfg.compile_opt.output_type}_{i}.txt'), result) - result = deq_output(os.path.join( - infer_dir, f'kmodel_info.txt'), result) - result.tofile(infer_output_paths[-1][0]) - self.totxtfile(infer_output_paths[-1][1], result) + if on_board: + # connect server + client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + client_socket.connect(('localhost', int(self.port))) + + # send header + header_dict = {} + header_dict['case'] = case_name + header_dict['app'] = 1 + header_dict['kmodel']= 1 + header_dict['inputs'] = len(self.inputs) + header_dict['outputs'] = len(self.outputs) + client_socket.sendall(json.dumps(header_dict).encode()) + dummy = client_socket.recv(1024) + + # send app + file_dict = {} + file_dict['file_name'] = os.path.basename(self.nncase_test_ci) + file_dict['file_size'] = os.path.getsize(self.nncase_test_ci) + client_socket.sendall(json.dumps(file_dict).encode()) + dummy = client_socket.recv(1024) + with open(self.nncase_test_ci, 'rb') as f: + client_socket.sendall(f.read()) + dummy = client_socket.recv(1024) + + # send kmodel + file_dict['file_name'] = 'test.kmodel' + file_dict['file_size'] = len(kmodel) + client_socket.sendall(json.dumps(file_dict).encode()) + dummy = client_socket.recv(1024) + client_socket.sendall(kmodel) + dummy = client_socket.recv(1024) + + # send inputs + for i in range(len(self.inputs)): + input_bin = os.path.join(case_dir, f'input_0_{i}.bin') + data = self.transform_input( + self.inputs[i]['data'], preprocess['input_type'], "infer") + dtype = preprocess['input_type'] + if preprocess['preprocess']: + input_bin = os.path.join(case_dir, f'input_{i}_{dtype}.bin') + data.tofile(input_bin) + self.totxtfile(os.path.join(case_dir, f'input_{i}_{dtype}.txt'), data) + + file_dict['file_name'] = f'input_0_{i}.bin' + file_dict['file_size'] = os.path.getsize(input_bin) + client_socket.sendall(json.dumps(file_dict).encode()) + dummy = client_socket.recv(1024) + client_socket.sendall(data.tobytes()) + dummy = client_socket.recv(1024) + + # infer result + cmd_result = client_socket.recv(1024).decode() + if cmd_result.find('succeed') != -1: + client_socket.sendall(f"pls send outputs".encode()) + + # recv outputs + for i in range(len(self.outputs)): + header = client_socket.recv(1024) + file_size = int(header.decode()) + client_socket.sendall(f"pls send nncase_result_{i}.bin".encode()) + + recv_size = 0 + buffer = bytearray(file_size) + while recv_size < file_size: + slice = client_socket.recv(4096) + buffer[recv_size:] = slice + recv_size += len(slice) + + # save nncase_result + nncase_result = np.frombuffer(buffer, dtype=self.outputs[i]['dtype']) + nncase_result.tofile(os.path.join(infer_dir, f'nncase_result_{i}.bin')) + self.totxtfile(os.path.join(infer_dir, f'nncase_result_{i}.txt'), nncase_result) + + # save nncase_vs_cpu_result + model_shape = self.outputs[i]['model_shape'] + nncase_vs_cpu_result = nncase_result.reshape(model_shape) + if preprocess['preprocess'] and len(model_shape) == 4: + if (preprocess['output_layout'] == 'NHWC' and self.model_type in ['caffe', 'onnx']): + nncase_vs_cpu_result = nncase_result.reshape(model_shape[0], model_shape[2], model_shape[3], model_shape[1]) + nncase_vs_cpu_result = np.transpose(nncase_vs_cpu_result, [0, 3, 1, 2]) + elif (preprocess['output_layout'] == 'NCHW' and self.model_type in ['tflite']): + nncase_vs_cpu_result = nncase_result.reshape(model_shape[0], model_shape[3], model_shape[1], model_shape[2]) + nncase_vs_cpu_result = np.transpose(nncase_vs_cpu_result, [0, 2, 3, 1]) + infer_output_paths.append(( + os.path.join(infer_dir, f'nncase_vs_cpu_result_{i}.bin'), + os.path.join(infer_dir, f'nncase_vs_cpu_result_{i}.txt'))) + if cfg.compile_opt.output_type != "float32" and infer_dir.split('/')[-1] == "ptq": + nncase_vs_cpu_result.tofile(os.path.join(infer_dir, f'nncase_vs_cpu_result_{cfg.compile_opt.output_type}_{i}.bin')) + self.totxtfile(os.path.join(infer_dir, f'nncase_vs_cpu_result_{cfg.compile_opt.output_type}_{i}.txt'), nncase_vs_cpu_result) + nncase_vs_cpu_result = deq_output(os.path.join(infer_dir, f'kmodel_info.txt'), nncase_vs_cpu_result) + nncase_vs_cpu_result.tofile(infer_output_paths[-1][0]) + self.totxtfile(infer_output_paths[-1][1], nncase_vs_cpu_result) + + client_socket.sendall(f"recv nncase_result_{i}.bin succeed".encode()) + + client_socket.close() + else: + client_socket.close() + raise Exception(f'{cmd_result}') + else: + # run in simulator + sim = nncase.Simulator() + sim.load_model(kmodel) + for i in range(len(self.inputs)): + data = self.transform_input( + self.inputs[i]['data'], preprocess['input_type'], "infer") + dtype = preprocess['input_type'] + if preprocess['preprocess']: + data.tofile(os.path.join(case_dir, f'input_{i}_{dtype}.bin')) + self.totxtfile(os.path.join(case_dir, f'input_{i}_{dtype}.txt'), data) + + sim.set_input_tensor(i, nncase.RuntimeTensor.from_numpy(data)) + sim.run() + + for i in range(sim.outputs_size): + nncase_result = sim.get_output_tensor(i).to_numpy() + + # save nncase_result + nncase_result.tofile(os.path.join(infer_dir, f'nncase_result_{i}.bin')) + self.totxtfile(os.path.join(infer_dir, f'nncase_result_{i}.txt'), nncase_result) + + # save nncase_vs_cpu_result + model_shape = self.outputs[i]['model_shape'] + nncase_vs_cpu_result = nncase_result + if preprocess['preprocess'] and len(model_shape) == 4: + if (preprocess['output_layout'] == 'NHWC' and self.model_type in ['caffe', 'onnx']): + nncase_vs_cpu_result = np.transpose(nncase_vs_cpu_result, [0, 3, 1, 2]) + elif (preprocess['output_layout'] == 'NCHW' and self.model_type in ['tflite']): + nncase_vs_cpu_result = np.transpose(nncase_vs_cpu_result, [0, 2, 3, 1]) + infer_output_paths.append(( + os.path.join(infer_dir, f'nncase_vs_cpu_result_{i}.bin'), + os.path.join(infer_dir, f'nncase_vs_cpu_result_{i}.txt'))) + if cfg.compile_opt.output_type != "float32" and infer_dir.split('/')[-1] == "ptq": + nncase_vs_cpu_result.tofile(os.path.join(infer_dir, f'nncase_vs_cpu_result_{cfg.compile_opt.output_type}_{i}.bin')) + self.totxtfile(os.path.join(infer_dir, f'nncase_vs_cpu_result_{cfg.compile_opt.output_type}_{i}.txt'), nncase_vs_cpu_result) + nncase_vs_cpu_result = deq_output(os.path.join(infer_dir, f'kmodel_info.txt'), nncase_vs_cpu_result) + nncase_vs_cpu_result.tofile(infer_output_paths[-1][0]) + self.totxtfile(infer_output_paths[-1][1], nncase_vs_cpu_result) + return infer_output_paths def on_test_start(self) -> None: diff --git a/toolchains/k230.baremetal.toolchain.cmake b/toolchains/k230.baremetal.toolchain.cmake index 5e8cde22c1..49163af48b 100644 --- a/toolchains/k230.baremetal.toolchain.cmake +++ b/toolchains/k230.baremetal.toolchain.cmake @@ -16,6 +16,10 @@ set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-elf-g++") set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-elf") + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64imafdcv -mabi=lp64d -mcmodel=medany -static") #-march=rv64imafdc_v0p7_zfh_zvamo0p7_zvlsseg0p7_xtheadc +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64imafdcv -mabi=lp64d -mcmodel=medany -static") + set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) @@ -26,4 +30,7 @@ set(ENABLE_HALIDE OFF) set(DEFAULT_BUILTIN_RUNTIMES OFF) set(BUILD_PYTHON_BINDING OFF) set(DEFAULT_SHARED_RUNTIME_TENSOR_PLATFORM_IMPL OFF) -set(BUILD_BENCHMARK OFF) \ No newline at end of file +set(BUILD_BENCHMARK OFF) + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64imafdcv_zihintpause_zfh_zba_zbb_zbc_zbs_xtheadc -mabi=lp64d -mcmodel=medany -mtune=c908") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64imafdcv_zihintpause_zfh_zba_zbb_zbc_zbs_xtheadc -mabi=lp64d -mcmodel=medany -mtune=c908") \ No newline at end of file diff --git a/toolchains/k230.linux.toolchain.cmake b/toolchains/k230.linux.toolchain.cmake index 730072c6e1..92ccf2aece 100644 --- a/toolchains/k230.linux.toolchain.cmake +++ b/toolchains/k230.linux.toolchain.cmake @@ -10,20 +10,18 @@ if(NOT RISCV_ROOT_PATH) endif() set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain") - -set(CMAKE_C_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc") -set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc") - -set(CMAKE_C_FLAGS "-march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906") -set(CMAKE_CXX_FLAGS "-march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906") - -set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu") +set(CMAKE_C_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-musl-gcc") +set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-musl-g++") +set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-musl") set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(ENABLE_VULKAN_RUNTIME OFF) set(ENABLE_HALIDE OFF) -# set(DEFAULT_BUILTIN_RUNTIMES OFF) -# set(DEFAULT_SHARED_RUNTIME_TENSOR_PLATFORM_IMPL OFF) +set(DEFAULT_BUILTIN_RUNTIMES OFF) +set(DEFAULT_SHARED_RUNTIME_TENSOR_PLATFORM_IMPL OFF) set(BUILD_BENCHMARK OFF) + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64imafdcv -mabi=lp64d -mcmodel=medany") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64imafdcv -mabi=lp64d -mcmodel=medany") diff --git a/toolchains/x86_64.toolchain.cmake b/toolchains/x86_64.toolchain.cmake new file mode 100644 index 0000000000..6808c70c84 --- /dev/null +++ b/toolchains/x86_64.toolchain.cmake @@ -0,0 +1,11 @@ + +if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Windows") + add_definitions(/DX86_64_SIMD_ON) + add_compile_options(/arch:AVX) + add_compile_options(/arch:AVX2) +elseif (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux") + add_definitions(-DX86_64_SIMD_ON) + add_compile_options( -mfma -msse -msse2 -msse3 -mssse3 -msse4 -msse4a -msse4.1 -msse4.2 -mavx -mavx2) +else() + message("current platform: other ... ") +endif() diff --git a/tools/stackvm_gen/IsaGen/Instructions.cs b/tools/stackvm_gen/IsaGen/Instructions.cs index ee4c76bc12..1ce60f6596 100644 --- a/tools/stackvm_gen/IsaGen/Instructions.cs +++ b/tools/stackvm_gen/IsaGen/Instructions.cs @@ -8,2248 +8,2456 @@ namespace IsaGen { - [System.AttributeUsage(AttributeTargets.Enum, Inherited = false, AllowMultiple = false)] - public sealed class BitLengthAttribute : Attribute - { - public uint BitLength { get; } - - public BitLengthAttribute(uint bitLength) - { - BitLength = bitLength; - } - } - - [System.AttributeUsage(AttributeTargets.All, Inherited = false, AllowMultiple = false)] - public sealed class EnumNameAttribute : Attribute - { - public string Name { get; } - - public EnumNameAttribute(string name) - { - Name = name; - } - } - - [BitLength(8)] - [EnumName("opcode_t")] - public enum OpCode - { - NOP, - LDNULL, - LDC_I4, - LDC_I4_0, - LDC_I4_1, - LDC_R4, - LDIND_I1, - LDIND_I2, - LDIND_I4, - LDIND_I, - LDIND_U1, - LDIND_U2, - LDIND_U4, - LDIND_U, - LDIND_BR2, - LDIND_R4, - STIND_I1, - STIND_I2, - STIND_I4, - STIND_I, - STIND_BR2, - STIND_R4, - LEA_GP, - LEA_BUFFER, - - LDELEM_I1, - LDELEM_I2, - LDELEM_I4, - LDELEM_I, - LDELEM_U1, - LDELEM_U2, - LDELEM_U4, - LDELEM_U, - LDELEM_BR2, - LDELEM_R4, - STELEM_I1, - STELEM_I2, - STELEM_I4, - STELEM_I, - STELEM_BR2, - STELEM_R4, - - LDARG, - LDARG_0, - LDARG_1, - LDARG_2, - LDARG_3, - LDARG_4, - LDARG_5, - - DUP, - POP, - - STSHAPE, - STPADDINGS, - - NEG, - ADD, - SUB, - MUL, - DIV, - DIV_U, - REM, - REM_U, - AND, - OR, - XOR, - NOT, - SHL, - SHR, - SHR_U, - - CLT, - CLT_U, - CLE, - CLE_U, - CEQ, - CGE, - CGE_U, - CGT, - CGT_U, - CNE, - - CONV_I1, - CONV_I2, - CONV_I4, - CONV_I, - CONV_U1, - CONV_U2, - CONV_U4, - CONV_U, - CONV_BR2, - CONV_R4, - - BR, - BR_TRUE, - BR_FALSE, - RET, - CALL, - ECALL, - THROW, - BREAK, - - TENSOR, - } - - [BitLength(16)] - [EnumName("tensor_function_t")] - public enum TensorFunction - { - BATCH_TO_SPACE, - BINARY, - BROADCAST, - CALL, - COMPARE, - CLAMP, - CONV2D, - CONV2D_TRANSPOSE, - CONVERT, - COPY, - CUMSUM, - DEQUANTIZE, - GATHER, - GATHER_ND, - HARDMAX, - LOGISTIC, - LUT1D, - MATMUL, - ONEHOT, - PAD, - QUANTIZE, - RANDOM_NORMAL, - RANDOM_UNIFORM, - REDUCE, - REDUCE_ARG, - REDUCE_PROD, - REDUCE_WINDOW2D, - RESIZE_IMAGE, - ROI_ALIGN, - SIGMOID, - SLICE, - SOFTMAX, - SPACE_TO_BATCH, - TAKE, - TERNARY, - TOPK, - TRANSPOSE, - TRILU, - UNARY, - } - - [BitLength(8)] - [EnumName("datatype_t")] - [Browsable(false)] - public enum DataType - { - } - - [BitLength(8)] - [EnumName("onehot_mode_t")] - [Browsable(false)] - public enum OneHotMode - { - } - - [BitLength(8)] - [EnumName("pad_mode_t")] - [Browsable(false)] - public enum PadMode - { - } - - [BitLength(8)] - [EnumName("memory_location_t")] - [Browsable(false)] - public enum MemoryLocation - { - } - - [BitLength(8)] - [EnumName("reduce_op_t")] - [Browsable(false)] - public enum ReduceOp - { - } - - [BitLength(8)] - [EnumName("reduce_arg_op_t")] - [Browsable(false)] - public enum ReduceArgOp - { - } - - [BitLength(8)] - [EnumName("image_resize_mode_t")] - [Browsable(false)] - public enum ImageResizeMode - { - } - - [BitLength(8)] - [EnumName("binary_op_t")] - [Browsable(false)] - public enum BinaryOp - { - } - - [BitLength(8)] - [EnumName("unary_op_t")] - [Browsable(false)] - public enum UnaryOp - { - } - - [BitLength(8)] - [EnumName("compare_op_t")] - [Browsable(false)] - public enum CompareOp - { - } - - [BitLength(8)] - [EnumName("roi_align_mode_t")] - [Browsable(false)] - public enum RoiAlignMode - { - } - - public abstract class Instruction - { - [DisplayName("opcode")] - [Description("OpCode")] - public abstract OpCode OpCode { get; } - } - - [DisplayName("NOP")] - [Category("Control and Status Instructions")] - [Description("No operation")] - public class NopInstruction : Instruction - { - public override OpCode OpCode => OpCode.NOP; - } - - [DisplayName("LDC_I4")] - [Category("Immediate Instructions")] - [Description("Load immedidate I4 to stack")] - public class LdcI4Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDC_I4; - - [DisplayName("imm")] - [Description("Immedidate I4")] - public int Imm { get; set; } - } - - [DisplayName("LDNULL")] - [Category("Immediate Instructions")] - [Description("Load immedidate nullptr as I to stack")] - public class LdNullInstruction : Instruction - { - public override OpCode OpCode => OpCode.LDNULL; - } - - [DisplayName("LDC_I4_0")] - [Category("Immediate Instructions")] - [Description("Load immedidate 0 as I4 to stack")] - public class LdcI4_0Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDC_I4_0; - } - - [DisplayName("LDC_I4_1")] - [Category("Immediate Instructions")] - [Description("Load immedidate 1 as I4 to stack")] - public class LdcI4_1Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDC_I4_1; - } - - [DisplayName("LDC_R4")] - [Category("Immediate Instructions")] - [Description("Load immedidate R4 to stack")] - public class LdcR4Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDC_R4; - - [DisplayName("imm")] - [Description("Immedidate R4")] - public float Imm { get; set; } - } - - [Category("Load Store Instructions")] - public abstract class LdStindInstruction : Instruction - { - } - - [DisplayName("LDIND_I1")] - [Description("Load indirect I1 to stack")] - public class LdindI1Instruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.LDIND_I1; - } - - [DisplayName("LDIND_I2")] - [Description("Load indirect I2 to stack")] - public class LdindI2Instruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.LDIND_I2; - } - - [DisplayName("LDIND_I4")] - [Description("Load indirect I4 to stack")] - public class LdindI4Instruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.LDIND_I4; - } - - [DisplayName("LDIND_I")] - [Description("Load indirect I to stack")] - public class LdindIInstruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.LDIND_I; - } - - [DisplayName("LDIND_U1")] - [Description("Load indirect U1 to stack")] - public class LdindU1Instruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.LDIND_U1; - } - - [DisplayName("LDIND_U2")] - [Description("Load indirect U2 to stack")] - public class LdindU2Instruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.LDIND_U2; - } - - [DisplayName("LDIND_U4")] - [Description("Load indirect U4 to stack")] - public class LdindU4Instruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.LDIND_U4; - } - - [DisplayName("LDIND_U")] - [Description("Load indirect U to stack")] - public class LdindUInstruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.LDIND_U; - } - - [DisplayName("LDIND_BR2")] - [Description("Load indirect BR2 to stack")] - public class LdindBR2Instruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.LDIND_BR2; - } - - [DisplayName("LDIND_R4")] - [Description("Load indirect R4 to stack")] - public class LdindR4Instruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.LDIND_R4; - } - - [DisplayName("STIND_I1")] - [Description("Store indirect I1 from stack")] - public class StindI1Instruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.STIND_I1; - } - - [DisplayName("STIND_I2")] - [Description("Store indirect I2 from stack")] - public class StindI2Instruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.STIND_I2; - } - - [DisplayName("STIND_I4")] - [Description("Store indirect I4 from stack")] - public class StindI4Instruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.STIND_I4; - } - - [DisplayName("STIND_I")] - [Description("Store indirect I from stack")] - public class StindIInstruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.STIND_I; - } - - [DisplayName("STIND_BR2")] - [Description("Store indirect BR2 from stack")] - public class StindBR2Instruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.STIND_BR2; - } - - [DisplayName("STIND_R4")] - [Description("Store indirect R4 from stack")] - public class StindR4Instruction : LdStindInstruction - { - public override OpCode OpCode => OpCode.STIND_R4; - } - - [DisplayName("LEA_GP")] - [Category("Load Store Instructions")] - [Description("Load a global pointer with offset to stack")] - public class LeaGPInstruction : Instruction - { - public override OpCode OpCode => OpCode.LEA_GP; - - [DisplayName("gpid")] - [Description("Global pointer id")] - public byte GpId { get; set; } - - [DisplayName("offset")] - [Description("Signed immediate offset")] - public int Offset { get; set; } - } - - [DisplayName("LEA_BUFFER")] - [Category("Load Store Instructions")] - [Description("Load a buffer pointer with offset to stack")] - public class LeaBufferInstruction : Instruction - { - public override OpCode OpCode => OpCode.LEA_BUFFER; - - [DisplayName("location")] - [Description("Location")] - public MemoryLocation Location { get; set; } - - [DisplayName("subres_id")] - [Description("SubresourceId")] - public byte SubresourceId { get; set; } - - [DisplayName("offset")] - [Description("Unsigned immediate offset")] - public uint Offset { get; set; } - } - - [DisplayName("LDELEM_I1")] - [Category("Load Store Instructions")] - [Description("Load an array element of I1 to stack")] - public class LdelemI1Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDELEM_I1; - } - - [DisplayName("LDELEM_I2")] - [Category("Load Store Instructions")] - [Description("Load an array element of I2 to stack")] - public class LdelemI2Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDELEM_I2; - } - - [DisplayName("LDELEM_I4")] - [Category("Load Store Instructions")] - [Description("Load an array element of I4 to stack")] - public class LdelemI4Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDELEM_I4; - } - - [DisplayName("LDELEM_I")] - [Category("Load Store Instructions")] - [Description("Load an array element of I to stack")] - public class LdelemIInstruction : Instruction - { - public override OpCode OpCode => OpCode.LDELEM_I; - } - - [DisplayName("LDELEM_U1")] - [Category("Load Store Instructions")] - [Description("Load an array element of U1 to stack")] - public class LdelemU1Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDELEM_U1; - } - - [DisplayName("LDELEM_U2")] - [Category("Load Store Instructions")] - [Description("Load an array element of U2 to stack")] - public class LdelemU2Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDELEM_U2; - } - - [DisplayName("LDELEM_U4")] - [Category("Load Store Instructions")] - [Description("Load an array element of U4 to stack")] - public class LdelemU4Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDELEM_U4; - } - - [DisplayName("LDELEM_U")] - [Category("Load Store Instructions")] - [Description("Load an array element of U to stack")] - public class LdelemUInstruction : Instruction - { - public override OpCode OpCode => OpCode.LDELEM_U; - } - - [DisplayName("LDELEM_BR2")] - [Category("Load Store Instructions")] - [Description("Load an array element of BR2 to stack")] - public class LdelemBR2Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDELEM_BR2; - } - - [DisplayName("LDELEM_R4")] - [Category("Load Store Instructions")] - [Description("Load an array element of R4 to stack")] - public class LdelemR4Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDELEM_R4; - } - - [DisplayName("STELEM_I1")] - [Category("Load Store Instructions")] - [Description("Store an array element of I1 from stack")] - public class StelemI1Instruction : Instruction - { - public override OpCode OpCode => OpCode.STELEM_I1; - } - - [DisplayName("STELEM_I2")] - [Category("Load Store Instructions")] - [Description("Store an array element of I2 from stack")] - public class StelemI2Instruction : Instruction - { - public override OpCode OpCode => OpCode.STELEM_I2; - } - - [DisplayName("STELEM_I4")] - [Category("Load Store Instructions")] - [Description("Store an array element of I4 from stack")] - public class StelemI4Instruction : Instruction - { - public override OpCode OpCode => OpCode.STELEM_I4; - } - - [DisplayName("STELEM_I")] - [Category("Load Store Instructions")] - [Description("Store an array element of I from stack")] - public class StelemIInstruction : Instruction - { - public override OpCode OpCode => OpCode.STELEM_I; - } - - [DisplayName("STELEM_BR2")] - [Category("Load Store Instructions")] - [Description("Store an array element of BR2 from stack")] - public class StelemBR2Instruction : Instruction - { - public override OpCode OpCode => OpCode.STELEM_BR2; - } - - [DisplayName("STELEM_R4")] - [Category("Load Store Instructions")] - [Description("Store an array element of R4 from stack")] - public class StelemR4Instruction : Instruction - { - public override OpCode OpCode => OpCode.STELEM_R4; - } - - [DisplayName("LDARG")] - [Category("Load Store Instructions")] - [Description("Load an argument to stack")] - public class LdargInstruction : Instruction - { - public override OpCode OpCode => OpCode.LDARG; - - [DisplayName("index")] - [Description("Argument index")] - public uint Index { get; set; } - } - - [DisplayName("LDARG_0")] - [Category("Load Store Instructions")] - [Description("Load an argument with index of 0 to stack")] - public class Ldarg0Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDARG_0; - } - - [DisplayName("LDARG_1")] - [Category("Load Store Instructions")] - [Description("Load an argument with index of 1 to stack")] - public class Ldarg1Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDARG_1; - } - - [DisplayName("LDARG_2")] - [Category("Load Store Instructions")] - [Description("Load an argument with index of 2 to stack")] - public class Ldarg2Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDARG_2; - } - - [DisplayName("LDARG_3")] - [Category("Load Store Instructions")] - [Description("Load an argument with index of 1 to stack")] - public class Ldarg3Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDARG_3; - } - - [DisplayName("LDARG_4")] - [Category("Load Store Instructions")] - [Description("Load an argument with index of 4 to stack")] - public class Ldarg4Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDARG_4; - } - - [DisplayName("LDARG_5")] - [Category("Load Store Instructions")] - [Description("Load an argument with index of 5 to stack")] - public class Ldarg5Instruction : Instruction - { - public override OpCode OpCode => OpCode.LDARG_5; - } - - [DisplayName("STSHAPE")] - [Category("Load Store Instructions")] - [Description("Store a shape from stack")] - public class StShapeInstruction : Instruction - { - public override OpCode OpCode => OpCode.STSHAPE; - - [DisplayName("rshape")] - [Description("Shape register index")] - public byte Rshape { get; set; } - - [DisplayName("rank")] - [Description("Shape's rank")] - public byte Rank { get; set; } - } - - [DisplayName("STPADDINGS")] - [Category("Load Store Instructions")] - [Description("Store paddings from stack")] - public class StPaddingsInstruction : Instruction - { - public override OpCode OpCode => OpCode.STPADDINGS; - - [DisplayName("rpaddings")] - [Description("Paddings register index")] - public byte Rpaddings { get; set; } - - [DisplayName("rank")] - [Description("Paddings' rank")] - public byte Rank { get; set; } - } - - [DisplayName("DUP")] - [Category("Stack Instructions")] - [Description("Duplicate the top item of stack")] - public class DupInstruction : Instruction - { - public override OpCode OpCode => OpCode.DUP; - } - - [DisplayName("POP")] - [Category("Stack Instructions")] - [Description("Pop the top item of stack")] - public class PopInstruction : Instruction - { - public override OpCode OpCode => OpCode.POP; - } - - [DisplayName("NEG")] - [Category("Computational Instructions")] - [Description("Negates a value and pushes the result onto the evaluation stack")] - public class NegInstruction : Instruction - { - public override OpCode OpCode => OpCode.NEG; - } - - [DisplayName("ADD")] - [Category("Computational Instructions")] - [Description("Adds two values and pushes the result onto the evaluation stack")] - public class AddInstruction : Instruction - { - public override OpCode OpCode => OpCode.ADD; - } - - [DisplayName("SUB")] - [Category("Computational Instructions")] - [Description("Subtracts one value from another and pushes the result onto the evaluation stack")] - public class SubInstruction : Instruction - { - public override OpCode OpCode => OpCode.SUB; - } - - [DisplayName("MUL")] - [Category("Computational Instructions")] - [Description("Multiplies two values and pushes the result on the evaluation stack")] - public class MulInstruction : Instruction - { - public override OpCode OpCode => OpCode.MUL; - } - - [DisplayName("DIV")] - [Category("Computational Instructions")] - [Description("Divides two values and pushes the result as a floating-point (type F) or quotient (type int32) onto the evaluation stack")] - public class DivInstruction : Instruction - { - public override OpCode OpCode => OpCode.DIV; - } - - [DisplayName("DIV_U")] - [Category("Computational Instructions")] - [Description("Divides two unsigned integer values and pushes the result (int32) onto the evaluation stack")] - public class DivUInstruction : Instruction - { - public override OpCode OpCode => OpCode.DIV_U; - } - - [DisplayName("REM")] - [Category("Computational Instructions")] - [Description("Divides two values and pushes the remainder onto the evaluation stack")] - public class RemInstruction : Instruction - { - public override OpCode OpCode => OpCode.REM; - } - - [DisplayName("REM_U")] - [Category("Computational Instructions")] - [Description("Divides two unsigned values and pushes the remainder onto the evaluation stack")] - public class RemUInstruction : Instruction - { - public override OpCode OpCode => OpCode.REM_U; - } - - [DisplayName("AND")] - [Category("Computational Instructions")] - [Description("Computes the bitwise AND of two values and pushes the result onto the evaluation stack")] - public class AndInstruction : Instruction - { - public override OpCode OpCode => OpCode.AND; - } - - [DisplayName("OR")] - [Category("Computational Instructions")] - [Description("Compute the bitwise complement of the two integer values on top of the stack and pushes the result onto the evaluation stack")] - public class OrInstruction : Instruction - { - public override OpCode OpCode => OpCode.OR; - } - - [DisplayName("XOR")] - [Category("Computational Instructions")] - [Description("Computes the bitwise XOR of the top two values on the evaluation stack, pushing the result onto the evaluation stack")] - public class XorInstruction : Instruction - { - public override OpCode OpCode => OpCode.XOR; - } - - [DisplayName("NOT")] - [Category("Computational Instructions")] - [Description("Computes the bitwise complement of the integer value on top of the stack and pushes the result onto the evaluation stack as the same type")] - public class NotInstruction : Instruction - { - public override OpCode OpCode => OpCode.NOT; - } - - [DisplayName("SHL")] - [Category("Computational Instructions")] - [Description("Shifts an integer value to the left (in zeroes) by a specified number of bits, pushing the result onto the evaluation stack")] - public class ShlInstruction : Instruction - { - public override OpCode OpCode => OpCode.SHL; - } - - [DisplayName("SHR")] - [Category("Computational Instructions")] - [Description("Shifts an integer value (in sign) to the right by a specified number of bits, pushing the result onto the evaluation stack")] - public class ShrInstruction : Instruction - { - public override OpCode OpCode => OpCode.SHR; - } - - [DisplayName("SHR_U")] - [Category("Computational Instructions")] - [Description("Shifts an unsigned integer value (in zeroes) to the right by a specified number of bits, pushing the result onto the evaluation stack")] - public class ShrUInstruction : Instruction - { - public override OpCode OpCode => OpCode.SHR_U; - } - - [DisplayName("CLT")] - [Category("Computational Instructions")] - [Description("Compares two values. If the first value is less than the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] - public class CltInstruction : Instruction - { - public override OpCode OpCode => OpCode.CLT; - } - - [DisplayName("CLT_U")] - [Category("Computational Instructions")] - [Description("Compares the unsigned or unordered values value1 and value2. If value1 is less than value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] - public class CltUInstruction : Instruction - { - public override OpCode OpCode => OpCode.CLT_U; - } - - [DisplayName("CLE")] - [Category("Computational Instructions")] - [Description("Compares two values. If the first value is less than or equal to the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] - public class CleInstruction : Instruction - { - public override OpCode OpCode => OpCode.CLE; - } - - [DisplayName("CLE_U")] - [Category("Computational Instructions")] - [Description("Compares the unsigned or unordered values value1 and value2. If value1 is less than or equal to value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] - public class CleUInstruction : Instruction - { - public override OpCode OpCode => OpCode.CLE_U; - } - - [DisplayName("CEQ")] - [Category("Computational Instructions")] - [Description("Compares two values. If they are equal, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] - public class CeqInstruction : Instruction - { - public override OpCode OpCode => OpCode.CEQ; - } - - [DisplayName("CGE")] - [Category("Computational Instructions")] - [Description("Compares two values. If the first value is greater than or equal to the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] - public class CgeInstruction : Instruction - { - public override OpCode OpCode => OpCode.CGE; - } - - [DisplayName("CGE_U")] - [Category("Computational Instructions")] - [Description("Compares the unsigned or unordered values value1 and value2. If value1 is greater than or equal to value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] - public class CgeUInstruction : Instruction - { - public override OpCode OpCode => OpCode.CGE_U; - } - - [DisplayName("CGT")] - [Category("Computational Instructions")] - [Description("Compares two values. If the first value is greater than the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] - public class CgtInstruction : Instruction - { - public override OpCode OpCode => OpCode.CGT; - } - - [DisplayName("CGT_U")] - [Category("Computational Instructions")] - [Description("Compares the unsigned or unordered values value1 and value2. If value1 is greater than value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] - public class CgtUInstruction : Instruction - { - public override OpCode OpCode => OpCode.CGT_U; - } - - [DisplayName("CNE")] - [Category("Computational Instructions")] - [Description("Compares two values. If the first value is not equal to the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] - public class CneInstruction : Instruction - { - public override OpCode OpCode => OpCode.CNE; - } - - [DisplayName("CONV_I1")] - [Category("Conversion Instructions")] - [Description("Converts the value on top of the evaluation stack to int8, and extends it to int32")] - public class ConvI1Instruction : Instruction - { - public override OpCode OpCode => OpCode.CONV_I1; - } - - [DisplayName("CONV_I2")] - [Category("Conversion Instructions")] - [Description("Converts the value on top of the evaluation stack to int16, and extends it to int32")] - public class ConvI2Instruction : Instruction - { - public override OpCode OpCode => OpCode.CONV_I2; - } - - [DisplayName("CONV_I4")] - [Category("Conversion Instructions")] - [Description("Converts the value on top of the evaluation stack to int32, and extends it to int32")] - public class ConvI4Instruction : Instruction - { - public override OpCode OpCode => OpCode.CONV_I4; - } - - [DisplayName("CONV_I")] - [Category("Conversion Instructions")] - [Description("Converts the value on top of the evaluation stack to native int, and extends it to int32")] - public class ConvIInstruction : Instruction - { - public override OpCode OpCode => OpCode.CONV_I; - } - - [DisplayName("CONV_U1")] - [Category("Conversion Instructions")] - [Description("Converts the value on top of the evaluation stack to unsigned int8, and extends it to int32")] - public class ConvU1Instruction : Instruction - { - public override OpCode OpCode => OpCode.CONV_U1; - } - - [DisplayName("CONV_U2")] - [Category("Conversion Instructions")] - [Description("Converts the value on top of the evaluation stack to unsigned int16, and extends it to int32")] - public class ConvU2Instruction : Instruction - { - public override OpCode OpCode => OpCode.CONV_U2; - } - - [DisplayName("CONV_U4")] - [Category("Conversion Instructions")] - [Description("Converts the value on top of the evaluation stack to unsigned int32, and extends it to int32")] - public class ConvU4Instruction : Instruction - { - public override OpCode OpCode => OpCode.CONV_U4; - } - - [DisplayName("CONV_U")] - [Category("Conversion Instructions")] - [Description("Converts the value on top of the evaluation stack to unsigned native int, and extends it to int32")] - public class ConvUInstruction : Instruction - { - public override OpCode OpCode => OpCode.CONV_U; - } - - [DisplayName("CONV_BR2")] - [Category("Conversion Instructions")] - [Description("Converts the value on top of the evaluation stack to bfloat16")] - public class ConvBR2Instruction : Instruction - { - public override OpCode OpCode => OpCode.CONV_BR2; - } - - [DisplayName("CONV_R4")] - [Category("Conversion Instructions")] - [Description("Converts the value on top of the evaluation stack to float32")] - public class ConvR4Instruction : Instruction - { - public override OpCode OpCode => OpCode.CONV_R4; - } - - [DisplayName("BR")] - [Category("Control and Status Instructions")] - [Description("Unconditionally transfers control to a target instruction")] - public class BrInstruction : Instruction - { - public override OpCode OpCode => OpCode.BR; - - [DisplayName("target")] - [Description("Branches to a target instruction at the specified offset")] - public int Target { get; set; } - } - - [DisplayName("BR_TRUE")] - [Category("Control and Status Instructions")] - [Description("Transfers control to a target instruction if value is true, not null, or non-zero")] - public class BrTrueInstruction : Instruction - { - public override OpCode OpCode => OpCode.BR_TRUE; - - [DisplayName("target")] - [Description("Branches to a target instruction at the specified offset")] - public int Target { get; set; } - } - - [DisplayName("BR_FALSE")] - [Category("Control and Status Instructions")] - [Description("Transfers control to a target instruction if value is false, null, or zero")] - public class BrFalseInstruction : Instruction - { - public override OpCode OpCode => OpCode.BR_FALSE; - - [DisplayName("target")] - [Description("Branches to a target instruction at the specified offset")] - public int Target { get; set; } - } - - [DisplayName("RET")] - [Category("Control and Status Instructions")] - [Description("Return")] - public class RetInstruction : Instruction - { - public override OpCode OpCode => OpCode.RET; - } - - [DisplayName("CALL")] - [Category("Control and Status Instructions")] - [Description("Call a target method")] - public class CallInstruction : Instruction - { - public override OpCode OpCode => OpCode.CALL; - - [DisplayName("args")] - [Description("Arguments count")] - public byte ArgsCount { get; set; } - - [DisplayName("target")] - [Description("Call a target method at the specified offset")] - public int Target { get; set; } - } - - [DisplayName("ECALL")] - [Category("Control and Status Instructions")] - [Description("Call a environment method")] - public class ECallInstruction : Instruction - { - public override OpCode OpCode => OpCode.ECALL; - - [DisplayName("args")] - [Description("Arguments count")] - public byte ArgsCount { get; set; } - } - - [DisplayName("THROW")] - [Category("Control and Status Instructions")] - [Description("Throw a error code currently on the evaluation stack")] - public class ThrowInstruction : Instruction - { - public override OpCode OpCode => OpCode.THROW; - } - - [DisplayName("BREAK")] - [Category("Control and Status Instructions")] - [Description("Inform the debugger that a break point has been tripped")] - public class BreakInstruction : Instruction - { - public override OpCode OpCode => OpCode.BREAK; - } - - public static class TensorCalls - { - public abstract class TensorInstruction : Instruction - { - public sealed override OpCode OpCode => OpCode.TENSOR; - - [DisplayName("funct")] - [Description("Tensor call function")] - public abstract TensorFunction Function { get; } - } - - [DisplayName("TENSOR.BATCH_TO_SPACE")] - [Category("Tensor Instructions")] - [Description("BatchToSpace")] - public class BatchToSpaceInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.BATCH_TO_SPACE; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("rshape_block")] - [Description("Block shape register")] - public byte RshapeBlock { get; set; } - - [DisplayName("rpad_crops")] - [Description("Crops paddings register")] - public byte RpadCrops { get; set; } - } - - [DisplayName("TENSOR.BROADCAST")] - [Category("Tensor Instructions")] - [Description("Broadcast")] - public class BroadcastInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.BROADCAST; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rshape_dest")] - [Description("Dest shape register")] - public byte RshapeDest { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - } - - [DisplayName("TENSOR.BINARY")] - [Category("Tensor Instructions")] - [Description("Binary")] - public class BinaryInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.BINARY; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src1")] - [Description("Source1 shape register")] - public byte RshapeSrc1 { get; set; } - - [DisplayName("rstride_src1")] - [Description("Source1 stride register")] - public byte RstrideSrc1 { get; set; } - - [DisplayName("rshape_src2")] - [Description("Source2 shape register")] - public byte RshapeSrc2 { get; set; } - - [DisplayName("rstride_src2")] - [Description("Source2 stride register")] - public byte RstrideSrc2 { get; set; } - - [DisplayName("rshape_dest")] - [Description("Dest shape register")] - public byte RshapeDest { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("binary_op")] - [Description("Binary operator")] - public BinaryOp BinaryOp { get; set; } - - [DisplayName("fused_clamp_low")] - [Description("FusedClampLow")] - public float FusedClampLow { get; set; } - - [DisplayName("fused_clamp_high")] - [Description("FusedClampHigh")] - public float FusedClampHigh { get; set; } - } - - [DisplayName("TENSOR.CALL")] - [Category("Tensor Instructions")] - [Description("Call")] - public class CallInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.CALL; - - [DisplayName("function_id")] - [Description("Function Id")] - public uint FunctionId { get; set; } - - [DisplayName("module_id")] - [Description("Module Id")] - public ushort ModuleId { get; set; } - - [DisplayName("num_src")] - [Description("Source count")] - public byte SrcCount { get; set; } - - [DisplayName("num_dst")] - [Description("Dest count")] - public byte DstCount { get; set; } - } - - [DisplayName("TENSOR.COMPARE")] - [Category("Tensor Instructions")] - [Description("Compare")] - public class CompareInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.COMPARE; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src1")] - [Description("Source1 shape register")] - public byte RshapeSrc1 { get; set; } - - [DisplayName("rstride_src1")] - [Description("Source1 stride register")] - public byte RstrideSrc1 { get; set; } - - [DisplayName("rshape_src2")] - [Description("Source2 shape register")] - public byte RshapeSrc2 { get; set; } - - [DisplayName("rstride_src2")] - [Description("Source2 stride register")] - public byte RstrideSrc2 { get; set; } - - [DisplayName("rshape_dest")] - [Description("Dest shape register")] - public byte RshapeDest { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("compare_op")] - [Description("Compare operator")] - public CompareOp CompareOp { get; set; } - } - [DisplayName("TENSOR.CONV2D")] - [Category("Tensor Instructions")] - [Description("Conv2D")] - public class Conv2DInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.CONV2D; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rshape_kernel")] - [Description("Kernel shape register")] - public byte RshapeKernel { get; set; } - - [DisplayName("rstride_kernel")] - [Description("Kernel stride register")] - public byte RstrideKernel { get; set; } - - [DisplayName("rstride_bias")] - [Description("Bias stride register")] - public byte RstrideBias { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("groups")] - [Description("Groups")] - public ushort Groups { get; set; } - - [DisplayName("stride_h")] - [Description("StrideH")] - public ushort StrideH { get; set; } - - [DisplayName("stride_w")] - [Description("StrideW")] - public ushort StrideW { get; set; } - - [DisplayName("dilation_h")] - [Description("DilationH")] - public ushort DilationH { get; set; } - - [DisplayName("dilation_w")] - [Description("DilationW")] - public ushort DilationW { get; set; } - - [DisplayName("fused_clamp_low")] - [Description("FusedClampLow")] - public float FusedClampLow { get; set; } - - [DisplayName("fused_clamp_high")] - [Description("FusedClampHigh")] - public float FusedClampHigh { get; set; } - } - - [DisplayName("TENSOR.COPY")] - [Category("Tensor Instructions")] - [Description("Copy")] - public class CopyInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.COPY; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape")] - [Description("Shape register")] - public byte Rshape { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - } - - [DisplayName("TENSOR.CONVERT")] - [Category("Tensor Instructions")] - [Description("Convert")] - public class ConvertInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.CONVERT; - - [DisplayName("in_datatype")] - [Description("Source Datatype")] - public DataType SrcDataType { get; set; } - - [DisplayName("dst_datatype")] - [Description("Dest Datatype")] - public DataType DestDataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source1 shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - } - - [DisplayName("TENSOR.CUMSUM")] - [Category("Tensor Instructions")] - [Description("CumSum")] - public class CumSumInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.CUMSUM; - - [DisplayName("datatype")] - [Description("Input/Output datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("axis")] - [Description("Axis")] - public int Axis { get; set; } - - [DisplayName("exclusive")] - [Description("Exclusive")] - public bool Exclusive { get; set; } - - [DisplayName("reverse")] - [Description("Reverse")] - public bool Reverse { get; set; } - } - - [DisplayName("TENSOR.DEQUANTIZE")] - [Category("Tensor Instructions")] - [Description("Dequantize")] - public class DequantizeInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.DEQUANTIZE; - - [DisplayName("in_datatype")] - [Description("Source Datatype")] - public DataType SrcDataType { get; set; } - - [DisplayName("dst_datatype")] - [Description("Dest Datatype")] - public DataType DestDataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - } - - [DisplayName("TENSOR.GATHER")] - [Category("Tensor Instructions")] - [Description("Gather")] - public class GatherInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.GATHER; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rshape_dest")] - [Description("Dest shape register")] - public byte RshapeDest { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("rshape_indices")] - [Description("Indices shape register")] - public byte RshapeIndices { get; set; } - - [DisplayName("axis")] - [Description("Axis")] - public byte Axis { get; set; } - } - - [DisplayName("TENSOR.GATHER_ND")] - [Category("Tensor Instructions")] - [Description("GatherND")] - public class GatherNDInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.GATHER_ND; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rshape_dest")] - [Description("Dest shape register")] - public byte RshapeDest { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("rshape_indices")] - [Description("Indices shape register")] - public byte RshapeIndices { get; set; } - - [DisplayName("batch_dims")] - [Description("Batch Dims")] - public byte Batchdims { get; set; } - } - - [DisplayName("TENSOR.HARDMAX")] - [Category("Tensor Instructions")] - [Description("Hardmax")] - public class HardmaxInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.HARDMAX; - - [DisplayName("datatype")] - [Description("Input/Output datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("axis")] - [Description("Axis")] - public int Axis { get; set; } - } - - [DisplayName("TENSOR.LUT1D")] - [Category("Tensor Instructions")] - [Description("Lut1D")] - public class LUT1DInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.LUT1D; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("table_len")] - [Description("Table length")] - public ushort TableLength { get; set; } - } - - [DisplayName("TENSOR.MATMUL")] + [System.AttributeUsage(AttributeTargets.Enum, Inherited = false, AllowMultiple = false)] + public sealed class BitLengthAttribute : Attribute + { + public uint BitLength { get; } + + public BitLengthAttribute(uint bitLength) + { + BitLength = bitLength; + } + } + + [System.AttributeUsage(AttributeTargets.All, Inherited = false, AllowMultiple = false)] + public sealed class EnumNameAttribute : Attribute + { + public string Name { get; } + + public EnumNameAttribute(string name) + { + Name = name; + } + } + + [BitLength(8)] + [EnumName("opcode_t")] + public enum OpCode + { + NOP, + LDNULL, + LDC_I4, + LDC_I4_0, + LDC_I4_1, + LDC_R4, + LDIND_I1, + LDIND_I2, + LDIND_I4, + LDIND_I, + LDIND_U1, + LDIND_U2, + LDIND_U4, + LDIND_U, + LDIND_BR2, + LDIND_R4, + STIND_I1, + STIND_I2, + STIND_I4, + STIND_I, + STIND_BR2, + STIND_R4, + LEA_GP, + LEA_BUFFER, + + LDELEM_I1, + LDELEM_I2, + LDELEM_I4, + LDELEM_I, + LDELEM_U1, + LDELEM_U2, + LDELEM_U4, + LDELEM_U, + LDELEM_BR2, + LDELEM_R4, + STELEM_I1, + STELEM_I2, + STELEM_I4, + STELEM_I, + STELEM_BR2, + STELEM_R4, + + LDARG, + LDARG_0, + LDARG_1, + LDARG_2, + LDARG_3, + LDARG_4, + LDARG_5, + + DUP, + POP, + + STSHAPE, + STPADDINGS, + + NEG, + ADD, + SUB, + MUL, + DIV, + DIV_U, + REM, + REM_U, + AND, + OR, + XOR, + NOT, + SHL, + SHR, + SHR_U, + + CLT, + CLT_U, + CLE, + CLE_U, + CEQ, + CGE, + CGE_U, + CGT, + CGT_U, + CNE, + + CONV_I1, + CONV_I2, + CONV_I4, + CONV_I, + CONV_U1, + CONV_U2, + CONV_U4, + CONV_U, + CONV_BR2, + CONV_R4, + + BR, + BR_TRUE, + BR_FALSE, + RET, + CALL, + ECALL, + THROW, + BREAK, + + TENSOR, + } + + [BitLength(16)] + [EnumName("tensor_function_t")] + public enum TensorFunction + { + BATCH_TO_SPACE, + BINARY, + BROADCAST, + CALL, + COMPARE, + CLAMP, + CONV2D, + CONV2D_TRANSPOSE, + CONVERT, + COPY, + CUMSUM, + DEQUANTIZE, + GATHER, + GATHER_ND, + HARDMAX, + LOGISTIC, + LUT1D, + MATMUL, + ONEHOT, + PAD, + QUANTIZE, + RANDOM_NORMAL, + RANDOM_UNIFORM, + REDUCE, + REDUCE_ARG, + REDUCE_PROD, + REDUCE_WINDOW2D, + RESIZE_IMAGE, + ROI_ALIGN, + SIGMOID, + SLICE, + SOFTMAX, + SPACE_TO_BATCH, + TAKE, + TERNARY, + TOPK, + TRANSPOSE, + TRILU, + UNARY, + GRU, + TFLITE_DETECTION_POSTPROCESS, + LAYER_NORMALIZATION, + COMPRESS, + GATHER_ELEMENTS, + INSTANCE_NORMALIZATION + } + + [BitLength(8)] + [EnumName("datatype_t")] + [Browsable(false)] + public enum DataType + { + } + + [BitLength(8)] + [EnumName("onehot_mode_t")] + [Browsable(false)] + public enum OneHotMode + { + } + + [BitLength(8)] + [EnumName("pad_mode_t")] + [Browsable(false)] + public enum PadMode + { + } + + [BitLength(8)] + [EnumName("memory_location_t")] + [Browsable(false)] + public enum MemoryLocation + { + } + + [BitLength(8)] + [EnumName("reduce_op_t")] + [Browsable(false)] + public enum ReduceOp + { + } + + [BitLength(8)] + [EnumName("reduce_arg_op_t")] + [Browsable(false)] + public enum ReduceArgOp + { + } + + [BitLength(8)] + [EnumName("image_resize_mode_t")] + [Browsable(false)] + public enum ImageResizeMode + { + } + + [BitLength(8)] + [EnumName("binary_op_t")] + [Browsable(false)] + public enum BinaryOp + { + } + + [BitLength(8)] + [EnumName("unary_op_t")] + [Browsable(false)] + public enum UnaryOp + { + } + + [BitLength(8)] + [EnumName("compare_op_t")] + [Browsable(false)] + public enum CompareOp + { + } + + [BitLength(8)] + [EnumName("roi_align_mode_t")] + [Browsable(false)] + public enum RoiAlignMode + { + } + + public abstract class Instruction + { + [DisplayName("opcode")] + [Description("OpCode")] + public abstract OpCode OpCode { get; } + } + + [DisplayName("NOP")] + [Category("Control and Status Instructions")] + [Description("No operation")] + public class NopInstruction : Instruction + { + public override OpCode OpCode => OpCode.NOP; + } + + [DisplayName("LDC_I4")] + [Category("Immediate Instructions")] + [Description("Load immedidate I4 to stack")] + public class LdcI4Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDC_I4; + + [DisplayName("imm")] + [Description("Immedidate I4")] + public int Imm { get; set; } + } + + [DisplayName("LDNULL")] + [Category("Immediate Instructions")] + [Description("Load immedidate nullptr as I to stack")] + public class LdNullInstruction : Instruction + { + public override OpCode OpCode => OpCode.LDNULL; + } + + [DisplayName("LDC_I4_0")] + [Category("Immediate Instructions")] + [Description("Load immedidate 0 as I4 to stack")] + public class LdcI4_0Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDC_I4_0; + } + + [DisplayName("LDC_I4_1")] + [Category("Immediate Instructions")] + [Description("Load immedidate 1 as I4 to stack")] + public class LdcI4_1Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDC_I4_1; + } + + [DisplayName("LDC_R4")] + [Category("Immediate Instructions")] + [Description("Load immedidate R4 to stack")] + public class LdcR4Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDC_R4; + + [DisplayName("imm")] + [Description("Immedidate R4")] + public float Imm { get; set; } + } + + [Category("Load Store Instructions")] + public abstract class LdStindInstruction : Instruction + { + } + + [DisplayName("LDIND_I1")] + [Description("Load indirect I1 to stack")] + public class LdindI1Instruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.LDIND_I1; + } + + [DisplayName("LDIND_I2")] + [Description("Load indirect I2 to stack")] + public class LdindI2Instruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.LDIND_I2; + } + + [DisplayName("LDIND_I4")] + [Description("Load indirect I4 to stack")] + public class LdindI4Instruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.LDIND_I4; + } + + [DisplayName("LDIND_I")] + [Description("Load indirect I to stack")] + public class LdindIInstruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.LDIND_I; + } + + [DisplayName("LDIND_U1")] + [Description("Load indirect U1 to stack")] + public class LdindU1Instruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.LDIND_U1; + } + + [DisplayName("LDIND_U2")] + [Description("Load indirect U2 to stack")] + public class LdindU2Instruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.LDIND_U2; + } + + [DisplayName("LDIND_U4")] + [Description("Load indirect U4 to stack")] + public class LdindU4Instruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.LDIND_U4; + } + + [DisplayName("LDIND_U")] + [Description("Load indirect U to stack")] + public class LdindUInstruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.LDIND_U; + } + + [DisplayName("LDIND_BR2")] + [Description("Load indirect BR2 to stack")] + public class LdindBR2Instruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.LDIND_BR2; + } + + [DisplayName("LDIND_R4")] + [Description("Load indirect R4 to stack")] + public class LdindR4Instruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.LDIND_R4; + } + + [DisplayName("STIND_I1")] + [Description("Store indirect I1 from stack")] + public class StindI1Instruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.STIND_I1; + } + + [DisplayName("STIND_I2")] + [Description("Store indirect I2 from stack")] + public class StindI2Instruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.STIND_I2; + } + + [DisplayName("STIND_I4")] + [Description("Store indirect I4 from stack")] + public class StindI4Instruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.STIND_I4; + } + + [DisplayName("STIND_I")] + [Description("Store indirect I from stack")] + public class StindIInstruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.STIND_I; + } + + [DisplayName("STIND_BR2")] + [Description("Store indirect BR2 from stack")] + public class StindBR2Instruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.STIND_BR2; + } + + [DisplayName("STIND_R4")] + [Description("Store indirect R4 from stack")] + public class StindR4Instruction : LdStindInstruction + { + public override OpCode OpCode => OpCode.STIND_R4; + } + + [DisplayName("LEA_GP")] + [Category("Load Store Instructions")] + [Description("Load a global pointer with offset to stack")] + public class LeaGPInstruction : Instruction + { + public override OpCode OpCode => OpCode.LEA_GP; + + [DisplayName("gpid")] + [Description("Global pointer id")] + public byte GpId { get; set; } + + [DisplayName("offset")] + [Description("Signed immediate offset")] + public int Offset { get; set; } + } + + [DisplayName("LEA_BUFFER")] + [Category("Load Store Instructions")] + [Description("Load a buffer pointer with offset to stack")] + public class LeaBufferInstruction : Instruction + { + public override OpCode OpCode => OpCode.LEA_BUFFER; + + [DisplayName("location")] + [Description("Location")] + public MemoryLocation Location { get; set; } + + [DisplayName("subres_id")] + [Description("SubresourceId")] + public byte SubresourceId { get; set; } + + [DisplayName("offset")] + [Description("Unsigned immediate offset")] + public uint Offset { get; set; } + } + + [DisplayName("LDELEM_I1")] + [Category("Load Store Instructions")] + [Description("Load an array element of I1 to stack")] + public class LdelemI1Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDELEM_I1; + } + + [DisplayName("LDELEM_I2")] + [Category("Load Store Instructions")] + [Description("Load an array element of I2 to stack")] + public class LdelemI2Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDELEM_I2; + } + + [DisplayName("LDELEM_I4")] + [Category("Load Store Instructions")] + [Description("Load an array element of I4 to stack")] + public class LdelemI4Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDELEM_I4; + } + + [DisplayName("LDELEM_I")] + [Category("Load Store Instructions")] + [Description("Load an array element of I to stack")] + public class LdelemIInstruction : Instruction + { + public override OpCode OpCode => OpCode.LDELEM_I; + } + + [DisplayName("LDELEM_U1")] + [Category("Load Store Instructions")] + [Description("Load an array element of U1 to stack")] + public class LdelemU1Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDELEM_U1; + } + + [DisplayName("LDELEM_U2")] + [Category("Load Store Instructions")] + [Description("Load an array element of U2 to stack")] + public class LdelemU2Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDELEM_U2; + } + + [DisplayName("LDELEM_U4")] + [Category("Load Store Instructions")] + [Description("Load an array element of U4 to stack")] + public class LdelemU4Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDELEM_U4; + } + + [DisplayName("LDELEM_U")] + [Category("Load Store Instructions")] + [Description("Load an array element of U to stack")] + public class LdelemUInstruction : Instruction + { + public override OpCode OpCode => OpCode.LDELEM_U; + } + + [DisplayName("LDELEM_BR2")] + [Category("Load Store Instructions")] + [Description("Load an array element of BR2 to stack")] + public class LdelemBR2Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDELEM_BR2; + } + + [DisplayName("LDELEM_R4")] + [Category("Load Store Instructions")] + [Description("Load an array element of R4 to stack")] + public class LdelemR4Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDELEM_R4; + } + + [DisplayName("STELEM_I1")] + [Category("Load Store Instructions")] + [Description("Store an array element of I1 from stack")] + public class StelemI1Instruction : Instruction + { + public override OpCode OpCode => OpCode.STELEM_I1; + } + + [DisplayName("STELEM_I2")] + [Category("Load Store Instructions")] + [Description("Store an array element of I2 from stack")] + public class StelemI2Instruction : Instruction + { + public override OpCode OpCode => OpCode.STELEM_I2; + } + + [DisplayName("STELEM_I4")] + [Category("Load Store Instructions")] + [Description("Store an array element of I4 from stack")] + public class StelemI4Instruction : Instruction + { + public override OpCode OpCode => OpCode.STELEM_I4; + } + + [DisplayName("STELEM_I")] + [Category("Load Store Instructions")] + [Description("Store an array element of I from stack")] + public class StelemIInstruction : Instruction + { + public override OpCode OpCode => OpCode.STELEM_I; + } + + [DisplayName("STELEM_BR2")] + [Category("Load Store Instructions")] + [Description("Store an array element of BR2 from stack")] + public class StelemBR2Instruction : Instruction + { + public override OpCode OpCode => OpCode.STELEM_BR2; + } + + [DisplayName("STELEM_R4")] + [Category("Load Store Instructions")] + [Description("Store an array element of R4 from stack")] + public class StelemR4Instruction : Instruction + { + public override OpCode OpCode => OpCode.STELEM_R4; + } + + [DisplayName("LDARG")] + [Category("Load Store Instructions")] + [Description("Load an argument to stack")] + public class LdargInstruction : Instruction + { + public override OpCode OpCode => OpCode.LDARG; + + [DisplayName("index")] + [Description("Argument index")] + public uint Index { get; set; } + } + + [DisplayName("LDARG_0")] + [Category("Load Store Instructions")] + [Description("Load an argument with index of 0 to stack")] + public class Ldarg0Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDARG_0; + } + + [DisplayName("LDARG_1")] + [Category("Load Store Instructions")] + [Description("Load an argument with index of 1 to stack")] + public class Ldarg1Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDARG_1; + } + + [DisplayName("LDARG_2")] + [Category("Load Store Instructions")] + [Description("Load an argument with index of 2 to stack")] + public class Ldarg2Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDARG_2; + } + + [DisplayName("LDARG_3")] + [Category("Load Store Instructions")] + [Description("Load an argument with index of 1 to stack")] + public class Ldarg3Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDARG_3; + } + + [DisplayName("LDARG_4")] + [Category("Load Store Instructions")] + [Description("Load an argument with index of 4 to stack")] + public class Ldarg4Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDARG_4; + } + + [DisplayName("LDARG_5")] + [Category("Load Store Instructions")] + [Description("Load an argument with index of 5 to stack")] + public class Ldarg5Instruction : Instruction + { + public override OpCode OpCode => OpCode.LDARG_5; + } + + [DisplayName("STSHAPE")] + [Category("Load Store Instructions")] + [Description("Store a shape from stack")] + public class StShapeInstruction : Instruction + { + public override OpCode OpCode => OpCode.STSHAPE; + + [DisplayName("rshape")] + [Description("Shape register index")] + public byte Rshape { get; set; } + + [DisplayName("rank")] + [Description("Shape's rank")] + public byte Rank { get; set; } + } + + [DisplayName("STPADDINGS")] + [Category("Load Store Instructions")] + [Description("Store paddings from stack")] + public class StPaddingsInstruction : Instruction + { + public override OpCode OpCode => OpCode.STPADDINGS; + + [DisplayName("rpaddings")] + [Description("Paddings register index")] + public byte Rpaddings { get; set; } + + [DisplayName("rank")] + [Description("Paddings' rank")] + public byte Rank { get; set; } + } + + [DisplayName("DUP")] + [Category("Stack Instructions")] + [Description("Duplicate the top item of stack")] + public class DupInstruction : Instruction + { + public override OpCode OpCode => OpCode.DUP; + } + + [DisplayName("POP")] + [Category("Stack Instructions")] + [Description("Pop the top item of stack")] + public class PopInstruction : Instruction + { + public override OpCode OpCode => OpCode.POP; + } + + [DisplayName("NEG")] + [Category("Computational Instructions")] + [Description("Negates a value and pushes the result onto the evaluation stack")] + public class NegInstruction : Instruction + { + public override OpCode OpCode => OpCode.NEG; + } + + [DisplayName("ADD")] + [Category("Computational Instructions")] + [Description("Adds two values and pushes the result onto the evaluation stack")] + public class AddInstruction : Instruction + { + public override OpCode OpCode => OpCode.ADD; + } + + [DisplayName("SUB")] + [Category("Computational Instructions")] + [Description("Subtracts one value from another and pushes the result onto the evaluation stack")] + public class SubInstruction : Instruction + { + public override OpCode OpCode => OpCode.SUB; + } + + [DisplayName("MUL")] + [Category("Computational Instructions")] + [Description("Multiplies two values and pushes the result on the evaluation stack")] + public class MulInstruction : Instruction + { + public override OpCode OpCode => OpCode.MUL; + } + + [DisplayName("DIV")] + [Category("Computational Instructions")] + [Description("Divides two values and pushes the result as a floating-point (type F) or quotient (type int32) onto the evaluation stack")] + public class DivInstruction : Instruction + { + public override OpCode OpCode => OpCode.DIV; + } + + [DisplayName("DIV_U")] + [Category("Computational Instructions")] + [Description("Divides two unsigned integer values and pushes the result (int32) onto the evaluation stack")] + public class DivUInstruction : Instruction + { + public override OpCode OpCode => OpCode.DIV_U; + } + + [DisplayName("REM")] + [Category("Computational Instructions")] + [Description("Divides two values and pushes the remainder onto the evaluation stack")] + public class RemInstruction : Instruction + { + public override OpCode OpCode => OpCode.REM; + } + + [DisplayName("REM_U")] + [Category("Computational Instructions")] + [Description("Divides two unsigned values and pushes the remainder onto the evaluation stack")] + public class RemUInstruction : Instruction + { + public override OpCode OpCode => OpCode.REM_U; + } + + [DisplayName("AND")] + [Category("Computational Instructions")] + [Description("Computes the bitwise AND of two values and pushes the result onto the evaluation stack")] + public class AndInstruction : Instruction + { + public override OpCode OpCode => OpCode.AND; + } + + [DisplayName("OR")] + [Category("Computational Instructions")] + [Description("Compute the bitwise complement of the two integer values on top of the stack and pushes the result onto the evaluation stack")] + public class OrInstruction : Instruction + { + public override OpCode OpCode => OpCode.OR; + } + + [DisplayName("XOR")] + [Category("Computational Instructions")] + [Description("Computes the bitwise XOR of the top two values on the evaluation stack, pushing the result onto the evaluation stack")] + public class XorInstruction : Instruction + { + public override OpCode OpCode => OpCode.XOR; + } + + [DisplayName("NOT")] + [Category("Computational Instructions")] + [Description("Computes the bitwise complement of the integer value on top of the stack and pushes the result onto the evaluation stack as the same type")] + public class NotInstruction : Instruction + { + public override OpCode OpCode => OpCode.NOT; + } + + [DisplayName("SHL")] + [Category("Computational Instructions")] + [Description("Shifts an integer value to the left (in zeroes) by a specified number of bits, pushing the result onto the evaluation stack")] + public class ShlInstruction : Instruction + { + public override OpCode OpCode => OpCode.SHL; + } + + [DisplayName("SHR")] + [Category("Computational Instructions")] + [Description("Shifts an integer value (in sign) to the right by a specified number of bits, pushing the result onto the evaluation stack")] + public class ShrInstruction : Instruction + { + public override OpCode OpCode => OpCode.SHR; + } + + [DisplayName("SHR_U")] + [Category("Computational Instructions")] + [Description("Shifts an unsigned integer value (in zeroes) to the right by a specified number of bits, pushing the result onto the evaluation stack")] + public class ShrUInstruction : Instruction + { + public override OpCode OpCode => OpCode.SHR_U; + } + + [DisplayName("CLT")] + [Category("Computational Instructions")] + [Description("Compares two values. If the first value is less than the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] + public class CltInstruction : Instruction + { + public override OpCode OpCode => OpCode.CLT; + } + + [DisplayName("CLT_U")] + [Category("Computational Instructions")] + [Description("Compares the unsigned or unordered values value1 and value2. If value1 is less than value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] + public class CltUInstruction : Instruction + { + public override OpCode OpCode => OpCode.CLT_U; + } + + [DisplayName("CLE")] + [Category("Computational Instructions")] + [Description("Compares two values. If the first value is less than or equal to the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] + public class CleInstruction : Instruction + { + public override OpCode OpCode => OpCode.CLE; + } + + [DisplayName("CLE_U")] + [Category("Computational Instructions")] + [Description("Compares the unsigned or unordered values value1 and value2. If value1 is less than or equal to value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] + public class CleUInstruction : Instruction + { + public override OpCode OpCode => OpCode.CLE_U; + } + + [DisplayName("CEQ")] + [Category("Computational Instructions")] + [Description("Compares two values. If they are equal, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] + public class CeqInstruction : Instruction + { + public override OpCode OpCode => OpCode.CEQ; + } + + [DisplayName("CGE")] + [Category("Computational Instructions")] + [Description("Compares two values. If the first value is greater than or equal to the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] + public class CgeInstruction : Instruction + { + public override OpCode OpCode => OpCode.CGE; + } + + [DisplayName("CGE_U")] + [Category("Computational Instructions")] + [Description("Compares the unsigned or unordered values value1 and value2. If value1 is greater than or equal to value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] + public class CgeUInstruction : Instruction + { + public override OpCode OpCode => OpCode.CGE_U; + } + + [DisplayName("CGT")] + [Category("Computational Instructions")] + [Description("Compares two values. If the first value is greater than the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] + public class CgtInstruction : Instruction + { + public override OpCode OpCode => OpCode.CGT; + } + + [DisplayName("CGT_U")] + [Category("Computational Instructions")] + [Description("Compares the unsigned or unordered values value1 and value2. If value1 is greater than value2, then the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] + public class CgtUInstruction : Instruction + { + public override OpCode OpCode => OpCode.CGT_U; + } + + [DisplayName("CNE")] + [Category("Computational Instructions")] + [Description("Compares two values. If the first value is not equal to the second, the integer value 1 (int32) is pushed onto the evaluation stack; otherwise 0 (int32) is pushed onto the evaluation stack")] + public class CneInstruction : Instruction + { + public override OpCode OpCode => OpCode.CNE; + } + + [DisplayName("CONV_I1")] + [Category("Conversion Instructions")] + [Description("Converts the value on top of the evaluation stack to int8, and extends it to int32")] + public class ConvI1Instruction : Instruction + { + public override OpCode OpCode => OpCode.CONV_I1; + } + + [DisplayName("CONV_I2")] + [Category("Conversion Instructions")] + [Description("Converts the value on top of the evaluation stack to int16, and extends it to int32")] + public class ConvI2Instruction : Instruction + { + public override OpCode OpCode => OpCode.CONV_I2; + } + + [DisplayName("CONV_I4")] + [Category("Conversion Instructions")] + [Description("Converts the value on top of the evaluation stack to int32, and extends it to int32")] + public class ConvI4Instruction : Instruction + { + public override OpCode OpCode => OpCode.CONV_I4; + } + + [DisplayName("CONV_I")] + [Category("Conversion Instructions")] + [Description("Converts the value on top of the evaluation stack to native int, and extends it to int32")] + public class ConvIInstruction : Instruction + { + public override OpCode OpCode => OpCode.CONV_I; + } + + [DisplayName("CONV_U1")] + [Category("Conversion Instructions")] + [Description("Converts the value on top of the evaluation stack to unsigned int8, and extends it to int32")] + public class ConvU1Instruction : Instruction + { + public override OpCode OpCode => OpCode.CONV_U1; + } + + [DisplayName("CONV_U2")] + [Category("Conversion Instructions")] + [Description("Converts the value on top of the evaluation stack to unsigned int16, and extends it to int32")] + public class ConvU2Instruction : Instruction + { + public override OpCode OpCode => OpCode.CONV_U2; + } + + [DisplayName("CONV_U4")] + [Category("Conversion Instructions")] + [Description("Converts the value on top of the evaluation stack to unsigned int32, and extends it to int32")] + public class ConvU4Instruction : Instruction + { + public override OpCode OpCode => OpCode.CONV_U4; + } + + [DisplayName("CONV_U")] + [Category("Conversion Instructions")] + [Description("Converts the value on top of the evaluation stack to unsigned native int, and extends it to int32")] + public class ConvUInstruction : Instruction + { + public override OpCode OpCode => OpCode.CONV_U; + } + + [DisplayName("CONV_BR2")] + [Category("Conversion Instructions")] + [Description("Converts the value on top of the evaluation stack to bfloat16")] + public class ConvBR2Instruction : Instruction + { + public override OpCode OpCode => OpCode.CONV_BR2; + } + + [DisplayName("CONV_R4")] + [Category("Conversion Instructions")] + [Description("Converts the value on top of the evaluation stack to float32")] + public class ConvR4Instruction : Instruction + { + public override OpCode OpCode => OpCode.CONV_R4; + } + + [DisplayName("BR")] + [Category("Control and Status Instructions")] + [Description("Unconditionally transfers control to a target instruction")] + public class BrInstruction : Instruction + { + public override OpCode OpCode => OpCode.BR; + + [DisplayName("target")] + [Description("Branches to a target instruction at the specified offset")] + public int Target { get; set; } + } + + [DisplayName("BR_TRUE")] + [Category("Control and Status Instructions")] + [Description("Transfers control to a target instruction if value is true, not null, or non-zero")] + public class BrTrueInstruction : Instruction + { + public override OpCode OpCode => OpCode.BR_TRUE; + + [DisplayName("target")] + [Description("Branches to a target instruction at the specified offset")] + public int Target { get; set; } + } + + [DisplayName("BR_FALSE")] + [Category("Control and Status Instructions")] + [Description("Transfers control to a target instruction if value is false, null, or zero")] + public class BrFalseInstruction : Instruction + { + public override OpCode OpCode => OpCode.BR_FALSE; + + [DisplayName("target")] + [Description("Branches to a target instruction at the specified offset")] + public int Target { get; set; } + } + + [DisplayName("RET")] + [Category("Control and Status Instructions")] + [Description("Return")] + public class RetInstruction : Instruction + { + public override OpCode OpCode => OpCode.RET; + } + + [DisplayName("CALL")] + [Category("Control and Status Instructions")] + [Description("Call a target method")] + public class CallInstruction : Instruction + { + public override OpCode OpCode => OpCode.CALL; + + [DisplayName("args")] + [Description("Arguments count")] + public byte ArgsCount { get; set; } + + [DisplayName("target")] + [Description("Call a target method at the specified offset")] + public int Target { get; set; } + } + + [DisplayName("ECALL")] + [Category("Control and Status Instructions")] + [Description("Call a environment method")] + public class ECallInstruction : Instruction + { + public override OpCode OpCode => OpCode.ECALL; + + [DisplayName("args")] + [Description("Arguments count")] + public byte ArgsCount { get; set; } + } + + [DisplayName("THROW")] + [Category("Control and Status Instructions")] + [Description("Throw a error code currently on the evaluation stack")] + public class ThrowInstruction : Instruction + { + public override OpCode OpCode => OpCode.THROW; + } + + [DisplayName("BREAK")] + [Category("Control and Status Instructions")] + [Description("Inform the debugger that a break point has been tripped")] + public class BreakInstruction : Instruction + { + public override OpCode OpCode => OpCode.BREAK; + } + + public static class TensorCalls + { + public abstract class TensorInstruction : Instruction + { + public sealed override OpCode OpCode => OpCode.TENSOR; + + [DisplayName("funct")] + [Description("Tensor call function")] + public abstract TensorFunction Function { get; } + } + + [DisplayName("TENSOR.BATCH_TO_SPACE")] + [Category("Tensor Instructions")] + [Description("BatchToSpace")] + public class BatchToSpaceInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.BATCH_TO_SPACE; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("rshape_block")] + [Description("Block shape register")] + public byte RshapeBlock { get; set; } + + [DisplayName("rpad_crops")] + [Description("Crops paddings register")] + public byte RpadCrops { get; set; } + } + + [DisplayName("TENSOR.BROADCAST")] + [Category("Tensor Instructions")] + [Description("Broadcast")] + public class BroadcastInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.BROADCAST; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rshape_dest")] + [Description("Dest shape register")] + public byte RshapeDest { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + } + + [DisplayName("TENSOR.BINARY")] + [Category("Tensor Instructions")] + [Description("Binary")] + public class BinaryInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.BINARY; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src1")] + [Description("Source1 shape register")] + public byte RshapeSrc1 { get; set; } + + [DisplayName("rstride_src1")] + [Description("Source1 stride register")] + public byte RstrideSrc1 { get; set; } + + [DisplayName("rshape_src2")] + [Description("Source2 shape register")] + public byte RshapeSrc2 { get; set; } + + [DisplayName("rstride_src2")] + [Description("Source2 stride register")] + public byte RstrideSrc2 { get; set; } + + [DisplayName("rshape_dest")] + [Description("Dest shape register")] + public byte RshapeDest { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("binary_op")] + [Description("Binary operator")] + public BinaryOp BinaryOp { get; set; } + + [DisplayName("fused_clamp_low")] + [Description("FusedClampLow")] + public float FusedClampLow { get; set; } + + [DisplayName("fused_clamp_high")] + [Description("FusedClampHigh")] + public float FusedClampHigh { get; set; } + } + + [DisplayName("TENSOR.CALL")] + [Category("Tensor Instructions")] + [Description("Call")] + public class CallInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.CALL; + + [DisplayName("function_id")] + [Description("Function Id")] + public uint FunctionId { get; set; } + + [DisplayName("module_id")] + [Description("Module Id")] + public ushort ModuleId { get; set; } + + [DisplayName("num_src")] + [Description("Source count")] + public byte SrcCount { get; set; } + + [DisplayName("num_dst")] + [Description("Dest count")] + public byte DstCount { get; set; } + } + + [DisplayName("TENSOR.COMPARE")] + [Category("Tensor Instructions")] + [Description("Compare")] + public class CompareInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.COMPARE; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src1")] + [Description("Source1 shape register")] + public byte RshapeSrc1 { get; set; } + + [DisplayName("rstride_src1")] + [Description("Source1 stride register")] + public byte RstrideSrc1 { get; set; } + + [DisplayName("rshape_src2")] + [Description("Source2 shape register")] + public byte RshapeSrc2 { get; set; } + + [DisplayName("rstride_src2")] + [Description("Source2 stride register")] + public byte RstrideSrc2 { get; set; } + + [DisplayName("rshape_dest")] + [Description("Dest shape register")] + public byte RshapeDest { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("compare_op")] + [Description("Compare operator")] + public CompareOp CompareOp { get; set; } + } + [DisplayName("TENSOR.CONV2D")] + [Category("Tensor Instructions")] + [Description("Conv2D")] + public class Conv2DInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.CONV2D; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rshape_kernel")] + [Description("Kernel shape register")] + public byte RshapeKernel { get; set; } + + [DisplayName("rstride_kernel")] + [Description("Kernel stride register")] + public byte RstrideKernel { get; set; } + + [DisplayName("rstride_bias")] + [Description("Bias stride register")] + public byte RstrideBias { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("groups")] + [Description("Groups")] + public ushort Groups { get; set; } + + [DisplayName("stride_h")] + [Description("StrideH")] + public ushort StrideH { get; set; } + + [DisplayName("stride_w")] + [Description("StrideW")] + public ushort StrideW { get; set; } + + [DisplayName("dilation_h")] + [Description("DilationH")] + public ushort DilationH { get; set; } + + [DisplayName("dilation_w")] + [Description("DilationW")] + public ushort DilationW { get; set; } + + [DisplayName("fused_clamp_low")] + [Description("FusedClampLow")] + public float FusedClampLow { get; set; } + + [DisplayName("fused_clamp_high")] + [Description("FusedClampHigh")] + public float FusedClampHigh { get; set; } + } + + [DisplayName("TENSOR.COPY")] + [Category("Tensor Instructions")] + [Description("Copy")] + public class CopyInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.COPY; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape")] + [Description("Shape register")] + public byte Rshape { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + } + + [DisplayName("TENSOR.CONVERT")] + [Category("Tensor Instructions")] + [Description("Convert")] + public class ConvertInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.CONVERT; + + [DisplayName("in_datatype")] + [Description("Source Datatype")] + public DataType SrcDataType { get; set; } + + [DisplayName("dst_datatype")] + [Description("Dest Datatype")] + public DataType DestDataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source1 shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + } + + [DisplayName("TENSOR.CUMSUM")] + [Category("Tensor Instructions")] + [Description("CumSum")] + public class CumSumInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.CUMSUM; + + [DisplayName("datatype")] + [Description("Input/Output datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("axis")] + [Description("Axis")] + public int Axis { get; set; } + + [DisplayName("exclusive")] + [Description("Exclusive")] + public bool Exclusive { get; set; } + + [DisplayName("reverse")] + [Description("Reverse")] + public bool Reverse { get; set; } + } + + [DisplayName("TENSOR.DEQUANTIZE")] + [Category("Tensor Instructions")] + [Description("Dequantize")] + public class DequantizeInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.DEQUANTIZE; + + [DisplayName("in_datatype")] + [Description("Source Datatype")] + public DataType SrcDataType { get; set; } + + [DisplayName("dst_datatype")] + [Description("Dest Datatype")] + public DataType DestDataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + } + + [DisplayName("TENSOR.GATHER")] + [Category("Tensor Instructions")] + [Description("Gather")] + public class GatherInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.GATHER; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rshape_dest")] + [Description("Dest shape register")] + public byte RshapeDest { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("rshape_indices")] + [Description("Indices shape register")] + public byte RshapeIndices { get; set; } + + [DisplayName("axis")] + [Description("Axis")] + public byte Axis { get; set; } + } + + [DisplayName("TENSOR.GATHER_ND")] + [Category("Tensor Instructions")] + [Description("GatherND")] + public class GatherNDInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.GATHER_ND; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rshape_dest")] + [Description("Dest shape register")] + public byte RshapeDest { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("rshape_indices")] + [Description("Indices shape register")] + public byte RshapeIndices { get; set; } + + [DisplayName("batch_dims")] + [Description("Batch Dims")] + public byte Batchdims { get; set; } + } + + [DisplayName("TENSOR.HARDMAX")] + [Category("Tensor Instructions")] + [Description("Hardmax")] + public class HardmaxInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.HARDMAX; + + [DisplayName("datatype")] + [Description("Input/Output datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("axis")] + [Description("Axis")] + public int Axis { get; set; } + } + + [DisplayName("TENSOR.LUT1D")] + [Category("Tensor Instructions")] + [Description("Lut1D")] + public class LUT1DInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.LUT1D; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("table_len")] + [Description("Table length")] + public ushort TableLength { get; set; } + } + + [DisplayName("TENSOR.MATMUL")] + [Category("Tensor Instructions")] + [Description("Matmul")] + public class MatmulInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.MATMUL; + + [DisplayName("rshape_src1")] + [Description("Source1 shape register")] + public byte RshapeSrc1 { get; set; } + + [DisplayName("rstride_src1")] + [Description("Source1 stride register")] + public byte RstrideSrc1 { get; set; } + + [DisplayName("rshape_src2")] + [Description("Source2 shape register")] + public byte RshapeSrc2 { get; set; } + + [DisplayName("rstride_src2")] + [Description("Source2 stride register")] + public byte RstrideSrc2 { get; set; } + + [DisplayName("rshape_dest")] + [Description("Dest shape register")] + public byte RshapeDest { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("fused_clamp_low")] + [Description("FusedClampLow")] + public float FusedClampLow { get; set; } + + [DisplayName("fused_clamp_high")] + [Description("FusedClampHigh")] + public float FusedClampHigh { get; set; } + } + + [DisplayName("TENSOR.ONEHOT")] + [Category("Tensor Instructions")] + [Description("OneHot")] + public class OneHotInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.ONEHOT; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_indices")] + [Description("Indices shape register")] + public byte RshapeIndices { get; set; } + + [DisplayName("rshape_dest")] + [Description("Dest shape register")] + public byte RshapeDest { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("axis")] + [Description("Axis")] + public byte Axis { get; set; } + + [DisplayName("onehot_mode")] + [Description("OneHot Mode")] + public OneHotMode OneHotMode { get; set; } + } + + [DisplayName("TENSOR.PAD")] + [Category("Tensor Instructions")] + [Description("Pad")] + public class PadInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.PAD; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("rpaddings")] + [Description("Paddings register")] + public byte Rpaddings { get; set; } + + [DisplayName("pad_mode")] + [Description("Pad mode")] + public PadMode PadMode { get; set; } + } + + [DisplayName("TENSOR.QUANTIZE")] + [Category("Tensor Instructions")] + [Description("Quantize")] + public class QuantizeInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.QUANTIZE; + + [DisplayName("in_datatype")] + [Description("Source Datatype")] + public DataType SrcDataType { get; set; } + + [DisplayName("dst_datatype")] + [Description("Dest Datatype")] + public DataType DestDataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + } + + [DisplayName("TENSOR.RANDOM_NORMAL")] + [Category("Tensor Instructions")] + [Description("RandomNormal")] + public class RandomNormalInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.RANDOM_NORMAL; + + [DisplayName("datatype_dest")] + [Description("Output datatype")] + public DataType DataTypeDest { get; set; } + + [DisplayName("rshape_dest")] + [Description("output shape register")] + public byte RshapeDest { get; set; } + + [DisplayName("mean")] + [Description("Mean")] + public float Mean { get; set; } + + [DisplayName("std")] + [Description("Std")] + public float Std { get; set; } + + [DisplayName("seed")] + [Description("Seed")] + public float Seed { get; set; } + } + + [DisplayName("TENSOR.RANDOM_UNIFORM")] + [Category("Tensor Instructions")] + [Description("RandomUniform")] + public class RandomUniformInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.RANDOM_UNIFORM; + + [DisplayName("datatype_dest")] + [Description("Output datatype")] + public DataType DataTypeDest { get; set; } + + [DisplayName("rshape_dest")] + [Description("output shape register")] + public byte RshapeDest { get; set; } + + [DisplayName("low")] + [Description("Low")] + public float Low { get; set; } + + [DisplayName("high")] + [Description("High")] + public float High { get; set; } + + [DisplayName("seed")] + [Description("Seed")] + public float Seed { get; set; } + } + + [DisplayName("TENSOR.REDUCE")] + [Category("Tensor Instructions")] + [Description("Reduce")] + public class ReduceInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.REDUCE; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("reduce_op")] + [Description("Reduce operator")] + public ReduceOp ReduceOp { get; set; } + + [DisplayName("rshape_axis")] + [Description("Axis shape register")] + public byte RshapeAxis { get; set; } + + [DisplayName("keep_dims")] + [Description("Keep dimensions")] + public bool KeepDims { get; set; } + } + + [DisplayName("TENSOR.REDUCE_ARG")] + [Category("Tensor Instructions")] + [Description("ReduceArg")] + public class ReduceArgInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.REDUCE_ARG; + + [DisplayName("datatype_src")] + [Description("Input datatype")] + public DataType DataTypeSrc { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("datatype_dest")] + [Description("Output datatype")] + public DataType DataTypeDest { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("reduce_arg_op")] + [Description("Reduce arg operator")] + public ReduceArgOp ReduceArgOp { get; set; } + + [DisplayName("rshape_axis")] + [Description("Axis shape register")] + public byte RshapeAxis { get; set; } + + [DisplayName("keep_dims")] + [Description("Keep dimensions")] + public bool KeepDims { get; set; } + + [DisplayName("select_last_idx")] + [Description("select last index")] + public bool SelectLastIdx { get; set; } + } + + [DisplayName("TENSOR.REDUCE_PROD")] + [Category("Tensor Instructions")] + [Description("ReduceProd")] + public class ReduceProdInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.REDUCE_PROD; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("rshape_axes")] + [Description("Axes shape register")] + public byte RshapeAxes { get; set; } + + [DisplayName("keep_dims")] + [Description("Keep dimensions")] + public bool KeepDims { get; set; } + } + + [DisplayName("TENSOR.REDUCE_WINDOW2D")] + [Category("Tensor Instructions")] + [Description("REDUCE_WINDOW2D")] + public class ReduceWindow2DInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.REDUCE_WINDOW2D; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("reduce_op")] + [Description("Reduce operator")] + public ReduceOp ReduceOp { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("filter_h")] + [Description("FilterH")] + public ushort FilterH { get; set; } + + [DisplayName("filter_w")] + [Description("FilterW")] + public ushort FilterW { get; set; } + + [DisplayName("stride_h")] + [Description("StrideH")] + public ushort StrideH { get; set; } + + [DisplayName("stride_w")] + [Description("StrideW")] + public ushort StrideW { get; set; } + + [DisplayName("dilation_h")] + [Description("DilationH")] + public ushort DilationH { get; set; } + + [DisplayName("dilation_w")] + [Description("DilationW")] + public ushort DilationW { get; set; } + + [DisplayName("fused_clamp_low")] + [Description("FusedClampLow")] + public float FusedClampLow { get; set; } + + [DisplayName("fused_clamp_high")] + [Description("FusedClampHigh")] + public float FusedClampHigh { get; set; } + } + + [DisplayName("TENSOR.RESIZE_IMAGE")] + [Category("Tensor Instructions")] + [Description("RESIZE_IMAGE")] + public class ResizeImageInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.RESIZE_IMAGE; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("align_corners")] + [Description("Align Corners")] + public bool AlignCorners { get; set; } + + [DisplayName("half_pixel_centers")] + [Description("Half Pixel Centers")] + public bool HalfPixelCenters { get; set; } + + [DisplayName("image_resize_mode")] + [Description("Image Resize Mode")] + public ImageResizeMode ImageResizeMode { get; set; } + } + + [DisplayName("TENSOR.ROI_ALIGN")] + [Category("Tensor Instructions")] + [Description("RoiAlign")] + public class RoiAlignInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.ROI_ALIGN; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rshape_dest")] + [Description("Dest shape register")] + public byte RshapeDest { get; set; } + + [DisplayName("mode")] + [Description("Mode")] + public RoiAlignMode mode { get; set; } + + [DisplayName("spatial_scale")] + [Description("Spatial Scale")] + public float SpatialScale { get; set; } + + [DisplayName("sampling_ratio")] + [Description("Sampling Ratio")] + public long SamplingRatio { get; set; } + } + + [DisplayName("TENSOR.SIGMOID")] + [Category("Tensor Instructions")] + [Description("Sigmoid")] + public class SigmoidInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.SIGMOID; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + } + + [DisplayName("TENSOR.SLICE")] + [Category("Tensor Instructions")] + [Description("Slice")] + public class SliceInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.SLICE; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("rbegins")] + [Description("Begins shape register")] + public byte Rbegins { get; set; } + + [DisplayName("rends")] + [Description("Ends shape register")] + public byte Rends { get; set; } + + [DisplayName("rstrides")] + [Description("Strides shape register")] + public byte Strides { get; set; } + } + + [DisplayName("TENSOR.SOFTMAX")] + [Category("Tensor Instructions")] + [Description("Softmax")] + public class SoftmaxInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.SOFTMAX; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("axis")] + [Description("Axis")] + public int Axis { get; set; } + + [DisplayName("beta")] + [Description("Beta")] + public float Beta { get; set; } + } + + [DisplayName("TENSOR.SPACE_TO_BATCH")] + [Category("Tensor Instructions")] + [Description("SpaceToBatch")] + public class SpaceToBatchInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.SPACE_TO_BATCH; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("rshape_block")] + [Description("Block shape register")] + public byte RshapeBlock { get; set; } + + [DisplayName("rpad_crops")] + [Description("Crops paddings register")] + public byte RpadCrops { get; set; } + } + + [DisplayName("TENSOR.TERNARY")] + [Category("Tensor Instructions")] + [Description("Ternary")] + public class TernaryInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.TERNARY; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src1")] + [Description("Source1 shape register")] + public byte RshapeSrc1 { get; set; } + + [DisplayName("rstride_src1")] + [Description("Source1 stride register")] + public byte RstrideSrc1 { get; set; } + + [DisplayName("rshape_src2")] + [Description("Source2 shape register")] + public byte RshapeSrc2 { get; set; } + + [DisplayName("rstride_src2")] + [Description("Source2 stride register")] + public byte RstrideSrc2 { get; set; } + + [DisplayName("rshape_src3")] + [Description("Source3 shape register")] + public byte RshapeSrc3 { get; set; } + + [DisplayName("rstride_src3")] + [Description("Source3 stride register")] + public byte RstrideSrc3 { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + } + + [DisplayName("TENSOR.TOPK")] + [Category("Tensor Instructions")] + [Description("Topk")] + public class TopKInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.TOPK; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rshape_dest1")] + [Description("Dest1 shape register")] + public byte RshapeDest1 { get; set; } + + [DisplayName("rstride_dest1")] + [Description("Dest1 stride register")] + public byte RstrideDest1 { get; set; } + + [DisplayName("rshape_dest2")] + [Description("Dest2 shape register")] + public byte RshapeDest2 { get; set; } + + [DisplayName("rstride_dest2")] + [Description("Dest2 stride register")] + public byte RstrideDest2 { get; set; } + + [DisplayName("k")] + [Description("K")] + public long K { get; set; } + + [DisplayName("axis")] + [Description("Axis")] + public int Axis { get; set; } + + [DisplayName("largest")] + [Description("Largest")] + public bool Largest { get; set; } + + [DisplayName("sorted")] + [Description("Sorted")] + public bool Sorted { get; set; } + } + + [DisplayName("TENSOR.TRILU")] + [Category("Tensor Instructions")] + [Description("Trilu")] + public class TriluInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.TRILU; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("upper")] + [Description("Upper")] + public bool Upper { get; set; } + + [DisplayName("k")] + [Description("K")] + public long K { get; set; } + } + + [DisplayName("TENSOR.UNARY")] + [Category("Tensor Instructions")] + [Description("Unary")] + public class UnaryInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.UNARY; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source1 shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("unary_op")] + [Description("Unary operator")] + public UnaryOp UnaryOp { get; set; } + } + + [DisplayName("TENSOR.TRANSPOSE")] + [Category("Tensor Instructions")] + [Description("Transpose")] + public class TransposeInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.TRANSPOSE; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("rshape_src")] + [Description("Source shape register")] + public byte RshapeSrc { get; set; } + + [DisplayName("rstride_src")] + [Description("Source stride register")] + public byte RstrideSrc { get; set; } + + [DisplayName("rstride_dest")] + [Description("Dest stride register")] + public byte RstrideDest { get; set; } + + [DisplayName("rshape_perm")] + [Description("Perm shape register")] + public byte RshapePerm { get; set; } + } + [DisplayName("TENSOR.GRU")] + [Category("Tensor Instructions")] + [Description("Gru")] + public class GruInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.GRU; + + [DisplayName("input_shape_src")] + [Description("Input shape register")] + public byte RshapeSrc1 { get; set; } + + [DisplayName("w_shape_src")] + [Description("W shape register")] + public byte RshapeSrc2 { get; set; } + + [DisplayName("direction")] + [Description("direction register")] + public byte Direction { get; set; } + + [DisplayName("linear_before_reset")] + [Description("LBR register")] + public bool LinearBeforeReset { get; set; } + + } + [DisplayName("TENSOR.TFLITE_DETECTION_POSTPROCESS")] + [Category("Tensor Instructions")] + [Description("Tflite_Detection_Postprocess")] + public class TfliteDetectionPostprocessInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.TFLITE_DETECTION_POSTPROCESS; + + [DisplayName("box_shape_src")] + [Description("Box shape register")] + public byte RshapeSrc1 { get; set; } + + [DisplayName("score_shape_src")] + [Description("Score shape register")] + public byte RshapeSrc2 { get; set; } + + [DisplayName("anchor_shape_src")] + [Description("Anchor shape register")] + public byte RshapeSrc3 { get; set; } + + [DisplayName("max_detections")] + [Description("max_detections register")] + public int MaxDetections { get; set; } + + [DisplayName("max_classes_per_detection")] + [Description("max_classes_per_detection register")] + public int MaxClassesPerDetection { get; set; } + + [DisplayName("detections_per_class")] + [Description("detections_per_class register")] + public int DetectionsPerClass { get; set; } + + [DisplayName("use_regular_non_max_suppression")] + [Description("use_regular_non_max_suppression register")] + public bool UseRegularNonMaxSuppression { get; set; } + + [DisplayName("nms_score_threshold")] + [Description("nms_score_threshold register")] + public float NmsScoreThreshold { get; set; } + + [DisplayName("nms_iou_threshold")] + [Description("nms_iou_threshold register")] + public float NmsIouThreshold { get; set; } + + [DisplayName("num_classes")] + [Description("num_classes register")] + public int NumClasses { get; set; } + + [DisplayName("y_scale")] + [Description("y_scale register")] + public float YScale { get; set; } + + [DisplayName("x_scale")] + [Description("x_scale register")] + public float XScale { get; set; } + + [DisplayName("h_scale")] + [Description("h_scale register")] + public float HScale { get; set; } + + [DisplayName("w_scale")] + [Description("w_scale register")] + public float WScale { get; set; } + } + + [DisplayName("TENSOR.LAYER_NORMALIZATION")] + [Category("Tensor Instructions")] + [Description("LAYER_NORMALIZATION")] + public class LayerNormInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.LAYER_NORMALIZATION; + + [DisplayName("datatype")] + [Description("Datatype")] + public DataType DataType { get; set; } + + [DisplayName("input_shape")] + [Description("input_shape")] + public byte input_shape { get; set; } + [DisplayName("axis")] + [Description("axis")] + public int axis { get; set; } + + [DisplayName("epsilon")] + [Description("epsilon")] + public float epsilon { get; set; } + } + + [DisplayName("TENSOR.COMPRESS")] + [Category("Tensor Instructions")] + [Description("Compress")] + public class CompressInstruction : TensorInstruction + { + public override TensorFunction Function => TensorFunction.COMPRESS; + + [DisplayName("input_shape_src")] + [Description("Input shape register")] + public byte RshapeSrc1 { get; set; } + + [DisplayName("condition_shape_src")] + [Description("Condition shape register")] + public byte RshapeSrc2 { get; set; } + + [DisplayName("axis")] + [Description("axis register")] + public float axis { get; set; } + } + + [DisplayName("TENSOR.GATHER_ELEMENTS")] [Category("Tensor Instructions")] - [Description("Matmul")] - public class MatmulInstruction : TensorInstruction + [Description("Gather_Elements")] + public class Gather_ElementsInstruction : TensorInstruction { - public override TensorFunction Function => TensorFunction.MATMUL; + public override TensorFunction Function => TensorFunction.GATHER_ELEMENTS; - [DisplayName("rshape_src1")] - [Description("Source1 shape register")] + [DisplayName("input_shape_src")] + [Description("Input shape register")] public byte RshapeSrc1 { get; set; } - [DisplayName("rstride_src1")] - [Description("Source1 stride register")] - public byte RstrideSrc1 { get; set; } - - [DisplayName("rshape_src2")] - [Description("Source2 shape register")] - public byte RshapeSrc2 { get; set; } - - [DisplayName("rstride_src2")] - [Description("Source2 stride register")] - public byte RstrideSrc2 { get; set; } - - [DisplayName("rshape_dest")] - [Description("Dest shape register")] - public byte RshapeDest { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("fused_clamp_low")] - [Description("FusedClampLow")] - public float FusedClampLow { get; set; } - - [DisplayName("fused_clamp_high")] - [Description("FusedClampHigh")] - public float FusedClampHigh { get; set; } - } - - [DisplayName("TENSOR.ONEHOT")] - [Category("Tensor Instructions")] - [Description("OneHot")] - public class OneHotInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.ONEHOT; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_indices")] + [DisplayName("indices_shape_src")] [Description("Indices shape register")] - public byte RshapeIndices { get; set; } - - [DisplayName("rshape_dest")] - [Description("Dest shape register")] - public byte RshapeDest { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("axis")] - [Description("Axis")] - public byte Axis { get; set; } - - [DisplayName("onehot_mode")] - [Description("OneHot Mode")] - public OneHotMode OneHotMode { get; set; } - } - - [DisplayName("TENSOR.PAD")] - [Category("Tensor Instructions")] - [Description("Pad")] - public class PadInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.PAD; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("rpaddings")] - [Description("Paddings register")] - public byte Rpaddings { get; set; } - - [DisplayName("pad_mode")] - [Description("Pad mode")] - public PadMode PadMode { get; set; } - } - - [DisplayName("TENSOR.QUANTIZE")] - [Category("Tensor Instructions")] - [Description("Quantize")] - public class QuantizeInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.QUANTIZE; - - [DisplayName("in_datatype")] - [Description("Source Datatype")] - public DataType SrcDataType { get; set; } - - [DisplayName("dst_datatype")] - [Description("Dest Datatype")] - public DataType DestDataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - } - - [DisplayName("TENSOR.RANDOM_NORMAL")] - [Category("Tensor Instructions")] - [Description("RandomNormal")] - public class RandomNormalInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.RANDOM_NORMAL; - - [DisplayName("datatype_dest")] - [Description("Output datatype")] - public DataType DataTypeDest { get; set; } - - [DisplayName("rshape_dest")] - [Description("output shape register")] - public byte RshapeDest { get; set; } - - [DisplayName("mean")] - [Description("Mean")] - public float Mean { get; set; } - - [DisplayName("std")] - [Description("Std")] - public float Std { get; set; } - - [DisplayName("seed")] - [Description("Seed")] - public float Seed { get; set; } - } - - [DisplayName("TENSOR.RANDOM_UNIFORM")] - [Category("Tensor Instructions")] - [Description("RandomUniform")] - public class RandomUniformInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.RANDOM_UNIFORM; - - [DisplayName("datatype_dest")] - [Description("Output datatype")] - public DataType DataTypeDest { get; set; } - - [DisplayName("rshape_dest")] - [Description("output shape register")] - public byte RshapeDest { get; set; } - - [DisplayName("low")] - [Description("Low")] - public float Low { get; set; } - - [DisplayName("high")] - [Description("High")] - public float High { get; set; } - - [DisplayName("seed")] - [Description("Seed")] - public float Seed { get; set; } - } - - [DisplayName("TENSOR.REDUCE")] - [Category("Tensor Instructions")] - [Description("Reduce")] - public class ReduceInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.REDUCE; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("reduce_op")] - [Description("Reduce operator")] - public ReduceOp ReduceOp { get; set; } - - [DisplayName("rshape_axis")] - [Description("Axis shape register")] - public byte RshapeAxis { get; set; } - - [DisplayName("keep_dims")] - [Description("Keep dimensions")] - public bool KeepDims { get; set; } - } - - [DisplayName("TENSOR.REDUCE_ARG")] - [Category("Tensor Instructions")] - [Description("ReduceArg")] - public class ReduceArgInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.REDUCE_ARG; - - [DisplayName("datatype_src")] - [Description("Input datatype")] - public DataType DataTypeSrc { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("datatype_dest")] - [Description("Output datatype")] - public DataType DataTypeDest { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("reduce_arg_op")] - [Description("Reduce arg operator")] - public ReduceArgOp ReduceArgOp { get; set; } - - [DisplayName("rshape_axis")] - [Description("Axis shape register")] - public byte RshapeAxis { get; set; } - - [DisplayName("keep_dims")] - [Description("Keep dimensions")] - public bool KeepDims { get; set; } - - [DisplayName("select_last_idx")] - [Description("select last index")] - public bool SelectLastIdx { get; set; } - } - - [DisplayName("TENSOR.REDUCE_PROD")] - [Category("Tensor Instructions")] - [Description("ReduceProd")] - public class ReduceProdInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.REDUCE_PROD; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("rshape_axes")] - [Description("Axes shape register")] - public byte RshapeAxes { get; set; } - - [DisplayName("keep_dims")] - [Description("Keep dimensions")] - public bool KeepDims { get; set; } - } - - [DisplayName("TENSOR.REDUCE_WINDOW2D")] - [Category("Tensor Instructions")] - [Description("REDUCE_WINDOW2D")] - public class ReduceWindow2DInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.REDUCE_WINDOW2D; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("reduce_op")] - [Description("Reduce operator")] - public ReduceOp ReduceOp { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("filter_h")] - [Description("FilterH")] - public ushort FilterH { get; set; } - - [DisplayName("filter_w")] - [Description("FilterW")] - public ushort FilterW { get; set; } - - [DisplayName("stride_h")] - [Description("StrideH")] - public ushort StrideH { get; set; } - - [DisplayName("stride_w")] - [Description("StrideW")] - public ushort StrideW { get; set; } - - [DisplayName("dilation_h")] - [Description("DilationH")] - public ushort DilationH { get; set; } - - [DisplayName("dilation_w")] - [Description("DilationW")] - public ushort DilationW { get; set; } - - [DisplayName("fused_clamp_low")] - [Description("FusedClampLow")] - public float FusedClampLow { get; set; } - - [DisplayName("fused_clamp_high")] - [Description("FusedClampHigh")] - public float FusedClampHigh { get; set; } - } - - [DisplayName("TENSOR.RESIZE_IMAGE")] - [Category("Tensor Instructions")] - [Description("RESIZE_IMAGE")] - public class ResizeImageInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.RESIZE_IMAGE; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("align_corners")] - [Description("Align Corners")] - public bool AlignCorners { get; set; } - - [DisplayName("half_pixel_centers")] - [Description("Half Pixel Centers")] - public bool HalfPixelCenters { get; set; } - - [DisplayName("image_resize_mode")] - [Description("Image Resize Mode")] - public ImageResizeMode ImageResizeMode { get; set; } - } - - [DisplayName("TENSOR.ROI_ALIGN")] - [Category("Tensor Instructions")] - [Description("RoiAlign")] - public class RoiAlignInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.ROI_ALIGN; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rshape_dest")] - [Description("Dest shape register")] - public byte RshapeDest { get; set; } - - [DisplayName("mode")] - [Description("Mode")] - public RoiAlignMode mode { get; set; } - - [DisplayName("spatial_scale")] - [Description("Spatial Scale")] - public float SpatialScale { get; set; } - - [DisplayName("sampling_ratio")] - [Description("Sampling Ratio")] - public long SamplingRatio { get; set; } - } - - [DisplayName("TENSOR.SIGMOID")] - [Category("Tensor Instructions")] - [Description("Sigmoid")] - public class SigmoidInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.SIGMOID; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - } - - [DisplayName("TENSOR.SLICE")] - [Category("Tensor Instructions")] - [Description("Slice")] - public class SliceInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.SLICE; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("rbegins")] - [Description("Begins shape register")] - public byte Rbegins { get; set; } - - [DisplayName("rends")] - [Description("Ends shape register")] - public byte Rends { get; set; } - - [DisplayName("rstrides")] - [Description("Strides shape register")] - public byte Strides { get; set; } - } - - [DisplayName("TENSOR.SOFTMAX")] - [Category("Tensor Instructions")] - [Description("Softmax")] - public class SoftmaxInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.SOFTMAX; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("axis")] - [Description("Axis")] - public int Axis { get; set; } - - [DisplayName("beta")] - [Description("Beta")] - public float Beta { get; set; } - } - - [DisplayName("TENSOR.TERNARY")] - [Category("Tensor Instructions")] - [Description("Ternary")] - public class TernaryInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.TERNARY; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src1")] - [Description("Source1 shape register")] - public byte RshapeSrc1 { get; set; } - - [DisplayName("rstride_src1")] - [Description("Source1 stride register")] - public byte RstrideSrc1 { get; set; } - - [DisplayName("rshape_src2")] - [Description("Source2 shape register")] public byte RshapeSrc2 { get; set; } - [DisplayName("rstride_src2")] - [Description("Source2 stride register")] - public byte RstrideSrc2 { get; set; } - - [DisplayName("rshape_src3")] - [Description("Source3 shape register")] - public byte RshapeSrc3 { get; set; } - - [DisplayName("rstride_src3")] - [Description("Source3 stride register")] - public byte RstrideSrc3 { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - } - - [DisplayName("TENSOR.TOPK")] - [Category("Tensor Instructions")] - [Description("Topk")] - public class TopKInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.TOPK; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rshape_dest1")] - [Description("Dest1 shape register")] - public byte RshapeDest1 { get; set; } - - [DisplayName("rstride_dest1")] - [Description("Dest1 stride register")] - public byte RstrideDest1 { get; set; } - - [DisplayName("rshape_dest2")] - [Description("Dest2 shape register")] - public byte RshapeDest2 { get; set; } - - [DisplayName("rstride_dest2")] - [Description("Dest2 stride register")] - public byte RstrideDest2 { get; set; } - - [DisplayName("k")] - [Description("K")] - public long K { get; set; } - [DisplayName("axis")] [Description("Axis")] public int Axis { get; set; } - - [DisplayName("largest")] - [Description("Largest")] - public bool Largest { get; set; } - - [DisplayName("sorted")] - [Description("Sorted")] - public bool Sorted { get; set; } - } - - [DisplayName("TENSOR.TRILU")] - [Category("Tensor Instructions")] - [Description("Trilu")] - public class TriluInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.TRILU; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("upper")] - [Description("Upper")] - public bool Upper { get; set; } - - [DisplayName("k")] - [Description("K")] - public long K { get; set; } } - [DisplayName("TENSOR.UNARY")] + [DisplayName("TENSOR.INSTANCE_NORMALIZATION")] [Category("Tensor Instructions")] - [Description("Unary")] - public class UnaryInstruction : TensorInstruction + [Description("INSTANCE_NORMALIZATION")] + public class InstanceNormInstruction : TensorInstruction { - public override TensorFunction Function => TensorFunction.UNARY; + public override TensorFunction Function => TensorFunction.INSTANCE_NORMALIZATION; [DisplayName("datatype")] [Description("Datatype")] public DataType DataType { get; set; } - [DisplayName("rshape_src")] - [Description("Source1 shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } - - [DisplayName("unary_op")] - [Description("Unary operator")] - public UnaryOp UnaryOp { get; set; } - } - - [DisplayName("TENSOR.TRANSPOSE")] - [Category("Tensor Instructions")] - [Description("Transpose")] - public class TransposeInstruction : TensorInstruction - { - public override TensorFunction Function => TensorFunction.TRANSPOSE; - - [DisplayName("datatype")] - [Description("Datatype")] - public DataType DataType { get; set; } - - [DisplayName("rshape_src")] - [Description("Source shape register")] - public byte RshapeSrc { get; set; } - - [DisplayName("rstride_src")] - [Description("Source stride register")] - public byte RstrideSrc { get; set; } - - [DisplayName("rstride_dest")] - [Description("Dest stride register")] - public byte RstrideDest { get; set; } + [DisplayName("input_shape")] + [Description("input_shape")] + public byte input_shape { get; set; } - [DisplayName("rshape_perm")] - [Description("Perm shape register")] - public byte RshapePerm { get; set; } + [DisplayName("epsilon")] + [Description("epsilon")] + public float epsilon { get; set; } } - } + } } diff --git a/tools/stackvm_gen/IsaGen/Templates/op_reader_cpp.razor b/tools/stackvm_gen/IsaGen/Templates/op_reader_cpp.razor index 78a5277698..809cbf766a 100644 --- a/tools/stackvm_gen/IsaGen/Templates/op_reader_cpp.razor +++ b/tools/stackvm_gen/IsaGen/Templates/op_reader_cpp.razor @@ -78,10 +78,9 @@ result op_visitor::visit(gsl::span text) noexcept while (!interrupted_ && !reader_.empty()) try_(next()); - #ifdef ENABLE_OP_PROFILE - op_profile profile_time; - profile_time.print(); - #endif +#ifdef ENABLE_OP_PROFILE + op_profile::print(); +#endif return ok(); }