diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 0a40e104..97c017ba 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,9 +1,9 @@
name: release
-on: pull_request
-# on:
-# push:
-# tags:
-# - '*'
+# on: pull_request
+on:
+ push:
+ tags:
+ - '*'
env:
DEVELOPER_DIR: /Applications/Xcode_13.4.1.app/Contents/Developer
@@ -1247,21 +1247,21 @@ jobs:
name: ${{ env.PACKAGE_NAME }}
path: ${{ env.PACKAGE_NAME }}.zip
- # release:
- # permissions:
- # contents: write # for softprops/action-gh-release to create a release
- # needs: [setup, android, ios, ios-simulator, armlinux, macos, mac-catalyst, windows, ubuntu, webassembly, apple, devboard]
- # runs-on: ubuntu-latest
- # steps:
- # - name: download
- # uses: actions/download-artifact@v4
- # with:
- # path: artifacts
- #
- # - name: create-release
- # uses: softprops/action-gh-release@v1
- # with:
- # token: ${{ secrets.GITHUB_TOKEN }}
- # tag_name: ${{ needs.setup.outputs.VERSION }}
- # name: Release ${{ needs.setup.outputs.VERSION }}
- # files: artifacts/*/*.zip
+ release:
+ permissions:
+ contents: write # for softprops/action-gh-release to create a release
+ needs: [setup, android, ios, ios-simulator, armlinux, macos, mac-catalyst, windows, ubuntu, webassembly, apple, devboard]
+ runs-on: ubuntu-latest
+ steps:
+ - name: download
+ uses: actions/download-artifact@v4
+ with:
+ path: artifacts
+
+ - name: create-release
+ uses: softprops/action-gh-release@v1
+ with:
+ token: ${{ secrets.GITHUB_TOKEN }}
+ tag_name: ${{ needs.setup.outputs.VERSION }}
+ name: Release ${{ needs.setup.outputs.VERSION }}
+ files: artifacts/*/*.zip
diff --git a/README.md b/README.md
index 7e60a936..f024f384 100644
--- a/README.md
+++ b/README.md
@@ -331,8 +331,8 @@ https://github.com/nihui/opencv-mobile/releases/latest
tinyvision
arm-linux-uclibcgnueabihf
- ❌ HW JPG decoder (WIP)
- ❌ HW JPG encoder (WIP)
+ ✅ HW JPG decoder
+ ✅ HW JPG encoder
✅ MIPI CSI camera
diff --git a/highgui/CMakeLists.txt b/highgui/CMakeLists.txt
index 663e7749..694c3212 100644
--- a/highgui/CMakeLists.txt
+++ b/highgui/CMakeLists.txt
@@ -7,8 +7,11 @@ set(highgui_srcs
${CMAKE_CURRENT_LIST_DIR}/src/capture_v4l2_rk_aiq.cpp
${CMAKE_CURRENT_LIST_DIR}/src/exif.cpp
${CMAKE_CURRENT_LIST_DIR}/src/highgui.cpp
+ ${CMAKE_CURRENT_LIST_DIR}/src/jpeg_decoder_aw.cpp
${CMAKE_CURRENT_LIST_DIR}/src/jpeg_decoder_cvi.cpp
+ ${CMAKE_CURRENT_LIST_DIR}/src/jpeg_encoder_aw.cpp
${CMAKE_CURRENT_LIST_DIR}/src/jpeg_encoder_rk_mpp.cpp
+ ${CMAKE_CURRENT_LIST_DIR}/src/kanna_rotate.cpp
${CMAKE_CURRENT_LIST_DIR}/src/videocapture.cpp
)
diff --git a/highgui/src/highgui.cpp b/highgui/src/highgui.cpp
index 85884c5f..34b6478c 100644
--- a/highgui/src/highgui.cpp
+++ b/highgui/src/highgui.cpp
@@ -40,7 +40,9 @@
#include "stb_image_write.h"
#if defined __linux__
+#include "jpeg_decoder_aw.h"
#include "jpeg_decoder_cvi.h"
+#include "jpeg_encoder_aw.h"
#include "jpeg_encoder_rk_mpp.h"
#endif
@@ -154,6 +156,36 @@ Mat imread(const String& filename, int flags)
if (buf_size > 4 && buf_data[0] == 0xFF && buf_data[1] == 0xD8)
{
// jpg magic
+ if (jpeg_decoder_aw::supported(buf_data, buf_size))
+ {
+ int w = 0;
+ int h = 0;
+ int c = desired_channels;
+
+ jpeg_decoder_aw d;
+ int ret = d.init(buf_data, buf_size, &w, &h, &c);
+ if (ret == 0 && (c == 1 || c == 3))
+ {
+ Mat img;
+ if (c == 1)
+ {
+ img.create(h, w, CV_8UC1);
+ }
+ else // if (c == 3)
+ {
+ img.create(h, w, CV_8UC3);
+ }
+
+ ret = d.decode(buf_data, buf_size, img.data);
+ if (ret == 0)
+ {
+ d.deinit();
+ return img;
+ }
+ }
+
+ // fallback to stbi_load_from_memory
+ }
if (jpeg_decoder_cvi::supported(buf_data, buf_size))
{
int w = 0;
@@ -289,6 +321,38 @@ bool imwrite(const String& filename, InputArray _img, const std::vector& pa
#if defined __linux__
if (ext == ".jpg" || ext == ".jpeg" || ext == ".JPG" || ext == ".JPEG")
{
+ if (jpeg_encoder_aw::supported(img.cols, img.rows, c))
+ {
+ // anything to bgr
+ if (!img.isContinuous())
+ {
+ img = img.clone();
+ }
+
+ int quality = 95;
+ for (size_t i = 0; i < params.size(); i += 2)
+ {
+ if (params[i] == IMWRITE_JPEG_QUALITY)
+ {
+ quality = params[i + 1];
+ break;
+ }
+ }
+
+ jpeg_encoder_aw e;
+ int ret = e.init(img.cols, img.rows, c, quality);
+ if (ret == 0)
+ {
+ ret = e.encode(img.data, filename.c_str());
+ if (ret == 0)
+ {
+ e.deinit();
+ return true;
+ }
+ }
+
+ // fallback to stb_image_write
+ }
if (jpeg_encoder_rk_mpp::supported(img.cols, img.rows, c))
{
// anything to bgr
@@ -410,6 +474,36 @@ Mat imdecode(InputArray _buf, int flags)
if (buf_size > 4 && buf_data[0] == 0xFF && buf_data[1] == 0xD8)
{
// jpg magic
+ if (jpeg_decoder_aw::supported(buf_data, buf_size))
+ {
+ int w = 0;
+ int h = 0;
+ int c = desired_channels;
+
+ jpeg_decoder_aw d;
+ int ret = d.init(buf_data, buf_size, &w, &h, &c);
+ if (ret == 0 && (c == 1 || c == 3))
+ {
+ Mat img;
+ if (c == 1)
+ {
+ img.create(h, w, CV_8UC1);
+ }
+ else // if (c == 3)
+ {
+ img.create(h, w, CV_8UC3);
+ }
+
+ ret = d.decode(buf_data, buf_size, img.data);
+ if (ret == 0)
+ {
+ d.deinit();
+ return img;
+ }
+ }
+
+ // fallback to stbi_load_from_memory
+ }
if (jpeg_decoder_cvi::supported(buf_data, buf_size))
{
int w = 0;
@@ -543,6 +637,38 @@ bool imencode(const String& ext, InputArray _img, std::vector& buf, const
#if defined __linux__
if (ext == ".jpg" || ext == ".jpeg" || ext == ".JPG" || ext == ".JPEG")
{
+ if (jpeg_encoder_aw::supported(img.cols, img.rows, c))
+ {
+ // anything to bgr
+ if (!img.isContinuous())
+ {
+ img = img.clone();
+ }
+
+ int quality = 95;
+ for (size_t i = 0; i < params.size(); i += 2)
+ {
+ if (params[i] == IMWRITE_JPEG_QUALITY)
+ {
+ quality = params[i + 1];
+ break;
+ }
+ }
+
+ jpeg_encoder_aw e;
+ int ret = e.init(img.cols, img.rows, c, quality);
+ if (ret == 0)
+ {
+ ret = e.encode(img.data, buf);
+ if (ret == 0)
+ {
+ e.deinit();
+ return true;
+ }
+ }
+
+ // fallback to stb_image_write
+ }
if (jpeg_encoder_rk_mpp::supported(img.cols, img.rows, c))
{
// anything to bgr
diff --git a/highgui/src/jpeg_decoder_aw.cpp b/highgui/src/jpeg_decoder_aw.cpp
new file mode 100644
index 00000000..fda0f1d0
--- /dev/null
+++ b/highgui/src/jpeg_decoder_aw.cpp
@@ -0,0 +1,1341 @@
+//
+// Copyright (C) 2024 nihui
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "jpeg_decoder_aw.h"
+
+#if defined __linux__
+#include
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+#if __ARM_NEON
+#include
+#endif
+
+#include "exif.hpp"
+#include "kanna_rotate.h"
+
+// 0 = unknown
+// 1 = t113-i
+// 2 = tinyvision
+static int get_device_model()
+{
+ static int device_model = -1;
+
+ if (device_model >= 0)
+ return device_model;
+
+ device_model = 0;
+
+ FILE* fp = fopen("/proc/device-tree/model", "rb");
+ if (fp)
+ {
+ char buf[1024];
+ fgets(buf, 1024, fp);
+ fclose(fp);
+
+ if (strncmp(buf, "sun8iw20", 8) == 0)
+ {
+ // t113-i
+ device_model = 1;
+ }
+ if (strncmp(buf, "sun8iw21", 8) == 0)
+ {
+ // tinyvision
+ device_model = 2;
+ }
+ }
+
+ return device_model;
+}
+
+static bool is_device_whitelisted()
+{
+ const int device_model = get_device_model();
+
+ if (device_model == 1)
+ {
+ // t113-i
+ return true;
+ }
+ if (device_model == 2)
+ {
+ // tinyvision
+ return true;
+ }
+
+ return false;
+}
+
+extern "C" {
+
+typedef void (*PFN_AddVDPlugin)();
+typedef void (*PFN_AddVDPluginSingle)(const char* lib);
+
+}
+
+static void* libcdc_base = 0;
+static void* libvideoengine = 0;
+
+static PFN_AddVDPlugin AddVDPlugin = 0;
+static PFN_AddVDPluginSingle AddVDPluginSingle = 0;
+
+static int unload_videoengine_library()
+{
+ if (libcdc_base)
+ {
+ dlclose(libcdc_base);
+ libcdc_base = 0;
+ }
+
+ if (libvideoengine)
+ {
+ dlclose(libvideoengine);
+ libvideoengine = 0;
+ }
+
+ AddVDPlugin = 0;
+ AddVDPluginSingle = 0;
+
+ return 0;
+}
+
+static int load_videoengine_library()
+{
+ if (libvideoengine)
+ return 0;
+
+ // check device whitelist
+ bool whitelisted = is_device_whitelisted();
+ if (!whitelisted)
+ {
+ fprintf(stderr, "this device is not whitelisted for jpeg decoder aw cedarc\n");
+ return -1;
+ }
+
+ libcdc_base = dlopen("libcdc_base.so", RTLD_GLOBAL | RTLD_LAZY);
+ if (!libcdc_base)
+ {
+ libcdc_base = dlopen("/usr/lib/libcdc_base.so", RTLD_GLOBAL | RTLD_LAZY);
+ }
+ if (!libcdc_base)
+ {
+ goto OUT;
+ }
+
+ libvideoengine = dlopen("libvideoengine.so", RTLD_LOCAL | RTLD_NOW);
+ if (!libvideoengine)
+ {
+ libvideoengine = dlopen("/usr/lib/libvideoengine.so", RTLD_LOCAL | RTLD_NOW);
+ }
+ if (!libvideoengine)
+ {
+ goto OUT;
+ }
+
+ AddVDPlugin = (PFN_AddVDPlugin)dlsym(libvideoengine, "AddVDPlugin");
+ AddVDPluginSingle = (PFN_AddVDPluginSingle)dlsym(libvideoengine, "AddVDPluginSingle");
+
+ return 0;
+
+OUT:
+ unload_videoengine_library();
+
+ return -1;
+}
+
+class videoengine_library_loader
+{
+public:
+ bool ready;
+
+ videoengine_library_loader()
+ {
+ ready = (load_videoengine_library() == 0);
+
+ if (libvideoengine)
+ {
+ // AddVDPlugin();
+ AddVDPluginSingle("/usr/lib/libawmjpeg.so");
+ }
+ }
+
+ ~videoengine_library_loader()
+ {
+ unload_videoengine_library();
+ }
+};
+
+static videoengine_library_loader videoengine;
+
+
+extern "C" {
+
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned int u32;
+
+#if (INTPTR_MAX == INT64_MAX)
+ typedef unsigned long u64;
+#else
+ typedef unsigned long long u64;
+#endif
+
+typedef signed char s8;
+typedef signed short s16;
+typedef signed int s32;
+
+#if (INTPTR_MAX == INT64_MAX)
+ typedef signed long s64;
+#else
+ typedef signed long long s64;
+#endif
+
+typedef uintptr_t size_addr;
+
+struct VeOpsS;
+struct ScMemOpsS;
+
+enum EVIDEOCODECFORMAT
+{
+ VIDEO_CODEC_FORMAT_UNKNOWN = 0,
+ VIDEO_CODEC_FORMAT_MJPEG = 0x101,
+ VIDEO_CODEC_FORMAT_MPEG1 = 0x102,
+ VIDEO_CODEC_FORMAT_MPEG2 = 0x103,
+ VIDEO_CODEC_FORMAT_MPEG4 = 0x104,
+ VIDEO_CODEC_FORMAT_MSMPEG4V1 = 0x105,
+ VIDEO_CODEC_FORMAT_MSMPEG4V2 = 0x106,
+ VIDEO_CODEC_FORMAT_DIVX3 = 0x107, //* not support
+ VIDEO_CODEC_FORMAT_DIVX4 = 0x108, //* not support
+ VIDEO_CODEC_FORMAT_DIVX5 = 0x109, //* not support
+ VIDEO_CODEC_FORMAT_XVID = 0x10a,
+ VIDEO_CODEC_FORMAT_H263 = 0x10b,
+ VIDEO_CODEC_FORMAT_SORENSSON_H263 = 0x10c,
+ VIDEO_CODEC_FORMAT_RXG2 = 0x10d,
+ VIDEO_CODEC_FORMAT_WMV1 = 0x10e,
+ VIDEO_CODEC_FORMAT_WMV2 = 0x10f,
+ VIDEO_CODEC_FORMAT_WMV3 = 0x110,
+ VIDEO_CODEC_FORMAT_VP6 = 0x111,
+ VIDEO_CODEC_FORMAT_VP8 = 0x112,
+ VIDEO_CODEC_FORMAT_VP9 = 0x113,
+ VIDEO_CODEC_FORMAT_RX = 0x114,
+ VIDEO_CODEC_FORMAT_H264 = 0x115,
+ VIDEO_CODEC_FORMAT_H265 = 0x116,
+ VIDEO_CODEC_FORMAT_AVS = 0x117,
+ VIDEO_CODEC_FORMAT_AVS2 = 0x118,
+
+ VIDEO_CODEC_FORMAT_MAX = VIDEO_CODEC_FORMAT_AVS2,
+ VIDEO_CODEC_FORMAT_MIN = VIDEO_CODEC_FORMAT_MJPEG,
+};
+
+enum EPIXELFORMAT
+{
+ PIXEL_FORMAT_DEFAULT = 0,
+
+ PIXEL_FORMAT_YUV_PLANER_420 = 1,
+ PIXEL_FORMAT_YUV_PLANER_422 = 2,
+ PIXEL_FORMAT_YUV_PLANER_444 = 3,
+
+ PIXEL_FORMAT_YV12 = 4,
+ PIXEL_FORMAT_NV21 = 5,
+ PIXEL_FORMAT_NV12 = 6,
+ PIXEL_FORMAT_YUV_MB32_420 = 7,
+ PIXEL_FORMAT_YUV_MB32_422 = 8,
+ PIXEL_FORMAT_YUV_MB32_444 = 9,
+
+ PIXEL_FORMAT_RGBA = 10,
+ PIXEL_FORMAT_ARGB = 11,
+ PIXEL_FORMAT_ABGR = 12,
+ PIXEL_FORMAT_BGRA = 13,
+
+ PIXEL_FORMAT_YUYV = 14,
+ PIXEL_FORMAT_YVYU = 15,
+ PIXEL_FORMAT_UYVY = 16,
+ PIXEL_FORMAT_VYUY = 17,
+
+ PIXEL_FORMAT_PLANARUV_422 = 18,
+ PIXEL_FORMAT_PLANARVU_422 = 19,
+ PIXEL_FORMAT_PLANARUV_444 = 20,
+ PIXEL_FORMAT_PLANARVU_444 = 21,
+ PIXEL_FORMAT_P010_UV = 22,
+ PIXEL_FORMAT_P010_VU = 23,
+
+ PIXEL_FORMAT_MIN = PIXEL_FORMAT_DEFAULT,
+ PIXEL_FORMAT_MAX = PIXEL_FORMAT_PLANARVU_444,
+};
+
+typedef enum CONTROL_AFBC_MODE {
+ DISABLE_AFBC_ALL_SIZE = 0,
+ ENABLE_AFBC_JUST_BIG_SIZE = 1, //* >= 4k
+ ENABLE_AFBC_ALL_SIZE = 2,
+}eControlAfbcMode;
+
+typedef enum CONTROL_IPTV_MODE {
+ DISABLE_IPTV_ALL_SIZE = 0,
+ ENABLE_IPTV_JUST_SMALL_SIZE = 1, //* < 4k
+ ENABLE_IPTV_ALL_SIZE = 2,
+}eControlIptvMode;
+
+typedef enum COMMON_CONFIG_FLAG
+{
+ IS_MIRACAST_STREAM = 1,
+
+}eCommonConfigFlag;
+
+enum EVDECODERESULT
+{
+ VDECODE_RESULT_UNSUPPORTED = -1,
+ VDECODE_RESULT_OK = 0,
+ VDECODE_RESULT_FRAME_DECODED = 1,
+ VDECODE_RESULT_CONTINUE = 2,
+ VDECODE_RESULT_KEYFRAME_DECODED = 3,
+ VDECODE_RESULT_NO_FRAME_BUFFER = 4,
+ VDECODE_RESULT_NO_BITSTREAM = 5,
+ VDECODE_RESULT_RESOLUTION_CHANGE = 6,
+
+ VDECODE_RESULT_MIN = VDECODE_RESULT_UNSUPPORTED,
+ VDECODE_RESULT_MAX = VDECODE_RESULT_RESOLUTION_CHANGE,
+};
+
+typedef struct VIDEOSTREAMINFO
+{
+ int eCodecFormat;
+ int nWidth;
+ int nHeight;
+ int nFrameRate;
+ int nFrameDuration;
+ int nAspectRatio;
+ int bIs3DStream;
+ int nCodecSpecificDataLen;
+ char* pCodecSpecificData;
+ int bSecureStreamFlag;
+ int bSecureStreamFlagLevel1;
+ int bIsFramePackage; /* 1: frame package; 0: stream package */
+ int h265ReferencePictureNum;
+ int bReOpenEngine;
+ int bIsFrameCtsTestFlag;
+}VideoStreamInfo;
+
+typedef struct VCONFIG
+{
+ int bScaleDownEn;
+ int bRotationEn;
+ int bSecOutputEn;
+ int nHorizonScaleDownRatio;
+ int nVerticalScaleDownRatio;
+ int nSDWidth;
+ int nSDHeight;
+ int bAnySizeSD;
+ int nSecHorizonScaleDownRatio;
+ int nSecVerticalScaleDownRatio;
+ int nRotateDegree;
+ int bThumbnailMode;
+ int eOutputPixelFormat;
+ int eSecOutputPixelFormat;
+ int bNoBFrames;
+ int bDisable3D;
+ int bSupportMaf; //not use
+ int bDispErrorFrame;
+ int nVbvBufferSize;
+ int nFrameBufferNum;
+ int bSecureosEn;
+ int bGpuBufValid;
+ int nAlignStride;
+ int bIsSoftDecoderFlag;
+ int bVirMallocSbm;
+ int bSupportPallocBufBeforeDecode;
+ //only used for xuqi, set this flag to 1 meaning palloc the fbm buffer before
+ // decode the sequence, to short the first frame decoing time
+ int nDeInterlaceHoldingFrameBufferNum;
+ int nDisplayHoldingFrameBufferNum;
+ int nRotateHoldingFrameBufferNum;
+ int nDecodeSmoothFrameBufferNum;
+ int bIsTvStream;
+ int nLbcLossyComMod; //1:1.5x; 2:2x; 3:2.5x;
+ unsigned int bIsLossy; //lossy compression or not
+ unsigned int bRcEn; //compact storage or not
+
+ struct ScMemOpsS *memops;
+ eControlAfbcMode eCtlAfbcMode;
+ eControlIptvMode eCtlIptvMode;
+
+ VeOpsS* veOpsS;
+ void* pVeOpsSelf;
+ int bConvertVp910bitTo8bit;
+ unsigned int nVeFreq;
+
+ int bCalledByOmxFlag;
+
+ int bSetProcInfoEnable; //* for check the decoder info by cat devices-note
+ int nSetProcInfoFreq;
+ int nChannelNum;
+ int nSupportMaxWidth; //the max width of mjpeg continue decode
+ int nSupportMaxHeight; //the max height of mjpeg continue decode
+ eCommonConfigFlag commonConfigFlag;
+ int bATMFlag;
+}VConfig;
+
+typedef enum eVeLbcMode
+{
+ LBC_MODE_DISABLE = 0,
+ LBC_MODE_1_5X = 1,
+ LBC_MODE_2_0X = 2,
+ LBC_MODE_2_5X = 3,
+ LBC_MODE_NO_LOSSY = 4,
+}eVeLbcMode;
+
+typedef struct VCONFIG_v85x
+{
+ int bScaleDownEn;
+ int bRotationEn;
+ int bSecOutputEn;
+ int nHorizonScaleDownRatio;
+ int nVerticalScaleDownRatio;
+ int nSecHorizonScaleDownRatio;
+ int nSecVerticalScaleDownRatio;
+ int nRotateDegree;
+ int bThumbnailMode;
+ int eOutputPixelFormat;
+ int eSecOutputPixelFormat;
+ int bNoBFrames;
+ int bDisable3D;
+ int bSupportMaf; //not use
+ int bDispErrorFrame;
+ int nVbvBufferSize;
+ int nFrameBufferNum;
+ int bSecureosEn;
+ int bGpuBufValid;
+ int nAlignStride;
+ int bIsSoftDecoderFlag;
+ int bVirMallocSbm;
+ int bSupportPallocBufBeforeDecode;
+ //only used for xuqi, set this flag to 1 meaning palloc the fbm buffer before
+ // decode the sequence, to short the first frame decoing time
+ int nDeInterlaceHoldingFrameBufferNum;
+ int nDisplayHoldingFrameBufferNum;
+ int nRotateHoldingFrameBufferNum;
+ int nDecodeSmoothFrameBufferNum;
+ int bIsTvStream;
+ eVeLbcMode nLbcLossyComMod;//1:1.5x; 2:2x; 3:2.5x;
+
+ struct ScMemOpsS *memops;
+ eControlAfbcMode eCtlAfbcMode;
+ eControlIptvMode eCtlIptvMode;
+
+ VeOpsS* veOpsS;
+ void* pVeOpsSelf;
+ int bConvertVp910bitTo8bit;
+ unsigned int nVeFreq;
+
+ int bCalledByOmxFlag;
+
+ int bSetProcInfoEnable; //* for check the decoder info by cat devices-note
+ int nSetProcInfoFreq;
+ int nChannelNum;
+ int nSupportMaxWidth; //the max width of mjpeg continue decode
+ int nSupportMaxHeight; //the max height of mjpeg continue decode
+
+ unsigned int bIsLossy; //lossy compression or not
+ unsigned int bRcEn; //compact storage or not
+}VConfig_v85x;
+
+typedef struct VIDEOSTREAMDATAINFO
+{
+ char* pData;
+ int nLength;
+ int64_t nPts;
+ int64_t nPcr;
+ int bIsFirstPart;
+ int bIsLastPart;
+ int nID;
+ int nStreamIndex;
+ int bValid;
+ unsigned int bVideoInfoFlag;
+ void* pVideoInfo;
+}VideoStreamDataInfo;
+
+typedef enum VIDEO_TRANSFER
+{
+ VIDEO_TRANSFER_RESERVED_0 = 0,
+ VIDEO_TRANSFER_BT1361 = 1,
+ VIDEO_TRANSFER_UNSPECIFIED = 2,
+ VIDEO_TRANSFER_RESERVED_1 = 3,
+ VIDEO_TRANSFER_GAMMA2_2 = 4,
+ VIDEO_TRANSFER_GAMMA2_8 = 5,
+ VIDEO_TRANSFER_SMPTE_170M = 6,
+ VIDEO_TRANSFER_SMPTE_240M = 7,
+ VIDEO_TRANSFER_LINEAR = 8,
+ VIDEO_TRANSFER_LOGARITHMIC_0 = 9,
+ VIDEO_TRANSFER_LOGARITHMIC_1 = 10,
+ VIDEO_TRANSFER_IEC61966 = 11,
+ VIDEO_TRANSFER_BT1361_EXTENDED = 12,
+ VIDEO_TRANSFER_SRGB = 13,
+ VIDEO_TRANSFER_BT2020_0 = 14,
+ VIDEO_TRANSFER_BT2020_1 = 15,
+ VIDEO_TRANSFER_ST2084 = 16,
+ VIDEO_TRANSFER_ST428_1 = 17,
+ VIDEO_TRANSFER_HLG = 18,
+ VIDEO_TRANSFER_RESERVED = 19, //* 19~255
+}VIDEO_TRANSFER;
+
+typedef enum VIDEO_MATRIX_COEFFS
+{
+ VIDEO_MATRIX_COEFFS_IDENTITY = 0,
+ VIDEO_MATRIX_COEFFS_BT709 = 1,
+ VIDEO_MATRIX_COEFFS_UNSPECIFIED_0 = 2,
+ VIDEO_MATRIX_COEFFS_RESERVED_0 = 3,
+ VIDEO_MATRIX_COEFFS_BT470M = 4,
+ VIDEO_MATRIX_COEFFS_BT601_625_0 = 5,
+ VIDEO_MATRIX_COEFFS_BT601_625_1 = 6,
+ VIDEO_MATRIX_COEFFS_SMPTE_240M = 7,
+ VIDEO_MATRIX_COEFFS_YCGCO = 8,
+ VIDEO_MATRIX_COEFFS_BT2020 = 9,
+ VIDEO_MATRIX_COEFFS_BT2020_CONSTANT_LUMINANCE = 10,
+ VIDEO_MATRIX_COEFFS_SOMPATE = 11,
+ VIDEO_MATRIX_COEFFS_CD_NON_CONSTANT_LUMINANCE = 12,
+ VIDEO_MATRIX_COEFFS_CD_CONSTANT_LUMINANCE = 13,
+ VIDEO_MATRIX_COEFFS_BTICC = 14,
+ VIDEO_MATRIX_COEFFS_RESERVED = 15, //* 15~255
+}VIDEO_MATRIX_COEFFS;
+
+typedef enum VIDEO_FULL_RANGE_FLAG
+{
+ VIDEO_FULL_RANGE_LIMITED = 0,
+ VIDEO_FULL_RANGE_FULL = 1,
+}VIDEO_FULL_RANGE_FLAG;
+
+typedef struct VIDEO_FRM_MV_INFO
+{
+ s16 nMaxMv_x;
+ s16 nMinMv_x;
+ s16 nAvgMv_x;
+ s16 nMaxMv_y;
+ s16 nMinMv_y;
+ s16 nAvgMv_y;
+ s16 nMaxMv;
+ s16 nMinMv;
+ s16 nAvgMv;
+ s16 SkipRatio;
+}VIDEO_FRM_MV_INFO;
+
+typedef enum VID_FRAME_TYPE
+{
+ VIDEO_FORMAT_TYPE_UNKONWN = 0,
+ VIDEO_FORMAT_TYPE_I,
+ VIDEO_FORMAT_TYPE_P,
+ VIDEO_FORMAT_TYPE_B,
+ VIDEO_FORMAT_TYPE_IDR,
+ VIDEO_FORMAT_TYPE_BUTT,
+}VID_FRAME_TYPE;
+
+typedef struct VIDEO_FRM_STATUS_INFO
+{
+ VID_FRAME_TYPE enVidFrmType;
+ int nVidFrmSize;
+ int nVidFrmDisW;
+ int nVidFrmDisH;
+ int nVidFrmQP;
+ double nAverBitRate;
+ double nFrameRate;
+ int64_t nVidFrmPTS;
+ VIDEO_FRM_MV_INFO nMvInfo;
+ int bDropPreFrame;
+}VIDEO_FRM_STATUS_INFO;
+
+typedef struct VIDEOPICTURE
+{
+ int nID;
+ int nStreamIndex;
+ int ePixelFormat;
+ int nWidth;
+ int nHeight;
+ int nLineStride;
+ int nTopOffset;
+ int nLeftOffset;
+ int nBottomOffset;
+ int nRightOffset;
+ int nFrameRate;
+ int nAspectRatio;
+ int bIsProgressive;
+ int bTopFieldFirst;
+ int bRepeatTopField;
+ int64_t nPts;
+ int64_t nPcr;
+ char* pData0;
+ char* pData1;
+ char* pData2;
+ char* pData3;
+ int bMafValid;
+ char* pMafData;
+ int nMafFlagStride;
+ int bPreFrmValid;
+ int nBufId;
+ size_addr phyYBufAddr;
+ size_addr phyCBufAddr;
+ void* pPrivate;
+ int nBufFd;
+ int nBufStatus;
+ int bTopFieldError;
+ int bBottomFieldError;
+ int nColorPrimary; // default value is 0xffffffff, valid value id 0x0000xxyy
+ // xx: is video full range code
+ // yy: is matrix coefficient
+ int bFrameErrorFlag;
+
+ //* to save hdr info and afbc header info
+ void* pMetaData;
+
+ //*display related parameter
+ VIDEO_FULL_RANGE_FLAG video_full_range_flag;
+ VIDEO_TRANSFER transfer_characteristics;
+ VIDEO_MATRIX_COEFFS matrix_coeffs;
+ u8 colour_primaries;
+ //*end of display related parameter defined
+ //size_addr nLower2BitPhyAddr;
+ int nLower2BitBufSize;
+ int nLower2BitBufOffset;
+ int nLower2BitBufStride;
+ int b10BitPicFlag;
+ int bEnableAfbcFlag;
+ int nLbcLossyComMod;//1:1.5x; 2:2x; 3:2.5x;
+ unsigned int bIsLossy; //lossy compression or not
+ unsigned int bRcEn; //compact storage or not
+
+ int nBufSize;
+ int nAfbcSize;
+ int nLbcSize;
+ int nDebugCount;
+ VIDEO_FRM_STATUS_INFO nCurFrameInfo;
+}VideoPicture;
+
+typedef void* VideoDecoder;
+
+typedef VideoDecoder* (*PFN_CreateVideoDecoder)();
+typedef void (*PFN_DestroyVideoDecoder)(VideoDecoder* pDecoder);
+typedef int (*PFN_InitializeVideoDecoder)(VideoDecoder* pDecoder, VideoStreamInfo* pVideoInfo, VConfig* pVconfig);
+typedef int (*PFN_RequestVideoStreamBuffer)(VideoDecoder* pDecoder, int nRequireSize, char** ppBuf, int* pBufSize, char** ppRingBuf, int* pRingBufSize, int nStreamBufIndex);
+typedef int (*PFN_SubmitVideoStreamData)(VideoDecoder* pDecoder, VideoStreamDataInfo* pDataInfo, int nStreamBufIndex);
+typedef int (*PFN_DecodeVideoStream)(VideoDecoder* pDecoder, int bEndOfStream, int bDecodeKeyFrameOnly, int bDropBFrameIfDelay, int64_t nCurrentTimeUs);
+typedef VideoPicture* (*PFN_RequestPicture)(VideoDecoder* pDecoder, int nStreamIndex);
+typedef int (*PFN_ReturnPicture)(VideoDecoder* pDecoder, VideoPicture* pPicture);
+
+}
+
+
+static void* libvdecoder = 0;
+
+static PFN_CreateVideoDecoder CreateVideoDecoder = 0;
+static PFN_DestroyVideoDecoder DestroyVideoDecoder = 0;
+static PFN_InitializeVideoDecoder InitializeVideoDecoder = 0;
+static PFN_RequestVideoStreamBuffer RequestVideoStreamBuffer = 0;
+static PFN_SubmitVideoStreamData SubmitVideoStreamData = 0;
+static PFN_DecodeVideoStream DecodeVideoStream = 0;
+static PFN_RequestPicture RequestPicture = 0;
+static PFN_ReturnPicture ReturnPicture = 0;
+
+static int load_vdecoder_library()
+{
+ if (libvdecoder)
+ return 0;
+
+ // check device whitelist
+ bool whitelisted = is_device_whitelisted();
+ if (!whitelisted)
+ {
+ fprintf(stderr, "this device is not whitelisted for jpeg decoder aw cedarc\n");
+ return -1;
+ }
+
+ libvdecoder = dlopen("libvdecoder.so", RTLD_LOCAL | RTLD_NOW);
+ if (!libvdecoder)
+ {
+ libvdecoder = dlopen("/usr/lib/libvdecoder.so", RTLD_LOCAL | RTLD_NOW);
+ }
+ if (!libvdecoder)
+ {
+ return -1;
+ }
+
+ CreateVideoDecoder = (PFN_CreateVideoDecoder)dlsym(libvdecoder, "CreateVideoDecoder");
+ DestroyVideoDecoder = (PFN_DestroyVideoDecoder)dlsym(libvdecoder, "DestroyVideoDecoder");
+ InitializeVideoDecoder = (PFN_InitializeVideoDecoder)dlsym(libvdecoder, "InitializeVideoDecoder");
+ RequestVideoStreamBuffer = (PFN_RequestVideoStreamBuffer)dlsym(libvdecoder, "RequestVideoStreamBuffer");
+ SubmitVideoStreamData = (PFN_SubmitVideoStreamData)dlsym(libvdecoder, "SubmitVideoStreamData");
+ DecodeVideoStream = (PFN_DecodeVideoStream)dlsym(libvdecoder, "DecodeVideoStream");
+ RequestPicture = (PFN_RequestPicture)dlsym(libvdecoder, "RequestPicture");
+ ReturnPicture = (PFN_ReturnPicture)dlsym(libvdecoder, "ReturnPicture");
+
+ return 0;
+}
+
+static int unload_vdecoder_library()
+{
+ if (!libvdecoder)
+ return 0;
+
+ dlclose(libvdecoder);
+ libvdecoder = 0;
+
+ CreateVideoDecoder = 0;
+ DestroyVideoDecoder = 0;
+ InitializeVideoDecoder = 0;
+ RequestVideoStreamBuffer = 0;
+ SubmitVideoStreamData = 0;
+ DecodeVideoStream = 0;
+ RequestPicture = 0;
+ ReturnPicture = 0;
+
+ return 0;
+}
+
+class vdecoder_library_loader
+{
+public:
+ bool ready;
+
+ vdecoder_library_loader()
+ {
+ ready = (load_vdecoder_library() == 0);
+ }
+
+ ~vdecoder_library_loader()
+ {
+ unload_vdecoder_library();
+ }
+};
+
+static vdecoder_library_loader vdecoder;
+
+
+static void yuv420sp2bgr_neon(const unsigned char* yptr, const unsigned char* vuptr, int w, int h, int stride, unsigned char* bgr)
+{
+#if __ARM_NEON
+ uint8x8_t _v128 = vdup_n_u8(128);
+ int8x8_t _v90 = vdup_n_s8(90);
+ int8x8_t _v46 = vdup_n_s8(46);
+ int8x8_t _v22 = vdup_n_s8(22);
+ int8x8_t _v113 = vdup_n_s8(113);
+#endif // __ARM_NEON
+
+ for (int y = 0; y < h; y += 2)
+ {
+ const unsigned char* yptr0 = yptr;
+ const unsigned char* yptr1 = yptr + stride;
+ unsigned char* bgr0 = bgr;
+ unsigned char* bgr1 = bgr + w * 3;
+
+#if __ARM_NEON
+ int nn = w >> 3;
+ int remain = w - (nn << 3);
+#else
+ int remain = w;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+ for (; nn > 0; nn--)
+ {
+ int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
+ int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
+
+ int8x8_t _vvuu = vreinterpret_s8_u8(vsub_u8(vld1_u8(vuptr), _v128));
+ int8x8x2_t _vvvvuuuu = vtrn_s8(_vvuu, _vvuu);
+ int8x8_t _vv = _vvvvuuuu.val[0];
+ int8x8_t _uu = _vvvvuuuu.val[1];
+
+ int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
+ int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
+ _g0 = vmlsl_s8(_g0, _uu, _v22);
+ int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
+
+ int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
+ int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
+ _g1 = vmlsl_s8(_g1, _uu, _v22);
+ int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
+
+ uint8x8x3_t _bgr0;
+ _bgr0.val[0] = vqshrun_n_s16(_b0, 6);
+ _bgr0.val[1] = vqshrun_n_s16(_g0, 6);
+ _bgr0.val[2] = vqshrun_n_s16(_r0, 6);
+
+ uint8x8x3_t _bgr1;
+ _bgr1.val[0] = vqshrun_n_s16(_b1, 6);
+ _bgr1.val[1] = vqshrun_n_s16(_g1, 6);
+ _bgr1.val[2] = vqshrun_n_s16(_r1, 6);
+
+ vst3_u8(bgr0, _bgr0);
+ vst3_u8(bgr1, _bgr1);
+
+ yptr0 += 8;
+ yptr1 += 8;
+ vuptr += 8;
+ bgr0 += 24;
+ bgr1 += 24;
+ }
+#endif // __ARM_NEON
+
+#define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
+ for (; remain > 0; remain -= 2)
+ {
+ // R = 1.164 * yy + 1.596 * vv
+ // G = 1.164 * yy - 0.813 * vv - 0.391 * uu
+ // B = 1.164 * yy + 2.018 * uu
+
+ // R = Y + (1.370705 * (V-128))
+ // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
+ // B = Y + (1.732446 * (U-128))
+
+ // R = ((Y << 6) + 87.72512 * (V-128)) >> 6
+ // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
+ // B = ((Y << 6) + 110.876544 * (U-128)) >> 6
+
+ // R = ((Y << 6) + 90 * (V-128)) >> 6
+ // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
+ // B = ((Y << 6) + 113 * (U-128)) >> 6
+
+ // R = (yy + 90 * vv) >> 6
+ // G = (yy - 46 * vv - 22 * uu) >> 6
+ // B = (yy + 113 * uu) >> 6
+
+ int v = vuptr[0] - 128;
+ int u = vuptr[1] - 128;
+
+ int ruv = 90 * v;
+ int guv = -46 * v + -22 * u;
+ int buv = 113 * u;
+
+ int y00 = yptr0[0] << 6;
+ bgr0[0] = SATURATE_CAST_UCHAR((y00 + buv) >> 6);
+ bgr0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6);
+ bgr0[2] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6);
+
+ int y01 = yptr0[1] << 6;
+ bgr0[3] = SATURATE_CAST_UCHAR((y01 + buv) >> 6);
+ bgr0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6);
+ bgr0[5] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6);
+
+ int y10 = yptr1[0] << 6;
+ bgr1[0] = SATURATE_CAST_UCHAR((y10 + buv) >> 6);
+ bgr1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6);
+ bgr1[2] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6);
+
+ int y11 = yptr1[1] << 6;
+ bgr1[3] = SATURATE_CAST_UCHAR((y11 + buv) >> 6);
+ bgr1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6);
+ bgr1[5] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6);
+
+ yptr0 += 2;
+ yptr1 += 2;
+ vuptr += 2;
+ bgr0 += 6;
+ bgr1 += 6;
+ }
+#undef SATURATE_CAST_UCHAR
+
+ yptr += 2 * stride;
+ vuptr += stride - w;
+ bgr += 2 * 3 * w;
+ }
+}
+
+class jpeg_decoder_aw_impl
+{
+public:
+ jpeg_decoder_aw_impl();
+ ~jpeg_decoder_aw_impl();
+
+ int init(const unsigned char* jpgdata, int size, int* width, int* height, int* ch);
+
+ int decode(const unsigned char* jpgdata, int size, unsigned char* outbgr) const;
+
+ int deinit();
+
+protected:
+ int corrupted; // 0=fine
+ int width;
+ int height;
+ int ch;
+ int components; // 1=gray 3=yuv
+ int sampling_factor; // 0=444 1=422h 2=422v 3=420 4=400
+ int progressive;
+ int orientation; // exif
+};
+
+jpeg_decoder_aw_impl::jpeg_decoder_aw_impl()
+{
+ corrupted = 1;
+ width = 0;
+ height = 0;
+ ch = 0;
+ components = 0;
+ sampling_factor = -1;
+ progressive = 0;
+ orientation = 0;
+}
+
+jpeg_decoder_aw_impl::~jpeg_decoder_aw_impl()
+{
+ deinit();
+}
+
+int jpeg_decoder_aw_impl::init(const unsigned char* jpgdata, int jpgsize, int* _width, int* _height, int* _ch)
+{
+ if (!jpgdata || jpgsize < 4)
+ return -1;
+
+ // jpg magic
+ if (jpgdata[0] != 0xFF || jpgdata[1] != 0xD8)
+ return -1;
+
+ // parse jpg for width height components sampling-factor progressive
+ const unsigned char* pbuf = jpgdata;
+ const unsigned char* pend = pbuf + jpgsize;
+ while (pbuf + 1 < pend)
+ {
+ unsigned char marker0 = pbuf[0];
+ unsigned char marker1 = pbuf[1];
+ pbuf += 2;
+
+ if (marker0 != 0xFF)
+ break;
+
+ // SOI EOI
+ if (marker1 == 0xD8 || marker1 == 0xD9)
+ continue;
+
+ if (marker1 != 0xC0 && marker1 != 0xC2)
+ {
+ unsigned int skipsize = (pbuf[0] << 8) + pbuf[1];
+ pbuf += skipsize;
+ continue;
+ }
+
+ // SOF0 SOF2
+ unsigned int skipsize = (pbuf[0] << 8) + pbuf[1];
+ if (pbuf + skipsize > pend)
+ break;
+
+ // only 8bit supported
+ if (pbuf[2] != 8)
+ break;
+
+ height = (pbuf[3] << 8) + pbuf[4];
+ width = (pbuf[5] << 8) + pbuf[6];
+ if (height == 0 || width == 0)
+ break;
+
+ components = pbuf[7];
+ if (components != 1 && components != 3)
+ break;
+
+ pbuf += 8;
+
+ unsigned char phv[3][2];
+ for (int c = 0; c < components; c++)
+ {
+ unsigned char q = pbuf[1];
+ phv[c][0] = (q >> 4); // 2 1 1 2 1 1 1 1 1 1 1 1
+ phv[c][1] = (q & 15); // 2 1 1 1 1 1 2 1 1 1 1 1
+ pbuf += 3;
+ }
+
+ if (components == 3 && phv[1][0] == 1 && phv[1][1] == 1 && phv[2][0] == 1 && phv[2][1] == 1)
+ {
+ if (phv[0][0] == 1 && phv[0][1] == 1) sampling_factor = 0;
+ if (phv[0][0] == 2 && phv[0][1] == 1) sampling_factor = 1;
+ if (phv[0][0] == 1 && phv[0][1] == 2) sampling_factor = 2;
+ if (phv[0][0] == 2 && phv[0][1] == 2) sampling_factor = 3;
+ }
+ if (components == 1 && phv[0][0] == 1 && phv[0][1] == 1)
+ {
+ sampling_factor = 4;
+ }
+
+ // unsupported sampling factor
+ if (sampling_factor == -1)
+ break;
+
+ // jpg is fine
+ corrupted = 0;
+
+ if (marker1 == 0xC2)
+ progressive = 1;
+
+ break;
+ }
+
+ // resolve exif orientation
+ {
+ std::string s((const char*)jpgdata, jpgsize);
+ std::istringstream iss(s);
+
+ cv::ExifReader exif_reader(iss);
+ if (exif_reader.parse())
+ {
+ cv::ExifEntry_t e = exif_reader.getTag(cv::ORIENTATION);
+ orientation = e.field_u16;
+ if (orientation < 1 && orientation > 8)
+ orientation = 1;
+ }
+ }
+
+ if (corrupted)
+ return -1;
+
+ // progressive not supported
+ if (progressive)
+ return -1;
+
+ // grayscale not supported
+ if (sampling_factor == 4)
+ return -1;
+
+ if (width % 2 != 0 || height % 2 != 0)
+ return -1;
+
+ if (width < 8 && height < 8)
+ return -1;
+
+ ch = *_ch;
+ if (ch == 0)
+ ch = components;
+
+ if (orientation > 4)
+ {
+ // swap width height
+ int tmp = height;
+ height = width;
+ width = tmp;
+ }
+
+ *_width = width;
+ *_height = height;
+ *_ch = ch;
+
+ return 0;
+}
+
+int jpeg_decoder_aw_impl::decode(const unsigned char* jpgdata, int jpgsize, unsigned char* outbgr) const
+{
+ if (!outbgr)
+ return -1;
+
+ // corrupted file
+ if (corrupted)
+ return -1;
+
+ // progressive not supported
+ if (progressive)
+ return -1;
+
+ // grayscale not supported
+ if (sampling_factor == 4)
+ return -1;
+
+ if (width % 2 != 0 || height % 2 != 0)
+ return -1;
+
+ if (width < 8 && height < 8)
+ return -1;
+
+ const int src_width = orientation > 4 ? height : width;
+ const int src_height = orientation > 4 ? width : height;
+
+ // flag
+ int ret_val = 0;
+
+ VideoDecoder* vdec = 0;
+ VideoPicture* vpic = 0;
+
+ char* pBuf = 0;
+ int bufSize = 0;
+ char* pRingBuf = 0;
+ int ringBufSize = 0;
+
+ {
+ vdec = CreateVideoDecoder();
+ if (!vdec)
+ {
+ fprintf(stderr, "CreateVideoDecoder failed\n");
+ ret_val = -1;
+ goto OUT;
+ }
+ }
+
+ {
+ VideoStreamInfo videoInfo;
+ memset(&videoInfo, 0, sizeof(videoInfo));
+ videoInfo.eCodecFormat = VIDEO_CODEC_FORMAT_MJPEG;
+ videoInfo.nWidth = src_width;
+ videoInfo.nHeight = src_height;
+
+ VConfig vconfig;
+ memset(&vconfig, 0, sizeof(vconfig));
+ vconfig.eOutputPixelFormat = PIXEL_FORMAT_NV21;
+ vconfig.eSecOutputPixelFormat = PIXEL_FORMAT_NV21;
+ vconfig.bSupportPallocBufBeforeDecode = 1;
+ vconfig.nDeInterlaceHoldingFrameBufferNum = 1;
+ vconfig.nDisplayHoldingFrameBufferNum = 1;
+ vconfig.nRotateHoldingFrameBufferNum = 0;
+ vconfig.nDecodeSmoothFrameBufferNum = 1;
+
+ VConfig_v85x vconfig_v85x;
+ memset(&vconfig_v85x, 0, sizeof(vconfig_v85x));
+ vconfig_v85x.eOutputPixelFormat = PIXEL_FORMAT_NV21;
+ vconfig_v85x.eSecOutputPixelFormat = PIXEL_FORMAT_NV21;
+ vconfig_v85x.bSupportPallocBufBeforeDecode = 1;
+ vconfig_v85x.nDeInterlaceHoldingFrameBufferNum = 1;
+ vconfig_v85x.nDisplayHoldingFrameBufferNum = 1;
+ vconfig_v85x.nRotateHoldingFrameBufferNum = 0;
+ vconfig_v85x.nDecodeSmoothFrameBufferNum = 1;
+
+ VConfig* p_vconfig = get_device_model() == 2 ? (VConfig*)&vconfig_v85x : &vconfig;
+
+ int ret = InitializeVideoDecoder(vdec, &videoInfo, p_vconfig);
+ if (ret != 0)
+ {
+ fprintf(stderr, "InitializeVideoDecoder failed %d\n", ret);
+ ret_val = -1;
+ goto OUT;
+ }
+ }
+
+ {
+ int ret = RequestVideoStreamBuffer(vdec, jpgsize, &pBuf, &bufSize, &pRingBuf, &ringBufSize, 0);
+ if (ret != 0)
+ {
+ fprintf(stderr, "RequestVideoStreamBuffer failed %d\n", ret);
+ ret_val = -1;
+ goto OUT;
+ }
+
+ if (bufSize + ringBufSize < jpgsize)
+ {
+ fprintf(stderr, "RequestVideoStreamBuffer too small %d + %d < %d\n", bufSize, ringBufSize, jpgsize);
+ ret_val = -1;
+ goto OUT;
+ }
+
+ // copy to vdec sbm
+ if (bufSize >= jpgsize)
+ {
+ memcpy(pBuf, jpgdata, jpgsize);
+ }
+ else
+ {
+ memcpy(pBuf, jpgdata, bufSize);
+ memcpy(pRingBuf, jpgdata + bufSize, jpgsize - bufSize);
+ }
+ }
+
+ {
+ VideoStreamDataInfo dataInfo;
+ memset(&dataInfo, 0, sizeof(dataInfo));
+ dataInfo.pData = pBuf;
+ dataInfo.nLength = jpgsize;
+ dataInfo.bIsFirstPart = 1;
+ dataInfo.bIsLastPart = 1;
+
+ int ret = SubmitVideoStreamData(vdec, &dataInfo, 0);
+ if (ret != 0)
+ {
+ fprintf(stderr, "SubmitVideoStreamData failed %d\n", ret);
+ ret_val = -1;
+ goto OUT;
+ }
+ }
+
+ {
+ int endofstream = 1;
+ int ret = DecodeVideoStream(vdec, endofstream, 0, 0, 0);
+ if (ret != VDECODE_RESULT_KEYFRAME_DECODED)
+ {
+ fprintf(stderr, "DecodeVideoStream failed %d\n", ret);
+ ret_val = -1;
+ goto OUT;
+ }
+ }
+
+ {
+ vpic = RequestPicture(vdec, 0);
+ if (!vpic)
+ {
+ fprintf(stderr, "RequestPicture failed\n");
+ ret_val = -1;
+ goto OUT;
+ }
+
+ // fprintf(stderr, "nID = %d\n", vpic->nID);
+ // fprintf(stderr, "nStreamIndex = %d\n", vpic->nStreamIndex);
+ // fprintf(stderr, "ePixelFormat = %d\n", vpic->ePixelFormat);
+ // fprintf(stderr, "nWidth = %d\n", vpic->nWidth);
+ // fprintf(stderr, "nHeight = %d\n", vpic->nHeight);
+ // fprintf(stderr, "nLineStride = %d\n", vpic->nLineStride);
+ // fprintf(stderr, "nTopOffset = %d\n", vpic->nTopOffset);
+ // fprintf(stderr, "nLeftOffset = %d\n", vpic->nLeftOffset);
+ // fprintf(stderr, "nBottomOffset = %d\n", vpic->nBottomOffset);
+ // fprintf(stderr, "nRightOffset = %d\n", vpic->nRightOffset);
+ // fprintf(stderr, "nFrameRate = %d\n", vpic->nFrameRate);
+ // fprintf(stderr, "nAspectRatio = %d\n", vpic->nAspectRatio);
+ // fprintf(stderr, "bIsProgressive = %d\n", vpic->bIsProgressive);
+ // fprintf(stderr, "bTopFieldFirst = %d\n", vpic->bTopFieldFirst);
+ // fprintf(stderr, "bRepeatTopField = %d\n", vpic->bRepeatTopField);
+ // fprintf(stderr, "nPts = %d\n", vpic->nPts);
+ // fprintf(stderr, "nPcr = %d\n", vpic->nPcr);
+
+ if (vpic->ePixelFormat != PIXEL_FORMAT_NV21)
+ {
+ fprintf(stderr, "unsupported ePixelFormat %d\n", vpic->ePixelFormat);
+ ret_val = -1;
+ goto OUT;
+ }
+
+ {
+ const unsigned char* yptr = (const unsigned char*)vpic->pData0;
+ const unsigned char* vuptr = (const unsigned char*)vpic->pData1;
+
+ if (orientation == 0 || orientation == 1)
+ {
+ // no rotate
+ yuv420sp2bgr_neon(yptr, vuptr, width, height, vpic->nLineStride, outbgr);
+ }
+ else
+ {
+ // rotate
+ std::vector yuv_rotated;
+ yuv_rotated.resize(width * height / 2 * 3);
+
+ unsigned char* dstY = (unsigned char*)yuv_rotated.data();
+ unsigned char* dstUV = (unsigned char*)yuv_rotated.data() + width * height;
+
+ kanna_rotate_c1(yptr, src_width, src_height, vpic->nLineStride, dstY, width, height, width, orientation);
+ kanna_rotate_c2(vuptr, src_width / 2, src_height / 2, vpic->nLineStride, dstUV, width / 2, height / 2, width, orientation);
+
+ yuv420sp2bgr_neon(dstY, dstUV, width, height, width, outbgr);
+ }
+ }
+
+ }
+
+OUT:
+
+ if (vpic)
+ {
+ ReturnPicture(vdec, vpic);
+ }
+
+ if (vdec)
+ {
+ DestroyVideoDecoder(vdec);
+ }
+
+ return ret_val;
+}
+
+int jpeg_decoder_aw_impl::deinit()
+{
+ corrupted = 1;
+ width = 0;
+ height = 0;
+ ch = 0;
+ components = 0;
+ sampling_factor = -1;
+ progressive = 0;
+ orientation = 0;
+
+ return 0;
+}
+
+bool jpeg_decoder_aw::supported(const unsigned char* jpgdata, int jpgsize)
+{
+ if (!jpgdata || jpgsize < 4)
+ return false;
+
+ // jpg magic
+ if (jpgdata[0] != 0xFF || jpgdata[1] != 0xD8)
+ return false;
+
+ if (!videoengine.ready)
+ return false;
+
+ if (!vdecoder.ready)
+ return false;
+
+ return true;
+}
+
+jpeg_decoder_aw::jpeg_decoder_aw() : d(new jpeg_decoder_aw_impl)
+{
+}
+
+jpeg_decoder_aw::~jpeg_decoder_aw()
+{
+ delete d;
+}
+
+int jpeg_decoder_aw::init(const unsigned char* jpgdata, int jpgsize, int* width, int* height, int* ch)
+{
+ return d->init(jpgdata, jpgsize, width, height, ch);
+}
+
+int jpeg_decoder_aw::decode(const unsigned char* jpgdata, int jpgsize, unsigned char* outbgr) const
+{
+ return d->decode(jpgdata, jpgsize, outbgr);
+}
+
+int jpeg_decoder_aw::deinit()
+{
+ return d->deinit();
+}
+
+#else // defined __linux__
+
+bool jpeg_decoder_aw::supported(const unsigned char* /*jpgdata*/, int /*jpgsize*/)
+{
+ return false;
+}
+
+jpeg_decoder_aw::jpeg_decoder_aw() : d(0)
+{
+}
+
+jpeg_decoder_aw::~jpeg_decoder_aw()
+{
+}
+
+int jpeg_decoder_aw::init(const unsigned char* /*jpgdata*/, int /*jpgsize*/, int* /*width*/, int* /*height*/, int* /*ch*/)
+{
+ return -1;
+}
+
+int jpeg_decoder_aw::decode(const unsigned char* /*jpgdata*/, int /*jpgsize*/, unsigned char* /*outbgr*/) const
+{
+ return -1;
+}
+
+int jpeg_decoder_aw::deinit()
+{
+ return -1;
+}
+
+#endif // defined __linux__
diff --git a/highgui/src/jpeg_decoder_aw.h b/highgui/src/jpeg_decoder_aw.h
new file mode 100644
index 00000000..f3a6d2a0
--- /dev/null
+++ b/highgui/src/jpeg_decoder_aw.h
@@ -0,0 +1,39 @@
+//
+// Copyright (C) 2024 nihui
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#ifndef JPEG_DECODER_AW_H
+#define JPEG_DECODER_AW_H
+
+class jpeg_decoder_aw_impl;
+class jpeg_decoder_aw
+{
+public:
+ static bool supported(const unsigned char* jpgdata, int jpgsize);
+
+ jpeg_decoder_aw();
+ ~jpeg_decoder_aw();
+
+ int init(const unsigned char* jpgdata, int jpgsize, int* width, int* height, int* ch);
+
+ int decode(const unsigned char* jpgdata, int jpgsize, unsigned char* outbgr) const;
+
+ int deinit();
+
+private:
+ jpeg_decoder_aw_impl* const d;
+};
+
+#endif // JPEG_DECODER_AW_H
diff --git a/highgui/src/jpeg_encoder_aw.cpp b/highgui/src/jpeg_encoder_aw.cpp
new file mode 100644
index 00000000..476e15b5
--- /dev/null
+++ b/highgui/src/jpeg_encoder_aw.cpp
@@ -0,0 +1,1726 @@
+//
+// Copyright (C) 2024 nihui
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "jpeg_encoder_aw.h"
+
+#if defined __linux__
+#include
+#include
+#include
+
+#include
+#include
+
+#if __ARM_NEON
+#include
+#endif
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+// 0 = unknown
+// 1 = t113-i
+// 2 = tinyvision
+static int get_device_model()
+{
+ static int device_model = -1;
+
+ if (device_model >= 0)
+ return device_model;
+
+ device_model = 0;
+
+ FILE* fp = fopen("/proc/device-tree/model", "rb");
+ if (fp)
+ {
+ char buf[1024];
+ fgets(buf, 1024, fp);
+ fclose(fp);
+
+ if (strncmp(buf, "sun8iw20", 8) == 0)
+ {
+ // t113-i
+ device_model = 1;
+ }
+ if (strncmp(buf, "sun8iw21", 8) == 0)
+ {
+ // tinyvision
+ device_model = 2;
+ }
+ }
+
+ return device_model;
+}
+
+static bool is_device_whitelisted()
+{
+ const int device_model = get_device_model();
+
+ if (device_model == 1)
+ {
+ // t113-i
+ return true;
+ }
+ if (device_model == 2)
+ {
+ // tinyvision
+ return true;
+ }
+
+ return false;
+}
+
+
+
+
+extern "C" {
+
+typedef enum VENC_CODEC_TYPE {
+ VENC_CODEC_H264,
+ VENC_CODEC_JPEG,
+ VENC_CODEC_H264_VER2,
+ VENC_CODEC_H265,
+ VENC_CODEC_VP8,
+} VENC_CODEC_TYPE;
+
+typedef enum VENC_PIXEL_FMT {
+ VENC_PIXEL_YUV420SP,
+ VENC_PIXEL_YVU420SP,
+ VENC_PIXEL_YUV420P,
+ VENC_PIXEL_YVU420P,
+ VENC_PIXEL_YUV422SP,
+ VENC_PIXEL_YVU422SP,
+ VENC_PIXEL_YUV422P,
+ VENC_PIXEL_YVU422P,
+ VENC_PIXEL_YUYV422,
+ VENC_PIXEL_UYVY422,
+ VENC_PIXEL_YVYU422,
+ VENC_PIXEL_VYUY422,
+ VENC_PIXEL_ARGB,
+ VENC_PIXEL_RGBA,
+ VENC_PIXEL_ABGR,
+ VENC_PIXEL_BGRA,
+ VENC_PIXEL_TILE_32X32,
+ VENC_PIXEL_TILE_128X32,
+ VENC_PIXEL_AFBC_AW,
+ VENC_PIXEL_LBC_AW, //* for v5v200 and newer ic
+} VENC_PIXEL_FMT;
+
+typedef enum VENC_INDEXTYPE {
+ VENC_IndexParamBitrate = 0x0,
+ /**< reference type: int */
+ VENC_IndexParamFramerate,
+ /**< reference type: int */
+ VENC_IndexParamMaxKeyInterval,
+ /**< reference type: int */
+ VENC_IndexParamIfilter,
+ /**< reference type: int */
+ VENC_IndexParamRotation,
+ /**< reference type: int */
+ VENC_IndexParamSliceHeight,
+ /**< reference type: int */
+ VENC_IndexParamForceKeyFrame,
+ /**< reference type: int (write only)*/
+ VENC_IndexParamMotionDetectEnable,
+ /**< reference type: MotionParam(write only) */
+ VENC_IndexParamMotionDetectStatus,
+ /**< reference type: int(read only) */
+ VENC_IndexParamRgb2Yuv,
+ /**< reference type: VENC_COLOR_SPACE */
+ VENC_IndexParamYuv2Yuv,
+ /**< reference type: VENC_YUV2YUV */
+ VENC_IndexParamROIConfig,
+ /**< reference type: VencROIConfig */
+ VENC_IndexParamStride,
+ /**< reference type: int */
+ VENC_IndexParamColorFormat,
+ /**< reference type: VENC_PIXEL_FMT */
+ VENC_IndexParamSize,
+ /**< reference type: VencSize(read only) */
+ VENC_IndexParamSetVbvSize,
+ /**< reference type: setVbvSize(write only) */
+ VENC_IndexParamVbvInfo,
+ /**< reference type: getVbvInfo(read only) */
+ VENC_IndexParamSuperFrameConfig,
+ /**< reference type: VencSuperFrameConfig */
+ VENC_IndexParamSetPSkip,
+ /**< reference type: unsigned int */
+ VENC_IndexParamResetEnc,
+ /**< reference type: */
+ VENC_IndexParamSaveBSFile,
+ /**< reference type: VencSaveBSFile */
+ VENC_IndexParamHorizonFlip,
+ /**< reference type: unsigned int */
+
+ /* check capabiliy */
+ VENC_IndexParamMAXSupportSize,
+ /**< reference type: VencSize(read only) */
+ VENC_IndexParamCheckColorFormat,
+ /**< reference type: VencCheckFormat(read only) */
+
+ /* H264 param */
+ VENC_IndexParamH264Param = 0x100,
+ /**< reference type: VencH264Param */
+ VENC_IndexParamH264SPSPPS,
+ /**< reference type: VencHeaderData (read only)*/
+ VENC_IndexParamH264QPRange,
+ /**< reference type: VencQPRange */
+ VENC_IndexParamH264ProfileLevel,
+ /**< reference type: VencProfileLevel */
+ VENC_IndexParamH264EntropyCodingCABAC,
+ /**< reference type: int(0:CAVLC 1:CABAC) */
+ VENC_IndexParamH264CyclicIntraRefresh,
+ /**< reference type: VencCyclicIntraRefresh */
+ VENC_IndexParamH264FixQP,
+ /**< reference type: VencFixQP */
+ VENC_IndexParamH264SVCSkip,
+ /**< reference type: VencH264SVCSkip */
+ VENC_IndexParamH264AspectRatio,
+ /**< reference type: VencH264AspectRatio */
+ VENC_IndexParamFastEnc,
+ /**< reference type: int */
+ VENC_IndexParamH264VideoSignal,
+ /**< reference type: VencH264VideoSignal */
+ VENC_IndexParamH264VideoTiming,
+ /**< reference type: VencH264VideoTiming */
+ VENC_IndexParamChmoraGray,
+ /**< reference type: unsigned char */
+ VENC_IndexParamIQpOffset,
+ /**< reference type: constant QP */
+ VENC_IndexParamH264ConstantQP,
+ /**< reference type: int */
+ /* jpeg param */
+ VENC_IndexParamJpegQuality = 0x200,
+ /**< reference type: int (1~100) */
+ VENC_IndexParamJpegExifInfo,
+ /**< reference type: EXIFInfo */
+ VENC_IndexParamJpegEncMode,
+ /**< reference type: 0:jpeg; 1:motion_jepg */
+ VENC_IndexParamJpegVideoSignal,
+ /**< reference type: VencJpegVideoSignal */
+
+ /* VP8 param */
+ VENC_IndexParamVP8Param,
+ /* max one frame length */
+ VENC_IndexParamSetFrameLenThreshold,
+ /**< reference type: int */
+ /* decrease the a20 dram bands */
+ VENC_IndexParamSetA20LowBands,
+ /**< reference type: 0:disable; 1:enable */
+ VENC_IndexParamSetBitRateRange,
+ /**< reference type: VencBitRateRange */
+ VENC_IndexParamLongTermReference,
+ /**< reference type: 0:disable; 1:enable, default:enable */
+
+ /* h265 param */
+ VENC_IndexParamH265Param = 0x300,
+ VENC_IndexParamH265Gop,
+ VENC_IndexParamH265ToalFramesNum,
+ VENC_IndexParamH26xUpdateLTRef,
+ VENC_IndexParamH265Header,
+ VENC_IndexParamH265TendRatioCoef,
+ VENC_IndexParamH265Trans,
+ /**< reference type: VencH265TranS */
+ VENC_IndexParamH265Sao,
+ /**< reference type: VencH265SaoS */
+ VENC_IndexParamH265Dblk,
+ /**< reference type: VencH265DblkS */
+ VENC_IndexParamH265Timing,
+ /**< reference type: VencH265TimingS */
+ VENC_IndexParamIntraPeriod,
+ VENC_IndexParamMBModeCtrl,
+ VENC_IndexParamMBSumInfoOutput,
+ VENC_IndexParamMBInfoOutput,
+ VENC_IndexParamVUIAspectRatio,
+ VENC_IndexParamVUIVideoSignal,
+ VENC_IndexParamVUIChromaLoc,
+ VENC_IndexParamVUIDisplayWindow,
+ VENC_IndexParamVUIBitstreamRestriction,
+
+ VENC_IndexParamAlterFrame = 0x400,
+ /**< reference type: unsigned int */
+ VENC_IndexParamVirtualIFrame,
+ VENC_IndexParamChannelNum,
+ VENC_IndexParamProcSet,
+ /**< reference type: VencOverlayInfoS */
+ VENC_IndexParamSetOverlay,
+ /**< reference type: unsigned char */
+ VENC_IndexParamAllParams,
+ /**< reference type:VencBrightnessS */
+ VENC_IndexParamBright,
+ /**< reference type:VencSmartFun */
+ VENC_IndexParamSmartFuntion,
+ /**< reference type: VencHVS */
+ VENC_IndexParamHVS,
+ /**< reference type: unsigned char */
+ VENC_IndexParamSkipTend,
+ /**< reference type: unsigned char */
+ VENC_IndexParamHighPassFilter,
+ /**< reference type: unsigned char */
+ VENC_IndexParamPFrameIntraEn,
+ /**< reference type: unsigned char */
+ VENC_IndexParamEncodeTimeEn,
+ /**< reference type: VencEncodeTimeS */
+ VENC_IndexParamGetEncodeTime,
+ /**< reference type: unsigned char */
+ VENC_IndexParam3DFilter,
+ /**< reference type: unsigned char */
+ VENC_IndexParamIntra4x4En,
+
+ /**< reference type: unsigned int */
+ VENC_IndexParamSetNullFrame = 0x500,
+ /**< reference type: VencThumbInfo */
+ VENC_IndexParamGetThumbYUV,
+ /**< reference type: E_ISP_SCALER_RATIO */
+ VENC_IndexParamSetThumbScaler,
+ /**< reference type: unsigned char */
+ VENC_IndexParamAdaptiveIntraInP,
+ /**< reference type: VencBaseConfig */
+ VENC_IndexParamUpdateBaseInfo,
+
+ /**< reference type: unsigned char */
+ VENC_IndexParamFillingCbr,
+
+ /**< reference type: unsigned char */
+ VENC_IndexParamRoi,
+
+ /**< reference type: unsigned int */
+ /* drop the frame that bitstreamLen exceed vbv-valid-size */
+ VENC_IndexParamDropOverflowFrame,
+
+ /**< reference type: unsigned int; 0: day, 1: night*/
+ VENC_IndexParamIsNightCaseFlag,
+
+ /**< reference type: unsigned int; 0: normal case, 1: ipc case*/
+ VENC_IndexParamProductCase,
+
+ /**< reference type: VencWatermarkInfoS */
+ VENC_IndexParamSetOverlayByWatermark,
+} VENC_INDEXTYPE;
+
+struct ScMemOpsS;
+struct VeOpsS;
+
+typedef enum eVeLbcMode
+{
+ LBC_MODE_DISABLE = 0,
+ LBC_MODE_1_5X = 1,
+ LBC_MODE_2_0X = 2,
+ LBC_MODE_2_5X = 3,
+ LBC_MODE_NO_LOSSY = 4,
+}eVeLbcMode;
+
+typedef struct VencBaseConfig {
+ unsigned char bEncH264Nalu;
+ unsigned int nInputWidth;
+ unsigned int nInputHeight;
+ unsigned int nDstWidth;
+ unsigned int nDstHeight;
+ unsigned int nStride;
+ VENC_PIXEL_FMT eInputFormat;
+ struct ScMemOpsS *memops;
+ VeOpsS* veOpsS;
+ void* pVeOpsSelf;
+
+ unsigned char bOnlyWbFlag;
+
+ //* for v5v200 and newer ic
+ unsigned char bLbcLossyComEnFlag2x;
+ unsigned char bLbcLossyComEnFlag2_5x;
+ unsigned char bIsVbvNoCache;
+ //* end
+} VencBaseConfig;
+
+typedef struct VencBaseConfig_v85x {
+ unsigned char bEncH264Nalu;
+ unsigned int nInputWidth;
+ unsigned int nInputHeight;
+ unsigned int nDstWidth;
+ unsigned int nDstHeight;
+ unsigned int nStride;
+ VENC_PIXEL_FMT eInputFormat;
+ struct ScMemOpsS *memops;
+ VeOpsS* veOpsS;
+ void* pVeOpsSelf;
+
+ unsigned char bOnlyWbFlag;
+
+ //* for v5v200 and newer ic
+ unsigned char bLbcLossyComEnFlag1_5x;
+ unsigned char bLbcLossyComEnFlag2x;
+ unsigned char bLbcLossyComEnFlag2_5x;
+ unsigned char bIsVbvNoCache;
+ //* end
+
+ unsigned int bOnlineMode; //* 1: online mode, 0: offline mode;
+ unsigned int bOnlineChannel; //* 1: online channel, 0: offline channel;
+ unsigned int nOnlineShareBufNum; //* share buffer num
+
+ //*for debug
+ unsigned int extend_flag; //* flag&0x1: printf reg before interrupt
+ //* flag&0x2: printf reg after interrupt
+ eVeLbcMode rec_lbc_mode; //*0: disable, 1:1.5x , 2: 2.0x, 3: 2.5x, 4: no_lossy
+ //*for debug(end)
+} VencBaseConfig_v85x;
+
+typedef struct VencAllocateBufferParam {
+ unsigned int nBufferNum;
+ unsigned int nSizeY;
+ unsigned int nSizeC;
+} VencAllocateBufferParam;
+
+typedef struct VencRect {
+ int nLeft;
+ int nTop;
+ int nWidth;
+ int nHeight;
+} VencRect;
+
+/* support 4 ROI region */
+typedef struct VencROIConfig {
+ int bEnable;
+ int index; /* (0~3) */
+ int nQPoffset;
+ unsigned char roi_abs_flag;
+ VencRect sRect;
+} VencROIConfig;
+
+typedef struct VencInputBuffer {
+ unsigned long nID;
+ long long nPts;
+ unsigned int nFlag;
+ unsigned char* pAddrPhyY;
+ unsigned char* pAddrPhyC;
+ unsigned char* pAddrVirY;
+ unsigned char* pAddrVirC;
+ int nWidth;
+ int nHeight;
+ int nAlign;
+ int bEnableCorp;
+ VencRect sCropInfo;
+
+ int ispPicVar;
+ int ispPicVarChroma; //chroma filter coef[0-63], from isp
+ int bUseInputBufferRoi;
+ VencROIConfig roi_param[8];
+ int bAllocMemSelf;
+ int nShareBufFd;
+ unsigned char bUseCsiColorFormat;
+ VENC_PIXEL_FMT eCsiColorFormat;
+
+ int envLV;
+} VencInputBuffer;
+
+typedef struct VencInputBuffer_v85x {
+ unsigned long nID;
+ long long nPts;
+ unsigned int nFlag;
+ unsigned char* pAddrPhyY;
+ unsigned char* pAddrPhyC;
+ unsigned char* pAddrVirY;
+ unsigned char* pAddrVirC;
+ int bEnableCorp;
+ VencRect sCropInfo;
+
+ int ispPicVar;
+ int ispPicVarChroma; //chroma filter coef[0-63], from isp
+ int bUseInputBufferRoi;
+ VencROIConfig roi_param[8];
+ int bAllocMemSelf;
+ int nShareBufFd;
+ unsigned char bUseCsiColorFormat;
+ VENC_PIXEL_FMT eCsiColorFormat;
+
+ int envLV;
+ int bNeedFlushCache;
+}VencInputBuffer_v85x;
+
+typedef struct FrameInfo {
+ int CurrQp;
+ int avQp;
+ int nGopIndex;
+ int nFrameIndex;
+ int nTotalIndex;
+} FrameInfo;
+
+typedef struct VencOutputBuffer {
+ int nID;
+ long long nPts;
+ unsigned int nFlag;
+ unsigned int nSize0;
+ unsigned int nSize1;
+ unsigned char* pData0;
+ unsigned char* pData1;
+
+ FrameInfo frame_info;
+ unsigned int nSize2;
+ unsigned char* pData2;
+} VencOutputBuffer;
+
+typedef void* VideoEncoder;
+
+typedef VideoEncoder* (*PFN_VideoEncCreate)(VENC_CODEC_TYPE eCodecType);
+typedef void (*PFN_VideoEncDestroy)(VideoEncoder* pEncoder);
+typedef int (*PFN_VideoEncInit)(VideoEncoder* pEncoder, VencBaseConfig* pConfig);
+typedef int (*PFN_VideoEncUnInit)(VideoEncoder* pEncoder);
+typedef int (*PFN_AllocInputBuffer)(VideoEncoder* pEncoder, VencAllocateBufferParam* pBufferParam);
+typedef int (*PFN_GetOneAllocInputBuffer)(VideoEncoder* pEncoder, VencInputBuffer* pInputbuffer);
+typedef int (*PFN_FlushCacheAllocInputBuffer)(VideoEncoder* pEncoder, VencInputBuffer* pInputbuffer);
+typedef int (*PFN_ReturnOneAllocInputBuffer)(VideoEncoder* pEncoder, VencInputBuffer* pInputbuffer);
+typedef int (*PFN_ReleaseAllocInputBuffer)(VideoEncoder* pEncoder);
+typedef int (*PFN_AddOneInputBuffer)(VideoEncoder* pEncoder, VencInputBuffer* pInputbuffer);
+typedef int (*PFN_VideoEncodeOneFrame)(VideoEncoder* pEncoder);
+typedef int (*PFN_AlreadyUsedInputBuffer)(VideoEncoder* pEncoder, VencInputBuffer* pBuffer);
+typedef int (*PFN_ValidBitstreamFrameNum)(VideoEncoder* pEncoder);
+typedef int (*PFN_GetOneBitstreamFrame)(VideoEncoder* pEncoder, VencOutputBuffer* pBuffer);
+typedef int (*PFN_FreeOneBitStreamFrame)(VideoEncoder* pEncoder, VencOutputBuffer* pBuffer);
+typedef int (*PFN_VideoEncGetParameter)(VideoEncoder* pEncoder, VENC_INDEXTYPE indexType, void* paramData);
+typedef int (*PFN_VideoEncSetParameter)(VideoEncoder* pEncoder, VENC_INDEXTYPE indexType, void* paramData);
+
+// v85x
+typedef VideoEncoder* (*PFN_VencCreate)(VENC_CODEC_TYPE eCodecType);
+typedef void (*PFN_VencDestroy)(VideoEncoder* pEncoder);
+typedef int (*PFN_VencInit)(VideoEncoder* pEncoder, VencBaseConfig_v85x* pConfig);
+typedef int (*PFN_VencStart)(VideoEncoder* pEncoder);
+typedef int (*PFN_VencPause)(VideoEncoder* pEncoder);
+typedef int (*PFN_VencReset)(VideoEncoder* pEncoder);
+typedef int (*PFN_VencAllocateInputBuf)(VideoEncoder* pEncoder, VencAllocateBufferParam *pBufferParam, VencInputBuffer_v85x* dst_inputBuf);
+typedef int (*PFN_VencGetValidInputBufNum)(VideoEncoder* pEncoder);
+typedef int (*PFN_VencQueueInputBuf)(VideoEncoder* pEncoder, VencInputBuffer_v85x* inputbuffer);
+typedef int (*PFN_VencGetValidOutputBufNum)(VideoEncoder* pEncoder);
+typedef int (*PFN_VencDequeueOutputBuf)(VideoEncoder* pEncoder, VencOutputBuffer* pBuffer);
+typedef int (*PFN_VencQueueOutputBuf)(VideoEncoder* pEncoder, VencOutputBuffer* pBuffer);
+typedef int (*PFN_VencGetParameter)(VideoEncoder* pEncoder, VENC_INDEXTYPE indexType, void* paramData);
+typedef int (*PFN_VencSetParameter)(VideoEncoder* pEncoder, VENC_INDEXTYPE indexType, void* paramData);
+
+typedef enum
+{
+ VencEvent_FrameFormatNotMatch = 0, // frame format is not match to initial setting.
+ VencEvent_UpdateMbModeInfo = 1,
+ VencEvent_UpdateMbStatInfo = 2,
+ VencEvent_UpdateSharpParam = 3,
+ VencEvent_UpdateIspMotionParam = 4,
+ VencEvent_UpdateVeToIspParam = 5,
+ VencEvent_Max = 0x7FFFFFFF
+} VencEventType;
+
+typedef struct
+{
+ int nResult;
+ VencInputBuffer *pInputBuffer;
+ //other informations about this frame encoding can be added below.
+
+} VencCbInputBufferDoneInfo;
+
+typedef struct
+{
+ int (*EventHandler)(
+ VideoEncoder* pEncoder,
+ void* pAppData,
+ VencEventType eEvent,
+ unsigned int nData1,
+ unsigned int nData2,
+ void* pEventData);
+
+ int (*InputBufferDone)(
+ VideoEncoder* pEncoder,
+ void* pAppData,
+ VencCbInputBufferDoneInfo* pBufferDoneInfo);
+} VencCbType;
+
+typedef int (*PFN_VencSetCallbacks)(VideoEncoder* pEncoder, VencCbType* pCallbacks, void* pAppData);
+
+}
+
+static void* libvencoder = 0;
+
+static PFN_VideoEncCreate VideoEncCreate = 0;
+static PFN_VideoEncDestroy VideoEncDestroy = 0;
+static PFN_VideoEncInit VideoEncInit = 0;
+static PFN_VideoEncUnInit VideoEncUnInit = 0;
+static PFN_AllocInputBuffer AllocInputBuffer = 0;
+static PFN_GetOneAllocInputBuffer GetOneAllocInputBuffer = 0;
+static PFN_FlushCacheAllocInputBuffer FlushCacheAllocInputBuffer = 0;
+static PFN_ReturnOneAllocInputBuffer ReturnOneAllocInputBuffer = 0;
+static PFN_ReleaseAllocInputBuffer ReleaseAllocInputBuffer = 0;
+static PFN_AddOneInputBuffer AddOneInputBuffer = 0;
+static PFN_VideoEncodeOneFrame VideoEncodeOneFrame = 0;
+static PFN_AlreadyUsedInputBuffer AlreadyUsedInputBuffer = 0;
+static PFN_ValidBitstreamFrameNum ValidBitstreamFrameNum = 0;
+static PFN_GetOneBitstreamFrame GetOneBitstreamFrame = 0;
+static PFN_FreeOneBitStreamFrame FreeOneBitStreamFrame = 0;
+static PFN_VideoEncGetParameter VideoEncGetParameter = 0;
+static PFN_VideoEncSetParameter VideoEncSetParameter = 0;
+
+// v85x
+static PFN_VencCreate VencCreate = 0;
+static PFN_VencDestroy VencDestroy = 0;
+static PFN_VencInit VencInit = 0;
+static PFN_VencStart VencStart = 0;
+static PFN_VencPause VencPause = 0;
+static PFN_VencReset VencReset = 0;
+static PFN_VencAllocateInputBuf VencAllocateInputBuf = 0;
+static PFN_VencGetValidInputBufNum VencGetValidInputBufNum = 0;
+static PFN_VencQueueInputBuf VencQueueInputBuf = 0;
+static PFN_VencGetValidOutputBufNum VencGetValidOutputBufNum = 0;
+static PFN_VencDequeueOutputBuf VencDequeueOutputBuf = 0;
+static PFN_VencQueueOutputBuf VencQueueOutputBuf = 0;
+static PFN_VencGetParameter VencGetParameter = 0;
+static PFN_VencSetParameter VencSetParameter = 0;
+static PFN_VencSetCallbacks VencSetCallbacks = 0;
+
+static int load_vencoder_library()
+{
+ if (libvencoder)
+ return 0;
+
+ // check device whitelist
+ bool whitelisted = is_device_whitelisted();
+ if (!whitelisted)
+ {
+ fprintf(stderr, "this device is not whitelisted for jpeg encoder aw cedarc\n");
+ return -1;
+ }
+
+ libvencoder = dlopen("libvencoder.so", RTLD_LOCAL | RTLD_NOW);
+ if (!libvencoder)
+ {
+ libvencoder = dlopen("/usr/lib/libvencoder.so", RTLD_LOCAL | RTLD_NOW);
+ }
+ if (!libvencoder)
+ {
+ return -1;
+ }
+
+ if (get_device_model() == 2)
+ {
+ VencCreate = (PFN_VencCreate)dlsym(libvencoder, "VencCreate");
+ VencDestroy = (PFN_VencDestroy)dlsym(libvencoder, "VencDestroy");
+ VencInit = (PFN_VencInit)dlsym(libvencoder, "VencInit");
+ VencStart = (PFN_VencStart)dlsym(libvencoder, "VencStart");
+ VencPause = (PFN_VencPause)dlsym(libvencoder, "VencPause");
+ VencReset = (PFN_VencReset)dlsym(libvencoder, "VencReset");
+ VencAllocateInputBuf = (PFN_VencAllocateInputBuf)dlsym(libvencoder, "VencAllocateInputBuf");
+ VencGetValidInputBufNum = (PFN_VencGetValidInputBufNum)dlsym(libvencoder, "VencGetValidInputBufNum");
+ VencQueueInputBuf = (PFN_VencQueueInputBuf)dlsym(libvencoder, "VencQueueInputBuf");
+ VencGetValidOutputBufNum = (PFN_VencGetValidOutputBufNum)dlsym(libvencoder, "VencGetValidOutputBufNum");
+ VencDequeueOutputBuf = (PFN_VencDequeueOutputBuf)dlsym(libvencoder, "VencDequeueOutputBuf");
+ VencQueueOutputBuf = (PFN_VencQueueOutputBuf)dlsym(libvencoder, "VencQueueOutputBuf");
+ VencGetParameter = (PFN_VencGetParameter)dlsym(libvencoder, "VencGetParameter");
+ VencSetParameter = (PFN_VencSetParameter)dlsym(libvencoder, "VencSetParameter");
+ VencSetCallbacks = (PFN_VencSetCallbacks)dlsym(libvencoder, "VencSetCallbacks");
+ }
+ else
+ {
+ VideoEncCreate = (PFN_VideoEncCreate)dlsym(libvencoder, "VideoEncCreate");
+ VideoEncDestroy = (PFN_VideoEncDestroy)dlsym(libvencoder, "VideoEncDestroy");
+ VideoEncInit = (PFN_VideoEncInit)dlsym(libvencoder, "VideoEncInit");
+ VideoEncUnInit = (PFN_VideoEncUnInit)dlsym(libvencoder, "VideoEncUnInit");
+ AllocInputBuffer = (PFN_AllocInputBuffer)dlsym(libvencoder, "AllocInputBuffer");
+ GetOneAllocInputBuffer = (PFN_GetOneAllocInputBuffer)dlsym(libvencoder, "GetOneAllocInputBuffer");
+ FlushCacheAllocInputBuffer = (PFN_FlushCacheAllocInputBuffer)dlsym(libvencoder, "FlushCacheAllocInputBuffer");
+ ReturnOneAllocInputBuffer = (PFN_ReturnOneAllocInputBuffer)dlsym(libvencoder, "ReturnOneAllocInputBuffer");
+ ReleaseAllocInputBuffer = (PFN_ReleaseAllocInputBuffer)dlsym(libvencoder, "ReleaseAllocInputBuffer");
+ AddOneInputBuffer = (PFN_AddOneInputBuffer)dlsym(libvencoder, "AddOneInputBuffer");
+ VideoEncodeOneFrame = (PFN_VideoEncodeOneFrame)dlsym(libvencoder, "VideoEncodeOneFrame");
+ AlreadyUsedInputBuffer = (PFN_AlreadyUsedInputBuffer)dlsym(libvencoder, "AlreadyUsedInputBuffer");
+ ValidBitstreamFrameNum = (PFN_ValidBitstreamFrameNum)dlsym(libvencoder, "ValidBitstreamFrameNum");
+ GetOneBitstreamFrame = (PFN_GetOneBitstreamFrame)dlsym(libvencoder, "GetOneBitstreamFrame");
+ FreeOneBitStreamFrame = (PFN_FreeOneBitStreamFrame)dlsym(libvencoder, "FreeOneBitStreamFrame");
+ VideoEncGetParameter = (PFN_VideoEncGetParameter)dlsym(libvencoder, "VideoEncGetParameter");
+ VideoEncSetParameter = (PFN_VideoEncSetParameter)dlsym(libvencoder, "VideoEncSetParameter");
+ }
+
+ return 0;
+}
+
+static int unload_vencoder_library()
+{
+ if (!libvencoder)
+ return 0;
+
+ dlclose(libvencoder);
+ libvencoder = 0;
+
+ VideoEncCreate = 0;
+ VideoEncDestroy = 0;
+ VideoEncInit = 0;
+ VideoEncUnInit = 0;
+ AllocInputBuffer = 0;
+ GetOneAllocInputBuffer = 0;
+ FlushCacheAllocInputBuffer = 0;
+ ReturnOneAllocInputBuffer = 0;
+ ReleaseAllocInputBuffer = 0;
+ AddOneInputBuffer = 0;
+ VideoEncodeOneFrame = 0;
+ AlreadyUsedInputBuffer = 0;
+ ValidBitstreamFrameNum = 0;
+ GetOneBitstreamFrame = 0;
+ FreeOneBitStreamFrame = 0;
+ VideoEncGetParameter = 0;
+ VideoEncSetParameter = 0;
+
+ VencCreate = 0;
+ VencDestroy = 0;
+ VencInit = 0;
+ VencStart = 0;
+ VencPause = 0;
+ VencReset = 0;
+ VencAllocateInputBuf = 0;
+ VencGetValidInputBufNum = 0;
+ VencQueueInputBuf = 0;
+ VencGetValidOutputBufNum = 0;
+ VencDequeueOutputBuf = 0;
+ VencQueueOutputBuf = 0;
+ VencGetParameter = 0;
+ VencSetParameter = 0;
+ VencSetCallbacks = 0;
+
+ return 0;
+}
+
+class vencoder_library_loader
+{
+public:
+ bool ready;
+
+ vencoder_library_loader()
+ {
+ ready = (load_vencoder_library() == 0);
+ }
+
+ ~vencoder_library_loader()
+ {
+ unload_vencoder_library();
+ }
+};
+
+static vencoder_library_loader vencoder;
+
+
+static void gray2yuv420sp(const unsigned char* graydata, int width, int height, unsigned char* yptr, unsigned char* uvptr, int stride)
+{
+ for (int y = 0; y + 1 < height; y += 2)
+ {
+ const unsigned char* p0 = graydata + y * width;
+ const unsigned char* p1 = graydata + (y + 1) * width;
+ unsigned char* yptr0 = yptr + y * stride;
+ unsigned char* yptr1 = yptr + (y + 1) * stride;
+ unsigned char* uvptr0 = uvptr + (y / 2) * stride;
+
+ memcpy(yptr0, p0, width);
+ memcpy(yptr1, p1, width);
+ memset(uvptr0, 128, width);
+ }
+}
+
+static void bgr2yuv420sp(const unsigned char* bgrdata, int width, int height, unsigned char* yptr, unsigned char* uvptr, int stride)
+{
+#if __ARM_NEON
+ uint8x8_t _v38 = vdup_n_u8(38);
+ uint8x8_t _v75 = vdup_n_u8(75);
+ uint8x8_t _v15 = vdup_n_u8(15);
+
+ uint8x8_t _v127 = vdup_n_u8(127);
+ uint8x8_t _v84_107 = vzip_u8(vdup_n_u8(84), vdup_n_u8(107)).val[0];
+ uint8x8_t _v43_20 = vzip_u8(vdup_n_u8(43), vdup_n_u8(20)).val[0];
+ uint16x8_t _v128 = vdupq_n_u16((128 << 8) + 128);
+#endif // __ARM_NEON
+
+ for (int y = 0; y + 1 < height; y += 2)
+ {
+ const unsigned char* p0 = bgrdata + y * width * 3;
+ const unsigned char* p1 = bgrdata + (y + 1) * width * 3;
+ unsigned char* yptr0 = yptr + y * stride;
+ unsigned char* yptr1 = yptr + (y + 1) * stride;
+ unsigned char* uvptr0 = uvptr + (y / 2) * stride;
+
+ int x = 0;
+#if __ARM_NEON
+ for (; x + 7 < width; x += 8)
+ {
+ uint8x8x3_t _bgr0 = vld3_u8(p0);
+ uint8x8x3_t _bgr1 = vld3_u8(p1);
+
+ uint16x8_t _y0 = vmull_u8(_bgr0.val[0], _v15);
+ uint16x8_t _y1 = vmull_u8(_bgr1.val[0], _v15);
+ _y0 = vmlal_u8(_y0, _bgr0.val[1], _v75);
+ _y1 = vmlal_u8(_y1, _bgr1.val[1], _v75);
+ _y0 = vmlal_u8(_y0, _bgr0.val[2], _v38);
+ _y1 = vmlal_u8(_y1, _bgr1.val[2], _v38);
+ uint8x8_t _y0_u8 = vqrshrun_n_s16(vreinterpretq_s16_u16(_y0), 7);
+ uint8x8_t _y1_u8 = vqrshrun_n_s16(vreinterpretq_s16_u16(_y1), 7);
+
+ uint16x4_t _b4 = vpaddl_u8(_bgr0.val[0]);
+ uint16x4_t _g4 = vpaddl_u8(_bgr0.val[1]);
+ uint16x4_t _r4 = vpaddl_u8(_bgr0.val[2]);
+ _b4 = vpadal_u8(_b4, _bgr1.val[0]);
+ _g4 = vpadal_u8(_g4, _bgr1.val[1]);
+ _r4 = vpadal_u8(_r4, _bgr1.val[2]);
+ uint16x4x2_t _brbr = vzip_u16(_b4, _r4);
+ uint16x4x2_t _gggg = vzip_u16(_g4, _g4);
+ uint16x4x2_t _rbrb = vzip_u16(_r4, _b4);
+ uint8x8_t _br = vshrn_n_u16(vcombine_u16(_brbr.val[0], _brbr.val[1]), 2);
+ uint8x8_t _gg = vshrn_n_u16(vcombine_u16(_gggg.val[0], _gggg.val[1]), 2);
+ uint8x8_t _rb = vshrn_n_u16(vcombine_u16(_rbrb.val[0], _rbrb.val[1]), 2);
+
+ // uint8x8_t _br = vtrn_u8(_bgr0.val[0], _bgr0.val[2]).val[0];
+ // uint8x8_t _gg = vtrn_u8(_bgr0.val[1], _bgr0.val[1]).val[0];
+ // uint8x8_t _rb = vtrn_u8(_bgr0.val[2], _bgr0.val[0]).val[0];
+
+ uint16x8_t _uv = vmlal_u8(_v128, _br, _v127);
+ _uv = vmlsl_u8(_uv, _gg, _v84_107);
+ _uv = vmlsl_u8(_uv, _rb, _v43_20);
+ uint8x8_t _uv_u8 = vqshrn_n_u16(_uv, 8);
+
+ vst1_u8(yptr0, _y0_u8);
+ vst1_u8(yptr1, _y1_u8);
+ vst1_u8(uvptr0, _uv_u8);
+
+ p0 += 24;
+ p1 += 24;
+ yptr0 += 8;
+ yptr1 += 8;
+ uvptr0 += 8;
+ }
+#endif
+ for (; x + 1 < width; x += 2)
+ {
+ unsigned char b00 = p0[0];
+ unsigned char g00 = p0[1];
+ unsigned char r00 = p0[2];
+
+ unsigned char b01 = p0[3];
+ unsigned char g01 = p0[4];
+ unsigned char r01 = p0[5];
+
+ unsigned char b10 = p1[0];
+ unsigned char g10 = p1[1];
+ unsigned char r10 = p1[2];
+
+ unsigned char b11 = p1[3];
+ unsigned char g11 = p1[4];
+ unsigned char r11 = p1[5];
+
+ // y = 0.29900 * r + 0.58700 * g + 0.11400 * b
+ // u = -0.16874 * r - 0.33126 * g + 0.50000 * b + 128
+ // v = 0.50000 * r - 0.41869 * g - 0.08131 * b + 128
+
+#define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
+ unsigned char y00 = SATURATE_CAST_UCHAR(( 38 * r00 + 75 * g00 + 15 * b00 + 64) >> 7);
+ unsigned char y01 = SATURATE_CAST_UCHAR(( 38 * r01 + 75 * g01 + 15 * b01 + 64) >> 7);
+ unsigned char y10 = SATURATE_CAST_UCHAR(( 38 * r10 + 75 * g10 + 15 * b10 + 64) >> 7);
+ unsigned char y11 = SATURATE_CAST_UCHAR(( 38 * r11 + 75 * g11 + 15 * b11 + 64) >> 7);
+
+ unsigned char b4 = (b00 + b01 + b10 + b11) / 4;
+ unsigned char g4 = (g00 + g01 + g10 + g11) / 4;
+ unsigned char r4 = (r00 + r01 + r10 + r11) / 4;
+
+ // unsigned char b4 = b00;
+ // unsigned char g4 = g00;
+ // unsigned char r4 = r00;
+
+ unsigned char u = SATURATE_CAST_UCHAR(((-43 * r4 - 84 * g4 + 127 * b4 + 128) >> 8) + 128);
+ unsigned char v = SATURATE_CAST_UCHAR(((127 * r4 - 107 * g4 - 20 * b4 + 128) >> 8) + 128);
+#undef SATURATE_CAST_UCHAR
+
+ yptr0[0] = y00;
+ yptr0[1] = y01;
+ yptr1[0] = y10;
+ yptr1[1] = y11;
+ uvptr0[0] = u;
+ uvptr0[1] = v;
+
+ p0 += 6;
+ p1 += 6;
+ yptr0 += 2;
+ yptr1 += 2;
+ uvptr0 += 2;
+ }
+ }
+}
+
+static void bgra2yuv420sp(const unsigned char* bgradata, int width, int height, unsigned char* yptr, unsigned char* uvptr, int stride)
+{
+#if __ARM_NEON
+ uint8x8_t _v38 = vdup_n_u8(38);
+ uint8x8_t _v75 = vdup_n_u8(75);
+ uint8x8_t _v15 = vdup_n_u8(15);
+
+ uint8x8_t _v127 = vdup_n_u8(127);
+ uint8x8_t _v84_107 = vzip_u8(vdup_n_u8(84), vdup_n_u8(107)).val[0];
+ uint8x8_t _v43_20 = vzip_u8(vdup_n_u8(43), vdup_n_u8(20)).val[0];
+ uint16x8_t _v128 = vdupq_n_u16((128 << 8) + 128);
+#endif // __ARM_NEON
+
+ for (int y = 0; y + 1 < height; y += 2)
+ {
+ const unsigned char* p0 = bgradata + y * width * 4;
+ const unsigned char* p1 = bgradata + (y + 1) * width * 4;
+ unsigned char* yptr0 = yptr + y * stride;
+ unsigned char* yptr1 = yptr + (y + 1) * stride;
+ unsigned char* uvptr0 = uvptr + (y / 2) * stride;
+
+ int x = 0;
+#if __ARM_NEON
+ for (; x + 7 < width; x += 8)
+ {
+ uint8x8x4_t _bgr0 = vld4_u8(p0);
+ uint8x8x4_t _bgr1 = vld4_u8(p1);
+
+ uint16x8_t _y0 = vmull_u8(_bgr0.val[0], _v15);
+ uint16x8_t _y1 = vmull_u8(_bgr1.val[0], _v15);
+ _y0 = vmlal_u8(_y0, _bgr0.val[1], _v75);
+ _y1 = vmlal_u8(_y1, _bgr1.val[1], _v75);
+ _y0 = vmlal_u8(_y0, _bgr0.val[2], _v38);
+ _y1 = vmlal_u8(_y1, _bgr1.val[2], _v38);
+ uint8x8_t _y0_u8 = vqrshrun_n_s16(vreinterpretq_s16_u16(_y0), 7);
+ uint8x8_t _y1_u8 = vqrshrun_n_s16(vreinterpretq_s16_u16(_y1), 7);
+
+ uint16x4_t _b4 = vpaddl_u8(_bgr0.val[0]);
+ uint16x4_t _g4 = vpaddl_u8(_bgr0.val[1]);
+ uint16x4_t _r4 = vpaddl_u8(_bgr0.val[2]);
+ _b4 = vpadal_u8(_b4, _bgr1.val[0]);
+ _g4 = vpadal_u8(_g4, _bgr1.val[1]);
+ _r4 = vpadal_u8(_r4, _bgr1.val[2]);
+ uint16x4x2_t _brbr = vzip_u16(_b4, _r4);
+ uint16x4x2_t _gggg = vzip_u16(_g4, _g4);
+ uint16x4x2_t _rbrb = vzip_u16(_r4, _b4);
+ uint8x8_t _br = vshrn_n_u16(vcombine_u16(_brbr.val[0], _brbr.val[1]), 2);
+ uint8x8_t _gg = vshrn_n_u16(vcombine_u16(_gggg.val[0], _gggg.val[1]), 2);
+ uint8x8_t _rb = vshrn_n_u16(vcombine_u16(_rbrb.val[0], _rbrb.val[1]), 2);
+
+ // uint8x8_t _br = vtrn_u8(_bgr0.val[0], _bgr0.val[2]).val[0];
+ // uint8x8_t _gg = vtrn_u8(_bgr0.val[1], _bgr0.val[1]).val[0];
+ // uint8x8_t _rb = vtrn_u8(_bgr0.val[2], _bgr0.val[0]).val[0];
+
+ uint16x8_t _uv = vmlal_u8(_v128, _br, _v127);
+ _uv = vmlsl_u8(_uv, _gg, _v84_107);
+ _uv = vmlsl_u8(_uv, _rb, _v43_20);
+ uint8x8_t _uv_u8 = vqshrn_n_u16(_uv, 8);
+
+ vst1_u8(yptr0, _y0_u8);
+ vst1_u8(yptr1, _y1_u8);
+ vst1_u8(uvptr0, _uv_u8);
+
+ p0 += 32;
+ p1 += 32;
+ yptr0 += 8;
+ yptr1 += 8;
+ uvptr0 += 8;
+ }
+#endif
+ for (; x + 1 < width; x += 2)
+ {
+ unsigned char b00 = p0[0];
+ unsigned char g00 = p0[1];
+ unsigned char r00 = p0[2];
+
+ unsigned char b01 = p0[4];
+ unsigned char g01 = p0[5];
+ unsigned char r01 = p0[6];
+
+ unsigned char b10 = p1[0];
+ unsigned char g10 = p1[1];
+ unsigned char r10 = p1[2];
+
+ unsigned char b11 = p1[4];
+ unsigned char g11 = p1[5];
+ unsigned char r11 = p1[6];
+
+ // y = 0.29900 * r + 0.58700 * g + 0.11400 * b
+ // u = -0.16874 * r - 0.33126 * g + 0.50000 * b + 128
+ // v = 0.50000 * r - 0.41869 * g - 0.08131 * b + 128
+
+#define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255);
+ unsigned char y00 = SATURATE_CAST_UCHAR(( 38 * r00 + 75 * g00 + 15 * b00 + 64) >> 7);
+ unsigned char y01 = SATURATE_CAST_UCHAR(( 38 * r01 + 75 * g01 + 15 * b01 + 64) >> 7);
+ unsigned char y10 = SATURATE_CAST_UCHAR(( 38 * r10 + 75 * g10 + 15 * b10 + 64) >> 7);
+ unsigned char y11 = SATURATE_CAST_UCHAR(( 38 * r11 + 75 * g11 + 15 * b11 + 64) >> 7);
+
+ unsigned char b4 = (b00 + b01 + b10 + b11) / 4;
+ unsigned char g4 = (g00 + g01 + g10 + g11) / 4;
+ unsigned char r4 = (r00 + r01 + r10 + r11) / 4;
+
+ // unsigned char b4 = b00;
+ // unsigned char g4 = g00;
+ // unsigned char r4 = r00;
+
+ unsigned char u = SATURATE_CAST_UCHAR(((-43 * r4 - 84 * g4 + 127 * b4 + 128) >> 8) + 128);
+ unsigned char v = SATURATE_CAST_UCHAR(((127 * r4 - 107 * g4 - 20 * b4 + 128) >> 8) + 128);
+#undef SATURATE_CAST_UCHAR
+
+ yptr0[0] = y00;
+ yptr0[1] = y01;
+ yptr1[0] = y10;
+ yptr1[1] = y11;
+ uvptr0[0] = u;
+ uvptr0[1] = v;
+
+ p0 += 8;
+ p1 += 8;
+ yptr0 += 2;
+ yptr1 += 2;
+ uvptr0 += 2;
+ }
+ }
+}
+
+class jpeg_encoder_aw_impl
+{
+public:
+ jpeg_encoder_aw_impl();
+ ~jpeg_encoder_aw_impl();
+
+ int init(int width, int height, int ch, int quality);
+
+ int encode(const unsigned char* bgrdata, std::vector& outdata) const;
+
+ int encode(const unsigned char* bgrdata, const char* outfilepath) const;
+
+ int deinit();
+
+public:
+ int inited;
+ int width;
+ int height;
+ int ch;
+
+ VideoEncoder* venc;
+
+ mutable VencInputBuffer input_buffer;
+ mutable VencInputBuffer_v85x input_buffer_v85x;
+ int b_input_buffer_got;
+};
+
+jpeg_encoder_aw_impl::jpeg_encoder_aw_impl()
+{
+ inited = 0;
+ width = 0;
+ height = 0;
+ ch = 0;
+
+ venc = 0;
+
+ b_input_buffer_got = 0;
+}
+
+jpeg_encoder_aw_impl::~jpeg_encoder_aw_impl()
+{
+ deinit();
+}
+
+static int EventHandler(VideoEncoder* /*pEncoder*/, void* /*pAppData*/, VencEventType /*eEvent*/, unsigned int /*nData1*/, unsigned int /*nData2*/, void* /*pEventData*/)
+{
+ // fprintf(stderr, "EventHandler event = %d\n", eEvent);
+ return 0;
+}
+
+static int InputBufferDone(VideoEncoder* /*pEncoder*/, void* pAppData, VencCbInputBufferDoneInfo* pBufferDoneInfo)
+{
+ // fprintf(stderr, "InputBufferDone\n");
+ jpeg_encoder_aw_impl* pthis = (jpeg_encoder_aw_impl*)pAppData;
+
+ memcpy(&pthis->input_buffer_v85x, pBufferDoneInfo->pInputBuffer, sizeof(VencInputBuffer_v85x));
+
+ return 0;
+}
+
+int jpeg_encoder_aw_impl::init(int _width, int _height, int _ch, int quality)
+{
+ if (!vencoder.ready)
+ {
+ fprintf(stderr, "vencoder not ready\n");
+ return -1;
+ }
+
+ if (inited)
+ {
+ int ret = deinit();
+ if (ret != 0)
+ {
+ fprintf(stderr, "deinit failed before re-init\n");
+ return -1;
+ }
+ }
+
+ width = _width;
+ height = _height;
+ ch = _ch;
+
+ // fprintf(stderr, "width = %d\n", width);
+ // fprintf(stderr, "height = %d\n", height);
+ // fprintf(stderr, "ch = %d\n", ch);
+
+ const int aligned_width = (width + 15) / 16 * 16;
+ const int aligned_height = (height + 15) / 16 * 16;
+
+ if (get_device_model() == 2)
+ {
+ venc = VencCreate(VENC_CODEC_JPEG);
+ if (!venc)
+ {
+ fprintf(stderr, "VencCreate failed\n");
+ goto OUT;
+ }
+
+ // {
+ // int vbv_size = aligned_width * aligned_height * 3 / 2;
+ // int ret = VencSetParameter(venc, VENC_IndexParamSetVbvSize, (void*)&vbv_size);
+ // if (ret)
+ // {
+ // fprintf(stderr, "VencSetParameter VENC_IndexParamSetVbvSize failed %d\n", ret);
+ // goto OUT;
+ // }
+ // }
+
+ {
+ int ret = VencSetParameter(venc, VENC_IndexParamJpegQuality, (void*)&quality);
+ if (ret)
+ {
+ fprintf(stderr, "VencSetParameter VENC_IndexParamJpegQuality failed %d\n", ret);
+ goto OUT;
+ }
+ }
+
+ {
+ int enc_mode = 0;
+ int ret = VencSetParameter(venc, VENC_IndexParamJpegEncMode, (void*)&enc_mode);
+ if (ret)
+ {
+ fprintf(stderr, "VencSetParameter VENC_IndexParamJpegEncMode failed %d\n", ret);
+ goto OUT;
+ }
+ }
+
+ {
+ VencBaseConfig_v85x config;
+ memset(&config, 0, sizeof(config));
+ config.nInputWidth = width;
+ config.nInputHeight = height;
+ config.nDstWidth = width;
+ config.nDstHeight = height;
+ config.nStride = aligned_width;
+ config.eInputFormat = VENC_PIXEL_YUV420SP;
+
+ int ret = VencInit(venc, &config);
+ if (ret)
+ {
+ fprintf(stderr, "VencInit failed %d\n", ret);
+ goto OUT;
+ }
+ }
+
+ {
+ VencAllocateBufferParam bufferParam;
+ bufferParam.nSizeY = aligned_width * aligned_height;
+ bufferParam.nSizeC = aligned_width * aligned_height / 2;
+ bufferParam.nBufferNum = 1;
+
+ int ret = VencAllocateInputBuf(venc, &bufferParam, &input_buffer_v85x);
+ if (ret)
+ {
+ fprintf(stderr, "VencAllocateInputBuf failed %d\n", ret);
+ goto OUT;
+ }
+
+ b_input_buffer_got = 1;
+ }
+
+ {
+ VencCbType vencCallBack;
+ vencCallBack.EventHandler = EventHandler;
+ vencCallBack.InputBufferDone = InputBufferDone;
+
+ int ret = VencSetCallbacks(venc, &vencCallBack, this);
+ if (ret)
+ {
+ fprintf(stderr, "VencSetCallbacks failed %d\n", ret);
+ goto OUT;
+ }
+ }
+
+ {
+ int ret = VencStart(venc);
+ if (ret)
+ {
+ fprintf(stderr, "VencStart failed %d\n", ret);
+ goto OUT;
+ }
+ }
+ }
+ else
+ {
+ venc = VideoEncCreate(VENC_CODEC_JPEG);
+ if (!venc)
+ {
+ fprintf(stderr, "VideoEncCreate failed\n");
+ goto OUT;
+ }
+
+ {
+ int ret = VideoEncSetParameter(venc, VENC_IndexParamJpegQuality, (void*)&quality);
+ if (ret)
+ {
+ fprintf(stderr, "VideoEncSetParameter VENC_IndexParamJpegQuality failed %d\n", ret);
+ goto OUT;
+ }
+ }
+
+ {
+ int enc_mode = 0;
+ int ret = VideoEncSetParameter(venc, VENC_IndexParamJpegEncMode, (void*)&enc_mode);
+ if (ret)
+ {
+ fprintf(stderr, "VideoEncSetParameter VENC_IndexParamJpegEncMode failed %d\n", ret);
+ goto OUT;
+ }
+ }
+
+ {
+ VencBaseConfig config;
+ memset(&config, 0, sizeof(config));
+ config.nInputWidth = width;
+ config.nInputHeight = height;
+ config.nDstWidth = width;
+ config.nDstHeight = height;
+ config.nStride = aligned_width;
+ config.eInputFormat = VENC_PIXEL_YUV420SP;
+
+ int ret = VideoEncInit(venc, &config);
+ if (ret)
+ {
+ fprintf(stderr, "VideoEncInit failed %d\n", ret);
+ goto OUT;
+ }
+ }
+
+ {
+ VencAllocateBufferParam bufferParam;
+ bufferParam.nSizeY = aligned_width * aligned_height;
+ bufferParam.nSizeC = aligned_width * aligned_height / 2;
+ bufferParam.nBufferNum = 1;
+
+ int ret = AllocInputBuffer(venc, &bufferParam);
+ if (ret)
+ {
+ fprintf(stderr, "AllocInputBuffer failed %d\n", ret);
+ goto OUT;
+ }
+ }
+
+ {
+ int ret = GetOneAllocInputBuffer(venc, &input_buffer);
+ if (ret)
+ {
+ fprintf(stderr, "GetOneAllocInputBuffer failed %d\n", ret);
+ goto OUT;
+ }
+
+ b_input_buffer_got = 1;
+ }
+ }
+
+ inited = 1;
+
+ return 0;
+
+OUT:
+ deinit();
+
+ return -1;
+}
+
+int jpeg_encoder_aw_impl::encode(const unsigned char* bgrdata, std::vector& outdata) const
+{
+ outdata.clear();
+
+ if (!inited)
+ {
+ fprintf(stderr, "not inited\n");
+ return -1;
+ }
+
+ int ret_val = 0;
+
+ VencOutputBuffer output_buffer;
+ int b_output_buffer_got = 0;
+
+ const int aligned_width = (width + 15) / 16 * 16;
+
+ if (get_device_model() == 2)
+ {
+ if (ch == 1)
+ {
+ gray2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer_v85x.pAddrVirY, (unsigned char*)input_buffer_v85x.pAddrVirC, aligned_width);
+ }
+ if (ch == 3)
+ {
+ bgr2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer_v85x.pAddrVirY, (unsigned char*)input_buffer_v85x.pAddrVirC, aligned_width);
+ }
+ if (ch == 4)
+ {
+ bgra2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer_v85x.pAddrVirY, (unsigned char*)input_buffer_v85x.pAddrVirC, aligned_width);
+ }
+
+ {
+ input_buffer_v85x.bNeedFlushCache = 1;
+
+ int ret = VencQueueInputBuf(venc, &input_buffer_v85x);
+ if (ret)
+ {
+ fprintf(stderr, "VencQueueInputBuf failed %d\n", ret);
+ goto OUT;
+ }
+ }
+
+ while (1)
+ {
+ int ret = VencDequeueOutputBuf(venc, &output_buffer);
+ if (ret == 5)
+ {
+ // VENC_RESULT_BITSTREAM_IS_EMPTY
+ // wait encoder complete
+ usleep(10*1000);
+ continue;
+ }
+ if (ret)
+ {
+ fprintf(stderr, "VencDequeueOutputBuf failed %d\n", ret);
+ goto OUT;
+ }
+
+ b_output_buffer_got = 1;
+ break;
+ }
+ }
+ else
+ {
+ if (ch == 1)
+ {
+ gray2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer.pAddrVirY, (unsigned char*)input_buffer.pAddrVirC, aligned_width);
+ }
+ if (ch == 3)
+ {
+ bgr2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer.pAddrVirY, (unsigned char*)input_buffer.pAddrVirC, aligned_width);
+ }
+ if (ch == 4)
+ {
+ bgra2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer.pAddrVirY, (unsigned char*)input_buffer.pAddrVirC, aligned_width);
+ }
+
+ {
+ int ret = FlushCacheAllocInputBuffer(venc, &input_buffer);
+ if (ret)
+ {
+ fprintf(stderr, "FlushCacheAllocInputBuffer failed %d\n", ret);
+ ret_val = -1;
+ goto OUT;
+ }
+ }
+
+ {
+ int ret = AddOneInputBuffer(venc, &input_buffer);
+ if (ret)
+ {
+ fprintf(stderr, "AddOneInputBuffer failed %d\n", ret);
+ ret_val = -1;
+ goto OUT;
+ }
+ }
+
+ {
+ int ret = VideoEncodeOneFrame(venc);
+ if (ret)
+ {
+ fprintf(stderr, "VideoEncodeOneFrame failed %d\n", ret);
+ ret_val = -1;
+ goto OUT;
+ }
+ }
+
+ {
+ int ret = AlreadyUsedInputBuffer(venc, &input_buffer);
+ if (ret)
+ {
+ fprintf(stderr, "AlreadyUsedInputBuffer failed %d\n", ret);
+ ret_val = -1;
+ goto OUT;
+ }
+ }
+
+ {
+ int ret = GetOneBitstreamFrame(venc, &output_buffer);
+ if (ret)
+ {
+ fprintf(stderr, "GetOneBitstreamFrame failed %d\n", ret);
+ ret_val = -1;
+ goto OUT;
+ }
+
+ b_output_buffer_got = 1;
+ }
+ }
+
+ outdata.resize(output_buffer.nSize0 + output_buffer.nSize1);
+ memcpy(outdata.data(), output_buffer.pData0, output_buffer.nSize0);
+ if (output_buffer.nSize1)
+ {
+ memcpy(outdata.data() + output_buffer.nSize0, output_buffer.pData1, output_buffer.nSize1);
+ }
+
+OUT:
+ if (b_output_buffer_got)
+ {
+ if (get_device_model() == 2)
+ {
+ int ret = VencQueueOutputBuf(venc, &output_buffer);
+ if (ret)
+ {
+ fprintf(stderr, "VencQueueOutputBuf failed %d\n", ret);
+ ret_val = -1;
+ }
+ }
+ else
+ {
+ int ret = FreeOneBitStreamFrame(venc, &output_buffer);
+ if (ret)
+ {
+ fprintf(stderr, "FreeOneBitStreamFrame failed %d\n", ret);
+ ret_val = -1;
+ }
+ }
+ }
+
+ return ret_val;
+}
+
+int jpeg_encoder_aw_impl::encode(const unsigned char* bgrdata, const char* outfilepath) const
+{
+ if (!inited)
+ {
+ fprintf(stderr, "not inited\n");
+ return -1;
+ }
+
+ int ret_val = 0;
+
+ VencOutputBuffer output_buffer;
+ int b_output_buffer_got = 0;
+
+ const int aligned_width = (width + 15) / 16 * 16;
+
+ FILE* fp = 0;
+
+ if (get_device_model() == 2)
+ {
+ // fprintf(stderr, "a\n");
+ if (ch == 1)
+ {
+ gray2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer_v85x.pAddrVirY, (unsigned char*)input_buffer_v85x.pAddrVirC, aligned_width);
+ }
+ if (ch == 3)
+ {
+ bgr2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer_v85x.pAddrVirY, (unsigned char*)input_buffer_v85x.pAddrVirC, aligned_width);
+ }
+ if (ch == 4)
+ {
+ bgra2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer_v85x.pAddrVirY, (unsigned char*)input_buffer_v85x.pAddrVirC, aligned_width);
+ }
+
+ {
+ input_buffer_v85x.bNeedFlushCache = 1;
+
+ int ret = VencQueueInputBuf(venc, &input_buffer_v85x);
+ if (ret)
+ {
+ fprintf(stderr, "VencQueueInputBuf failed %d\n", ret);
+ goto OUT;
+ }
+ }
+
+ while (1)
+ {
+ int ret = VencDequeueOutputBuf(venc, &output_buffer);
+ if (ret == 5)
+ {
+ // VENC_RESULT_BITSTREAM_IS_EMPTY
+ // wait encoder complete
+ usleep(10*1000);
+ continue;
+ }
+ if (ret)
+ {
+ fprintf(stderr, "VencDequeueOutputBuf failed %d\n", ret);
+ goto OUT;
+ }
+
+ b_output_buffer_got = 1;
+ break;
+ }
+ }
+ else
+ {
+ if (ch == 1)
+ {
+ gray2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer.pAddrVirY, (unsigned char*)input_buffer.pAddrVirC, aligned_width);
+ }
+ if (ch == 3)
+ {
+ bgr2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer.pAddrVirY, (unsigned char*)input_buffer.pAddrVirC, aligned_width);
+ }
+ if (ch == 4)
+ {
+ bgra2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer.pAddrVirY, (unsigned char*)input_buffer.pAddrVirC, aligned_width);
+ }
+
+ {
+ int ret = FlushCacheAllocInputBuffer(venc, &input_buffer);
+ if (ret)
+ {
+ fprintf(stderr, "FlushCacheAllocInputBuffer failed %d\n", ret);
+ ret_val = -1;
+ goto OUT;
+ }
+ }
+
+ {
+ int ret = AddOneInputBuffer(venc, &input_buffer);
+ if (ret)
+ {
+ fprintf(stderr, "AddOneInputBuffer failed %d\n", ret);
+ ret_val = -1;
+ goto OUT;
+ }
+ }
+
+ {
+ int ret = VideoEncodeOneFrame(venc);
+ if (ret)
+ {
+ fprintf(stderr, "VideoEncodeOneFrame failed %d\n", ret);
+ ret_val = -1;
+ goto OUT;
+ }
+ }
+
+ {
+ int ret = AlreadyUsedInputBuffer(venc, &input_buffer);
+ if (ret)
+ {
+ fprintf(stderr, "AlreadyUsedInputBuffer failed %d\n", ret);
+ ret_val = -1;
+ goto OUT;
+ }
+ }
+
+ {
+ int ret = GetOneBitstreamFrame(venc, &output_buffer);
+ if (ret)
+ {
+ fprintf(stderr, "GetOneBitstreamFrame failed %d\n", ret);
+ ret_val = -1;
+ goto OUT;
+ }
+
+ b_output_buffer_got = 1;
+ }
+ }
+
+ fp = fopen(outfilepath, "wb");
+ if (!fp)
+ {
+ fprintf(stderr, "fopen %s failed\n", outfilepath);
+ ret_val = -1;
+ goto OUT;
+ }
+
+ fwrite(output_buffer.pData0, 1, output_buffer.nSize0, fp);
+ if (output_buffer.nSize1)
+ {
+ fwrite(output_buffer.pData1, 1, output_buffer.nSize1, fp);
+ }
+
+OUT:
+ if (b_output_buffer_got)
+ {
+ if (get_device_model() == 2)
+ {
+ int ret = VencQueueOutputBuf(venc, &output_buffer);
+ if (ret)
+ {
+ fprintf(stderr, "VencQueueOutputBuf failed %d\n", ret);
+ ret_val = -1;
+ }
+ }
+ else
+ {
+ int ret = FreeOneBitStreamFrame(venc, &output_buffer);
+ if (ret)
+ {
+ fprintf(stderr, "FreeOneBitStreamFrame failed %d\n", ret);
+ ret_val = -1;
+ }
+ }
+ }
+
+ if (fp)
+ {
+ fclose(fp);
+ }
+
+ return ret_val;
+}
+
+int jpeg_encoder_aw_impl::deinit()
+{
+ if (!inited)
+ return 0;
+
+ int ret_val = 0;
+
+ if (b_input_buffer_got)
+ {
+ if (get_device_model() == 2)
+ {
+ // free input_buffer_v85x ?
+ }
+ else
+ {
+ int ret = ReturnOneAllocInputBuffer(venc, &input_buffer);
+ if (ret)
+ {
+ fprintf(stderr, "ReturnOneAllocInputBuffer failed %d\n", ret);
+ ret_val = -1;
+ }
+ }
+
+ b_input_buffer_got = 0;
+ }
+
+ if (venc)
+ {
+ if (get_device_model() == 2)
+ {
+ VencPause(venc);
+ VencDestroy(venc);
+ }
+ else
+ {
+ VideoEncDestroy(venc);
+ }
+
+ venc = 0;
+ }
+
+ width = 0;
+ height = 0;
+ ch = 0;
+
+ inited = 0;
+
+ return ret_val;
+}
+
+bool jpeg_encoder_aw::supported(int width, int height, int ch)
+{
+ if (!vencoder.ready)
+ return false;
+
+ if (ch != 1 && ch != 3 && ch != 4)
+ return false;
+
+ if (width % 2 != 0 || height % 2 != 0)
+ return false;
+
+ if (width < 8 || height < 8)
+ return false;
+
+ if (width * height > 4000 * 4000)
+ return false;
+
+ return true;
+}
+
+jpeg_encoder_aw::jpeg_encoder_aw() : d(new jpeg_encoder_aw_impl)
+{
+}
+
+jpeg_encoder_aw::~jpeg_encoder_aw()
+{
+ delete d;
+}
+
+int jpeg_encoder_aw::init(int width, int height, int ch, int quality)
+{
+ return d->init(width, height, ch, quality);
+}
+
+int jpeg_encoder_aw::encode(const unsigned char* bgrdata, std::vector& outdata) const
+{
+ return d->encode(bgrdata, outdata);
+}
+
+int jpeg_encoder_aw::encode(const unsigned char* bgrdata, const char* outfilepath) const
+{
+ return d->encode(bgrdata, outfilepath);
+}
+
+int jpeg_encoder_aw::deinit()
+{
+ return d->deinit();
+}
+
+#else // defined __linux__
+
+bool jpeg_encoder_aw::supported(int /*width*/, int /*height*/, int /*ch*/)
+{
+ return false;
+}
+
+jpeg_encoder_aw::jpeg_encoder_aw() : d(0)
+{
+}
+
+jpeg_encoder_aw::~jpeg_encoder_aw()
+{
+}
+
+int jpeg_encoder_aw::init(int /*width*/, int /*height*/, int /*ch*/, int /*quality*/)
+{
+ return -1;
+}
+
+int jpeg_encoder_aw::encode(const unsigned char* /*bgrdata*/, std::vector& /*outdata*/) const
+{
+ return -1;
+}
+
+int jpeg_encoder_aw::encode(const unsigned char* /*bgrdata*/, const char* /*outfilepath*/) const
+{
+ return -1;
+}
+
+int jpeg_encoder_aw::deinit()
+{
+ return -1;
+}
+
+#endif // defined __linux__
diff --git a/highgui/src/jpeg_encoder_aw.h b/highgui/src/jpeg_encoder_aw.h
new file mode 100644
index 00000000..c70dfa2a
--- /dev/null
+++ b/highgui/src/jpeg_encoder_aw.h
@@ -0,0 +1,43 @@
+//
+// Copyright (C) 2024 nihui
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#ifndef JPEG_ENCODER_AW_H
+#define JPEG_ENCODER_AW_H
+
+#include
+
+class jpeg_encoder_aw_impl;
+class jpeg_encoder_aw
+{
+public:
+ static bool supported(int width, int height, int ch);
+
+ jpeg_encoder_aw();
+ ~jpeg_encoder_aw();
+
+ int init(int width, int height, int ch, int quality);
+
+ int encode(const unsigned char* bgrdata, std::vector& outdata) const;
+
+ int encode(const unsigned char* bgrdata, const char* outfilepath) const;
+
+ int deinit();
+
+private:
+ jpeg_encoder_aw_impl* const d;
+};
+
+#endif // JPEG_ENCODER_AW_H
diff --git a/highgui/src/kanna_rotate.cpp b/highgui/src/kanna_rotate.cpp
new file mode 100644
index 00000000..dda97f5e
--- /dev/null
+++ b/highgui/src/kanna_rotate.cpp
@@ -0,0 +1,6112 @@
+//
+// Copyright (C) 2024 nihui
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "kanna_rotate.h"
+#if __ARM_NEON
+#include
+#endif // __ARM_NEON
+
+// should be a kanna ascii art here in my local branch
+// but we shall ask the original art author for permission first ...
+// https://www.reddit.com/r/anime/comments/5uxjn4/i_recreated_the_kanna_ascii_art_from_kobayashisan/
+
+static void kanna_rotate_1_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw;
+ const int wgap = stride - w;
+
+ const unsigned char* src0 = src;
+ const unsigned char* src1 = src + srcstride;
+ unsigned char* dst0 = dst;
+ unsigned char* dst1 = dst + stride;
+
+ int y = 0;
+ for (; y + 1 < srch; y += 2)
+ {
+#if __ARM_NEON
+ int nn = srcw >> 5;
+ int remain = srcw - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src0 = vld1q_u8(src0);
+ uint8x16_t _src0n = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src0);
+ vst1q_u8(dst0 + 16, _src0n);
+
+ uint8x16_t _src1 = vld1q_u8(src1);
+ uint8x16_t _src1n = vld1q_u8(src1 + 16);
+ vst1q_u8(dst1, _src1);
+ vst1q_u8(dst1 + 16, _src1n);
+
+ src0 += 32;
+ src1 += 32;
+ dst0 += 32;
+ dst1 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "pld [%2, #256] \n"
+ "vld1.u8 {d4-d7}, [%2]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%3]! \n"
+ "vst1.u8 {d4-d7}, [%4]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1)
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+ }
+#endif // __aarch64__
+#else
+ int remain = srcw;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ *dst1++ = *src1++;
+ }
+
+ src0 += srcwgap + srcstride;
+ src1 += srcwgap + srcstride;
+ dst0 += wgap + stride;
+ dst1 += wgap + stride;
+ }
+
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ int nn = srcw >> 5;
+ int remain = srcw - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src = vld1q_u8(src0);
+ uint8x16_t _src2 = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src);
+ vst1q_u8(dst0 + 16, _src2);
+
+ src0 += 32;
+ dst0 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%2]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1");
+ }
+#endif // __aarch64__
+#else
+ int remain = srcw;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ }
+
+ src0 += srcwgap;
+ dst0 += wgap;
+ }
+}
+
+static void kanna_rotate_1_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw * 2;
+ const int wgap = stride - w * 2;
+
+ int size = srcw * 2;
+
+ const unsigned char* src0 = src;
+ const unsigned char* src1 = src + srcstride;
+ unsigned char* dst0 = dst;
+ unsigned char* dst1 = dst + stride;
+
+ int y = 0;
+ for (; y + 1 < srch; y += 2)
+ {
+#if __ARM_NEON
+ int nn = size >> 5;
+ int remain = size - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src0 = vld1q_u8(src0);
+ uint8x16_t _src0n = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src0);
+ vst1q_u8(dst0 + 16, _src0n);
+
+ uint8x16_t _src1 = vld1q_u8(src1);
+ uint8x16_t _src1n = vld1q_u8(src1 + 16);
+ vst1q_u8(dst1, _src1);
+ vst1q_u8(dst1 + 16, _src1n);
+
+ src0 += 32;
+ src1 += 32;
+ dst0 += 32;
+ dst1 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "pld [%2, #256] \n"
+ "vld1.u8 {d4-d7}, [%2]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%3]! \n"
+ "vst1.u8 {d4-d7}, [%4]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1)
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+ }
+#endif // __aarch64__
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ *dst1++ = *src1++;
+ }
+
+ src0 += srcwgap + srcstride;
+ src1 += srcwgap + srcstride;
+ dst0 += wgap + stride;
+ dst1 += wgap + stride;
+ }
+
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ int nn = size >> 5;
+ int remain = size - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src = vld1q_u8(src0);
+ uint8x16_t _src2 = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src);
+ vst1q_u8(dst0 + 16, _src2);
+
+ src0 += 32;
+ dst0 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%2]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1");
+ }
+#endif // __aarch64__
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ }
+
+ src0 += srcwgap;
+ dst0 += wgap;
+ }
+}
+
+static void kanna_rotate_1_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw * 3;
+ const int wgap = stride - w * 3;
+
+ int size = srcw * 3;
+
+ const unsigned char* src0 = src;
+ const unsigned char* src1 = src + srcstride;
+ unsigned char* dst0 = dst;
+ unsigned char* dst1 = dst + stride;
+
+ int y = 0;
+ for (; y + 1 < srch; y += 2)
+ {
+#if __ARM_NEON
+ int nn = size >> 5;
+ int remain = size - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src0 = vld1q_u8(src0);
+ uint8x16_t _src0n = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src0);
+ vst1q_u8(dst0 + 16, _src0n);
+
+ uint8x16_t _src1 = vld1q_u8(src1);
+ uint8x16_t _src1n = vld1q_u8(src1 + 16);
+ vst1q_u8(dst1, _src1);
+ vst1q_u8(dst1 + 16, _src1n);
+
+ src0 += 32;
+ src1 += 32;
+ dst0 += 32;
+ dst1 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "pld [%2, #256] \n"
+ "vld1.u8 {d4-d7}, [%2]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%3]! \n"
+ "vst1.u8 {d4-d7}, [%4]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1)
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+ }
+#endif // __aarch64__
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ *dst1++ = *src1++;
+ }
+
+ src0 += srcwgap + srcstride;
+ src1 += srcwgap + srcstride;
+ dst0 += wgap + stride;
+ dst1 += wgap + stride;
+ }
+
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ int nn = size >> 5;
+ int remain = size - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src = vld1q_u8(src0);
+ uint8x16_t _src2 = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src);
+ vst1q_u8(dst0 + 16, _src2);
+
+ src0 += 32;
+ dst0 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%2]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1");
+ }
+#endif // __aarch64__
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ }
+
+ src0 += srcwgap;
+ dst0 += wgap;
+ }
+}
+
+static void kanna_rotate_1_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw * 4;
+ const int wgap = stride - w * 4;
+
+ int size = srcw * 4;
+
+ const unsigned char* src0 = src;
+ const unsigned char* src1 = src + srcstride;
+ unsigned char* dst0 = dst;
+ unsigned char* dst1 = dst + stride;
+
+ int y = 0;
+ for (; y + 1 < srch; y += 2)
+ {
+#if __ARM_NEON
+ int nn = size >> 5;
+ int remain = size - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src0 = vld1q_u8(src0);
+ uint8x16_t _src0n = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src0);
+ vst1q_u8(dst0 + 16, _src0n);
+
+ uint8x16_t _src1 = vld1q_u8(src1);
+ uint8x16_t _src1n = vld1q_u8(src1 + 16);
+ vst1q_u8(dst1, _src1);
+ vst1q_u8(dst1 + 16, _src1n);
+
+ src0 += 32;
+ src1 += 32;
+ dst0 += 32;
+ dst1 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "pld [%2, #256] \n"
+ "vld1.u8 {d4-d7}, [%2]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%3]! \n"
+ "vst1.u8 {d4-d7}, [%4]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1)
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+ }
+#endif // __aarch64__
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ *dst1++ = *src1++;
+ }
+
+ src0 += srcwgap + srcstride;
+ src1 += srcwgap + srcstride;
+ dst0 += wgap + stride;
+ dst1 += wgap + stride;
+ }
+
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ int nn = size >> 5;
+ int remain = size - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src = vld1q_u8(src0);
+ uint8x16_t _src2 = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src);
+ vst1q_u8(dst0 + 16, _src2);
+
+ src0 += 32;
+ dst0 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%2]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1");
+ }
+#endif // __aarch64__
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ }
+
+ src0 += srcwgap;
+ dst0 += wgap;
+ }
+}
+
+static void kanna_rotate_2_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw;
+ const int wgap = stride + w;
+
+ const unsigned char* src0 = src;
+ unsigned char* dst0 = dst + w - 1;
+
+ int y = 0;
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ dst0 -= 15;
+
+ int nn = srcw >> 4;
+ int remain = srcw - (nn << 4);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8_t _src = vld1_u8(src0);
+ uint8x8_t _src2 = vld1_u8(src0 + 8);
+
+ _src = vrev64_u8(_src);
+ _src2 = vrev64_u8(_src2);
+
+ vst1_u8(dst0, _src2);
+ vst1_u8(dst0 + 8, _src);
+
+ src0 += 16;
+ dst0 -= 16;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "mov r4, #-16 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld1.u8 {d0-d1}, [%1]! \n"
+ "vrev64.u8 d3, d0 \n"
+ "vrev64.u8 d2, d1 \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d2-d3}, [%2], r4 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1", "r4");
+ }
+#endif // __aarch64__
+
+ dst0 += 15;
+#else
+ int remain = srcw;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0 = *src0;
+
+ src0 += 1;
+ dst0 -= 1;
+ }
+
+ src0 += srcwgap;
+ dst0 += wgap;
+ }
+}
+
+static void kanna_rotate_2_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw * 2;
+ const int wgap = stride + w * 2;
+
+ const unsigned char* src0 = src;
+ unsigned char* dst0 = dst + w * 2 - 2;
+
+ int y = 0;
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ dst0 -= 7 * 2;
+
+ int nn = srcw >> 4;
+ int remain = srcw - (nn << 4);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x2_t _src = vld2_u8(src0);
+ uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2);
+
+ _src.val[0] = vrev64_u8(_src.val[0]);
+ _src.val[1] = vrev64_u8(_src.val[1]);
+
+ _src2.val[0] = vrev64_u8(_src2.val[0]);
+ _src2.val[1] = vrev64_u8(_src2.val[1]);
+
+ vst2_u8(dst0, _src);
+ vst2_u8(dst0 - 8 * 2, _src2);
+
+ src0 += 16 * 2;
+ dst0 -= 16 * 2;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "mov r4, #-16 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld2.u8 {d0-d1}, [%1]! \n"
+ "vrev64.u8 d0, d0 \n"
+ "pld [%1, #128] \n"
+ "vld2.u8 {d2-d3}, [%1]! \n"
+ "vrev64.u8 d1, d1 \n"
+ "vrev64.u8 d2, d2 \n"
+ "vst2.u8 {d0-d1}, [%2], r4 \n"
+ "vrev64.u8 d3, d3 \n"
+ "subs %0, #1 \n"
+ "vst2.u8 {d2-d3}, [%2], r4 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1", "r4");
+ }
+#endif // __aarch64__
+
+ dst0 += 7 * 2;
+#else
+ int remain = srcw;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+
+ src0 += 2;
+ dst0 -= 2;
+ }
+
+ src0 += srcwgap;
+ dst0 += wgap;
+ }
+}
+
+static void kanna_rotate_2_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw * 3;
+ const int wgap = stride + w * 3;
+
+ const unsigned char* src0 = src;
+ unsigned char* dst0 = dst + w * 3 - 3;
+
+ int y = 0;
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ dst0 -= 7 * 3;
+
+ int nn = srcw >> 4;
+ int remain = srcw - (nn << 4);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x3_t _src = vld3_u8(src0);
+ uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3);
+
+ _src.val[0] = vrev64_u8(_src.val[0]);
+ _src.val[1] = vrev64_u8(_src.val[1]);
+ _src.val[2] = vrev64_u8(_src.val[2]);
+
+ _src2.val[0] = vrev64_u8(_src2.val[0]);
+ _src2.val[1] = vrev64_u8(_src2.val[1]);
+ _src2.val[2] = vrev64_u8(_src2.val[2]);
+
+ vst3_u8(dst0, _src);
+ vst3_u8(dst0 - 8 * 3, _src2);
+
+ src0 += 16 * 3;
+ dst0 -= 16 * 3;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "mov r4, #-24 \n"
+ "0: \n"
+ "pld [%1, #192] \n"
+ "vld3.u8 {d0-d2}, [%1]! \n"
+ "vrev64.u8 d0, d0 \n"
+ "vrev64.u8 d1, d1 \n"
+ "pld [%1, #192] \n"
+ "vld3.u8 {d4-d6}, [%1]! \n"
+ "vrev64.u8 d2, d2 \n"
+ "vrev64.u8 d4, d4 \n"
+ "vst3.u8 {d0-d2}, [%2], r4 \n"
+ "vrev64.u8 d5, d5 \n"
+ "vrev64.u8 d6, d6 \n"
+ "subs %0, #1 \n"
+ "vst3.u8 {d4-d6}, [%2], r4 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "r4");
+ }
+#endif // __aarch64__
+
+ dst0 += 7 * 3;
+#else
+ int remain = srcw;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src0[2];
+
+ src0 += 3;
+ dst0 -= 3;
+ }
+
+ src0 += srcwgap;
+ dst0 += wgap;
+ }
+}
+
+static void kanna_rotate_2_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw * 4;
+ const int wgap = stride + w * 4;
+
+ const unsigned char* src0 = src;
+ unsigned char* dst0 = dst + w * 4 - 4;
+
+ int y = 0;
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ dst0 -= 7 * 4;
+
+ int nn = srcw >> 4;
+ int remain = srcw - (nn << 4);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x4_t _src = vld4_u8(src0);
+ uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4);
+
+ _src.val[0] = vrev64_u8(_src.val[0]);
+ _src.val[1] = vrev64_u8(_src.val[1]);
+ _src.val[2] = vrev64_u8(_src.val[2]);
+ _src.val[3] = vrev64_u8(_src.val[3]);
+
+ _src2.val[0] = vrev64_u8(_src2.val[0]);
+ _src2.val[1] = vrev64_u8(_src2.val[1]);
+ _src2.val[2] = vrev64_u8(_src2.val[2]);
+ _src2.val[3] = vrev64_u8(_src2.val[3]);
+
+ vst4_u8(dst0, _src);
+ vst4_u8(dst0 - 8 * 4, _src2);
+
+ src0 += 16 * 4;
+ dst0 -= 16 * 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "mov r4, #-32 \n"
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld4.u8 {d0-d3}, [%1]! \n"
+ "vrev64.u8 d0, d0 \n"
+ "vrev64.u8 d1, d1 \n"
+ "vrev64.u8 d2, d2 \n"
+ "pld [%1, #256] \n"
+ "vld4.u8 {d4-d7}, [%1]! \n"
+ "vrev64.u8 d3, d3 \n"
+ "vrev64.u8 d4, d4 \n"
+ "vrev64.u8 d5, d5 \n"
+ "vst4.u8 {d0-d3}, [%2], r4 \n"
+ "vrev64.u8 d6, d6 \n"
+ "vrev64.u8 d7, d7 \n"
+ "subs %0, #1 \n"
+ "vst4.u8 {d4-d7}, [%2], r4 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "r4");
+ }
+#endif // __aarch64__
+
+ dst0 += 7 * 4;
+#else
+ int remain = srcw;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src0[2];
+ dst0[3] = src0[3];
+
+ src0 += 4;
+ dst0 -= 4;
+ }
+
+ src0 += srcwgap;
+ dst0 += wgap;
+ }
+}
+
+static void kanna_rotate_3_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw;
+ const int wgap = stride - w;
+
+ // point to the last dst pixel
+ unsigned char* dstend = dst + stride * h - wgap;
+
+ const unsigned char* src0 = src;
+ unsigned char* dst0 = dstend - 1;
+
+ int y = 0;
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ dst0 -= 15;
+
+ int nn = srcw >> 4;
+ int remain = srcw - (nn << 4);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8_t _src = vld1_u8(src0);
+ uint8x8_t _src2 = vld1_u8(src0 + 8);
+
+ _src = vrev64_u8(_src);
+ _src2 = vrev64_u8(_src2);
+
+ vst1_u8(dst0, _src2);
+ vst1_u8(dst0 + 8, _src);
+
+ src0 += 16;
+ dst0 -= 16;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "mov r4, #-16 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld1.u8 {d0-d1}, [%1]! \n"
+ "vrev64.u8 d3, d0 \n"
+ "vrev64.u8 d2, d1 \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d2-d3}, [%2], r4 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1", "r4");
+ }
+#endif // __aarch64__
+
+ dst0 += 15;
+#else
+ int remain = srcw;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0 = *src0;
+
+ src0 += 1;
+ dst0 -= 1;
+ }
+
+ src0 += srcwgap;
+ dst0 -= wgap;
+ }
+}
+
+static void kanna_rotate_3_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw * 2;
+ const int wgap = stride - w * 2;
+
+ // point to the last dst pixel
+ unsigned char* dstend = dst + stride * h - wgap;
+
+ const unsigned char* src0 = src;
+ unsigned char* dst0 = dstend - 2;
+
+ int y = 0;
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ dst0 -= 7 * 2;
+
+ int nn = srcw >> 4;
+ int remain = srcw - (nn << 4);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x2_t _src = vld2_u8(src0);
+ uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2);
+
+ _src.val[0] = vrev64_u8(_src.val[0]);
+ _src.val[1] = vrev64_u8(_src.val[1]);
+
+ _src2.val[0] = vrev64_u8(_src2.val[0]);
+ _src2.val[1] = vrev64_u8(_src2.val[1]);
+
+ vst2_u8(dst0, _src);
+ vst2_u8(dst0 - 8 * 2, _src2);
+
+ src0 += 16 * 2;
+ dst0 -= 16 * 2;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "mov r4, #-16 \n"
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld2.u8 {d0-d1}, [%1]! \n"
+ "vrev64.u8 d0, d0 \n"
+ "pld [%1, #128] \n"
+ "vld2.u8 {d2-d3}, [%1]! \n"
+ "vrev64.u8 d1, d1 \n"
+ "vrev64.u8 d2, d2 \n"
+ "vst2.u8 {d0-d1}, [%2], r4 \n"
+ "vrev64.u8 d3, d3 \n"
+ "subs %0, #1 \n"
+ "vst2.u8 {d2-d3}, [%2], r4 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1", "r4");
+ }
+#endif // __aarch64__
+
+ dst0 += 7 * 2;
+#else
+ int remain = srcw;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+
+ src0 += 2;
+ dst0 -= 2;
+ }
+
+ src0 += srcwgap;
+ dst0 -= wgap;
+ }
+}
+
+static void kanna_rotate_3_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw * 3;
+ const int wgap = stride - w * 3;
+
+ // point to the last dst pixel
+ unsigned char* dstend = dst + stride * h - wgap;
+
+ const unsigned char* src0 = src;
+ unsigned char* dst0 = dstend - 3;
+
+ int y = 0;
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ dst0 -= 7 * 3;
+
+ int nn = srcw >> 4;
+ int remain = srcw - (nn << 4);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x3_t _src = vld3_u8(src0);
+ uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3);
+
+ _src.val[0] = vrev64_u8(_src.val[0]);
+ _src.val[1] = vrev64_u8(_src.val[1]);
+ _src.val[2] = vrev64_u8(_src.val[2]);
+
+ _src2.val[0] = vrev64_u8(_src2.val[0]);
+ _src2.val[1] = vrev64_u8(_src2.val[1]);
+ _src2.val[2] = vrev64_u8(_src2.val[2]);
+
+ vst3_u8(dst0, _src);
+ vst3_u8(dst0 - 8 * 3, _src2);
+
+ src0 += 16 * 3;
+ dst0 -= 16 * 3;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "mov r4, #-24 \n"
+ "0: \n"
+ "pld [%1, #192] \n"
+ "vld3.u8 {d0-d2}, [%1]! \n"
+ "vrev64.u8 d0, d0 \n"
+ "vrev64.u8 d1, d1 \n"
+ "pld [%1, #192] \n"
+ "vld3.u8 {d4-d6}, [%1]! \n"
+ "vrev64.u8 d2, d2 \n"
+ "vrev64.u8 d4, d4 \n"
+ "vst3.u8 {d0-d2}, [%2], r4 \n"
+ "vrev64.u8 d5, d5 \n"
+ "vrev64.u8 d6, d6 \n"
+ "subs %0, #1 \n"
+ "vst3.u8 {d4-d6}, [%2], r4 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "r4");
+ }
+#endif // __aarch64__
+
+ dst0 += 7 * 3;
+#else
+ int remain = srcw;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src0[2];
+
+ src0 += 3;
+ dst0 -= 3;
+ }
+
+ src0 += srcwgap;
+ dst0 -= wgap;
+ }
+}
+
+static void kanna_rotate_3_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw * 4;
+ const int wgap = stride - w * 4;
+
+ // point to the last dst pixel
+ unsigned char* dstend = dst + stride * h - wgap;
+
+ const unsigned char* src0 = src;
+ unsigned char* dst0 = dstend - 4;
+
+ int y = 0;
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ dst0 -= 7 * 4;
+
+ int nn = srcw >> 4;
+ int remain = srcw - (nn << 4);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x4_t _src = vld4_u8(src0);
+ uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4);
+
+ _src.val[0] = vrev64_u8(_src.val[0]);
+ _src.val[1] = vrev64_u8(_src.val[1]);
+ _src.val[2] = vrev64_u8(_src.val[2]);
+ _src.val[3] = vrev64_u8(_src.val[3]);
+
+ _src2.val[0] = vrev64_u8(_src2.val[0]);
+ _src2.val[1] = vrev64_u8(_src2.val[1]);
+ _src2.val[2] = vrev64_u8(_src2.val[2]);
+ _src2.val[3] = vrev64_u8(_src2.val[3]);
+
+ vst4_u8(dst0, _src);
+ vst4_u8(dst0 - 8 * 4, _src2);
+
+ src0 += 16 * 4;
+ dst0 -= 16 * 4;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "mov r4, #-32 \n"
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld4.u8 {d0-d3}, [%1]! \n"
+ "vrev64.u8 d0, d0 \n"
+ "vrev64.u8 d1, d1 \n"
+ "vrev64.u8 d2, d2 \n"
+ "pld [%1, #256] \n"
+ "vld4.u8 {d4-d7}, [%1]! \n"
+ "vrev64.u8 d3, d3 \n"
+ "vrev64.u8 d4, d4 \n"
+ "vrev64.u8 d5, d5 \n"
+ "vst4.u8 {d0-d3}, [%2], r4 \n"
+ "vrev64.u8 d6, d6 \n"
+ "vrev64.u8 d7, d7 \n"
+ "subs %0, #1 \n"
+ "vst4.u8 {d4-d7}, [%2], r4 \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "r4");
+ }
+#endif // __aarch64__
+
+ dst0 += 7 * 4;
+#else
+ int remain = srcw;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src0[2];
+ dst0[3] = src0[3];
+
+ src0 += 4;
+ dst0 -= 4;
+ }
+
+ src0 += srcwgap;
+ dst0 -= wgap;
+ }
+}
+
+static void kanna_rotate_4_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw;
+ const int wgap = stride + w;
+
+ // point to the last dst pixel row
+ unsigned char* dstend = dst + stride * (h - 1);
+
+ const unsigned char* src0 = src;
+ const unsigned char* src1 = src + srcstride;
+ unsigned char* dst0 = dstend;
+ unsigned char* dst1 = dstend - stride;
+
+ int y = 0;
+ for (; y + 1 < srch; y += 2)
+ {
+#if __ARM_NEON
+ int nn = srcw >> 5;
+ int remain = srcw - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src0 = vld1q_u8(src0);
+ uint8x16_t _src0n = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src0);
+ vst1q_u8(dst0 + 16, _src0n);
+
+ uint8x16_t _src1 = vld1q_u8(src1);
+ uint8x16_t _src1n = vld1q_u8(src1 + 16);
+ vst1q_u8(dst1, _src1);
+ vst1q_u8(dst1 + 16, _src1n);
+
+ src0 += 32;
+ src1 += 32;
+ dst0 += 32;
+ dst1 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "pld [%2, #256] \n"
+ "vld1.u8 {d4-d7}, [%2]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%3]! \n"
+ "vst1.u8 {d4-d7}, [%4]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1)
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+ }
+#endif // __aarch64__
+#else
+ int remain = srcw;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ *dst1++ = *src1++;
+ }
+
+ src0 += srcwgap + srcstride;
+ src1 += srcwgap + srcstride;
+ dst0 -= wgap + stride;
+ dst1 -= wgap + stride;
+ }
+
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ int nn = srcw >> 5;
+ int remain = srcw - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src = vld1q_u8(src0);
+ uint8x16_t _src2 = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src);
+ vst1q_u8(dst0 + 16, _src2);
+
+ src0 += 32;
+ dst0 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%2]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1");
+ }
+#endif // __aarch64__
+#else
+ int remain = srcw;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ }
+
+ src0 += srcwgap;
+ dst0 -= wgap;
+ }
+}
+
+static void kanna_rotate_4_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw * 2;
+ const int wgap = stride + w * 2;
+
+ // point to the last dst pixel row
+ unsigned char* dstend = dst + stride * (h - 1);
+
+ int size = srcw * 2;
+
+ const unsigned char* src0 = src;
+ const unsigned char* src1 = src + srcstride;
+ unsigned char* dst0 = dstend;
+ unsigned char* dst1 = dstend - stride;
+
+ int y = 0;
+ for (; y + 1 < srch; y += 2)
+ {
+#if __ARM_NEON
+ int nn = size >> 5;
+ int remain = size - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src0 = vld1q_u8(src0);
+ uint8x16_t _src0n = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src0);
+ vst1q_u8(dst0 + 16, _src0n);
+
+ uint8x16_t _src1 = vld1q_u8(src1);
+ uint8x16_t _src1n = vld1q_u8(src1 + 16);
+ vst1q_u8(dst1, _src1);
+ vst1q_u8(dst1 + 16, _src1n);
+
+ src0 += 32;
+ src1 += 32;
+ dst0 += 32;
+ dst1 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "pld [%2, #256] \n"
+ "vld1.u8 {d4-d7}, [%2]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%3]! \n"
+ "vst1.u8 {d4-d7}, [%4]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1)
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+ }
+#endif // __aarch64__
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ *dst1++ = *src1++;
+ }
+
+ src0 += srcwgap + srcstride;
+ src1 += srcwgap + srcstride;
+ dst0 -= wgap + stride;
+ dst1 -= wgap + stride;
+ }
+
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ int nn = size >> 5;
+ int remain = size - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src = vld1q_u8(src0);
+ uint8x16_t _src2 = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src);
+ vst1q_u8(dst0 + 16, _src2);
+
+ src0 += 32;
+ dst0 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%2]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1");
+ }
+#endif // __aarch64__
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ }
+
+ src0 += srcwgap;
+ dst0 -= wgap;
+ }
+}
+
+static void kanna_rotate_4_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw * 3;
+ const int wgap = stride + w * 3;
+
+ // point to the last dst pixel row
+ unsigned char* dstend = dst + stride * (h - 1);
+
+ int size = srcw * 3;
+
+ const unsigned char* src0 = src;
+ const unsigned char* src1 = src + srcstride;
+ unsigned char* dst0 = dstend;
+ unsigned char* dst1 = dstend - stride;
+
+ int y = 0;
+ for (; y + 1 < srch; y += 2)
+ {
+#if __ARM_NEON
+ int nn = size >> 5;
+ int remain = size - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src0 = vld1q_u8(src0);
+ uint8x16_t _src0n = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src0);
+ vst1q_u8(dst0 + 16, _src0n);
+
+ uint8x16_t _src1 = vld1q_u8(src1);
+ uint8x16_t _src1n = vld1q_u8(src1 + 16);
+ vst1q_u8(dst1, _src1);
+ vst1q_u8(dst1 + 16, _src1n);
+
+ src0 += 32;
+ src1 += 32;
+ dst0 += 32;
+ dst1 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "pld [%2, #256] \n"
+ "vld1.u8 {d4-d7}, [%2]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%3]! \n"
+ "vst1.u8 {d4-d7}, [%4]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1)
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+ }
+#endif // __aarch64__
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ *dst1++ = *src1++;
+ }
+
+ src0 += srcwgap + srcstride;
+ src1 += srcwgap + srcstride;
+ dst0 -= wgap + stride;
+ dst1 -= wgap + stride;
+ }
+
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ int nn = size >> 5;
+ int remain = size - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src = vld1q_u8(src0);
+ uint8x16_t _src2 = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src);
+ vst1q_u8(dst0 + 16, _src2);
+
+ src0 += 32;
+ dst0 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%2]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1");
+ }
+#endif // __aarch64__
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ }
+
+ src0 += srcwgap;
+ dst0 -= wgap;
+ }
+}
+
+static void kanna_rotate_4_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw * 4;
+ const int wgap = stride + w * 4;
+
+ // point to the last dst pixel row
+ unsigned char* dstend = dst + stride * (h - 1);
+
+ int size = srcw * 4;
+
+ const unsigned char* src0 = src;
+ const unsigned char* src1 = src + srcstride;
+ unsigned char* dst0 = dstend;
+ unsigned char* dst1 = dstend - stride;
+
+ int y = 0;
+ for (; y + 1 < srch; y += 2)
+ {
+#if __ARM_NEON
+ int nn = size >> 5;
+ int remain = size - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src0 = vld1q_u8(src0);
+ uint8x16_t _src0n = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src0);
+ vst1q_u8(dst0 + 16, _src0n);
+
+ uint8x16_t _src1 = vld1q_u8(src1);
+ uint8x16_t _src1n = vld1q_u8(src1 + 16);
+ vst1q_u8(dst1, _src1);
+ vst1q_u8(dst1 + 16, _src1n);
+
+ src0 += 32;
+ src1 += 32;
+ dst0 += 32;
+ dst1 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "pld [%2, #256] \n"
+ "vld1.u8 {d4-d7}, [%2]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%3]! \n"
+ "vst1.u8 {d4-d7}, [%4]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1)
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+ }
+#endif // __aarch64__
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ *dst1++ = *src1++;
+ }
+
+ src0 += srcwgap + srcstride;
+ src1 += srcwgap + srcstride;
+ dst0 -= wgap + stride;
+ dst1 -= wgap + stride;
+ }
+
+ for (; y < srch; y++)
+ {
+#if __ARM_NEON
+ int nn = size >> 5;
+ int remain = size - (nn << 5);
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x16_t _src = vld1q_u8(src0);
+ uint8x16_t _src2 = vld1q_u8(src0 + 16);
+ vst1q_u8(dst0, _src);
+ vst1q_u8(dst0 + 16, _src2);
+
+ src0 += 32;
+ dst0 += 32;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld1.u8 {d0-d3}, [%1]! \n"
+ "subs %0, #1 \n"
+ "vst1.u8 {d0-d3}, [%2]! \n"
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(dst0) // %2
+ : "0"(nn),
+ "1"(src0),
+ "2"(dst0)
+ : "cc", "memory", "q0", "q1");
+ }
+#endif // __aarch64__
+#else
+ int remain = size;
+#endif // __ARM_NEON
+
+ for (; remain > 0; remain--)
+ {
+ *dst0++ = *src0++;
+ }
+
+ src0 += srcwgap;
+ dst0 -= wgap;
+ }
+}
+
+static void kanna_rotate_5_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw;
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst0 = dst + y;
+ unsigned char* dst1 = dst + y + stride;
+
+ int src_step = 2 * srcstride;
+ int dst_step = 2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8_t _src0 = vld1_u8(src0);
+ uint8x8_t _src1 = vld1_u8(src1);
+
+ uint8x8_t _src2 = vld1_u8(src0 + src_step);
+ uint8x8_t _src3 = vld1_u8(src1 + src_step);
+
+ uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
+ uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
+
+ uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
+ uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+ uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+ uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+ vst1_u8(dst0, _dst0);
+ vst1_u8(dst1, _dst1);
+ vst1_u8(dst0 + dst_step, _dst2);
+ vst1_u8(dst1 + dst_step, _dst3);
+ vst1_u8(dst0 + 2 * dst_step, _dst4);
+ vst1_u8(dst1 + 2 * dst_step, _dst5);
+ vst1_u8(dst0 + 3 * dst_step, _dst6);
+ vst1_u8(dst1 + 3 * dst_step, _dst7);
+
+ src0 += 8;
+ src1 += 8;
+
+ dst0 += 4 * dst_step;
+ dst1 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #64] \n"
+ "vld1.u8 {d0}, [%1], %10 \n"
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d1}, [%2], %10 \n"
+
+ "pld [%1, #64] \n"
+ "vld1.u8 {d2}, [%1], %10 \n"
+
+ "vtrn.u8 d0, d1 \n" // _src01t_r
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d3}, [%2], %10 \n"
+
+ "pld [%1, #64] \n"
+ "vld1.u8 {d4}, [%1], %10 \n"
+
+ "vtrn.u8 d2, d3 \n" // _src23t_r
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d5}, [%2], %10 \n"
+
+ "pld [%1, #64] \n"
+ "vld1.u8 {d6}, [%1], %10 \n"
+
+ "vtrn.u8 d4, d5 \n" // _src45t_r
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d7}, [%2], %10 \n"
+
+ "vtrn.u8 d6, d7 \n" // _src67t_r
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q0, q1 \n" // _src02tt_r _src13tt_r
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q2, q3 \n" // _src13tt_r _src46tt_r
+
+ "add %1, #8 \n" // src0 += 8
+
+ "vtrn.u32 q0, q2 \n" // _src04ttt_r _src15ttt_r
+
+ "add %2, #8 \n" // src1 += 8
+
+ "vtrn.u32 q1, q3 \n" // _src26ttt_r _src37ttt_r
+ "vst1.u8 {d0}, [%3], %11 \n"
+ "vst1.u8 {d1}, [%4], %11 \n"
+
+ "subs %0, #1 \n"
+
+ "vst1.u8 {d2}, [%3], %11 \n"
+ "vst1.u8 {d3}, [%4], %11 \n"
+ "vst1.u8 {d4}, [%3], %11 \n"
+ "vst1.u8 {d5}, [%4], %11 \n"
+ "vst1.u8 {d6}, [%3], %11 \n"
+ "vst1.u8 {d7}, [%4], %11 \n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src1[0];
+ dst0[2] = src0[0 + src_step];
+ dst0[3] = src1[0 + src_step];
+ dst0[4] = src0[0 + 2 * src_step];
+ dst0[5] = src1[0 + 2 * src_step];
+ dst0[6] = src0[0 + 3 * src_step];
+ dst0[7] = src1[0 + 3 * src_step];
+
+ src0 += 1;
+ src1 += 1;
+
+ dst0 += stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dst + y;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ *dst0 = *src0;
+
+ src0 += 1;
+ dst0 += stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_5_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw * 2;
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst0 = dst + y * 2;
+ unsigned char* dst1 = dst + y * 2 + stride;
+
+ int src_step = 2 * srcstride;
+ int dst_step = 2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x2_t _src0 = vld2_u8(src0);
+ uint8x8x2_t _src1 = vld2_u8(src1);
+
+ uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
+ uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
+
+ uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
+ uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
+
+ uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
+ uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
+
+ uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
+ uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
+ uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
+ uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+ uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
+ uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
+ uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
+ uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+ uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
+ uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
+ uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
+ uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
+
+ uint8x8x2_t _dst0;
+ uint8x8x2_t _dst1;
+ uint8x8x2_t _dst2;
+ uint8x8x2_t _dst3;
+ uint8x8x2_t _dst4;
+ uint8x8x2_t _dst5;
+ uint8x8x2_t _dst6;
+ uint8x8x2_t _dst7;
+
+ _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+ _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+ _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+ _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+ _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+ _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+ _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+ _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+ _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+ _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+
+ vst2_u8(dst0, _dst0);
+ vst2_u8(dst1, _dst1);
+ vst2_u8(dst0 + dst_step, _dst2);
+ vst2_u8(dst1 + dst_step, _dst3);
+ vst2_u8(dst0 + 2 * dst_step, _dst4);
+ vst2_u8(dst1 + 2 * dst_step, _dst5);
+ vst2_u8(dst0 + 3 * dst_step, _dst6);
+ vst2_u8(dst1 + 3 * dst_step, _dst7);
+
+ src0 += 2 * 8;
+ src1 += 2 * 8;
+
+ dst0 += 4 * dst_step;
+ dst1 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld2.u8 {d0-d1}, [%1], %10 \n"
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d2-d3}, [%2], %10 \n"
+
+ "pld [%1, #128] \n"
+ "vld2.u8 {d4-d5}, [%1], %10 \n"
+
+ "vtrn.u8 q0, q1 \n" // _src01t_r
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d6-d7}, [%2], %10 \n"
+
+ "pld [%1, #128] \n"
+ "vld2.u8 {d16-d17}, [%1], %10\n"
+
+ "vtrn.u8 q2, q3 \n" // _src23t_r
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d18-d19}, [%2], %10\n"
+
+ "pld [%1, #128] \n"
+ "vld2.u8 {d20-d21}, [%1], %10\n"
+
+ "vtrn.u8 q8, q9 \n" // _src45t_r
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d22-d23}, [%2], %10\n"
+
+ "vtrn.u8 q10, q11 \n" // _src67t_r
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q0, q2 \n" // _src02tt_r
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q1, q3 \n" // _src13tt_r
+
+ "add %1, #16 \n" // src0 += 16
+
+ "vtrn.u16 q8, q10 \n" // _src46tt_r
+
+ "add %2, #16 \n" // src1 += 16
+
+ "vtrn.u16 q9, q11 \n" // _src57tt_r
+
+ "vtrn.u32 q0, q8 \n" // _src04ttt_r
+
+ "vtrn.u32 q1, q9 \n" // _src15ttt_r
+ "vst2.u8 {d0-d1}, [%3], %11 \n"
+
+ "vtrn.u32 q2, q10 \n" // _src26ttt_r
+ "vst2.u8 {d2-d3}, [%4], %11 \n"
+
+ "vtrn.u32 q3, q11 \n" // _src37ttt_r
+ "vst2.u8 {d4-d5}, [%3], %11 \n"
+
+ "subs %0, #1 \n"
+
+ "vst2.u8 {d6-d7}, [%4], %11 \n"
+ "vst2.u8 {d16-d17}, [%3], %11\n"
+ "vst2.u8 {d18-d19}, [%4], %11\n"
+ "vst2.u8 {d20-d21}, [%3], %11\n"
+ "vst2.u8 {d22-d23}, [%4], %11\n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src1[0];
+ dst0[3] = src1[1];
+ dst0[4] = src0[0 + src_step];
+ dst0[5] = src0[1 + src_step];
+ dst0[6] = src1[0 + src_step];
+ dst0[7] = src1[1 + src_step];
+ dst0[8] = src0[0 + 2 * src_step];
+ dst0[9] = src0[1 + 2 * src_step];
+ dst0[10] = src1[0 + 2 * src_step];
+ dst0[11] = src1[1 + 2 * src_step];
+ dst0[12] = src0[0 + 3 * src_step];
+ dst0[13] = src0[1 + 3 * src_step];
+ dst0[14] = src1[0 + 3 * src_step];
+ dst0[15] = src1[1 + 3 * src_step];
+
+ src0 += 2;
+ src1 += 2;
+
+ dst0 += stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dst + y * 2;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+
+ src0 += 2;
+ dst0 += stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_5_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw * 3;
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst0 = dst + y * 3;
+ unsigned char* dst1 = dst + y * 3 + stride;
+
+ int src_step = 2 * srcstride;
+ int dst_step = 2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x3_t _src0 = vld3_u8(src0);
+ uint8x8x3_t _src1 = vld3_u8(src1);
+
+ uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
+ uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
+
+ uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
+ uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
+
+ uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
+ uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
+
+ uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
+ uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
+ uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
+ uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
+
+ uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
+ uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
+ uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
+ uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+ uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
+ uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
+ uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
+ uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
+
+ uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
+ uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
+ uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
+ uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+ uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
+ uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
+ uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
+ uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
+
+ uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
+ uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
+ uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
+ uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
+
+ uint8x8x3_t _dst0;
+ uint8x8x3_t _dst1;
+ uint8x8x3_t _dst2;
+ uint8x8x3_t _dst3;
+ uint8x8x3_t _dst4;
+ uint8x8x3_t _dst5;
+ uint8x8x3_t _dst6;
+ uint8x8x3_t _dst7;
+
+ _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+ _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+ _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+ _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+ _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+ _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+ _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+ _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+ _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+ _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+
+ _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+ _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+ _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+ _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+ _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+ _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+ _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+ _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+
+ vst3_u8(dst0, _dst0);
+ vst3_u8(dst1, _dst1);
+ vst3_u8(dst0 + dst_step, _dst2);
+ vst3_u8(dst1 + dst_step, _dst3);
+ vst3_u8(dst0 + 2 * dst_step, _dst4);
+ vst3_u8(dst1 + 2 * dst_step, _dst5);
+ vst3_u8(dst0 + 3 * dst_step, _dst6);
+ vst3_u8(dst1 + 3 * dst_step, _dst7);
+
+ src0 += 3 * 8;
+ src1 += 3 * 8;
+
+ dst0 += 4 * dst_step;
+ dst1 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #192] \n"
+ "vld3.u8 {d0-d2}, [%1], %10 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d4-d6}, [%2], %10 \n"
+
+ "pld [%1, #192] \n"
+ "vld3.u8 {d8-d10}, [%1], %10 \n"
+
+ "vtrn.u8 q0, q2 \n" // _src01t_r
+ "vtrn.u8 d2, d6 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d12-d14}, [%2], %10\n"
+
+ "pld [%1, #192] \n"
+ "vld3.u8 {d16-d18}, [%1], %10\n"
+
+ "vtrn.u8 q4, q6 \n" // _src23t_r
+ "vtrn.u8 d10, d14 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d20-d22}, [%2], %10\n"
+
+ "pld [%1, #192] \n"
+ "vld3.u8 {d24-d26}, [%1], %10\n"
+
+ "vtrn.u8 q8, q10 \n" // _src45t_r
+ "vtrn.u8 d18, d22 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d28-d30}, [%2], %10\n"
+
+ "vtrn.u8 q12, q14 \n" // _src67t_r
+ "vtrn.u8 d26, d30 \n"
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q0, q4 \n" // _src02tt_r
+ "vtrn.u16 d2, d10 \n"
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q2, q6 \n" // _src13tt_r
+ "vtrn.u16 d6, d14 \n"
+
+ "add %1, #24 \n" // src0 += 24
+
+ "vtrn.u16 q8, q12 \n" // _src46tt_r
+ "vtrn.u16 d18, d26 \n"
+
+ "add %2, #24 \n" // src1 += 24
+
+ "vtrn.u16 q10, q14 \n" // _src57tt_r
+ "vtrn.u16 d22, d30 \n"
+
+ "vtrn.u32 q0, q8 \n" // _src04ttt_r
+ "vtrn.u32 d2, d18 \n"
+
+ "vtrn.u32 q2, q10 \n" // _src15ttt_r
+ "vst3.u8 {d0-d2}, [%3], %11 \n"
+ "vtrn.u32 d6, d22 \n"
+
+ "vtrn.u32 q4, q12 \n" // _src26ttt_r
+ "vst3.u8 {d4-d6}, [%4], %11 \n"
+ "vtrn.u32 d10, d26 \n"
+
+ "vtrn.u32 q6, q14 \n" // _src37ttt_r
+ "vst3.u8 {d8-d10}, [%3], %11 \n"
+ "vtrn.u32 d14, d30 \n"
+
+ "subs %0, #1 \n"
+
+ "vst3.u8 {d16-d18}, [%3], %11\n"
+ "vst3.u8 {d12-d14}, [%4], %11\n"
+ "vst3.u8 {d20-d22}, [%4], %11\n"
+ "vst3.u8 {d24-d26}, [%3], %11\n"
+ "vst3.u8 {d28-d30}, [%4], %11\n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src0[2];
+ dst0[3] = src1[0];
+ dst0[4] = src1[1];
+ dst0[5] = src1[2];
+ dst0[6] = src0[0 + src_step];
+ dst0[7] = src0[1 + src_step];
+ dst0[8] = src0[2 + src_step];
+ dst0[9] = src1[0 + src_step];
+ dst0[10] = src1[1 + src_step];
+ dst0[11] = src1[2 + src_step];
+ dst0[12] = src0[0 + 2 * src_step];
+ dst0[13] = src0[1 + 2 * src_step];
+ dst0[14] = src0[2 + 2 * src_step];
+ dst0[15] = src1[0 + 2 * src_step];
+ dst0[16] = src1[1 + 2 * src_step];
+ dst0[17] = src1[2 + 2 * src_step];
+ dst0[18] = src0[0 + 3 * src_step];
+ dst0[19] = src0[1 + 3 * src_step];
+ dst0[20] = src0[2 + 3 * src_step];
+ dst0[21] = src1[0 + 3 * src_step];
+ dst0[22] = src1[1 + 3 * src_step];
+ dst0[23] = src1[2 + 3 * src_step];
+
+ src0 += 3;
+ src1 += 3;
+
+ dst0 += stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dst + y * 3;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src0[2];
+
+ src0 += 3;
+ dst0 += stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_5_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw * 4;
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst0 = dst + y * 4;
+ unsigned char* dst1 = dst + y * 4 + stride;
+
+ int src_step = 2 * srcstride;
+ int dst_step = 2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x4_t _src0 = vld4_u8(src0);
+ uint8x8x4_t _src1 = vld4_u8(src1);
+
+ uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
+ uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
+
+ uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
+ uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
+
+ uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
+ uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
+
+ uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
+ uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
+ uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
+ uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
+
+ uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
+ uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
+ uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
+ uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
+
+ uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]);
+ uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]);
+ uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]);
+ uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+ uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
+ uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
+ uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
+ uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
+
+ uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
+ uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
+ uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
+ uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
+
+ uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0]));
+ uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1]));
+ uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0]));
+ uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+ uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
+ uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
+ uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
+ uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
+
+ uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
+ uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
+ uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
+ uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
+
+ uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0]));
+ uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0]));
+ uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1]));
+ uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1]));
+
+ uint8x8x4_t _dst0;
+ uint8x8x4_t _dst1;
+ uint8x8x4_t _dst2;
+ uint8x8x4_t _dst3;
+ uint8x8x4_t _dst4;
+ uint8x8x4_t _dst5;
+ uint8x8x4_t _dst6;
+ uint8x8x4_t _dst7;
+
+ _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+ _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+ _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+ _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+ _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+ _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+ _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+ _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+ _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+ _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+
+ _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+ _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+ _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+ _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+ _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+ _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+ _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+ _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+
+ _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
+ _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
+ _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
+ _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
+ _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
+ _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
+ _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
+ _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
+
+ vst4_u8(dst0, _dst0);
+ vst4_u8(dst1, _dst1);
+ vst4_u8(dst0 + dst_step, _dst2);
+ vst4_u8(dst1 + dst_step, _dst3);
+ vst4_u8(dst0 + 2 * dst_step, _dst4);
+ vst4_u8(dst1 + 2 * dst_step, _dst5);
+ vst4_u8(dst0 + 3 * dst_step, _dst6);
+ vst4_u8(dst1 + 3 * dst_step, _dst7);
+
+ src0 += 4 * 8;
+ src1 += 4 * 8;
+
+ dst0 += 4 * dst_step;
+ dst1 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld4.u8 {d0-d3}, [%1], %10 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d4-d7}, [%2], %10 \n"
+
+ "pld [%1, #256] \n"
+ "vld4.u8 {d8-d11}, [%1], %10 \n"
+
+ "vtrn.u8 q0, q2 \n" // _src01t_r
+ "vtrn.u8 q1, q3 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d12-d15}, [%2], %10\n"
+
+ "pld [%1, #256] \n"
+ "vld4.u8 {d16-d19}, [%1], %10\n"
+
+ "vtrn.u8 q4, q6 \n" // _src23t_r
+ "vtrn.u8 q5, q7 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d20-d23}, [%2], %10\n"
+
+ "pld [%1, #256] \n"
+ "vld4.u8 {d24-d27}, [%1], %10\n"
+
+ "vtrn.u8 q8, q10 \n" // _src45t_r
+ "vtrn.u8 q9, q11 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d28-d31}, [%2], %10\n"
+
+ "vtrn.u8 q12, q14 \n" // _src67t_r
+ "vtrn.u8 q13, q15 \n"
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q0, q4 \n" // _src02tt_r
+ "vtrn.u16 q1, q5 \n"
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q2, q6 \n" // _src13tt_r
+ "vtrn.u16 q3, q7 \n"
+
+ "add %1, #32 \n" // src0 += 32
+
+ "vtrn.u16 q8, q12 \n" // _src46tt_r
+ "vtrn.u16 q9, q13 \n"
+
+ "add %2, #32 \n" // src1 += 32
+
+ "vtrn.u16 q10, q14 \n" // _src57tt_r
+ "vtrn.u16 q11, q15 \n"
+
+ "vtrn.u32 q0, q8 \n" // _src04ttt_r
+ "vtrn.u32 q1, q9 \n"
+
+ "vtrn.u32 q2, q10 \n" // _src15ttt_r
+ "vst4.u8 {d0-d3}, [%3], %11 \n"
+ "vtrn.u32 q3, q11 \n"
+
+ "vtrn.u32 q4, q12 \n" // _src26ttt_r
+ "vst4.u8 {d4-d7}, [%4], %11 \n"
+ "vtrn.u32 q5, q13 \n"
+
+ "vtrn.u32 q6, q14 \n" // _src37ttt_r
+ "vst4.u8 {d8-d11}, [%3], %11 \n"
+ "vtrn.u32 q7, q15 \n"
+
+ "subs %0, #1 \n"
+
+ "vst4.u8 {d16-d19}, [%3], %11\n"
+ "vst4.u8 {d12-d15}, [%4], %11\n"
+ "vst4.u8 {d20-d23}, [%4], %11\n"
+ "vst4.u8 {d24-d27}, [%3], %11\n"
+ "vst4.u8 {d28-d31}, [%4], %11\n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src0[2];
+ dst0[3] = src0[3];
+ dst0[4] = src1[0];
+ dst0[5] = src1[1];
+ dst0[6] = src1[2];
+ dst0[7] = src1[3];
+ dst0[8] = src0[0 + src_step];
+ dst0[9] = src0[1 + src_step];
+ dst0[10] = src0[2 + src_step];
+ dst0[11] = src0[3 + src_step];
+ dst0[12] = src1[0 + src_step];
+ dst0[13] = src1[1 + src_step];
+ dst0[14] = src1[2 + src_step];
+ dst0[15] = src1[3 + src_step];
+ dst0[16] = src0[0 + 2 * src_step];
+ dst0[17] = src0[1 + 2 * src_step];
+ dst0[18] = src0[2 + 2 * src_step];
+ dst0[19] = src0[3 + 2 * src_step];
+ dst0[20] = src1[0 + 2 * src_step];
+ dst0[21] = src1[1 + 2 * src_step];
+ dst0[22] = src1[2 + 2 * src_step];
+ dst0[23] = src1[3 + 2 * src_step];
+ dst0[24] = src0[0 + 3 * src_step];
+ dst0[25] = src0[1 + 3 * src_step];
+ dst0[26] = src0[2 + 3 * src_step];
+ dst0[27] = src0[3 + 3 * src_step];
+ dst0[28] = src1[0 + 3 * src_step];
+ dst0[29] = src1[1 + 3 * src_step];
+ dst0[30] = src1[2 + 3 * src_step];
+ dst0[31] = src1[3 + 3 * src_step];
+
+ src0 += 4;
+ src1 += 4;
+
+ dst0 += stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dst + y * 4;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src0[2];
+ dst0[3] = src0[3];
+
+ src0 += 4;
+ dst0 += stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_6_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw;
+
+ // point to the last dst pixel in row
+ unsigned char* dstend = dst + w;
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst0 = dstend - y - 8;
+ unsigned char* dst1 = dstend - y - 8 + stride;
+
+ int src_step = 2 * srcstride;
+ int dst_step = 2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8_t _src0 = vld1_u8(src0);
+ uint8x8_t _src1 = vld1_u8(src1);
+
+ uint8x8_t _src2 = vld1_u8(src0 + src_step);
+ uint8x8_t _src3 = vld1_u8(src1 + src_step);
+
+ uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
+ uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
+
+ uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
+ uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+ uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+ uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+ vst1_u8(dst0, _dst7);
+ vst1_u8(dst1, _dst6);
+ vst1_u8(dst0 + dst_step, _dst5);
+ vst1_u8(dst1 + dst_step, _dst4);
+ vst1_u8(dst0 + 2 * dst_step, _dst3);
+ vst1_u8(dst1 + 2 * dst_step, _dst2);
+ vst1_u8(dst0 + 3 * dst_step, _dst1);
+ vst1_u8(dst1 + 3 * dst_step, _dst0);
+
+ src0 += 8;
+ src1 += 8;
+
+ dst0 += 4 * dst_step;
+ dst1 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #64] \n"
+ "vld1.u8 {d0}, [%1], %10 \n"
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d1}, [%2], %10 \n"
+
+ "pld [%1, #64] \n"
+ "vld1.u8 {d2}, [%1], %10 \n"
+
+ "vtrn.u8 d1, d0 \n" // _src01t_r
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d3}, [%2], %10 \n"
+
+ "pld [%1, #64] \n"
+ "vld1.u8 {d4}, [%1], %10 \n"
+
+ "vtrn.u8 d3, d2 \n" // _src23t_r
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d5}, [%2], %10 \n"
+
+ "pld [%1, #64] \n"
+ "vld1.u8 {d6}, [%1], %10 \n"
+
+ "vtrn.u8 d5, d4 \n" // _src45t_r
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d7}, [%2], %10 \n"
+
+ "vtrn.u8 d7, d6 \n" // _src67t_r
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q1, q0 \n" // _src02tt_r _src13tt_r
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q3, q2 \n" // _src46tt_r _src57tt_r
+
+ "add %1, #8 \n" // src0 += 8
+
+ "vtrn.u32 q3, q1 \n" // _src26ttt_r _src37ttt_r
+
+ "add %2, #8 \n" // src1 += 8
+
+ "vtrn.u32 q2, q0 \n" // _src04ttt_r _src15ttt_r
+ "vst1.u8 {d6}, [%4], %11 \n"
+ "vst1.u8 {d7}, [%3], %11 \n"
+
+ "subs %0, #1 \n"
+
+ "vst1.u8 {d4}, [%4], %11 \n"
+ "vst1.u8 {d5}, [%3], %11 \n"
+ "vst1.u8 {d2}, [%4], %11 \n"
+ "vst1.u8 {d3}, [%3], %11 \n"
+ "vst1.u8 {d0}, [%4], %11 \n"
+ "vst1.u8 {d1}, [%3], %11 \n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst0[0] = src1[0 + 3 * src_step];
+ dst0[1] = src0[0 + 3 * src_step];
+ dst0[2] = src1[0 + 2 * src_step];
+ dst0[3] = src0[0 + 2 * src_step];
+ dst0[4] = src1[0 + src_step];
+ dst0[5] = src0[0 + src_step];
+ dst0[6] = src1[0];
+ dst0[7] = src0[0];
+
+ src0 += 1;
+ src1 += 1;
+
+ dst0 += stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dstend - y - 1;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ *dst0 = *src0;
+
+ src0 += 1;
+ dst0 += stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_6_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw * 2;
+
+ // point to the last dst pixel in row
+ unsigned char* dstend = dst + w * 2;
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst0 = dstend - y * 2 - 8 * 2;
+ unsigned char* dst1 = dstend - y * 2 - 8 * 2 + stride;
+
+ int src_step = 2 * srcstride;
+ int dst_step = 2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x2_t _src0 = vld2_u8(src0);
+ uint8x8x2_t _src1 = vld2_u8(src1);
+
+ uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
+ uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
+
+ uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
+ uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
+
+ uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
+ uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
+
+ uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
+ uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
+ uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
+ uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+ uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
+ uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
+ uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
+ uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+ uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
+ uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
+ uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
+ uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
+
+ uint8x8x2_t _dst0;
+ uint8x8x2_t _dst1;
+ uint8x8x2_t _dst2;
+ uint8x8x2_t _dst3;
+ uint8x8x2_t _dst4;
+ uint8x8x2_t _dst5;
+ uint8x8x2_t _dst6;
+ uint8x8x2_t _dst7;
+
+ _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+ _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+ _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+ _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+ _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+ _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+ _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+ _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+ _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+ _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+
+ vst2_u8(dst0, _dst7);
+ vst2_u8(dst1, _dst6);
+ vst2_u8(dst0 + dst_step, _dst5);
+ vst2_u8(dst1 + dst_step, _dst4);
+ vst2_u8(dst0 + 2 * dst_step, _dst3);
+ vst2_u8(dst1 + 2 * dst_step, _dst2);
+ vst2_u8(dst0 + 3 * dst_step, _dst1);
+ vst2_u8(dst1 + 3 * dst_step, _dst0);
+
+ src0 += 2 * 8;
+ src1 += 2 * 8;
+
+ dst0 += 4 * dst_step;
+ dst1 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld2.u8 {d0-d1}, [%1], %10 \n"
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d2-d3}, [%2], %10 \n"
+
+ "pld [%1, #128] \n"
+ "vld2.u8 {d4-d5}, [%1], %10 \n"
+
+ "vtrn.u8 q1, q0 \n" // _src01t_r
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d6-d7}, [%2], %10 \n"
+
+ "pld [%1, #128] \n"
+ "vld2.u8 {d16-d17}, [%1], %10\n"
+
+ "vtrn.u8 q3, q2 \n" // _src23t_r
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d18-d19}, [%2], %10\n"
+
+ "pld [%1, #128] \n"
+ "vld2.u8 {d20-d21}, [%1], %10\n"
+
+ "vtrn.u8 q9, q8 \n" // _src45t_r
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d22-d23}, [%2], %10\n"
+
+ "vtrn.u8 q11, q10 \n" // _src67t_r
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q2, q0 \n" // _src02tt_r
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q3, q1 \n" // _src13tt_r
+
+ "add %1, #16 \n" // src0 += 16
+
+ "vtrn.u16 q10, q8 \n" // _src46tt_r
+
+ "add %2, #16 \n" // src1 += 16
+
+ "vtrn.u16 q11, q9 \n" // _src57tt_r
+
+ "vtrn.u32 q10, q2 \n" // _src26ttt_r
+
+ "vtrn.u32 q11, q3 \n" // _src37ttt_r
+ "vst2.u8 {d20-d21}, [%4], %11\n"
+
+ "vtrn.u32 q8, q0 \n" // _src04ttt_r
+ "vst2.u8 {d22-d23}, [%3], %11\n"
+
+ "vtrn.u32 q9, q1 \n" // _src15ttt_r
+ "vst2.u8 {d16-d17}, [%4], %11\n"
+
+ "subs %0, #1 \n"
+
+ "vst2.u8 {d18-d19}, [%3], %11\n"
+ "vst2.u8 {d4-d5}, [%4], %11 \n"
+ "vst2.u8 {d6-d7}, [%3], %11 \n"
+ "vst2.u8 {d0-d1}, [%4], %11 \n"
+ "vst2.u8 {d2-d3}, [%3], %11 \n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst0[0] = src1[0 + 3 * src_step];
+ dst0[1] = src1[1 + 3 * src_step];
+ dst0[2] = src0[0 + 3 * src_step];
+ dst0[3] = src0[1 + 3 * src_step];
+ dst0[4] = src1[0 + 2 * src_step];
+ dst0[5] = src1[1 + 2 * src_step];
+ dst0[6] = src0[0 + 2 * src_step];
+ dst0[7] = src0[1 + 2 * src_step];
+ dst0[8] = src1[0 + src_step];
+ dst0[9] = src1[1 + src_step];
+ dst0[10] = src0[0 + src_step];
+ dst0[11] = src0[1 + src_step];
+ dst0[12] = src1[0];
+ dst0[13] = src1[1];
+ dst0[14] = src0[0];
+ dst0[15] = src0[1];
+
+ src0 += 2;
+ src1 += 2;
+
+ dst0 += stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dstend - y * 2 - 2;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+
+ src0 += 2;
+ dst0 += stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_6_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw * 3;
+
+ // point to the last dst pixel in row
+ unsigned char* dstend = dst + w * 3;
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst0 = dstend - y * 3 - 8 * 3;
+ unsigned char* dst1 = dstend - y * 3 - 8 * 3 + stride;
+
+ int src_step = 2 * srcstride;
+ int dst_step = 2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x3_t _src0 = vld3_u8(src0);
+ uint8x8x3_t _src1 = vld3_u8(src1);
+
+ uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
+ uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
+
+ uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
+ uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
+
+ uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
+ uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
+
+ uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
+ uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
+ uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
+ uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
+
+ uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
+ uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
+ uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
+ uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+ uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
+ uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
+ uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
+ uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
+
+ uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
+ uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
+ uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
+ uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+ uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
+ uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
+ uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
+ uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
+
+ uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
+ uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
+ uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
+ uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
+
+ uint8x8x3_t _dst0;
+ uint8x8x3_t _dst1;
+ uint8x8x3_t _dst2;
+ uint8x8x3_t _dst3;
+ uint8x8x3_t _dst4;
+ uint8x8x3_t _dst5;
+ uint8x8x3_t _dst6;
+ uint8x8x3_t _dst7;
+
+ _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+ _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+ _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+ _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+ _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+ _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+ _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+ _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+ _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+ _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+
+ _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+ _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+ _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+ _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+ _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+ _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+ _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+ _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+
+ vst3_u8(dst0, _dst7);
+ vst3_u8(dst1, _dst6);
+ vst3_u8(dst0 + dst_step, _dst5);
+ vst3_u8(dst1 + dst_step, _dst4);
+ vst3_u8(dst0 + 2 * dst_step, _dst3);
+ vst3_u8(dst1 + 2 * dst_step, _dst2);
+ vst3_u8(dst0 + 3 * dst_step, _dst1);
+ vst3_u8(dst1 + 3 * dst_step, _dst0);
+
+ src0 += 3 * 8;
+ src1 += 3 * 8;
+
+ dst0 += 4 * dst_step;
+ dst1 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #192] \n"
+ "vld3.u8 {d0-d2}, [%1], %10 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d4-d6}, [%2], %10 \n"
+
+ "pld [%1, #192] \n"
+ "vld3.u8 {d8-d10}, [%1], %10 \n"
+
+ "vtrn.u8 q2, q0 \n" // _src01t_r
+ "vtrn.u8 d6, d2 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d12-d14}, [%2], %10\n"
+
+ "pld [%1, #192] \n"
+ "vld3.u8 {d16-d18}, [%1], %10\n"
+
+ "vtrn.u8 q6, q4 \n" // _src23t_r
+ "vtrn.u8 d14, d10 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d20-d22}, [%2], %10\n"
+
+ "pld [%1, #192] \n"
+ "vld3.u8 {d24-d26}, [%1], %10\n"
+
+ "vtrn.u8 q10, q8 \n" // _src45t_r
+ "vtrn.u8 d22, d18 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d28-d30}, [%2], %10\n"
+
+ "vtrn.u8 q14, q12 \n" // _src67t_r
+ "vtrn.u8 d30, d26 \n"
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q4, q0 \n" // _src02tt_r
+ "vtrn.u16 d10, d2 \n"
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q6, q2 \n" // _src13tt_r
+ "vtrn.u16 d14, d6 \n"
+
+ "add %1, #24 \n" // src0 += 24
+
+ "vtrn.u16 q12, q8 \n" // _src46tt_r
+ "vtrn.u16 d26, d18 \n"
+
+ "add %2, #24 \n" // src1 += 24
+
+ "vtrn.u16 q14, q10 \n" // _src57tt_r
+ "vtrn.u16 d30, d22 \n"
+
+ "vtrn.u32 q12, q4 \n" // _src26ttt_r
+ "vtrn.u32 d26, d10 \n"
+
+ "vtrn.u32 q14, q6 \n" // _src37ttt_r
+ "vst3.u8 {d24-d26}, [%4], %11\n"
+ "vtrn.u32 d30, d14 \n"
+
+ "vtrn.u32 q8, q0 \n" // _src04ttt_r
+ "vst3.u8 {d28-d30}, [%3], %11\n"
+ "vtrn.u32 d18, d2 \n"
+
+ "vtrn.u32 q10, q2 \n" // _src15ttt_r
+ "vst3.u8 {d16-d18}, [%4], %11\n"
+ "vtrn.u32 d22, d6 \n"
+
+ "subs %0, #1 \n"
+
+ "vst3.u8 {d20-d22}, [%3], %11\n"
+ "vst3.u8 {d8-d10}, [%4], %11 \n"
+ "vst3.u8 {d12-d14}, [%3], %11\n"
+ "vst3.u8 {d0-d2}, [%4], %11 \n"
+ "vst3.u8 {d4-d6}, [%3], %11 \n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst0[0] = src1[0 + 3 * src_step];
+ dst0[1] = src1[1 + 3 * src_step];
+ dst0[2] = src1[2 + 3 * src_step];
+ dst0[3] = src0[0 + 3 * src_step];
+ dst0[4] = src0[1 + 3 * src_step];
+ dst0[5] = src0[2 + 3 * src_step];
+ dst0[6] = src1[0 + 2 * src_step];
+ dst0[7] = src1[1 + 2 * src_step];
+ dst0[8] = src1[2 + 2 * src_step];
+ dst0[9] = src0[0 + 2 * src_step];
+ dst0[10] = src0[1 + 2 * src_step];
+ dst0[11] = src0[2 + 2 * src_step];
+ dst0[12] = src1[0 + src_step];
+ dst0[13] = src1[1 + src_step];
+ dst0[14] = src1[2 + src_step];
+ dst0[15] = src0[0 + src_step];
+ dst0[16] = src0[1 + src_step];
+ dst0[17] = src0[2 + src_step];
+ dst0[18] = src1[0];
+ dst0[19] = src1[1];
+ dst0[20] = src1[2];
+ dst0[21] = src0[0];
+ dst0[22] = src0[1];
+ dst0[23] = src0[2];
+
+ src0 += 3;
+ src1 += 3;
+
+ dst0 += stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dstend - y * 3 - 3;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src0[2];
+
+ src0 += 3;
+ dst0 += stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_6_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
+{
+ const int srcwgap = srcstride - srcw * 4;
+
+ // point to the last dst pixel in row
+ unsigned char* dstend = dst + w * 4;
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst0 = dstend - y * 4 - 8 * 4;
+ unsigned char* dst1 = dstend - y * 4 - 8 * 4 + stride;
+
+ int src_step = 2 * srcstride;
+ int dst_step = 2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x4_t _src0 = vld4_u8(src0);
+ uint8x8x4_t _src1 = vld4_u8(src1);
+
+ uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
+ uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
+
+ uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
+ uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
+
+ uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
+ uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
+
+ uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
+ uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
+ uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
+ uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
+
+ uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
+ uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
+ uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
+ uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
+
+ uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]);
+ uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]);
+ uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]);
+ uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+ uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
+ uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
+ uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
+ uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
+
+ uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
+ uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
+ uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
+ uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
+
+ uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1]));
+ uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0]));
+ uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1]));
+ uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+ uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
+ uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
+ uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
+ uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
+
+ uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
+ uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
+ uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
+ uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
+
+ uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1]));
+ uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1]));
+ uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0]));
+ uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0]));
+
+ uint8x8x4_t _dst0;
+ uint8x8x4_t _dst1;
+ uint8x8x4_t _dst2;
+ uint8x8x4_t _dst3;
+ uint8x8x4_t _dst4;
+ uint8x8x4_t _dst5;
+ uint8x8x4_t _dst6;
+ uint8x8x4_t _dst7;
+
+ _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+ _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+ _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+ _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+ _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+ _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+ _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+ _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+ _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+ _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+
+ _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+ _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+ _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+ _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+ _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+ _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+ _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+ _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+
+ _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
+ _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
+ _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
+ _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
+ _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
+ _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
+ _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
+ _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
+
+ vst4_u8(dst0, _dst7);
+ vst4_u8(dst1, _dst6);
+ vst4_u8(dst0 + dst_step, _dst5);
+ vst4_u8(dst1 + dst_step, _dst4);
+ vst4_u8(dst0 + 2 * dst_step, _dst3);
+ vst4_u8(dst1 + 2 * dst_step, _dst2);
+ vst4_u8(dst0 + 3 * dst_step, _dst1);
+ vst4_u8(dst1 + 3 * dst_step, _dst0);
+
+ src0 += 4 * 8;
+ src1 += 4 * 8;
+
+ dst0 += 4 * dst_step;
+ dst1 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld4.u8 {d0-d3}, [%1], %10 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d4-d7}, [%2], %10 \n"
+
+ "pld [%1, #256] \n"
+ "vld4.u8 {d8-d11}, [%1], %10 \n"
+
+ "vtrn.u8 q2, q0 \n" // _src01t_r
+ "vtrn.u8 q3, q1 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d12-d15}, [%2], %10\n"
+
+ "pld [%1, #256] \n"
+ "vld4.u8 {d16-d19}, [%1], %10\n"
+
+ "vtrn.u8 q6, q4 \n" // _src23t_r
+ "vtrn.u8 q7, q5 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d20-d23}, [%2], %10\n"
+
+ "pld [%1, #256] \n"
+ "vld4.u8 {d24-d27}, [%1], %10\n"
+
+ "vtrn.u8 q10, q8 \n" // _src45t_r
+ "vtrn.u8 q11, q9 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d28-d31}, [%2], %10\n"
+
+ "vtrn.u8 q14, q12 \n" // _src67t_r
+ "vtrn.u8 q15, q13 \n"
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q4, q0 \n" // _src02tt_r
+ "vtrn.u16 q5, q1 \n"
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q6, q2 \n" // _src13tt_r
+ "vtrn.u16 q7, q3 \n"
+
+ "add %1, #32 \n" // src0 += 32
+
+ "vtrn.u16 q12, q8 \n" // _src46tt_r
+ "vtrn.u16 q13, q9 \n"
+
+ "add %2, #32 \n" // src1 += 32
+
+ "vtrn.u16 q14, q10 \n" // _src57tt_r
+ "vtrn.u16 q15, q11 \n"
+
+ "vtrn.u32 q12, q4 \n" // _src26ttt_r
+ "vtrn.u32 q13, q5 \n"
+
+ "vtrn.u32 q14, q6 \n" // _src37ttt_r
+ "vst4.u8 {d24-d27}, [%4], %11\n"
+ "vtrn.u32 q15, q7 \n"
+
+ "vtrn.u32 q8, q0 \n" // _src04ttt_r
+ "vst4.u8 {d28-d31}, [%3], %11\n"
+ "vtrn.u32 q9, q1 \n"
+
+ "vtrn.u32 q10, q2 \n" // _src15ttt_r
+ "vst4.u8 {d16-d19}, [%4], %11\n"
+ "vtrn.u32 q11, q3 \n"
+
+ "subs %0, #1 \n"
+
+ "vst4.u8 {d8-d11}, [%4], %11 \n"
+ "vst4.u8 {d20-d23}, [%3], %11\n"
+ "vst4.u8 {d12-d15}, [%3], %11\n"
+ "vst4.u8 {d0-d3}, [%4], %11 \n"
+ "vst4.u8 {d4-d7}, [%3], %11 \n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst0), // %3
+ "=r"(dst1) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst0),
+ "4"(dst1),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst0[0] = src1[0 + 3 * src_step];
+ dst0[1] = src1[1 + 3 * src_step];
+ dst0[2] = src1[2 + 3 * src_step];
+ dst0[3] = src1[3 + 3 * src_step];
+ dst0[4] = src0[0 + 3 * src_step];
+ dst0[5] = src0[1 + 3 * src_step];
+ dst0[6] = src0[2 + 3 * src_step];
+ dst0[7] = src0[3 + 3 * src_step];
+ dst0[8] = src1[0 + 2 * src_step];
+ dst0[9] = src1[1 + 2 * src_step];
+ dst0[10] = src1[2 + 2 * src_step];
+ dst0[11] = src1[3 + 2 * src_step];
+ dst0[12] = src0[0 + 2 * src_step];
+ dst0[13] = src0[1 + 2 * src_step];
+ dst0[14] = src0[2 + 2 * src_step];
+ dst0[15] = src0[3 + 2 * src_step];
+ dst0[16] = src1[0 + src_step];
+ dst0[17] = src1[1 + src_step];
+ dst0[18] = src1[2 + src_step];
+ dst0[19] = src1[3 + src_step];
+ dst0[20] = src0[0 + src_step];
+ dst0[21] = src0[1 + src_step];
+ dst0[22] = src0[2 + src_step];
+ dst0[23] = src0[3 + src_step];
+ dst0[24] = src1[0];
+ dst0[25] = src1[1];
+ dst0[26] = src1[2];
+ dst0[27] = src1[3];
+ dst0[28] = src0[0];
+ dst0[29] = src0[1];
+ dst0[30] = src0[2];
+ dst0[31] = src0[3];
+
+ src0 += 4;
+ src1 += 4;
+
+ dst0 += stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dstend - y * 4 - 4;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src0[2];
+ dst0[3] = src0[3];
+
+ src0 += 4;
+ dst0 += stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_7_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw;
+
+ // point to the last dst pixel
+ unsigned char* dstend = dst + stride * (h - 1) + w;
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst6 = dstend - y - 8 - stride;
+ unsigned char* dst7 = dstend - y - 8;
+
+ int src_step = 2 * srcstride;
+ int dst_step = -2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8_t _src0 = vld1_u8(src0);
+ uint8x8_t _src1 = vld1_u8(src1);
+
+ uint8x8_t _src2 = vld1_u8(src0 + src_step);
+ uint8x8_t _src3 = vld1_u8(src1 + src_step);
+
+ uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
+ uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
+
+ uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
+ uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+ uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+ uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+ vst1_u8(dst7, _dst7);
+ vst1_u8(dst6, _dst6);
+ vst1_u8(dst7 + dst_step, _dst5);
+ vst1_u8(dst6 + dst_step, _dst4);
+ vst1_u8(dst7 + 2 * dst_step, _dst3);
+ vst1_u8(dst6 + 2 * dst_step, _dst2);
+ vst1_u8(dst7 + 3 * dst_step, _dst1);
+ vst1_u8(dst6 + 3 * dst_step, _dst0);
+
+ src0 += 8;
+ src1 += 8;
+
+ dst7 += 4 * dst_step;
+ dst6 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #64] \n"
+ "vld1.u8 {d0}, [%1], %10 \n"
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d1}, [%2], %10 \n"
+
+ "pld [%1, #64] \n"
+ "vld1.u8 {d2}, [%1], %10 \n"
+
+ "vtrn.u8 d1, d0 \n" // _src01t_r
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d3}, [%2], %10 \n"
+
+ "pld [%1, #64] \n"
+ "vld1.u8 {d4}, [%1], %10 \n"
+
+ "vtrn.u8 d3, d2 \n" // _src23t_r
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d5}, [%2], %10 \n"
+
+ "pld [%1, #64] \n"
+ "vld1.u8 {d6}, [%1], %10 \n"
+
+ "vtrn.u8 d5, d4 \n" // _src45t_r
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d7}, [%2], %10 \n"
+
+ "vtrn.u8 d7, d6 \n" // _src67t_r
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q1, q0 \n" // _src02tt_r _src13tt_r
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q3, q2 \n" // _src46tt_r _src57tt_r
+
+ "add %1, #8 \n" // src0 += 8
+
+ "vtrn.u32 q3, q1 \n" // _src26ttt_r _src37ttt_r
+
+ "add %2, #8 \n" // src1 += 8
+
+ "vtrn.u32 q2, q0 \n" // _src04ttt_r _src15ttt_r
+ "vst1.u8 {d6}, [%4], %11 \n"
+ "vst1.u8 {d7}, [%3], %11 \n"
+
+ "subs %0, #1 \n"
+
+ "vst1.u8 {d4}, [%4], %11 \n"
+ "vst1.u8 {d5}, [%3], %11 \n"
+ "vst1.u8 {d2}, [%4], %11 \n"
+ "vst1.u8 {d3}, [%3], %11 \n"
+ "vst1.u8 {d0}, [%4], %11 \n"
+ "vst1.u8 {d1}, [%3], %11 \n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst7), // %3
+ "=r"(dst6) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst7),
+ "4"(dst6),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst7[0] = src1[0 + 3 * src_step];
+ dst7[1] = src0[0 + 3 * src_step];
+ dst7[2] = src1[0 + 2 * src_step];
+ dst7[3] = src0[0 + 2 * src_step];
+ dst7[4] = src1[0 + src_step];
+ dst7[5] = src0[0 + src_step];
+ dst7[6] = src1[0];
+ dst7[7] = src0[0];
+
+ src0 += 1;
+ src1 += 1;
+
+ dst7 -= stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dstend - y - 1;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ *dst0 = *src0;
+
+ src0 += 1;
+ dst0 -= stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_7_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw * 2;
+
+ // point to the last dst pixel
+ unsigned char* dstend = dst + stride * (h - 1) + w * 2;
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst6 = dstend - y * 2 - 8 * 2 - stride;
+ unsigned char* dst7 = dstend - y * 2 - 8 * 2;
+
+ int src_step = 2 * srcstride;
+ int dst_step = -2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x2_t _src0 = vld2_u8(src0);
+ uint8x8x2_t _src1 = vld2_u8(src1);
+
+ uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
+ uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
+
+ uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
+ uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
+
+ uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
+ uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
+
+ uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
+ uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
+ uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
+ uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+ uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
+ uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
+ uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
+ uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+ uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
+ uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
+ uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
+ uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
+
+ uint8x8x2_t _dst0;
+ uint8x8x2_t _dst1;
+ uint8x8x2_t _dst2;
+ uint8x8x2_t _dst3;
+ uint8x8x2_t _dst4;
+ uint8x8x2_t _dst5;
+ uint8x8x2_t _dst6;
+ uint8x8x2_t _dst7;
+
+ _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+ _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+ _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+ _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+ _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+ _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+ _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+ _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+ _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+ _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+
+ vst2_u8(dst7, _dst7);
+ vst2_u8(dst6, _dst6);
+ vst2_u8(dst7 + dst_step, _dst5);
+ vst2_u8(dst6 + dst_step, _dst4);
+ vst2_u8(dst7 + 2 * dst_step, _dst3);
+ vst2_u8(dst6 + 2 * dst_step, _dst2);
+ vst2_u8(dst7 + 3 * dst_step, _dst1);
+ vst2_u8(dst6 + 3 * dst_step, _dst0);
+
+ src0 += 2 * 8;
+ src1 += 2 * 8;
+
+ dst7 += 4 * dst_step;
+ dst6 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld2.u8 {d0-d1}, [%1], %10 \n"
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d2-d3}, [%2], %10 \n"
+
+ "pld [%1, #128] \n"
+ "vld2.u8 {d4-d5}, [%1], %10 \n"
+
+ "vtrn.u8 q1, q0 \n" // _src01t_r
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d6-d7}, [%2], %10 \n"
+
+ "pld [%1, #128] \n"
+ "vld2.u8 {d16-d17}, [%1], %10\n"
+
+ "vtrn.u8 q3, q2 \n" // _src23t_r
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d18-d19}, [%2], %10\n"
+
+ "pld [%1, #128] \n"
+ "vld2.u8 {d20-d21}, [%1], %10\n"
+
+ "vtrn.u8 q9, q8 \n" // _src45t_r
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d22-d23}, [%2], %10\n"
+
+ "vtrn.u8 q11, q10 \n" // _src67t_r
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q2, q0 \n" // _src02tt_r
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q3, q1 \n" // _src13tt_r
+
+ "add %1, #16 \n" // src0 += 16
+
+ "vtrn.u16 q10, q8 \n" // _src46tt_r
+
+ "add %2, #16 \n" // src1 += 16
+
+ "vtrn.u16 q11, q9 \n" // _src57tt_r
+
+ "vtrn.u32 q10, q2 \n" // _src26ttt_r
+
+ "vtrn.u32 q11, q3 \n" // _src37ttt_r
+ "vst2.u8 {d20-d21}, [%4], %11\n"
+
+ "vtrn.u32 q8, q0 \n" // _src04ttt_r
+ "vst2.u8 {d22-d23}, [%3], %11\n"
+
+ "vtrn.u32 q9, q1 \n" // _src15ttt_r
+ "vst2.u8 {d16-d17}, [%4], %11\n"
+
+ "subs %0, #1 \n"
+
+ "vst2.u8 {d4-d5}, [%4], %11 \n"
+ "vst2.u8 {d18-d19}, [%3], %11\n"
+ "vst2.u8 {d6-d7}, [%3], %11 \n"
+ "vst2.u8 {d0-d1}, [%4], %11 \n"
+ "vst2.u8 {d2-d3}, [%3], %11 \n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst7), // %3
+ "=r"(dst6) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst7),
+ "4"(dst6),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst7[0] = src1[0 + 3 * src_step];
+ dst7[1] = src1[1 + 3 * src_step];
+ dst7[2] = src0[0 + 3 * src_step];
+ dst7[3] = src0[1 + 3 * src_step];
+ dst7[4] = src1[0 + 2 * src_step];
+ dst7[5] = src1[1 + 2 * src_step];
+ dst7[6] = src0[0 + 2 * src_step];
+ dst7[7] = src0[1 + 2 * src_step];
+ dst7[8] = src1[0 + src_step];
+ dst7[9] = src1[1 + src_step];
+ dst7[10] = src0[0 + src_step];
+ dst7[11] = src0[1 + src_step];
+ dst7[12] = src1[0];
+ dst7[13] = src1[1];
+ dst7[14] = src0[0];
+ dst7[15] = src0[1];
+
+ src0 += 2;
+ src1 += 2;
+
+ dst7 -= stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dstend - y * 2 - 2;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+
+ src0 += 2;
+ dst0 -= stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_7_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw * 3;
+
+ // point to the last dst pixel
+ unsigned char* dstend = dst + stride * (h - 1) + w * 3;
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst6 = dstend - y * 3 - 8 * 3 - stride;
+ unsigned char* dst7 = dstend - y * 3 - 8 * 3;
+
+ int src_step = 2 * srcstride;
+ int dst_step = -2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x3_t _src0 = vld3_u8(src0);
+ uint8x8x3_t _src1 = vld3_u8(src1);
+
+ uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
+ uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
+
+ uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
+ uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
+
+ uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
+ uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
+
+ uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
+ uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
+ uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
+ uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
+
+ uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
+ uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
+ uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
+ uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+ uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
+ uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
+ uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
+ uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
+
+ uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
+ uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
+ uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
+ uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+ uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
+ uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
+ uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
+ uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
+
+ uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
+ uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
+ uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
+ uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
+
+ uint8x8x3_t _dst0;
+ uint8x8x3_t _dst1;
+ uint8x8x3_t _dst2;
+ uint8x8x3_t _dst3;
+ uint8x8x3_t _dst4;
+ uint8x8x3_t _dst5;
+ uint8x8x3_t _dst6;
+ uint8x8x3_t _dst7;
+
+ _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+ _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+ _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+ _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+ _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+ _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+ _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+ _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+ _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+ _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+
+ _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+ _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+ _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+ _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+ _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+ _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+ _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+ _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+
+ vst3_u8(dst7, _dst7);
+ vst3_u8(dst6, _dst6);
+ vst3_u8(dst7 + dst_step, _dst5);
+ vst3_u8(dst6 + dst_step, _dst4);
+ vst3_u8(dst7 + 2 * dst_step, _dst3);
+ vst3_u8(dst6 + 2 * dst_step, _dst2);
+ vst3_u8(dst7 + 3 * dst_step, _dst1);
+ vst3_u8(dst6 + 3 * dst_step, _dst0);
+
+ src0 += 3 * 8;
+ src1 += 3 * 8;
+
+ dst7 += 4 * dst_step;
+ dst6 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #192] \n"
+ "vld3.u8 {d0-d2}, [%1], %10 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d4-d6}, [%2], %10 \n"
+
+ "pld [%1, #192] \n"
+ "vld3.u8 {d8-d10}, [%1], %10 \n"
+
+ "vtrn.u8 q2, q0 \n" // _src01t_r
+ "vtrn.u8 d6, d2 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d12-d14}, [%2], %10\n"
+
+ "pld [%1, #192] \n"
+ "vld3.u8 {d16-d18}, [%1], %10\n"
+
+ "vtrn.u8 q6, q4 \n" // _src23t_r
+ "vtrn.u8 d14, d10 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d20-d22}, [%2], %10\n"
+
+ "pld [%1, #192] \n"
+ "vld3.u8 {d24-d26}, [%1], %10\n"
+
+ "vtrn.u8 q10, q8 \n" // _src45t_r
+ "vtrn.u8 d22, d18 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d28-d30}, [%2], %10\n"
+
+ "vtrn.u8 q14, q12 \n" // _src67t_r
+ "vtrn.u8 d30, d26 \n"
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q4, q0 \n" // _src02tt_r
+ "vtrn.u16 d10, d2 \n"
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q6, q2 \n" // _src13tt_r
+ "vtrn.u16 d14, d6 \n"
+
+ "add %1, #24 \n" // src0 += 24
+
+ "vtrn.u16 q12, q8 \n" // _src46tt_r
+ "vtrn.u16 d26, d18 \n"
+
+ "add %2, #24 \n" // src1 += 24
+
+ "vtrn.u16 q14, q10 \n" // _src57tt_r
+ "vtrn.u16 d30, d22 \n"
+
+ "vtrn.u32 q12, q4 \n" // _src26ttt_r
+ "vtrn.u32 d26, d10 \n"
+
+ "vtrn.u32 q14, q6 \n" // _src37ttt_r
+ "vst3.u8 {d24-d26}, [%4], %11\n"
+ "vtrn.u32 d30, d14 \n"
+
+ "vtrn.u32 q8, q0 \n" // _src04ttt_r
+ "vst3.u8 {d28-d30}, [%3], %11\n"
+ "vtrn.u32 d18, d2 \n"
+
+ "vtrn.u32 q10, q2 \n" // _src15ttt_r
+ "vst3.u8 {d16-d18}, [%4], %11\n"
+ "vtrn.u32 d22, d6 \n"
+
+ "subs %0, #1 \n"
+
+ "vst3.u8 {d8-d10}, [%4], %11 \n"
+ "vst3.u8 {d20-d22}, [%3], %11\n"
+ "vst3.u8 {d12-d14}, [%3], %11\n"
+ "vst3.u8 {d0-d2}, [%4], %11 \n"
+ "vst3.u8 {d4-d6}, [%3], %11 \n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst7), // %3
+ "=r"(dst6) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst7),
+ "4"(dst6),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst7[0] = src1[0 + 3 * src_step];
+ dst7[1] = src1[1 + 3 * src_step];
+ dst7[2] = src1[2 + 3 * src_step];
+ dst7[3] = src0[0 + 3 * src_step];
+ dst7[4] = src0[1 + 3 * src_step];
+ dst7[5] = src0[2 + 3 * src_step];
+ dst7[6] = src1[0 + 2 * src_step];
+ dst7[7] = src1[1 + 2 * src_step];
+ dst7[8] = src1[2 + 2 * src_step];
+ dst7[9] = src0[0 + 2 * src_step];
+ dst7[10] = src0[1 + 2 * src_step];
+ dst7[11] = src0[2 + 2 * src_step];
+ dst7[12] = src1[0 + src_step];
+ dst7[13] = src1[1 + src_step];
+ dst7[14] = src1[2 + src_step];
+ dst7[15] = src0[0 + src_step];
+ dst7[16] = src0[1 + src_step];
+ dst7[17] = src0[2 + src_step];
+ dst7[18] = src1[0];
+ dst7[19] = src1[1];
+ dst7[20] = src1[2];
+ dst7[21] = src0[0];
+ dst7[22] = src0[1];
+ dst7[23] = src0[2];
+
+ src0 += 3;
+ src1 += 3;
+
+ dst7 -= stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dstend - y * 3 - 3;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src0[2];
+
+ src0 += 3;
+ dst0 -= stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_7_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw * 4;
+
+ // point to the last dst pixel
+ unsigned char* dstend = dst + stride * (h - 1) + w * 4;
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst6 = dstend - y * 4 - 8 * 4 - stride;
+ unsigned char* dst7 = dstend - y * 4 - 8 * 4;
+
+ int src_step = 2 * srcstride;
+ int dst_step = -2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x4_t _src0 = vld4_u8(src0);
+ uint8x8x4_t _src1 = vld4_u8(src1);
+
+ uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
+ uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
+
+ uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
+ uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
+
+ uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
+ uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
+
+ uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
+ uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
+ uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
+ uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
+
+ uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
+ uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
+ uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
+ uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
+
+ uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]);
+ uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]);
+ uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]);
+ uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
+
+ uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
+ uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
+ uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
+ uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
+
+ uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
+ uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
+ uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
+ uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
+
+ uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1]));
+ uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0]));
+ uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1]));
+ uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
+
+ uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
+ uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
+ uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
+ uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
+
+ uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
+ uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
+ uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
+ uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
+
+ uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1]));
+ uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1]));
+ uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0]));
+ uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0]));
+
+ uint8x8x4_t _dst0;
+ uint8x8x4_t _dst1;
+ uint8x8x4_t _dst2;
+ uint8x8x4_t _dst3;
+ uint8x8x4_t _dst4;
+ uint8x8x4_t _dst5;
+ uint8x8x4_t _dst6;
+ uint8x8x4_t _dst7;
+
+ _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+ _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+
+ _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+ _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+ _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+ _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+ _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+ _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+ _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+ _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+
+ _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+ _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+ _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+ _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+ _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+ _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+ _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+ _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+
+ _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
+ _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
+ _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
+ _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
+ _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
+ _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
+ _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
+ _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
+
+ vst4_u8(dst7, _dst7);
+ vst4_u8(dst6, _dst6);
+ vst4_u8(dst7 + dst_step, _dst5);
+ vst4_u8(dst6 + dst_step, _dst4);
+ vst4_u8(dst7 + 2 * dst_step, _dst3);
+ vst4_u8(dst6 + 2 * dst_step, _dst2);
+ vst4_u8(dst7 + 3 * dst_step, _dst1);
+ vst4_u8(dst6 + 3 * dst_step, _dst0);
+
+ src0 += 4 * 8;
+ src1 += 4 * 8;
+
+ dst7 += 4 * dst_step;
+ dst6 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld4.u8 {d0-d3}, [%1], %10 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d4-d7}, [%2], %10 \n"
+
+ "pld [%1, #256] \n"
+ "vld4.u8 {d8-d11}, [%1], %10 \n"
+
+ "vtrn.u8 q2, q0 \n" // _src01t_r
+ "vtrn.u8 q3, q1 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d12-d15}, [%2], %10\n"
+
+ "pld [%1, #256] \n"
+ "vld4.u8 {d16-d19}, [%1], %10\n"
+
+ "vtrn.u8 q6, q4 \n" // _src23t_r
+ "vtrn.u8 q7, q5 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d20-d23}, [%2], %10\n"
+
+ "pld [%1, #256] \n"
+ "vld4.u8 {d24-d27}, [%1], %10\n"
+
+ "vtrn.u8 q10, q8 \n" // _src45t_r
+ "vtrn.u8 q11, q9 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d28-d31}, [%2], %10\n"
+
+ "vtrn.u8 q14, q12 \n" // _src67t_r
+ "vtrn.u8 q15, q13 \n"
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q4, q0 \n" // _src02tt_r
+ "vtrn.u16 q5, q1 \n"
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q6, q2 \n" // _src13tt_r
+ "vtrn.u16 q7, q3 \n"
+
+ "add %1, #32 \n" // src0 += 32
+
+ "vtrn.u16 q12, q8 \n" // _src46tt_r
+ "vtrn.u16 q13, q9 \n"
+
+ "add %2, #32 \n" // src1 += 32
+
+ "vtrn.u16 q14, q10 \n" // _src57tt_r
+ "vtrn.u16 q15, q11 \n"
+
+ "vtrn.u32 q12, q4 \n" // _src26ttt_r
+ "vtrn.u32 q13, q5 \n"
+
+ "vtrn.u32 q14, q6 \n" // _src37ttt_r
+ "vst4.u8 {d24-d27}, [%4], %11\n"
+ "vtrn.u32 q15, q7 \n"
+
+ "vtrn.u32 q8, q0 \n" // _src04ttt_r
+ "vst4.u8 {d28-d31}, [%3], %11\n"
+ "vtrn.u32 q9, q1 \n"
+
+ "vtrn.u32 q10, q2 \n" // _src15ttt_r
+ "vst4.u8 {d16-d19}, [%4], %11\n"
+ "vtrn.u32 q11, q3 \n"
+
+ "subs %0, #1 \n"
+
+ "vst4.u8 {d8-d11}, [%4], %11 \n"
+ "vst4.u8 {d20-d23}, [%3], %11\n"
+ "vst4.u8 {d12-d15}, [%3], %11\n"
+ "vst4.u8 {d0-d3}, [%4], %11 \n"
+ "vst4.u8 {d4-d7}, [%3], %11 \n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst7), // %3
+ "=r"(dst6) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst7),
+ "4"(dst6),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst7[0] = src1[0 + 3 * src_step];
+ dst7[1] = src1[1 + 3 * src_step];
+ dst7[2] = src1[2 + 3 * src_step];
+ dst7[3] = src1[3 + 3 * src_step];
+ dst7[4] = src0[0 + 3 * src_step];
+ dst7[5] = src0[1 + 3 * src_step];
+ dst7[6] = src0[2 + 3 * src_step];
+ dst7[7] = src0[3 + 3 * src_step];
+ dst7[8] = src1[0 + 2 * src_step];
+ dst7[9] = src1[1 + 2 * src_step];
+ dst7[10] = src1[2 + 2 * src_step];
+ dst7[11] = src1[3 + 2 * src_step];
+ dst7[12] = src0[0 + 2 * src_step];
+ dst7[13] = src0[1 + 2 * src_step];
+ dst7[14] = src0[2 + 2 * src_step];
+ dst7[15] = src0[3 + 2 * src_step];
+ dst7[16] = src1[0 + src_step];
+ dst7[17] = src1[1 + src_step];
+ dst7[18] = src1[2 + src_step];
+ dst7[19] = src1[3 + src_step];
+ dst7[20] = src0[0 + src_step];
+ dst7[21] = src0[1 + src_step];
+ dst7[22] = src0[2 + src_step];
+ dst7[23] = src0[3 + src_step];
+ dst7[24] = src1[0];
+ dst7[25] = src1[1];
+ dst7[26] = src1[2];
+ dst7[27] = src1[3];
+ dst7[28] = src0[0];
+ dst7[29] = src0[1];
+ dst7[30] = src0[2];
+ dst7[31] = src0[3];
+
+ src0 += 4;
+ src1 += 4;
+
+ dst7 -= stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dstend - y * 4 - 4;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src0[2];
+ dst0[3] = src0[3];
+
+ src0 += 4;
+ dst0 -= stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_8_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw;
+
+ // point to the last dst pixel row
+ unsigned char* dstend = dst + stride * (h - 1);
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst7 = dstend + y;
+ unsigned char* dst6 = dstend + y - stride;
+
+ int src_step = 2 * srcstride;
+ int dst_step = -2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8_t _src0 = vld1_u8(src0);
+ uint8x8_t _src1 = vld1_u8(src1);
+
+ uint8x8_t _src2 = vld1_u8(src0 + src_step);
+ uint8x8_t _src3 = vld1_u8(src1 + src_step);
+
+ uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
+ uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
+
+ uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
+ uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+ uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+ uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+ vst1_u8(dst7, _dst0);
+ vst1_u8(dst6, _dst1);
+ vst1_u8(dst7 + dst_step, _dst2);
+ vst1_u8(dst6 + dst_step, _dst3);
+ vst1_u8(dst7 + 2 * dst_step, _dst4);
+ vst1_u8(dst6 + 2 * dst_step, _dst5);
+ vst1_u8(dst7 + 3 * dst_step, _dst6);
+ vst1_u8(dst6 + 3 * dst_step, _dst7);
+
+ src0 += 8;
+ src1 += 8;
+
+ dst7 += 4 * dst_step;
+ dst6 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #64] \n"
+ "vld1.u8 {d0}, [%1], %10 \n"
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d1}, [%2], %10 \n"
+
+ "pld [%1, #64] \n"
+ "vld1.u8 {d2}, [%1], %10 \n"
+
+ "vtrn.u8 d0, d1 \n" // _src01t_r
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d3}, [%2], %10 \n"
+
+ "pld [%1, #64] \n"
+ "vld1.u8 {d4}, [%1], %10 \n"
+
+ "vtrn.u8 d2, d3 \n" // _src23t_r
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d5}, [%2], %10 \n"
+
+ "pld [%1, #64] \n"
+ "vld1.u8 {d6}, [%1], %10 \n"
+
+ "vtrn.u8 d4, d5 \n" // _src45t_r
+
+ "pld [%2, #64] \n"
+ "vld1.u8 {d7}, [%2], %10 \n"
+
+ "vtrn.u8 d6, d7 \n" // _src67t_r
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q0, q1 \n" // _src02tt_r _src13tt_r
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q2, q3 \n" // _src46tt_r _src57tt_r
+
+ "add %1, #8 \n" // src0 += 8
+
+ "vtrn.u32 q0, q2 \n" // _src04ttt_r _src15ttt_r
+
+ "add %2, #8 \n" // src1 += 8
+
+ "vtrn.u32 q1, q3 \n" // _src26ttt_r _src37ttt_r
+ "vst1.u8 {d0}, [%3], %11 \n"
+ "vst1.u8 {d1}, [%4], %11 \n"
+
+ "subs %0, #1 \n"
+
+ "vst1.u8 {d2}, [%3], %11 \n"
+ "vst1.u8 {d3}, [%4], %11 \n"
+ "vst1.u8 {d4}, [%3], %11 \n"
+ "vst1.u8 {d5}, [%4], %11 \n"
+ "vst1.u8 {d6}, [%3], %11 \n"
+ "vst1.u8 {d7}, [%4], %11 \n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst7), // %3
+ "=r"(dst6) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst7),
+ "4"(dst6),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst7[0] = src0[0];
+ dst7[1] = src1[0];
+ dst7[2] = src0[0 + src_step];
+ dst7[3] = src1[0 + src_step];
+ dst7[4] = src0[0 + 2 * src_step];
+ dst7[5] = src1[0 + 2 * src_step];
+ dst7[6] = src0[0 + 3 * src_step];
+ dst7[7] = src1[0 + 3 * src_step];
+
+ src0 += 1;
+ src1 += 1;
+
+ dst7 -= stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dstend + y;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ *dst0 = *src0;
+
+ src0 += 1;
+ dst0 -= stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_8_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw * 2;
+
+ // point to the last dst pixel row
+ unsigned char* dstend = dst + stride * (h - 1);
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst7 = dstend + y * 2;
+ unsigned char* dst6 = dstend + y * 2 - stride;
+
+ int src_step = 2 * srcstride;
+ int dst_step = -2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x2_t _src0 = vld2_u8(src0);
+ uint8x8x2_t _src1 = vld2_u8(src1);
+
+ uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
+ uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
+
+ uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
+ uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
+
+ uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
+ uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
+
+ uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
+ uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
+ uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
+ uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+ uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
+ uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
+ uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
+ uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+ uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
+ uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
+ uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
+ uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
+
+ uint8x8x2_t _dst0;
+ uint8x8x2_t _dst1;
+ uint8x8x2_t _dst2;
+ uint8x8x2_t _dst3;
+ uint8x8x2_t _dst4;
+ uint8x8x2_t _dst5;
+ uint8x8x2_t _dst6;
+ uint8x8x2_t _dst7;
+
+ _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+ _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+ _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+ _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+ _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+ _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+ _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+ _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+ _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+ _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+
+ vst2_u8(dst7, _dst0);
+ vst2_u8(dst6, _dst1);
+ vst2_u8(dst7 + dst_step, _dst2);
+ vst2_u8(dst6 + dst_step, _dst3);
+ vst2_u8(dst7 + 2 * dst_step, _dst4);
+ vst2_u8(dst6 + 2 * dst_step, _dst5);
+ vst2_u8(dst7 + 3 * dst_step, _dst6);
+ vst2_u8(dst6 + 3 * dst_step, _dst7);
+
+ src0 += 2 * 8;
+ src1 += 2 * 8;
+
+ dst7 += 4 * dst_step;
+ dst6 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #128] \n"
+ "vld2.u8 {d0-d1}, [%1], %10 \n"
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d2-d3}, [%2], %10 \n"
+
+ "pld [%1, #128] \n"
+ "vld2.u8 {d4-d5}, [%1], %10 \n"
+
+ "vtrn.u8 q0, q1 \n" // _src01t_r
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d6-d7}, [%2], %10 \n"
+
+ "pld [%1, #128] \n"
+ "vld2.u8 {d16-d17}, [%1], %10\n"
+
+ "vtrn.u8 q2, q3 \n" // _src23t_r
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d18-d19}, [%2], %10\n"
+
+ "pld [%1, #128] \n"
+ "vld2.u8 {d20-d21}, [%1], %10\n"
+
+ "vtrn.u8 q8, q9 \n" // _src45t_r
+
+ "pld [%2, #128] \n"
+ "vld2.u8 {d22-d23}, [%2], %10\n"
+
+ "vtrn.u8 q10, q11 \n" // _src67t_r
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q0, q2 \n" // _src02tt_r
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q1, q3 \n" // _src13tt_r
+
+ "add %1, #16 \n" // src0 += 16
+
+ "vtrn.u16 q8, q10 \n" // _src46tt_r
+
+ "add %2, #16 \n" // src1 += 16
+
+ "vtrn.u16 q9, q11 \n" // _src57tt_r
+
+ "vtrn.u32 q0, q8 \n" // _src04ttt_r
+
+ "vtrn.u32 q1, q9 \n" // _src15ttt_r
+ "vst2.u8 {d0-d1}, [%3], %11 \n"
+
+ "vtrn.u32 q2, q10 \n" // _src26ttt_r
+ "vst2.u8 {d2-d3}, [%4], %11 \n"
+
+ "vtrn.u32 q3, q11 \n" // _src37ttt_r
+ "vst2.u8 {d4-d5}, [%3], %11 \n"
+
+ "subs %0, #1 \n"
+
+ "vst2.u8 {d16-d17}, [%3], %11\n"
+ "vst2.u8 {d6-d7}, [%4], %11 \n"
+ "vst2.u8 {d18-d19}, [%4], %11\n"
+ "vst2.u8 {d20-d21}, [%3], %11\n"
+ "vst2.u8 {d22-d23}, [%4], %11\n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst7), // %3
+ "=r"(dst6) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst7),
+ "4"(dst6),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst7[0] = src0[0];
+ dst7[1] = src0[1];
+ dst7[2] = src1[0];
+ dst7[3] = src1[1];
+ dst7[4] = src0[0 + src_step];
+ dst7[5] = src0[1 + src_step];
+ dst7[6] = src1[0 + src_step];
+ dst7[7] = src1[1 + src_step];
+ dst7[8] = src0[0 + 2 * src_step];
+ dst7[9] = src0[1 + 2 * src_step];
+ dst7[10] = src1[0 + 2 * src_step];
+ dst7[11] = src1[1 + 2 * src_step];
+ dst7[12] = src0[0 + 3 * src_step];
+ dst7[13] = src0[1 + 3 * src_step];
+ dst7[14] = src1[0 + 3 * src_step];
+ dst7[15] = src1[1 + 3 * src_step];
+
+ src0 += 2;
+ src1 += 2;
+
+ dst7 -= stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dstend + y * 2;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+
+ src0 += 2;
+ dst0 -= stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_8_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw * 3;
+
+ // point to the last dst pixel row
+ unsigned char* dstend = dst + stride * (h - 1);
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst7 = dstend + y * 3;
+ unsigned char* dst6 = dstend + y * 3 - stride;
+
+ int src_step = 2 * srcstride;
+ int dst_step = -2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x3_t _src0 = vld3_u8(src0);
+ uint8x8x3_t _src1 = vld3_u8(src1);
+
+ uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
+ uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
+
+ uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
+ uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
+
+ uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
+ uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
+
+ uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
+ uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
+ uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
+ uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
+
+ uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
+ uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
+ uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
+ uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+ uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
+ uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
+ uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
+ uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
+
+ uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
+ uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
+ uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
+ uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+ uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
+ uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
+ uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
+ uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
+
+ uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
+ uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
+ uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
+ uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
+
+ uint8x8x3_t _dst0;
+ uint8x8x3_t _dst1;
+ uint8x8x3_t _dst2;
+ uint8x8x3_t _dst3;
+ uint8x8x3_t _dst4;
+ uint8x8x3_t _dst5;
+ uint8x8x3_t _dst6;
+ uint8x8x3_t _dst7;
+
+ _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+ _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+ _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+ _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+ _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+ _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+ _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+ _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+ _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+ _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+
+ _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+ _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+ _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+ _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+ _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+ _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+ _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+ _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+
+ vst3_u8(dst7, _dst0);
+ vst3_u8(dst6, _dst1);
+ vst3_u8(dst7 + dst_step, _dst2);
+ vst3_u8(dst6 + dst_step, _dst3);
+ vst3_u8(dst7 + 2 * dst_step, _dst4);
+ vst3_u8(dst6 + 2 * dst_step, _dst5);
+ vst3_u8(dst7 + 3 * dst_step, _dst6);
+ vst3_u8(dst6 + 3 * dst_step, _dst7);
+
+ src0 += 3 * 8;
+ src1 += 3 * 8;
+
+ dst7 += 4 * dst_step;
+ dst6 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #192] \n"
+ "vld3.u8 {d0-d2}, [%1], %10 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d4-d6}, [%2], %10 \n"
+
+ "pld [%1, #192] \n"
+ "vld3.u8 {d8-d10}, [%1], %10 \n"
+
+ "vtrn.u8 q0, q2 \n" // _src01t_r
+ "vtrn.u8 d2, d6 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d12-d14}, [%2], %10\n"
+
+ "pld [%1, #192] \n"
+ "vld3.u8 {d16-d18}, [%1], %10\n"
+
+ "vtrn.u8 q4, q6 \n" // _src23t_r
+ "vtrn.u8 d10, d14 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d20-d22}, [%2], %10\n"
+
+ "pld [%1, #192] \n"
+ "vld3.u8 {d24-d26}, [%1], %10\n"
+
+ "vtrn.u8 q8, q10 \n" // _src45t_r
+ "vtrn.u8 d18, d22 \n"
+
+ "pld [%2, #192] \n"
+ "vld3.u8 {d28-d30}, [%2], %10\n"
+
+ "vtrn.u8 q12, q14 \n" // _src67t_r
+ "vtrn.u8 d26, d30 \n"
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q0, q4 \n" // _src02tt_r
+ "vtrn.u16 d2, d10 \n"
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q2, q6 \n" // _src13tt_r
+ "vtrn.u16 d6, d14 \n"
+
+ "add %1, #24 \n" // src0 += 24
+
+ "vtrn.u16 q8, q12 \n" // _src46tt_r
+ "vtrn.u16 d18, d26 \n"
+
+ "add %2, #24 \n" // src1 += 24
+
+ "vtrn.u16 q10, q14 \n" // _src57tt_r
+ "vtrn.u16 d22, d30 \n"
+
+ "vtrn.u32 q0, q8 \n" // _src04ttt_r
+ "vtrn.u32 d2, d18 \n"
+
+ "vtrn.u32 q2, q10 \n" // _src15ttt_r
+ "vst3.u8 {d0-d2}, [%3], %11 \n"
+ "vtrn.u32 d6, d22 \n"
+
+ "vtrn.u32 q4, q12 \n" // _src26ttt_r
+ "vst3.u8 {d4-d6}, [%4], %11 \n"
+ "vtrn.u32 d10, d26 \n"
+
+ "vtrn.u32 q6, q14 \n" // _src37ttt_r
+ "vst3.u8 {d8-d10}, [%3], %11 \n"
+ "vtrn.u32 d14, d30 \n"
+
+ "subs %0, #1 \n"
+
+ "vst3.u8 {d16-d18}, [%3], %11\n"
+ "vst3.u8 {d12-d14}, [%4], %11\n"
+ "vst3.u8 {d20-d22}, [%4], %11\n"
+ "vst3.u8 {d24-d26}, [%3], %11\n"
+ "vst3.u8 {d28-d30}, [%4], %11\n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst7), // %3
+ "=r"(dst6) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst7),
+ "4"(dst6),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst7[0] = src0[0];
+ dst7[1] = src0[1];
+ dst7[2] = src0[2];
+ dst7[3] = src1[0];
+ dst7[4] = src1[1];
+ dst7[5] = src1[2];
+ dst7[6] = src0[0 + src_step];
+ dst7[7] = src0[1 + src_step];
+ dst7[8] = src0[2 + src_step];
+ dst7[9] = src1[0 + src_step];
+ dst7[10] = src1[1 + src_step];
+ dst7[11] = src1[2 + src_step];
+ dst7[12] = src0[0 + 2 * src_step];
+ dst7[13] = src0[1 + 2 * src_step];
+ dst7[14] = src0[2 + 2 * src_step];
+ dst7[15] = src1[0 + 2 * src_step];
+ dst7[16] = src1[1 + 2 * src_step];
+ dst7[17] = src1[2 + 2 * src_step];
+ dst7[18] = src0[0 + 3 * src_step];
+ dst7[19] = src0[1 + 3 * src_step];
+ dst7[20] = src0[2 + 3 * src_step];
+ dst7[21] = src1[0 + 3 * src_step];
+ dst7[22] = src1[1 + 3 * src_step];
+ dst7[23] = src1[2 + 3 * src_step];
+
+ src0 += 3;
+ src1 += 3;
+
+ dst7 -= stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dstend + y * 3;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src0[2];
+
+ src0 += 3;
+ dst0 -= stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+static void kanna_rotate_8_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
+{
+ const int srcwgap = srcstride - srcw * 4;
+
+ // point to the last dst pixel row
+ unsigned char* dstend = dst + stride * (h - 1);
+
+ const unsigned char* src0 = src;
+
+ int y = 0;
+#if __ARM_NEON
+ for (; y + 7 < srch; y += 8)
+ {
+ const unsigned char* src1 = src0 + srcstride;
+
+ unsigned char* dst7 = dstend + y * 4;
+ unsigned char* dst6 = dstend + y * 4 - stride;
+
+ int src_step = 2 * srcstride;
+ int dst_step = -2 * stride;
+
+ int nn = srcw >> 3;
+ int remain = srcw - (nn << 3);
+
+#if __aarch64__
+ for (; nn > 0; nn--)
+ {
+ uint8x8x4_t _src0 = vld4_u8(src0);
+ uint8x8x4_t _src1 = vld4_u8(src1);
+
+ uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
+ uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
+
+ uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
+ uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
+
+ uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
+ uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
+
+ uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
+ uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
+ uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
+ uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
+
+ uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
+ uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
+ uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
+ uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
+
+ uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
+ uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
+ uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
+ uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
+
+ uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]);
+ uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]);
+ uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]);
+ uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]);
+
+ uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
+ uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
+ uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
+ uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
+
+ uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
+ uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
+ uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
+ uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
+
+ uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
+ uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
+ uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
+ uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
+
+ uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0]));
+ uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1]));
+ uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0]));
+ uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1]));
+
+ uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
+ uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
+ uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
+ uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
+
+ uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
+ uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
+ uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
+ uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
+
+ uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
+ uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
+ uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
+ uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
+
+ uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0]));
+ uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0]));
+ uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1]));
+ uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1]));
+
+ uint8x8x4_t _dst0;
+ uint8x8x4_t _dst1;
+ uint8x8x4_t _dst2;
+ uint8x8x4_t _dst3;
+ uint8x8x4_t _dst4;
+ uint8x8x4_t _dst5;
+ uint8x8x4_t _dst6;
+ uint8x8x4_t _dst7;
+
+ _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
+ _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
+ _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
+ _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
+ _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
+ _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
+ _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
+ _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
+
+ _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
+ _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
+ _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
+ _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
+ _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
+ _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
+ _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
+ _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
+
+ _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
+ _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
+ _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
+ _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
+ _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
+ _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
+ _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
+ _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
+
+ _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
+ _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
+ _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
+ _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
+ _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
+ _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
+ _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
+ _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
+
+ vst4_u8(dst7, _dst0);
+ vst4_u8(dst6, _dst1);
+ vst4_u8(dst7 + dst_step, _dst2);
+ vst4_u8(dst6 + dst_step, _dst3);
+ vst4_u8(dst7 + 2 * dst_step, _dst4);
+ vst4_u8(dst6 + 2 * dst_step, _dst5);
+ vst4_u8(dst7 + 3 * dst_step, _dst6);
+ vst4_u8(dst6 + 3 * dst_step, _dst7);
+
+ src0 += 4 * 8;
+ src1 += 4 * 8;
+
+ dst7 += 4 * dst_step;
+ dst6 += 4 * dst_step;
+ }
+#else
+ if (nn > 0)
+ {
+ asm volatile(
+ "0: \n"
+ "pld [%1, #256] \n"
+ "vld4.u8 {d0-d3}, [%1], %10 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d4-d7}, [%2], %10 \n"
+
+ "pld [%1, #256] \n"
+ "vld4.u8 {d8-d11}, [%1], %10 \n"
+
+ "vtrn.u8 q0, q2 \n" // _src01t_r
+ "vtrn.u8 q1, q3 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d12-d15}, [%2], %10\n"
+
+ "pld [%1, #256] \n"
+ "vld4.u8 {d16-d19}, [%1], %10\n"
+
+ "vtrn.u8 q4, q6 \n" // _src23t_r
+ "vtrn.u8 q5, q7 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d20-d23}, [%2], %10\n"
+
+ "pld [%1, #256] \n"
+ "vld4.u8 {d24-d27}, [%1], %10\n"
+
+ "vtrn.u8 q8, q10 \n" // _src45t_r
+ "vtrn.u8 q9, q11 \n"
+
+ "pld [%2, #256] \n"
+ "vld4.u8 {d28-d31}, [%2], %10\n"
+
+ "vtrn.u8 q12, q14 \n" // _src67t_r
+ "vtrn.u8 q13, q15 \n"
+
+ "sub %1, %1, %10, lsl #2 \n" // restore src0
+
+ "vtrn.u16 q0, q4 \n" // _src02tt_r
+ "vtrn.u16 q1, q5 \n"
+
+ "sub %2, %2, %10, lsl #2 \n" // restore src1
+
+ "vtrn.u16 q2, q6 \n" // _src13tt_r
+ "vtrn.u16 q3, q7 \n"
+
+ "add %1, #32 \n" // src0 += 32
+
+ "vtrn.u16 q8, q12 \n" // _src46tt_r
+ "vtrn.u16 q9, q13 \n"
+
+ "add %2, #32 \n" // src1 += 32
+
+ "vtrn.u16 q10, q14 \n" // _src57tt_r
+ "vtrn.u16 q11, q15 \n"
+
+ "vtrn.u32 q0, q8 \n" // _src04ttt_r
+ "vtrn.u32 q1, q9 \n"
+
+ "vtrn.u32 q2, q10 \n" // _src15ttt_r
+ "vst4.u8 {d0-d3}, [%3], %11 \n"
+ "vtrn.u32 q3, q11 \n"
+
+ "vtrn.u32 q4, q12 \n" // _src26ttt_r
+ "vst4.u8 {d4-d7}, [%4], %11 \n"
+ "vtrn.u32 q5, q13 \n"
+
+ "vtrn.u32 q6, q14 \n" // _src37ttt_r
+ "vst4.u8 {d8-d11}, [%3], %11 \n"
+ "vtrn.u32 q7, q15 \n"
+
+ "subs %0, #1 \n"
+
+ "vst4.u8 {d16-d19}, [%3], %11\n"
+ "vst4.u8 {d12-d15}, [%4], %11\n"
+ "vst4.u8 {d20-d23}, [%4], %11\n"
+ "vst4.u8 {d24-d27}, [%3], %11\n"
+ "vst4.u8 {d28-d31}, [%4], %11\n"
+
+ "bne 0b \n"
+ : "=r"(nn), // %0
+ "=r"(src0), // %1
+ "=r"(src1), // %2
+ "=r"(dst7), // %3
+ "=r"(dst6) // %4
+ : "0"(nn),
+ "1"(src0),
+ "2"(src1),
+ "3"(dst7),
+ "4"(dst6),
+ "r"(src_step), // %10
+ "r"(dst_step) // %11
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+ }
+#endif // __aarch64__
+ for (; remain > 0; remain--)
+ {
+ dst7[0] = src0[0];
+ dst7[1] = src0[1];
+ dst7[2] = src0[2];
+ dst7[3] = src0[3];
+ dst7[4] = src1[0];
+ dst7[5] = src1[1];
+ dst7[6] = src1[2];
+ dst7[7] = src1[3];
+ dst7[8] = src0[0 + src_step];
+ dst7[9] = src0[1 + src_step];
+ dst7[10] = src0[2 + src_step];
+ dst7[11] = src0[3 + src_step];
+ dst7[12] = src1[0 + src_step];
+ dst7[13] = src1[1 + src_step];
+ dst7[14] = src1[2 + src_step];
+ dst7[15] = src1[3 + src_step];
+ dst7[16] = src0[0 + 2 * src_step];
+ dst7[17] = src0[1 + 2 * src_step];
+ dst7[18] = src0[2 + 2 * src_step];
+ dst7[19] = src0[3 + 2 * src_step];
+ dst7[20] = src1[0 + 2 * src_step];
+ dst7[21] = src1[1 + 2 * src_step];
+ dst7[22] = src1[2 + 2 * src_step];
+ dst7[23] = src1[3 + 2 * src_step];
+ dst7[24] = src0[0 + 3 * src_step];
+ dst7[25] = src0[1 + 3 * src_step];
+ dst7[26] = src0[2 + 3 * src_step];
+ dst7[27] = src0[3 + 3 * src_step];
+ dst7[28] = src1[0 + 3 * src_step];
+ dst7[29] = src1[1 + 3 * src_step];
+ dst7[30] = src1[2 + 3 * src_step];
+ dst7[31] = src1[3 + 3 * src_step];
+
+ src0 += 4;
+ src1 += 4;
+
+ dst7 -= stride;
+ }
+
+ src0 += srcwgap + 7 * srcstride;
+ }
+#endif // __ARM_NEON
+ for (; y < srch; y++)
+ {
+ unsigned char* dst0 = dstend + y * 4;
+
+ int x = 0;
+ for (; x < srcw; x++)
+ {
+ dst0[0] = src0[0];
+ dst0[1] = src0[1];
+ dst0[2] = src0[2];
+ dst0[3] = src0[3];
+
+ src0 += 4;
+ dst0 -= stride;
+ }
+
+ src0 += srcwgap;
+ }
+}
+
+void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
+{
+ return kanna_rotate_c1(src, srcw, srch, srcw, dst, w, h, w, type);
+}
+
+void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
+{
+ return kanna_rotate_c2(src, srcw, srch, srcw * 2, dst, w, h, w * 2, type);
+}
+
+void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
+{
+ return kanna_rotate_c3(src, srcw, srch, srcw * 3, dst, w, h, w * 3, type);
+}
+
+void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
+{
+ return kanna_rotate_c4(src, srcw, srch, srcw * 4, dst, w, h, w * 4, type);
+}
+
+void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
+{
+ // assert srcw == w && srch == h for type 1234
+ // assert srcw == h && srch == w for type 5678
+
+ switch (type)
+ {
+ case 1:
+ kanna_rotate_1_c1(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 2:
+ kanna_rotate_2_c1(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 3:
+ kanna_rotate_3_c1(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 4:
+ kanna_rotate_4_c1(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 5:
+ kanna_rotate_5_c1(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 6:
+ kanna_rotate_6_c1(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 7:
+ kanna_rotate_7_c1(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 8:
+ kanna_rotate_8_c1(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ default:
+ // unsupported rotate type
+ break;
+ }
+}
+
+void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
+{
+ // assert srcw == w && srch == h for type 1234
+ // assert srcw == h && srch == w for type 5678
+
+ switch (type)
+ {
+ case 1:
+ kanna_rotate_1_c2(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 2:
+ kanna_rotate_2_c2(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 3:
+ kanna_rotate_3_c2(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 4:
+ kanna_rotate_4_c2(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 5:
+ kanna_rotate_5_c2(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 6:
+ kanna_rotate_6_c2(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 7:
+ kanna_rotate_7_c2(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 8:
+ kanna_rotate_8_c2(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ default:
+ // unsupported rotate type
+ break;
+ }
+}
+
+void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
+{
+ // assert srcw == w && srch == h for type 1234
+ // assert srcw == h && srch == w for type 5678
+
+ switch (type)
+ {
+ case 1:
+ kanna_rotate_1_c3(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 2:
+ kanna_rotate_2_c3(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 3:
+ kanna_rotate_3_c3(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 4:
+ kanna_rotate_4_c3(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 5:
+ kanna_rotate_5_c3(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 6:
+ kanna_rotate_6_c3(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 7:
+ kanna_rotate_7_c3(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 8:
+ kanna_rotate_8_c3(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ default:
+ // unsupported rotate type
+ break;
+ }
+}
+
+void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
+{
+ // assert srcw == w && srch == h for type 1234
+ // assert srcw == h && srch == w for type 5678
+
+ switch (type)
+ {
+ case 1:
+ kanna_rotate_1_c4(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 2:
+ kanna_rotate_2_c4(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 3:
+ kanna_rotate_3_c4(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 4:
+ kanna_rotate_4_c4(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 5:
+ kanna_rotate_5_c4(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 6:
+ kanna_rotate_6_c4(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 7:
+ kanna_rotate_7_c4(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ case 8:
+ kanna_rotate_8_c4(src, srcw, srch, srcstride, dst, w, h, stride);
+ break;
+ default:
+ // unsupported rotate type
+ break;
+ }
+}
+
+void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
+{
+ // assert srcw % 2 == 0
+ // assert srch % 2 == 0
+ // assert w % 2 == 0
+ // assert h % 2 == 0
+
+ const unsigned char* srcY = src;
+ unsigned char* dstY = dst;
+ kanna_rotate_c1(srcY, srcw, srch, dstY, w, h, type);
+
+ const unsigned char* srcUV = src + srcw * srch;
+ unsigned char* dstUV = dst + w * h;
+ kanna_rotate_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2, type);
+}
diff --git a/highgui/src/kanna_rotate.h b/highgui/src/kanna_rotate.h
new file mode 100644
index 00000000..8414fa62
--- /dev/null
+++ b/highgui/src/kanna_rotate.h
@@ -0,0 +1,36 @@
+//
+// Copyright (C) 2024 nihui
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#ifndef KANNA_ROTATE_H
+#define KANNA_ROTATE_H
+
+void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+
+void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+
+void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+
+void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+
+void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+
+void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+
+void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+
+void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+
+#endif // KANNA_ROTATE_H