From 07c3bd09f7e883f319ae2a1c0e882e51a9124903 Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 9 Feb 2024 22:56:57 +0800 Subject: [PATCH 01/10] aw jpg decoder --- highgui/CMakeLists.txt | 2 + highgui/src/highgui.cpp | 61 + highgui/src/jpeg_decoder_aw.cpp | 1241 +++++++ highgui/src/jpeg_decoder_aw.h | 39 + highgui/src/kanna_rotate.cpp | 6112 +++++++++++++++++++++++++++++++ highgui/src/kanna_rotate.h | 36 + 6 files changed, 7491 insertions(+) create mode 100644 highgui/src/jpeg_decoder_aw.cpp create mode 100644 highgui/src/jpeg_decoder_aw.h create mode 100644 highgui/src/kanna_rotate.cpp create mode 100644 highgui/src/kanna_rotate.h diff --git a/highgui/CMakeLists.txt b/highgui/CMakeLists.txt index 663e7749..d8ff69a3 100644 --- a/highgui/CMakeLists.txt +++ b/highgui/CMakeLists.txt @@ -7,8 +7,10 @@ set(highgui_srcs ${CMAKE_CURRENT_LIST_DIR}/src/capture_v4l2_rk_aiq.cpp ${CMAKE_CURRENT_LIST_DIR}/src/exif.cpp ${CMAKE_CURRENT_LIST_DIR}/src/highgui.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/jpeg_decoder_aw.cpp ${CMAKE_CURRENT_LIST_DIR}/src/jpeg_decoder_cvi.cpp ${CMAKE_CURRENT_LIST_DIR}/src/jpeg_encoder_rk_mpp.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/kanna_rotate.cpp ${CMAKE_CURRENT_LIST_DIR}/src/videocapture.cpp ) diff --git a/highgui/src/highgui.cpp b/highgui/src/highgui.cpp index 85884c5f..c4eb63e8 100644 --- a/highgui/src/highgui.cpp +++ b/highgui/src/highgui.cpp @@ -40,6 +40,7 @@ #include "stb_image_write.h" #if defined __linux__ +#include "jpeg_decoder_aw.h" #include "jpeg_decoder_cvi.h" #include "jpeg_encoder_rk_mpp.h" #endif @@ -154,6 +155,36 @@ Mat imread(const String& filename, int flags) if (buf_size > 4 && buf_data[0] == 0xFF && buf_data[1] == 0xD8) { // jpg magic + if (jpeg_decoder_aw::supported(buf_data, buf_size)) + { + int w = 0; + int h = 0; + int c = desired_channels; + + jpeg_decoder_aw d; + int ret = d.init(buf_data, buf_size, &w, &h, &c); + if (ret == 0 && (c == 1 || c == 3)) + { + Mat img; + if (c == 1) + { + img.create(h, w, CV_8UC1); + } + else // if (c == 3) + { + img.create(h, w, CV_8UC3); + } + + ret = d.decode(buf_data, buf_size, img.data); + if (ret == 0) + { + d.deinit(); + return img; + } + } + + // fallback to stbi_load_from_memory + } if (jpeg_decoder_cvi::supported(buf_data, buf_size)) { int w = 0; @@ -410,6 +441,36 @@ Mat imdecode(InputArray _buf, int flags) if (buf_size > 4 && buf_data[0] == 0xFF && buf_data[1] == 0xD8) { // jpg magic + if (jpeg_decoder_aw::supported(buf_data, buf_size)) + { + int w = 0; + int h = 0; + int c = desired_channels; + + jpeg_decoder_aw d; + int ret = d.init(buf_data, buf_size, &w, &h, &c); + if (ret == 0 && (c == 1 || c == 3)) + { + Mat img; + if (c == 1) + { + img.create(h, w, CV_8UC1); + } + else // if (c == 3) + { + img.create(h, w, CV_8UC3); + } + + ret = d.decode(buf_data, buf_size, img.data); + if (ret == 0) + { + d.deinit(); + return img; + } + } + + // fallback to stbi_load_from_memory + } if (jpeg_decoder_cvi::supported(buf_data, buf_size)) { int w = 0; diff --git a/highgui/src/jpeg_decoder_aw.cpp b/highgui/src/jpeg_decoder_aw.cpp new file mode 100644 index 00000000..c3671c2c --- /dev/null +++ b/highgui/src/jpeg_decoder_aw.cpp @@ -0,0 +1,1241 @@ +// +// Copyright (C) 2024 nihui +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "jpeg_decoder_aw.h" + +#if defined __linux__ +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#if __ARM_NEON +#include +#endif + +#include "exif.hpp" +#include "kanna_rotate.h" + +// 0 = unknown +// 1 = tinyvision +static int get_device_model() +{ + static int device_model = -1; + + if (device_model >= 0) + return device_model; + + device_model = 0; + + FILE* fp = fopen("/proc/device-tree/model", "rb"); + if (fp) + { + char buf[1024]; + fgets(buf, 1024, fp); + fclose(fp); + + if (strncmp(buf, "sun8iw21", 8) == 0) + { + // tinyvision + device_model = 1; + } + } + + return device_model; +} + +static bool is_device_whitelisted() +{ + const int device_model = get_device_model(); + + if (device_model == 1) + { + // tinyvision + return true; + } + + return false; +} + +extern "C" { + +typedef void (*PFN_AddVDPlugin)(); +typedef void (*PFN_AddVDPluginSingle)(const char* lib); + +} + +static void* libvideoengine = 0; + +static PFN_AddVDPlugin AddVDPlugin = 0; +static PFN_AddVDPluginSingle AddVDPluginSingle = 0; + +static int load_videoengine_library() +{ + if (libvideoengine) + return 0; + + // check device whitelist + bool whitelisted = is_device_whitelisted(); + if (!whitelisted) + { + fprintf(stderr, "this device is not whitelisted for jpeg decoder aw cedarc\n"); + return -1; + } + + libvideoengine = dlopen("libvideoengine.so", RTLD_LOCAL | RTLD_NOW); + if (!libvideoengine) + { + libvideoengine = dlopen("/usr/lib/libvideoengine.so", RTLD_LOCAL | RTLD_NOW); + } + if (!libvideoengine) + { + return -1; + } + + AddVDPlugin = (PFN_AddVDPlugin)dlsym(libvideoengine, "AddVDPlugin"); + AddVDPluginSingle = (PFN_AddVDPluginSingle)dlsym(libvideoengine, "AddVDPluginSingle"); + + return 0; +} + +static int unload_videoengine_library() +{ + if (!libvideoengine) + return 0; + + dlclose(libvideoengine); + libvideoengine = 0; + + AddVDPlugin = 0; + AddVDPluginSingle = 0; + + return 0; +} + +class videoengine_library_loader +{ +public: + bool ready; + + videoengine_library_loader() + { + ready = (load_videoengine_library() == 0); + + if (libvideoengine) + { + // AddVDPlugin(); + AddVDPluginSingle("/usr/lib/libawmjpeg.so"); + } + } + + ~videoengine_library_loader() + { + unload_videoengine_library(); + } +}; + +static videoengine_library_loader videoengine; + + +extern "C" { + +typedef unsigned char u8; +typedef unsigned short u16; +typedef unsigned int u32; + +#if (INTPTR_MAX == INT64_MAX) + typedef unsigned long u64; +#else + typedef unsigned long long u64; +#endif + +typedef signed char s8; +typedef signed short s16; +typedef signed int s32; + +#if (INTPTR_MAX == INT64_MAX) + typedef signed long s64; +#else + typedef signed long long s64; +#endif + +typedef uintptr_t size_addr; + +struct VeOpsS; +struct ScMemOpsS; + +enum EVIDEOCODECFORMAT +{ + VIDEO_CODEC_FORMAT_UNKNOWN = 0, + VIDEO_CODEC_FORMAT_MJPEG = 0x101, + VIDEO_CODEC_FORMAT_MPEG1 = 0x102, + VIDEO_CODEC_FORMAT_MPEG2 = 0x103, + VIDEO_CODEC_FORMAT_MPEG4 = 0x104, + VIDEO_CODEC_FORMAT_MSMPEG4V1 = 0x105, + VIDEO_CODEC_FORMAT_MSMPEG4V2 = 0x106, + VIDEO_CODEC_FORMAT_DIVX3 = 0x107, //* not support + VIDEO_CODEC_FORMAT_DIVX4 = 0x108, //* not support + VIDEO_CODEC_FORMAT_DIVX5 = 0x109, //* not support + VIDEO_CODEC_FORMAT_XVID = 0x10a, + VIDEO_CODEC_FORMAT_H263 = 0x10b, + VIDEO_CODEC_FORMAT_SORENSSON_H263 = 0x10c, + VIDEO_CODEC_FORMAT_RXG2 = 0x10d, + VIDEO_CODEC_FORMAT_WMV1 = 0x10e, + VIDEO_CODEC_FORMAT_WMV2 = 0x10f, + VIDEO_CODEC_FORMAT_WMV3 = 0x110, + VIDEO_CODEC_FORMAT_VP6 = 0x111, + VIDEO_CODEC_FORMAT_VP8 = 0x112, + VIDEO_CODEC_FORMAT_VP9 = 0x113, + VIDEO_CODEC_FORMAT_RX = 0x114, + VIDEO_CODEC_FORMAT_H264 = 0x115, + VIDEO_CODEC_FORMAT_H265 = 0x116, + VIDEO_CODEC_FORMAT_AVS = 0x117, + VIDEO_CODEC_FORMAT_AVS2 = 0x118, + + VIDEO_CODEC_FORMAT_MAX = VIDEO_CODEC_FORMAT_AVS2, + VIDEO_CODEC_FORMAT_MIN = VIDEO_CODEC_FORMAT_MJPEG, +}; + +enum EPIXELFORMAT +{ + PIXEL_FORMAT_DEFAULT = 0, + + PIXEL_FORMAT_YUV_PLANER_420 = 1, + PIXEL_FORMAT_YUV_PLANER_422 = 2, + PIXEL_FORMAT_YUV_PLANER_444 = 3, + + PIXEL_FORMAT_YV12 = 4, + PIXEL_FORMAT_NV21 = 5, + PIXEL_FORMAT_NV12 = 6, + PIXEL_FORMAT_YUV_MB32_420 = 7, + PIXEL_FORMAT_YUV_MB32_422 = 8, + PIXEL_FORMAT_YUV_MB32_444 = 9, + + PIXEL_FORMAT_RGBA = 10, + PIXEL_FORMAT_ARGB = 11, + PIXEL_FORMAT_ABGR = 12, + PIXEL_FORMAT_BGRA = 13, + + PIXEL_FORMAT_YUYV = 14, + PIXEL_FORMAT_YVYU = 15, + PIXEL_FORMAT_UYVY = 16, + PIXEL_FORMAT_VYUY = 17, + + PIXEL_FORMAT_PLANARUV_422 = 18, + PIXEL_FORMAT_PLANARVU_422 = 19, + PIXEL_FORMAT_PLANARUV_444 = 20, + PIXEL_FORMAT_PLANARVU_444 = 21, + PIXEL_FORMAT_P010_UV = 22, + PIXEL_FORMAT_P010_VU = 23, + + PIXEL_FORMAT_MIN = PIXEL_FORMAT_DEFAULT, + PIXEL_FORMAT_MAX = PIXEL_FORMAT_PLANARVU_444, +}; + +typedef enum CONTROL_AFBC_MODE { + DISABLE_AFBC_ALL_SIZE = 0, + ENABLE_AFBC_JUST_BIG_SIZE = 1, //* >= 4k + ENABLE_AFBC_ALL_SIZE = 2, +}eControlAfbcMode; + +typedef enum CONTROL_IPTV_MODE { + DISABLE_IPTV_ALL_SIZE = 0, + ENABLE_IPTV_JUST_SMALL_SIZE = 1, //* < 4k + ENABLE_IPTV_ALL_SIZE = 2, +}eControlIptvMode; + +typedef enum COMMON_CONFIG_FLAG +{ + IS_MIRACAST_STREAM = 1, + +}eCommonConfigFlag; + +enum EVDECODERESULT +{ + VDECODE_RESULT_UNSUPPORTED = -1, + VDECODE_RESULT_OK = 0, + VDECODE_RESULT_FRAME_DECODED = 1, + VDECODE_RESULT_CONTINUE = 2, + VDECODE_RESULT_KEYFRAME_DECODED = 3, + VDECODE_RESULT_NO_FRAME_BUFFER = 4, + VDECODE_RESULT_NO_BITSTREAM = 5, + VDECODE_RESULT_RESOLUTION_CHANGE = 6, + + VDECODE_RESULT_MIN = VDECODE_RESULT_UNSUPPORTED, + VDECODE_RESULT_MAX = VDECODE_RESULT_RESOLUTION_CHANGE, +}; + +typedef struct VIDEOSTREAMINFO +{ + int eCodecFormat; + int nWidth; + int nHeight; + int nFrameRate; + int nFrameDuration; + int nAspectRatio; + int bIs3DStream; + int nCodecSpecificDataLen; + char* pCodecSpecificData; + int bSecureStreamFlag; + int bSecureStreamFlagLevel1; + int bIsFramePackage; /* 1: frame package; 0: stream package */ + int h265ReferencePictureNum; + int bReOpenEngine; + int bIsFrameCtsTestFlag; +}VideoStreamInfo; + +typedef struct VCONFIG +{ + int bScaleDownEn; + int bRotationEn; + int bSecOutputEn; + int nHorizonScaleDownRatio; + int nVerticalScaleDownRatio; + int nSDWidth; + int nSDHeight; + int bAnySizeSD; + int nSecHorizonScaleDownRatio; + int nSecVerticalScaleDownRatio; + int nRotateDegree; + int bThumbnailMode; + int eOutputPixelFormat; + int eSecOutputPixelFormat; + int bNoBFrames; + int bDisable3D; + int bSupportMaf; //not use + int bDispErrorFrame; + int nVbvBufferSize; + int nFrameBufferNum; + int bSecureosEn; + int bGpuBufValid; + int nAlignStride; + int bIsSoftDecoderFlag; + int bVirMallocSbm; + int bSupportPallocBufBeforeDecode; + //only used for xuqi, set this flag to 1 meaning palloc the fbm buffer before + // decode the sequence, to short the first frame decoing time + int nDeInterlaceHoldingFrameBufferNum; + int nDisplayHoldingFrameBufferNum; + int nRotateHoldingFrameBufferNum; + int nDecodeSmoothFrameBufferNum; + int bIsTvStream; + int nLbcLossyComMod; //1:1.5x; 2:2x; 3:2.5x; + unsigned int bIsLossy; //lossy compression or not + unsigned int bRcEn; //compact storage or not + + struct ScMemOpsS *memops; + eControlAfbcMode eCtlAfbcMode; + eControlIptvMode eCtlIptvMode; + + VeOpsS* veOpsS; + void* pVeOpsSelf; + int bConvertVp910bitTo8bit; + unsigned int nVeFreq; + + int bCalledByOmxFlag; + + int bSetProcInfoEnable; //* for check the decoder info by cat devices-note + int nSetProcInfoFreq; + int nChannelNum; + int nSupportMaxWidth; //the max width of mjpeg continue decode + int nSupportMaxHeight; //the max height of mjpeg continue decode + eCommonConfigFlag commonConfigFlag; + int bATMFlag; +}VConfig; + +typedef struct VIDEOSTREAMDATAINFO +{ + char* pData; + int nLength; + int64_t nPts; + int64_t nPcr; + int bIsFirstPart; + int bIsLastPart; + int nID; + int nStreamIndex; + int bValid; + unsigned int bVideoInfoFlag; + void* pVideoInfo; +}VideoStreamDataInfo; + +typedef enum VIDEO_TRANSFER +{ + VIDEO_TRANSFER_RESERVED_0 = 0, + VIDEO_TRANSFER_BT1361 = 1, + VIDEO_TRANSFER_UNSPECIFIED = 2, + VIDEO_TRANSFER_RESERVED_1 = 3, + VIDEO_TRANSFER_GAMMA2_2 = 4, + VIDEO_TRANSFER_GAMMA2_8 = 5, + VIDEO_TRANSFER_SMPTE_170M = 6, + VIDEO_TRANSFER_SMPTE_240M = 7, + VIDEO_TRANSFER_LINEAR = 8, + VIDEO_TRANSFER_LOGARITHMIC_0 = 9, + VIDEO_TRANSFER_LOGARITHMIC_1 = 10, + VIDEO_TRANSFER_IEC61966 = 11, + VIDEO_TRANSFER_BT1361_EXTENDED = 12, + VIDEO_TRANSFER_SRGB = 13, + VIDEO_TRANSFER_BT2020_0 = 14, + VIDEO_TRANSFER_BT2020_1 = 15, + VIDEO_TRANSFER_ST2084 = 16, + VIDEO_TRANSFER_ST428_1 = 17, + VIDEO_TRANSFER_HLG = 18, + VIDEO_TRANSFER_RESERVED = 19, //* 19~255 +}VIDEO_TRANSFER; + +typedef enum VIDEO_MATRIX_COEFFS +{ + VIDEO_MATRIX_COEFFS_IDENTITY = 0, + VIDEO_MATRIX_COEFFS_BT709 = 1, + VIDEO_MATRIX_COEFFS_UNSPECIFIED_0 = 2, + VIDEO_MATRIX_COEFFS_RESERVED_0 = 3, + VIDEO_MATRIX_COEFFS_BT470M = 4, + VIDEO_MATRIX_COEFFS_BT601_625_0 = 5, + VIDEO_MATRIX_COEFFS_BT601_625_1 = 6, + VIDEO_MATRIX_COEFFS_SMPTE_240M = 7, + VIDEO_MATRIX_COEFFS_YCGCO = 8, + VIDEO_MATRIX_COEFFS_BT2020 = 9, + VIDEO_MATRIX_COEFFS_BT2020_CONSTANT_LUMINANCE = 10, + VIDEO_MATRIX_COEFFS_SOMPATE = 11, + VIDEO_MATRIX_COEFFS_CD_NON_CONSTANT_LUMINANCE = 12, + VIDEO_MATRIX_COEFFS_CD_CONSTANT_LUMINANCE = 13, + VIDEO_MATRIX_COEFFS_BTICC = 14, + VIDEO_MATRIX_COEFFS_RESERVED = 15, //* 15~255 +}VIDEO_MATRIX_COEFFS; + +typedef enum VIDEO_FULL_RANGE_FLAG +{ + VIDEO_FULL_RANGE_LIMITED = 0, + VIDEO_FULL_RANGE_FULL = 1, +}VIDEO_FULL_RANGE_FLAG; + +typedef struct VIDEO_FRM_MV_INFO +{ + s16 nMaxMv_x; + s16 nMinMv_x; + s16 nAvgMv_x; + s16 nMaxMv_y; + s16 nMinMv_y; + s16 nAvgMv_y; + s16 nMaxMv; + s16 nMinMv; + s16 nAvgMv; + s16 SkipRatio; +}VIDEO_FRM_MV_INFO; + +typedef enum VID_FRAME_TYPE +{ + VIDEO_FORMAT_TYPE_UNKONWN = 0, + VIDEO_FORMAT_TYPE_I, + VIDEO_FORMAT_TYPE_P, + VIDEO_FORMAT_TYPE_B, + VIDEO_FORMAT_TYPE_IDR, + VIDEO_FORMAT_TYPE_BUTT, +}VID_FRAME_TYPE; + +typedef struct VIDEO_FRM_STATUS_INFO +{ + VID_FRAME_TYPE enVidFrmType; + int nVidFrmSize; + int nVidFrmDisW; + int nVidFrmDisH; + int nVidFrmQP; + double nAverBitRate; + double nFrameRate; + int64_t nVidFrmPTS; + VIDEO_FRM_MV_INFO nMvInfo; + int bDropPreFrame; +}VIDEO_FRM_STATUS_INFO; + +typedef struct VIDEOPICTURE +{ + int nID; + int nStreamIndex; + int ePixelFormat; + int nWidth; + int nHeight; + int nLineStride; + int nTopOffset; + int nLeftOffset; + int nBottomOffset; + int nRightOffset; + int nFrameRate; + int nAspectRatio; + int bIsProgressive; + int bTopFieldFirst; + int bRepeatTopField; + int64_t nPts; + int64_t nPcr; + char* pData0; + char* pData1; + char* pData2; + char* pData3; + int bMafValid; + char* pMafData; + int nMafFlagStride; + int bPreFrmValid; + int nBufId; + size_addr phyYBufAddr; + size_addr phyCBufAddr; + void* pPrivate; + int nBufFd; + int nBufStatus; + int bTopFieldError; + int bBottomFieldError; + int nColorPrimary; // default value is 0xffffffff, valid value id 0x0000xxyy + // xx: is video full range code + // yy: is matrix coefficient + int bFrameErrorFlag; + + //* to save hdr info and afbc header info + void* pMetaData; + + //*display related parameter + VIDEO_FULL_RANGE_FLAG video_full_range_flag; + VIDEO_TRANSFER transfer_characteristics; + VIDEO_MATRIX_COEFFS matrix_coeffs; + u8 colour_primaries; + //*end of display related parameter defined + //size_addr nLower2BitPhyAddr; + int nLower2BitBufSize; + int nLower2BitBufOffset; + int nLower2BitBufStride; + int b10BitPicFlag; + int bEnableAfbcFlag; + int nLbcLossyComMod;//1:1.5x; 2:2x; 3:2.5x; + unsigned int bIsLossy; //lossy compression or not + unsigned int bRcEn; //compact storage or not + + int nBufSize; + int nAfbcSize; + int nLbcSize; + int nDebugCount; + VIDEO_FRM_STATUS_INFO nCurFrameInfo; +}VideoPicture; + +typedef void* VideoDecoder; + +typedef VideoDecoder* (*PFN_CreateVideoDecoder)(); +typedef void (*PFN_DestroyVideoDecoder)(VideoDecoder* pDecoder); +typedef int (*PFN_InitializeVideoDecoder)(VideoDecoder* pDecoder, VideoStreamInfo* pVideoInfo, VConfig* pVconfig); +typedef int (*PFN_RequestVideoStreamBuffer)(VideoDecoder* pDecoder, int nRequireSize, char** ppBuf, int* pBufSize, char** ppRingBuf, int* pRingBufSize, int nStreamBufIndex); +typedef int (*PFN_SubmitVideoStreamData)(VideoDecoder* pDecoder, VideoStreamDataInfo* pDataInfo, int nStreamBufIndex); +typedef int (*PFN_DecodeVideoStream)(VideoDecoder* pDecoder, int bEndOfStream, int bDecodeKeyFrameOnly, int bDropBFrameIfDelay, int64_t nCurrentTimeUs); +typedef VideoPicture* (*PFN_RequestPicture)(VideoDecoder* pDecoder, int nStreamIndex); +typedef int (*PFN_ReturnPicture)(VideoDecoder* pDecoder, VideoPicture* pPicture); + +} + + +static void* libvdecoder = 0; + +static PFN_CreateVideoDecoder CreateVideoDecoder = 0; +static PFN_DestroyVideoDecoder DestroyVideoDecoder = 0; +static PFN_InitializeVideoDecoder InitializeVideoDecoder = 0; +static PFN_RequestVideoStreamBuffer RequestVideoStreamBuffer = 0; +static PFN_SubmitVideoStreamData SubmitVideoStreamData = 0; +static PFN_DecodeVideoStream DecodeVideoStream = 0; +static PFN_RequestPicture RequestPicture = 0; +static PFN_ReturnPicture ReturnPicture = 0; + +static int load_vdecoder_library() +{ + if (libvdecoder) + return 0; + + // check device whitelist + bool whitelisted = is_device_whitelisted(); + if (!whitelisted) + { + fprintf(stderr, "this device is not whitelisted for jpeg decoder aw cedarc\n"); + return -1; + } + + libvdecoder = dlopen("libvdecoder.so", RTLD_LOCAL | RTLD_NOW); + if (!libvdecoder) + { + libvdecoder = dlopen("/usr/lib/libvdecoder.so", RTLD_LOCAL | RTLD_NOW); + } + if (!libvdecoder) + { + return -1; + } + + CreateVideoDecoder = (PFN_CreateVideoDecoder)dlsym(libvdecoder, "CreateVideoDecoder"); + DestroyVideoDecoder = (PFN_DestroyVideoDecoder)dlsym(libvdecoder, "DestroyVideoDecoder"); + InitializeVideoDecoder = (PFN_InitializeVideoDecoder)dlsym(libvdecoder, "InitializeVideoDecoder"); + RequestVideoStreamBuffer = (PFN_RequestVideoStreamBuffer)dlsym(libvdecoder, "RequestVideoStreamBuffer"); + SubmitVideoStreamData = (PFN_SubmitVideoStreamData)dlsym(libvdecoder, "SubmitVideoStreamData"); + DecodeVideoStream = (PFN_DecodeVideoStream)dlsym(libvdecoder, "DecodeVideoStream"); + RequestPicture = (PFN_RequestPicture)dlsym(libvdecoder, "RequestPicture"); + ReturnPicture = (PFN_ReturnPicture)dlsym(libvdecoder, "ReturnPicture"); + + return 0; +} + +static int unload_vdecoder_library() +{ + if (!libvdecoder) + return 0; + + dlclose(libvdecoder); + libvdecoder = 0; + + CreateVideoDecoder = 0; + DestroyVideoDecoder = 0; + InitializeVideoDecoder = 0; + RequestVideoStreamBuffer = 0; + SubmitVideoStreamData = 0; + DecodeVideoStream = 0; + RequestPicture = 0; + ReturnPicture = 0; + + return 0; +} + +class vdecoder_library_loader +{ +public: + bool ready; + + vdecoder_library_loader() + { + ready = (load_vdecoder_library() == 0); + } + + ~vdecoder_library_loader() + { + unload_vdecoder_library(); + } +}; + +static vdecoder_library_loader vdecoder; + + +static void yuv420sp2bgr_neon(const unsigned char* yptr, const unsigned char* vuptr, int w, int h, int stride, unsigned char* bgr) +{ +#if __ARM_NEON + uint8x8_t _v128 = vdup_n_u8(128); + int8x8_t _v90 = vdup_n_s8(90); + int8x8_t _v46 = vdup_n_s8(46); + int8x8_t _v22 = vdup_n_s8(22); + int8x8_t _v113 = vdup_n_s8(113); +#endif // __ARM_NEON + + for (int y = 0; y < h; y += 2) + { + const unsigned char* yptr0 = yptr; + const unsigned char* yptr1 = yptr + stride; + unsigned char* bgr0 = bgr; + unsigned char* bgr1 = bgr + w * 3; + +#if __ARM_NEON + int nn = w >> 3; + int remain = w - (nn << 3); +#else + int remain = w; +#endif // __ARM_NEON + +#if __ARM_NEON + for (; nn > 0; nn--) + { + int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6)); + int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6)); + + int8x8_t _vvuu = vreinterpret_s8_u8(vsub_u8(vld1_u8(vuptr), _v128)); + int8x8x2_t _vvvvuuuu = vtrn_s8(_vvuu, _vvuu); + int8x8_t _vv = _vvvvuuuu.val[0]; + int8x8_t _uu = _vvvvuuuu.val[1]; + + int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90); + int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46); + _g0 = vmlsl_s8(_g0, _uu, _v22); + int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113); + + int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90); + int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46); + _g1 = vmlsl_s8(_g1, _uu, _v22); + int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113); + + uint8x8x3_t _bgr0; + _bgr0.val[0] = vqshrun_n_s16(_b0, 6); + _bgr0.val[1] = vqshrun_n_s16(_g0, 6); + _bgr0.val[2] = vqshrun_n_s16(_r0, 6); + + uint8x8x3_t _bgr1; + _bgr1.val[0] = vqshrun_n_s16(_b1, 6); + _bgr1.val[1] = vqshrun_n_s16(_g1, 6); + _bgr1.val[2] = vqshrun_n_s16(_r1, 6); + + vst3_u8(bgr0, _bgr0); + vst3_u8(bgr1, _bgr1); + + yptr0 += 8; + yptr1 += 8; + vuptr += 8; + bgr0 += 24; + bgr1 += 24; + } +#endif // __ARM_NEON + +#define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255); + for (; remain > 0; remain -= 2) + { + // R = 1.164 * yy + 1.596 * vv + // G = 1.164 * yy - 0.813 * vv - 0.391 * uu + // B = 1.164 * yy + 2.018 * uu + + // R = Y + (1.370705 * (V-128)) + // G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128)) + // B = Y + (1.732446 * (U-128)) + + // R = ((Y << 6) + 87.72512 * (V-128)) >> 6 + // G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6 + // B = ((Y << 6) + 110.876544 * (U-128)) >> 6 + + // R = ((Y << 6) + 90 * (V-128)) >> 6 + // G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6 + // B = ((Y << 6) + 113 * (U-128)) >> 6 + + // R = (yy + 90 * vv) >> 6 + // G = (yy - 46 * vv - 22 * uu) >> 6 + // B = (yy + 113 * uu) >> 6 + + int v = vuptr[0] - 128; + int u = vuptr[1] - 128; + + int ruv = 90 * v; + int guv = -46 * v + -22 * u; + int buv = 113 * u; + + int y00 = yptr0[0] << 6; + bgr0[0] = SATURATE_CAST_UCHAR((y00 + buv) >> 6); + bgr0[1] = SATURATE_CAST_UCHAR((y00 + guv) >> 6); + bgr0[2] = SATURATE_CAST_UCHAR((y00 + ruv) >> 6); + + int y01 = yptr0[1] << 6; + bgr0[3] = SATURATE_CAST_UCHAR((y01 + buv) >> 6); + bgr0[4] = SATURATE_CAST_UCHAR((y01 + guv) >> 6); + bgr0[5] = SATURATE_CAST_UCHAR((y01 + ruv) >> 6); + + int y10 = yptr1[0] << 6; + bgr1[0] = SATURATE_CAST_UCHAR((y10 + buv) >> 6); + bgr1[1] = SATURATE_CAST_UCHAR((y10 + guv) >> 6); + bgr1[2] = SATURATE_CAST_UCHAR((y10 + ruv) >> 6); + + int y11 = yptr1[1] << 6; + bgr1[3] = SATURATE_CAST_UCHAR((y11 + buv) >> 6); + bgr1[4] = SATURATE_CAST_UCHAR((y11 + guv) >> 6); + bgr1[5] = SATURATE_CAST_UCHAR((y11 + ruv) >> 6); + + yptr0 += 2; + yptr1 += 2; + vuptr += 2; + bgr0 += 6; + bgr1 += 6; + } +#undef SATURATE_CAST_UCHAR + + yptr += 2 * stride; + vuptr += stride - w; + bgr += 2 * 3 * w; + } +} + +class jpeg_decoder_aw_impl +{ +public: + jpeg_decoder_aw_impl(); + ~jpeg_decoder_aw_impl(); + + int init(const unsigned char* jpgdata, int size, int* width, int* height, int* ch); + + int decode(const unsigned char* jpgdata, int size, unsigned char* outbgr) const; + + int deinit(); + +protected: + int corrupted; // 0=fine + int width; + int height; + int ch; + int components; // 1=gray 3=yuv + int sampling_factor; // 0=444 1=422h 2=422v 3=420 4=400 + int progressive; + int orientation; // exif +}; + +jpeg_decoder_aw_impl::jpeg_decoder_aw_impl() +{ + corrupted = 1; + width = 0; + height = 0; + ch = 0; + components = 0; + sampling_factor = -1; + progressive = 0; + orientation = -1; +} + +jpeg_decoder_aw_impl::~jpeg_decoder_aw_impl() +{ + deinit(); +} + +int jpeg_decoder_aw_impl::init(const unsigned char* jpgdata, int jpgsize, int* _width, int* _height, int* _ch) +{ + if (!jpgdata || jpgsize < 4) + return -1; + + // jpg magic + if (jpgdata[0] != 0xFF || jpgdata[1] != 0xD8) + return -1; + + // parse jpg for width height components sampling-factor progressive + const unsigned char* pbuf = jpgdata; + const unsigned char* pend = pbuf + jpgsize; + while (pbuf + 1 < pend) + { + unsigned char marker0 = pbuf[0]; + unsigned char marker1 = pbuf[1]; + pbuf += 2; + + if (marker0 != 0xFF) + break; + + // SOI EOI + if (marker1 == 0xD8 || marker1 == 0xD9) + continue; + + if (marker1 != 0xC0 && marker1 != 0xC2) + { + unsigned int skipsize = (pbuf[0] << 8) + pbuf[1]; + pbuf += skipsize; + continue; + } + + // SOF0 SOF2 + unsigned int skipsize = (pbuf[0] << 8) + pbuf[1]; + if (pbuf + skipsize > pend) + break; + + // only 8bit supported + if (pbuf[2] != 8) + break; + + height = (pbuf[3] << 8) + pbuf[4]; + width = (pbuf[5] << 8) + pbuf[6]; + if (height == 0 || width == 0) + break; + + components = pbuf[7]; + if (components != 1 && components != 3) + break; + + pbuf += 8; + + unsigned char phv[3][2]; + for (int c = 0; c < components; c++) + { + unsigned char q = pbuf[1]; + phv[c][0] = (q >> 4); // 2 1 1 2 1 1 1 1 1 1 1 1 + phv[c][1] = (q & 15); // 2 1 1 1 1 1 2 1 1 1 1 1 + pbuf += 3; + } + + if (components == 3 && phv[1][0] == 1 && phv[1][1] == 1 && phv[2][0] == 1 && phv[2][1] == 1) + { + if (phv[0][0] == 1 && phv[0][1] == 1) sampling_factor = 0; + if (phv[0][0] == 2 && phv[0][1] == 1) sampling_factor = 1; + if (phv[0][0] == 1 && phv[0][1] == 2) sampling_factor = 2; + if (phv[0][0] == 2 && phv[0][1] == 2) sampling_factor = 3; + } + if (components == 1 && phv[0][0] == 1 && phv[0][1] == 1) + { + sampling_factor = 4; + } + + // unsupported sampling factor + if (sampling_factor == -1) + break; + + // jpg is fine + corrupted = 0; + + if (marker1 == 0xC2) + progressive = 1; + + break; + } + + // resolve exif orientation + { + std::string s((const char*)jpgdata, jpgsize); + std::istringstream iss(s); + + cv::ExifReader exif_reader(iss); + if (exif_reader.parse()) + { + cv::ExifEntry_t e = exif_reader.getTag(cv::ORIENTATION); + orientation = e.field_u16; + if (orientation < 1 && orientation > 8) + orientation = 1; + } + } + // orientation = 7; + + if (corrupted) + return -1; + + // progressive not supported + if (progressive) + return -1; + + // grayscale not supported + if (sampling_factor == 4) + return -1; + + if (width % 2 != 0 || height % 2 != 0) + return -1; + + if (width < 8 && height < 8) + return -1; + + ch = *_ch; + if (ch == 0) + ch = components; + + if (orientation > 4) + { + // swap width height + int tmp = height; + height = width; + width = tmp; + } + + *_width = width; + *_height = height; + *_ch = ch; + + return 0; +} + +int jpeg_decoder_aw_impl::decode(const unsigned char* jpgdata, int jpgsize, unsigned char* outbgr) const +{ + if (!outbgr) + return -1; + + // corrupted file + if (corrupted) + return -1; + + // progressive not supported + if (progressive) + return -1; + + // grayscale not supported + if (sampling_factor == 4) + return -1; + + if (width % 2 != 0 || height % 2 != 0) + return -1; + + if (width < 8 && height < 8) + return -1; + + const int src_width = orientation > 4 ? height : width; + const int src_height = orientation > 4 ? width : height; + + // flag + int ret_val = 0; + + VideoDecoder* vdec = 0; + VideoPicture* vpic = 0; + + char* pBuf = 0; + int bufSize = 0; + char* pRingBuf = 0; + int ringBufSize = 0; + + { + vdec = CreateVideoDecoder(); + if (!vdec) + { + fprintf(stderr, "CreateVideoDecoder failed\n"); + ret_val = -1; + goto OUT; + } + } + + { + VideoStreamInfo videoInfo; + memset(&videoInfo, 0, sizeof(videoInfo)); + videoInfo.eCodecFormat = VIDEO_CODEC_FORMAT_MJPEG; + videoInfo.nWidth = src_width; + videoInfo.nHeight = src_height; + + VConfig vconfig; + memset(&vconfig, 0, sizeof(vconfig)); + vconfig.eOutputPixelFormat = PIXEL_FORMAT_NV21; + vconfig.eSecOutputPixelFormat = PIXEL_FORMAT_NV21; + vconfig.bSupportPallocBufBeforeDecode = 1; + vconfig.nDeInterlaceHoldingFrameBufferNum = 1; + vconfig.nDisplayHoldingFrameBufferNum = 1; + vconfig.nRotateHoldingFrameBufferNum = 0; + vconfig.nDecodeSmoothFrameBufferNum = 1; + vconfig.nSupportMaxWidth = src_width; + vconfig.nSupportMaxHeight = src_height; + + int ret = InitializeVideoDecoder(vdec, &videoInfo, &vconfig); + if (ret != 0) + { + fprintf(stderr, "InitializeVideoDecoder failed %d\n", ret); + ret_val = -1; + goto OUT; + } + } + + { + int ret = RequestVideoStreamBuffer(vdec, jpgsize, &pBuf, &bufSize, &pRingBuf, &ringBufSize, 0); + if (ret != 0) + { + fprintf(stderr, "RequestVideoStreamBuffer failed %d\n", ret); + ret_val = -1; + goto OUT; + } + + if (bufSize + ringBufSize < jpgsize) + { + fprintf(stderr, "RequestVideoStreamBuffer too small %d + %d < %d\n", bufSize, ringBufSize, jpgsize); + ret_val = -1; + goto OUT; + } + + // copy to vdec sbm + if (bufSize >= jpgsize) + { + memcpy(pBuf, jpgdata, jpgsize); + } + else + { + memcpy(pBuf, jpgdata, bufSize); + memcpy(pRingBuf, jpgdata + bufSize, jpgsize - bufSize); + } + } + + { + VideoStreamDataInfo dataInfo; + dataInfo.pData = pBuf; + dataInfo.nLength = jpgsize; + dataInfo.nPts = 0; + dataInfo.nPcr = 0; + dataInfo.bIsFirstPart = 1; + dataInfo.bIsLastPart = 1; + dataInfo.nID = 0; + dataInfo.nStreamIndex = 0; + dataInfo.bValid = 0; + dataInfo.bVideoInfoFlag = 0; + dataInfo.pVideoInfo = 0; + + int ret = SubmitVideoStreamData(vdec, &dataInfo, 0); + if (ret != 0) + { + fprintf(stderr, "SubmitVideoStreamData failed %d\n", ret); + ret_val = -1; + goto OUT; + } + } + + { + int endofstream = 1; + int ret = DecodeVideoStream(vdec, endofstream, 0, 0, 0); + if (ret != VDECODE_RESULT_KEYFRAME_DECODED) + { + fprintf(stderr, "DecodeVideoStream failed %d\n", ret); + ret_val = -1; + goto OUT; + } + } + + { + vpic = RequestPicture(vdec, 0); + if (!vpic) + { + fprintf(stderr, "RequestPicture failed\n"); + ret_val = -1; + goto OUT; + } + + // fprintf(stderr, "nID = %d\n", vpic->nID); + // fprintf(stderr, "nStreamIndex = %d\n", vpic->nStreamIndex); + // fprintf(stderr, "ePixelFormat = %d\n", vpic->ePixelFormat); + // fprintf(stderr, "nWidth = %d\n", vpic->nWidth); + // fprintf(stderr, "nHeight = %d\n", vpic->nHeight); + // fprintf(stderr, "nLineStride = %d\n", vpic->nLineStride); + // fprintf(stderr, "nTopOffset = %d\n", vpic->nTopOffset); + // fprintf(stderr, "nLeftOffset = %d\n", vpic->nLeftOffset); + // fprintf(stderr, "nBottomOffset = %d\n", vpic->nBottomOffset); + // fprintf(stderr, "nRightOffset = %d\n", vpic->nRightOffset); + // fprintf(stderr, "nFrameRate = %d\n", vpic->nFrameRate); + // fprintf(stderr, "nAspectRatio = %d\n", vpic->nAspectRatio); + // fprintf(stderr, "bIsProgressive = %d\n", vpic->bIsProgressive); + // fprintf(stderr, "bTopFieldFirst = %d\n", vpic->bTopFieldFirst); + // fprintf(stderr, "bRepeatTopField = %d\n", vpic->bRepeatTopField); + // fprintf(stderr, "nPts = %d\n", vpic->nPts); + // fprintf(stderr, "nPcr = %d\n", vpic->nPcr); + + if (vpic->ePixelFormat != PIXEL_FORMAT_NV21) + { + fprintf(stderr, "unsupported ePixelFormat %d\n", vpic->ePixelFormat); + ret_val = -1; + goto OUT; + } + + { + const unsigned char* yptr = (const unsigned char*)vpic->pData0; + const unsigned char* vuptr = (const unsigned char*)vpic->pData1; + + if (orientation == 0 || orientation == 1) + { + // no rotate + yuv420sp2bgr_neon(yptr, vuptr, width, height, vpic->nLineStride, outbgr); + } + else + { + // rotate + std::vector yuv_rotated; + yuv_rotated.resize(width * height / 2 * 3); + + unsigned char* dstY = (unsigned char*)yuv_rotated.data(); + unsigned char* dstUV = (unsigned char*)yuv_rotated.data() + width * height; + + kanna_rotate_c1(yptr, src_width, src_height, vpic->nLineStride, dstY, width, height, width, orientation); + kanna_rotate_c2(vuptr, src_width / 2, src_height / 2, vpic->nLineStride, dstUV, width / 2, height / 2, width, orientation); + + yuv420sp2bgr_neon(dstY, dstUV, width, height, width, outbgr); + } + } + + } + +OUT: + + if (vpic) + { + ReturnPicture(vdec, vpic); + } + + if (vdec) + { + DestroyVideoDecoder(vdec); + } + + return ret_val; +} + +int jpeg_decoder_aw_impl::deinit() +{ + corrupted = 1; + width = 0; + height = 0; + ch = 0; + components = 0; + sampling_factor = -1; + progressive = 0; + orientation = -1; + + return 0; +} + +bool jpeg_decoder_aw::supported(const unsigned char* jpgdata, int jpgsize) +{ + if (!jpgdata || jpgsize < 4) + return false; + + // jpg magic + if (jpgdata[0] != 0xFF || jpgdata[1] != 0xD8) + return false; + + if (!videoengine.ready) + return false; + + if (!vdecoder.ready) + return false; + + return true; +} + +jpeg_decoder_aw::jpeg_decoder_aw() : d(new jpeg_decoder_aw_impl) +{ +} + +jpeg_decoder_aw::~jpeg_decoder_aw() +{ + delete d; +} + +int jpeg_decoder_aw::init(const unsigned char* jpgdata, int jpgsize, int* width, int* height, int* ch) +{ + return d->init(jpgdata, jpgsize, width, height, ch); +} + +int jpeg_decoder_aw::decode(const unsigned char* jpgdata, int jpgsize, unsigned char* outbgr) const +{ + return d->decode(jpgdata, jpgsize, outbgr); +} + +int jpeg_decoder_aw::deinit() +{ + return d->deinit(); +} + +#else // defined __linux__ + +bool jpeg_decoder_aw::supported(const unsigned char* /*jpgdata*/, int /*jpgsize*/) +{ + return false; +} + +jpeg_decoder_aw::jpeg_decoder_aw() : d(0) +{ +} + +jpeg_decoder_aw::~jpeg_decoder_aw() +{ +} + +int jpeg_decoder_aw::init(const unsigned char* /*jpgdata*/, int /*jpgsize*/, int* /*width*/, int* /*height*/, int* /*ch*/) +{ + return -1; +} + +int jpeg_decoder_aw::decode(const unsigned char* /*jpgdata*/, int /*jpgsize*/, unsigned char* /*outbgr*/) const +{ + return -1; +} + +int jpeg_decoder_aw::deinit() +{ + return -1; +} + +#endif // defined __linux__ diff --git a/highgui/src/jpeg_decoder_aw.h b/highgui/src/jpeg_decoder_aw.h new file mode 100644 index 00000000..f3a6d2a0 --- /dev/null +++ b/highgui/src/jpeg_decoder_aw.h @@ -0,0 +1,39 @@ +// +// Copyright (C) 2024 nihui +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef JPEG_DECODER_AW_H +#define JPEG_DECODER_AW_H + +class jpeg_decoder_aw_impl; +class jpeg_decoder_aw +{ +public: + static bool supported(const unsigned char* jpgdata, int jpgsize); + + jpeg_decoder_aw(); + ~jpeg_decoder_aw(); + + int init(const unsigned char* jpgdata, int jpgsize, int* width, int* height, int* ch); + + int decode(const unsigned char* jpgdata, int jpgsize, unsigned char* outbgr) const; + + int deinit(); + +private: + jpeg_decoder_aw_impl* const d; +}; + +#endif // JPEG_DECODER_AW_H diff --git a/highgui/src/kanna_rotate.cpp b/highgui/src/kanna_rotate.cpp new file mode 100644 index 00000000..dda97f5e --- /dev/null +++ b/highgui/src/kanna_rotate.cpp @@ -0,0 +1,6112 @@ +// +// Copyright (C) 2024 nihui +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "kanna_rotate.h" +#if __ARM_NEON +#include +#endif // __ARM_NEON + +// should be a kanna ascii art here in my local branch +// but we shall ask the original art author for permission first ... +// https://www.reddit.com/r/anime/comments/5uxjn4/i_recreated_the_kanna_ascii_art_from_kobayashisan/ + +static void kanna_rotate_1_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw; + const int wgap = stride - w; + + const unsigned char* src0 = src; + const unsigned char* src1 = src + srcstride; + unsigned char* dst0 = dst; + unsigned char* dst1 = dst + stride; + + int y = 0; + for (; y + 1 < srch; y += 2) + { +#if __ARM_NEON + int nn = srcw >> 5; + int remain = srcw - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src0 = vld1q_u8(src0); + uint8x16_t _src0n = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src0); + vst1q_u8(dst0 + 16, _src0n); + + uint8x16_t _src1 = vld1q_u8(src1); + uint8x16_t _src1n = vld1q_u8(src1 + 16); + vst1q_u8(dst1, _src1); + vst1q_u8(dst1 + 16, _src1n); + + src0 += 32; + src1 += 32; + dst0 += 32; + dst1 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "pld [%2, #256] \n" + "vld1.u8 {d4-d7}, [%2]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%3]! \n" + "vst1.u8 {d4-d7}, [%4]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1) + : "cc", "memory", "q0", "q1", "q2", "q3"); + } +#endif // __aarch64__ +#else + int remain = srcw; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + *dst1++ = *src1++; + } + + src0 += srcwgap + srcstride; + src1 += srcwgap + srcstride; + dst0 += wgap + stride; + dst1 += wgap + stride; + } + + for (; y < srch; y++) + { +#if __ARM_NEON + int nn = srcw >> 5; + int remain = srcw - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src = vld1q_u8(src0); + uint8x16_t _src2 = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src); + vst1q_u8(dst0 + 16, _src2); + + src0 += 32; + dst0 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%2]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#else + int remain = srcw; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + } + + src0 += srcwgap; + dst0 += wgap; + } +} + +static void kanna_rotate_1_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw * 2; + const int wgap = stride - w * 2; + + int size = srcw * 2; + + const unsigned char* src0 = src; + const unsigned char* src1 = src + srcstride; + unsigned char* dst0 = dst; + unsigned char* dst1 = dst + stride; + + int y = 0; + for (; y + 1 < srch; y += 2) + { +#if __ARM_NEON + int nn = size >> 5; + int remain = size - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src0 = vld1q_u8(src0); + uint8x16_t _src0n = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src0); + vst1q_u8(dst0 + 16, _src0n); + + uint8x16_t _src1 = vld1q_u8(src1); + uint8x16_t _src1n = vld1q_u8(src1 + 16); + vst1q_u8(dst1, _src1); + vst1q_u8(dst1 + 16, _src1n); + + src0 += 32; + src1 += 32; + dst0 += 32; + dst1 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "pld [%2, #256] \n" + "vld1.u8 {d4-d7}, [%2]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%3]! \n" + "vst1.u8 {d4-d7}, [%4]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1) + : "cc", "memory", "q0", "q1", "q2", "q3"); + } +#endif // __aarch64__ +#else + int remain = size; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + *dst1++ = *src1++; + } + + src0 += srcwgap + srcstride; + src1 += srcwgap + srcstride; + dst0 += wgap + stride; + dst1 += wgap + stride; + } + + for (; y < srch; y++) + { +#if __ARM_NEON + int nn = size >> 5; + int remain = size - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src = vld1q_u8(src0); + uint8x16_t _src2 = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src); + vst1q_u8(dst0 + 16, _src2); + + src0 += 32; + dst0 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%2]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#else + int remain = size; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + } + + src0 += srcwgap; + dst0 += wgap; + } +} + +static void kanna_rotate_1_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw * 3; + const int wgap = stride - w * 3; + + int size = srcw * 3; + + const unsigned char* src0 = src; + const unsigned char* src1 = src + srcstride; + unsigned char* dst0 = dst; + unsigned char* dst1 = dst + stride; + + int y = 0; + for (; y + 1 < srch; y += 2) + { +#if __ARM_NEON + int nn = size >> 5; + int remain = size - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src0 = vld1q_u8(src0); + uint8x16_t _src0n = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src0); + vst1q_u8(dst0 + 16, _src0n); + + uint8x16_t _src1 = vld1q_u8(src1); + uint8x16_t _src1n = vld1q_u8(src1 + 16); + vst1q_u8(dst1, _src1); + vst1q_u8(dst1 + 16, _src1n); + + src0 += 32; + src1 += 32; + dst0 += 32; + dst1 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "pld [%2, #256] \n" + "vld1.u8 {d4-d7}, [%2]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%3]! \n" + "vst1.u8 {d4-d7}, [%4]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1) + : "cc", "memory", "q0", "q1", "q2", "q3"); + } +#endif // __aarch64__ +#else + int remain = size; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + *dst1++ = *src1++; + } + + src0 += srcwgap + srcstride; + src1 += srcwgap + srcstride; + dst0 += wgap + stride; + dst1 += wgap + stride; + } + + for (; y < srch; y++) + { +#if __ARM_NEON + int nn = size >> 5; + int remain = size - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src = vld1q_u8(src0); + uint8x16_t _src2 = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src); + vst1q_u8(dst0 + 16, _src2); + + src0 += 32; + dst0 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%2]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#else + int remain = size; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + } + + src0 += srcwgap; + dst0 += wgap; + } +} + +static void kanna_rotate_1_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw * 4; + const int wgap = stride - w * 4; + + int size = srcw * 4; + + const unsigned char* src0 = src; + const unsigned char* src1 = src + srcstride; + unsigned char* dst0 = dst; + unsigned char* dst1 = dst + stride; + + int y = 0; + for (; y + 1 < srch; y += 2) + { +#if __ARM_NEON + int nn = size >> 5; + int remain = size - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src0 = vld1q_u8(src0); + uint8x16_t _src0n = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src0); + vst1q_u8(dst0 + 16, _src0n); + + uint8x16_t _src1 = vld1q_u8(src1); + uint8x16_t _src1n = vld1q_u8(src1 + 16); + vst1q_u8(dst1, _src1); + vst1q_u8(dst1 + 16, _src1n); + + src0 += 32; + src1 += 32; + dst0 += 32; + dst1 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "pld [%2, #256] \n" + "vld1.u8 {d4-d7}, [%2]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%3]! \n" + "vst1.u8 {d4-d7}, [%4]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1) + : "cc", "memory", "q0", "q1", "q2", "q3"); + } +#endif // __aarch64__ +#else + int remain = size; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + *dst1++ = *src1++; + } + + src0 += srcwgap + srcstride; + src1 += srcwgap + srcstride; + dst0 += wgap + stride; + dst1 += wgap + stride; + } + + for (; y < srch; y++) + { +#if __ARM_NEON + int nn = size >> 5; + int remain = size - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src = vld1q_u8(src0); + uint8x16_t _src2 = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src); + vst1q_u8(dst0 + 16, _src2); + + src0 += 32; + dst0 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%2]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#else + int remain = size; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + } + + src0 += srcwgap; + dst0 += wgap; + } +} + +static void kanna_rotate_2_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw; + const int wgap = stride + w; + + const unsigned char* src0 = src; + unsigned char* dst0 = dst + w - 1; + + int y = 0; + for (; y < srch; y++) + { +#if __ARM_NEON + dst0 -= 15; + + int nn = srcw >> 4; + int remain = srcw - (nn << 4); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8_t _src = vld1_u8(src0); + uint8x8_t _src2 = vld1_u8(src0 + 8); + + _src = vrev64_u8(_src); + _src2 = vrev64_u8(_src2); + + vst1_u8(dst0, _src2); + vst1_u8(dst0 + 8, _src); + + src0 += 16; + dst0 -= 16; + } +#else + if (nn > 0) + { + asm volatile( + "mov r4, #-16 \n" + "0: \n" + "pld [%1, #128] \n" + "vld1.u8 {d0-d1}, [%1]! \n" + "vrev64.u8 d3, d0 \n" + "vrev64.u8 d2, d1 \n" + "subs %0, #1 \n" + "vst1.u8 {d2-d3}, [%2], r4 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1", "r4"); + } +#endif // __aarch64__ + + dst0 += 15; +#else + int remain = srcw; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0 = *src0; + + src0 += 1; + dst0 -= 1; + } + + src0 += srcwgap; + dst0 += wgap; + } +} + +static void kanna_rotate_2_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw * 2; + const int wgap = stride + w * 2; + + const unsigned char* src0 = src; + unsigned char* dst0 = dst + w * 2 - 2; + + int y = 0; + for (; y < srch; y++) + { +#if __ARM_NEON + dst0 -= 7 * 2; + + int nn = srcw >> 4; + int remain = srcw - (nn << 4); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x2_t _src = vld2_u8(src0); + uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2); + + _src.val[0] = vrev64_u8(_src.val[0]); + _src.val[1] = vrev64_u8(_src.val[1]); + + _src2.val[0] = vrev64_u8(_src2.val[0]); + _src2.val[1] = vrev64_u8(_src2.val[1]); + + vst2_u8(dst0, _src); + vst2_u8(dst0 - 8 * 2, _src2); + + src0 += 16 * 2; + dst0 -= 16 * 2; + } +#else + if (nn > 0) + { + asm volatile( + "mov r4, #-16 \n" + "0: \n" + "pld [%1, #128] \n" + "vld2.u8 {d0-d1}, [%1]! \n" + "vrev64.u8 d0, d0 \n" + "pld [%1, #128] \n" + "vld2.u8 {d2-d3}, [%1]! \n" + "vrev64.u8 d1, d1 \n" + "vrev64.u8 d2, d2 \n" + "vst2.u8 {d0-d1}, [%2], r4 \n" + "vrev64.u8 d3, d3 \n" + "subs %0, #1 \n" + "vst2.u8 {d2-d3}, [%2], r4 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1", "r4"); + } +#endif // __aarch64__ + + dst0 += 7 * 2; +#else + int remain = srcw; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + + src0 += 2; + dst0 -= 2; + } + + src0 += srcwgap; + dst0 += wgap; + } +} + +static void kanna_rotate_2_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw * 3; + const int wgap = stride + w * 3; + + const unsigned char* src0 = src; + unsigned char* dst0 = dst + w * 3 - 3; + + int y = 0; + for (; y < srch; y++) + { +#if __ARM_NEON + dst0 -= 7 * 3; + + int nn = srcw >> 4; + int remain = srcw - (nn << 4); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x3_t _src = vld3_u8(src0); + uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3); + + _src.val[0] = vrev64_u8(_src.val[0]); + _src.val[1] = vrev64_u8(_src.val[1]); + _src.val[2] = vrev64_u8(_src.val[2]); + + _src2.val[0] = vrev64_u8(_src2.val[0]); + _src2.val[1] = vrev64_u8(_src2.val[1]); + _src2.val[2] = vrev64_u8(_src2.val[2]); + + vst3_u8(dst0, _src); + vst3_u8(dst0 - 8 * 3, _src2); + + src0 += 16 * 3; + dst0 -= 16 * 3; + } +#else + if (nn > 0) + { + asm volatile( + "mov r4, #-24 \n" + "0: \n" + "pld [%1, #192] \n" + "vld3.u8 {d0-d2}, [%1]! \n" + "vrev64.u8 d0, d0 \n" + "vrev64.u8 d1, d1 \n" + "pld [%1, #192] \n" + "vld3.u8 {d4-d6}, [%1]! \n" + "vrev64.u8 d2, d2 \n" + "vrev64.u8 d4, d4 \n" + "vst3.u8 {d0-d2}, [%2], r4 \n" + "vrev64.u8 d5, d5 \n" + "vrev64.u8 d6, d6 \n" + "subs %0, #1 \n" + "vst3.u8 {d4-d6}, [%2], r4 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1", "q2", "q3", "r4"); + } +#endif // __aarch64__ + + dst0 += 7 * 3; +#else + int remain = srcw; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src0[2]; + + src0 += 3; + dst0 -= 3; + } + + src0 += srcwgap; + dst0 += wgap; + } +} + +static void kanna_rotate_2_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw * 4; + const int wgap = stride + w * 4; + + const unsigned char* src0 = src; + unsigned char* dst0 = dst + w * 4 - 4; + + int y = 0; + for (; y < srch; y++) + { +#if __ARM_NEON + dst0 -= 7 * 4; + + int nn = srcw >> 4; + int remain = srcw - (nn << 4); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x4_t _src = vld4_u8(src0); + uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4); + + _src.val[0] = vrev64_u8(_src.val[0]); + _src.val[1] = vrev64_u8(_src.val[1]); + _src.val[2] = vrev64_u8(_src.val[2]); + _src.val[3] = vrev64_u8(_src.val[3]); + + _src2.val[0] = vrev64_u8(_src2.val[0]); + _src2.val[1] = vrev64_u8(_src2.val[1]); + _src2.val[2] = vrev64_u8(_src2.val[2]); + _src2.val[3] = vrev64_u8(_src2.val[3]); + + vst4_u8(dst0, _src); + vst4_u8(dst0 - 8 * 4, _src2); + + src0 += 16 * 4; + dst0 -= 16 * 4; + } +#else + if (nn > 0) + { + asm volatile( + "mov r4, #-32 \n" + "0: \n" + "pld [%1, #256] \n" + "vld4.u8 {d0-d3}, [%1]! \n" + "vrev64.u8 d0, d0 \n" + "vrev64.u8 d1, d1 \n" + "vrev64.u8 d2, d2 \n" + "pld [%1, #256] \n" + "vld4.u8 {d4-d7}, [%1]! \n" + "vrev64.u8 d3, d3 \n" + "vrev64.u8 d4, d4 \n" + "vrev64.u8 d5, d5 \n" + "vst4.u8 {d0-d3}, [%2], r4 \n" + "vrev64.u8 d6, d6 \n" + "vrev64.u8 d7, d7 \n" + "subs %0, #1 \n" + "vst4.u8 {d4-d7}, [%2], r4 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1", "q2", "q3", "r4"); + } +#endif // __aarch64__ + + dst0 += 7 * 4; +#else + int remain = srcw; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src0[2]; + dst0[3] = src0[3]; + + src0 += 4; + dst0 -= 4; + } + + src0 += srcwgap; + dst0 += wgap; + } +} + +static void kanna_rotate_3_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) +{ + const int srcwgap = srcstride - srcw; + const int wgap = stride - w; + + // point to the last dst pixel + unsigned char* dstend = dst + stride * h - wgap; + + const unsigned char* src0 = src; + unsigned char* dst0 = dstend - 1; + + int y = 0; + for (; y < srch; y++) + { +#if __ARM_NEON + dst0 -= 15; + + int nn = srcw >> 4; + int remain = srcw - (nn << 4); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8_t _src = vld1_u8(src0); + uint8x8_t _src2 = vld1_u8(src0 + 8); + + _src = vrev64_u8(_src); + _src2 = vrev64_u8(_src2); + + vst1_u8(dst0, _src2); + vst1_u8(dst0 + 8, _src); + + src0 += 16; + dst0 -= 16; + } +#else + if (nn > 0) + { + asm volatile( + "mov r4, #-16 \n" + "0: \n" + "pld [%1, #128] \n" + "vld1.u8 {d0-d1}, [%1]! \n" + "vrev64.u8 d3, d0 \n" + "vrev64.u8 d2, d1 \n" + "subs %0, #1 \n" + "vst1.u8 {d2-d3}, [%2], r4 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1", "r4"); + } +#endif // __aarch64__ + + dst0 += 15; +#else + int remain = srcw; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0 = *src0; + + src0 += 1; + dst0 -= 1; + } + + src0 += srcwgap; + dst0 -= wgap; + } +} + +static void kanna_rotate_3_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) +{ + const int srcwgap = srcstride - srcw * 2; + const int wgap = stride - w * 2; + + // point to the last dst pixel + unsigned char* dstend = dst + stride * h - wgap; + + const unsigned char* src0 = src; + unsigned char* dst0 = dstend - 2; + + int y = 0; + for (; y < srch; y++) + { +#if __ARM_NEON + dst0 -= 7 * 2; + + int nn = srcw >> 4; + int remain = srcw - (nn << 4); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x2_t _src = vld2_u8(src0); + uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2); + + _src.val[0] = vrev64_u8(_src.val[0]); + _src.val[1] = vrev64_u8(_src.val[1]); + + _src2.val[0] = vrev64_u8(_src2.val[0]); + _src2.val[1] = vrev64_u8(_src2.val[1]); + + vst2_u8(dst0, _src); + vst2_u8(dst0 - 8 * 2, _src2); + + src0 += 16 * 2; + dst0 -= 16 * 2; + } +#else + if (nn > 0) + { + asm volatile( + "mov r4, #-16 \n" + "0: \n" + "pld [%1, #128] \n" + "vld2.u8 {d0-d1}, [%1]! \n" + "vrev64.u8 d0, d0 \n" + "pld [%1, #128] \n" + "vld2.u8 {d2-d3}, [%1]! \n" + "vrev64.u8 d1, d1 \n" + "vrev64.u8 d2, d2 \n" + "vst2.u8 {d0-d1}, [%2], r4 \n" + "vrev64.u8 d3, d3 \n" + "subs %0, #1 \n" + "vst2.u8 {d2-d3}, [%2], r4 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1", "r4"); + } +#endif // __aarch64__ + + dst0 += 7 * 2; +#else + int remain = srcw; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + + src0 += 2; + dst0 -= 2; + } + + src0 += srcwgap; + dst0 -= wgap; + } +} + +static void kanna_rotate_3_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) +{ + const int srcwgap = srcstride - srcw * 3; + const int wgap = stride - w * 3; + + // point to the last dst pixel + unsigned char* dstend = dst + stride * h - wgap; + + const unsigned char* src0 = src; + unsigned char* dst0 = dstend - 3; + + int y = 0; + for (; y < srch; y++) + { +#if __ARM_NEON + dst0 -= 7 * 3; + + int nn = srcw >> 4; + int remain = srcw - (nn << 4); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x3_t _src = vld3_u8(src0); + uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3); + + _src.val[0] = vrev64_u8(_src.val[0]); + _src.val[1] = vrev64_u8(_src.val[1]); + _src.val[2] = vrev64_u8(_src.val[2]); + + _src2.val[0] = vrev64_u8(_src2.val[0]); + _src2.val[1] = vrev64_u8(_src2.val[1]); + _src2.val[2] = vrev64_u8(_src2.val[2]); + + vst3_u8(dst0, _src); + vst3_u8(dst0 - 8 * 3, _src2); + + src0 += 16 * 3; + dst0 -= 16 * 3; + } +#else + if (nn > 0) + { + asm volatile( + "mov r4, #-24 \n" + "0: \n" + "pld [%1, #192] \n" + "vld3.u8 {d0-d2}, [%1]! \n" + "vrev64.u8 d0, d0 \n" + "vrev64.u8 d1, d1 \n" + "pld [%1, #192] \n" + "vld3.u8 {d4-d6}, [%1]! \n" + "vrev64.u8 d2, d2 \n" + "vrev64.u8 d4, d4 \n" + "vst3.u8 {d0-d2}, [%2], r4 \n" + "vrev64.u8 d5, d5 \n" + "vrev64.u8 d6, d6 \n" + "subs %0, #1 \n" + "vst3.u8 {d4-d6}, [%2], r4 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1", "q2", "q3", "r4"); + } +#endif // __aarch64__ + + dst0 += 7 * 3; +#else + int remain = srcw; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src0[2]; + + src0 += 3; + dst0 -= 3; + } + + src0 += srcwgap; + dst0 -= wgap; + } +} + +static void kanna_rotate_3_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) +{ + const int srcwgap = srcstride - srcw * 4; + const int wgap = stride - w * 4; + + // point to the last dst pixel + unsigned char* dstend = dst + stride * h - wgap; + + const unsigned char* src0 = src; + unsigned char* dst0 = dstend - 4; + + int y = 0; + for (; y < srch; y++) + { +#if __ARM_NEON + dst0 -= 7 * 4; + + int nn = srcw >> 4; + int remain = srcw - (nn << 4); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x4_t _src = vld4_u8(src0); + uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4); + + _src.val[0] = vrev64_u8(_src.val[0]); + _src.val[1] = vrev64_u8(_src.val[1]); + _src.val[2] = vrev64_u8(_src.val[2]); + _src.val[3] = vrev64_u8(_src.val[3]); + + _src2.val[0] = vrev64_u8(_src2.val[0]); + _src2.val[1] = vrev64_u8(_src2.val[1]); + _src2.val[2] = vrev64_u8(_src2.val[2]); + _src2.val[3] = vrev64_u8(_src2.val[3]); + + vst4_u8(dst0, _src); + vst4_u8(dst0 - 8 * 4, _src2); + + src0 += 16 * 4; + dst0 -= 16 * 4; + } +#else + if (nn > 0) + { + asm volatile( + "mov r4, #-32 \n" + "0: \n" + "pld [%1, #256] \n" + "vld4.u8 {d0-d3}, [%1]! \n" + "vrev64.u8 d0, d0 \n" + "vrev64.u8 d1, d1 \n" + "vrev64.u8 d2, d2 \n" + "pld [%1, #256] \n" + "vld4.u8 {d4-d7}, [%1]! \n" + "vrev64.u8 d3, d3 \n" + "vrev64.u8 d4, d4 \n" + "vrev64.u8 d5, d5 \n" + "vst4.u8 {d0-d3}, [%2], r4 \n" + "vrev64.u8 d6, d6 \n" + "vrev64.u8 d7, d7 \n" + "subs %0, #1 \n" + "vst4.u8 {d4-d7}, [%2], r4 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1", "q2", "q3", "r4"); + } +#endif // __aarch64__ + + dst0 += 7 * 4; +#else + int remain = srcw; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src0[2]; + dst0[3] = src0[3]; + + src0 += 4; + dst0 -= 4; + } + + src0 += srcwgap; + dst0 -= wgap; + } +} + +static void kanna_rotate_4_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) +{ + const int srcwgap = srcstride - srcw; + const int wgap = stride + w; + + // point to the last dst pixel row + unsigned char* dstend = dst + stride * (h - 1); + + const unsigned char* src0 = src; + const unsigned char* src1 = src + srcstride; + unsigned char* dst0 = dstend; + unsigned char* dst1 = dstend - stride; + + int y = 0; + for (; y + 1 < srch; y += 2) + { +#if __ARM_NEON + int nn = srcw >> 5; + int remain = srcw - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src0 = vld1q_u8(src0); + uint8x16_t _src0n = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src0); + vst1q_u8(dst0 + 16, _src0n); + + uint8x16_t _src1 = vld1q_u8(src1); + uint8x16_t _src1n = vld1q_u8(src1 + 16); + vst1q_u8(dst1, _src1); + vst1q_u8(dst1 + 16, _src1n); + + src0 += 32; + src1 += 32; + dst0 += 32; + dst1 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "pld [%2, #256] \n" + "vld1.u8 {d4-d7}, [%2]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%3]! \n" + "vst1.u8 {d4-d7}, [%4]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1) + : "cc", "memory", "q0", "q1", "q2", "q3"); + } +#endif // __aarch64__ +#else + int remain = srcw; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + *dst1++ = *src1++; + } + + src0 += srcwgap + srcstride; + src1 += srcwgap + srcstride; + dst0 -= wgap + stride; + dst1 -= wgap + stride; + } + + for (; y < srch; y++) + { +#if __ARM_NEON + int nn = srcw >> 5; + int remain = srcw - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src = vld1q_u8(src0); + uint8x16_t _src2 = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src); + vst1q_u8(dst0 + 16, _src2); + + src0 += 32; + dst0 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%2]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#else + int remain = srcw; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + } + + src0 += srcwgap; + dst0 -= wgap; + } +} + +static void kanna_rotate_4_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) +{ + const int srcwgap = srcstride - srcw * 2; + const int wgap = stride + w * 2; + + // point to the last dst pixel row + unsigned char* dstend = dst + stride * (h - 1); + + int size = srcw * 2; + + const unsigned char* src0 = src; + const unsigned char* src1 = src + srcstride; + unsigned char* dst0 = dstend; + unsigned char* dst1 = dstend - stride; + + int y = 0; + for (; y + 1 < srch; y += 2) + { +#if __ARM_NEON + int nn = size >> 5; + int remain = size - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src0 = vld1q_u8(src0); + uint8x16_t _src0n = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src0); + vst1q_u8(dst0 + 16, _src0n); + + uint8x16_t _src1 = vld1q_u8(src1); + uint8x16_t _src1n = vld1q_u8(src1 + 16); + vst1q_u8(dst1, _src1); + vst1q_u8(dst1 + 16, _src1n); + + src0 += 32; + src1 += 32; + dst0 += 32; + dst1 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "pld [%2, #256] \n" + "vld1.u8 {d4-d7}, [%2]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%3]! \n" + "vst1.u8 {d4-d7}, [%4]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1) + : "cc", "memory", "q0", "q1", "q2", "q3"); + } +#endif // __aarch64__ +#else + int remain = size; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + *dst1++ = *src1++; + } + + src0 += srcwgap + srcstride; + src1 += srcwgap + srcstride; + dst0 -= wgap + stride; + dst1 -= wgap + stride; + } + + for (; y < srch; y++) + { +#if __ARM_NEON + int nn = size >> 5; + int remain = size - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src = vld1q_u8(src0); + uint8x16_t _src2 = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src); + vst1q_u8(dst0 + 16, _src2); + + src0 += 32; + dst0 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%2]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#else + int remain = size; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + } + + src0 += srcwgap; + dst0 -= wgap; + } +} + +static void kanna_rotate_4_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) +{ + const int srcwgap = srcstride - srcw * 3; + const int wgap = stride + w * 3; + + // point to the last dst pixel row + unsigned char* dstend = dst + stride * (h - 1); + + int size = srcw * 3; + + const unsigned char* src0 = src; + const unsigned char* src1 = src + srcstride; + unsigned char* dst0 = dstend; + unsigned char* dst1 = dstend - stride; + + int y = 0; + for (; y + 1 < srch; y += 2) + { +#if __ARM_NEON + int nn = size >> 5; + int remain = size - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src0 = vld1q_u8(src0); + uint8x16_t _src0n = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src0); + vst1q_u8(dst0 + 16, _src0n); + + uint8x16_t _src1 = vld1q_u8(src1); + uint8x16_t _src1n = vld1q_u8(src1 + 16); + vst1q_u8(dst1, _src1); + vst1q_u8(dst1 + 16, _src1n); + + src0 += 32; + src1 += 32; + dst0 += 32; + dst1 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "pld [%2, #256] \n" + "vld1.u8 {d4-d7}, [%2]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%3]! \n" + "vst1.u8 {d4-d7}, [%4]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1) + : "cc", "memory", "q0", "q1", "q2", "q3"); + } +#endif // __aarch64__ +#else + int remain = size; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + *dst1++ = *src1++; + } + + src0 += srcwgap + srcstride; + src1 += srcwgap + srcstride; + dst0 -= wgap + stride; + dst1 -= wgap + stride; + } + + for (; y < srch; y++) + { +#if __ARM_NEON + int nn = size >> 5; + int remain = size - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src = vld1q_u8(src0); + uint8x16_t _src2 = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src); + vst1q_u8(dst0 + 16, _src2); + + src0 += 32; + dst0 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%2]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#else + int remain = size; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + } + + src0 += srcwgap; + dst0 -= wgap; + } +} + +static void kanna_rotate_4_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) +{ + const int srcwgap = srcstride - srcw * 4; + const int wgap = stride + w * 4; + + // point to the last dst pixel row + unsigned char* dstend = dst + stride * (h - 1); + + int size = srcw * 4; + + const unsigned char* src0 = src; + const unsigned char* src1 = src + srcstride; + unsigned char* dst0 = dstend; + unsigned char* dst1 = dstend - stride; + + int y = 0; + for (; y + 1 < srch; y += 2) + { +#if __ARM_NEON + int nn = size >> 5; + int remain = size - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src0 = vld1q_u8(src0); + uint8x16_t _src0n = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src0); + vst1q_u8(dst0 + 16, _src0n); + + uint8x16_t _src1 = vld1q_u8(src1); + uint8x16_t _src1n = vld1q_u8(src1 + 16); + vst1q_u8(dst1, _src1); + vst1q_u8(dst1 + 16, _src1n); + + src0 += 32; + src1 += 32; + dst0 += 32; + dst1 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "pld [%2, #256] \n" + "vld1.u8 {d4-d7}, [%2]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%3]! \n" + "vst1.u8 {d4-d7}, [%4]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1) + : "cc", "memory", "q0", "q1", "q2", "q3"); + } +#endif // __aarch64__ +#else + int remain = size; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + *dst1++ = *src1++; + } + + src0 += srcwgap + srcstride; + src1 += srcwgap + srcstride; + dst0 -= wgap + stride; + dst1 -= wgap + stride; + } + + for (; y < srch; y++) + { +#if __ARM_NEON + int nn = size >> 5; + int remain = size - (nn << 5); +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x16_t _src = vld1q_u8(src0); + uint8x16_t _src2 = vld1q_u8(src0 + 16); + vst1q_u8(dst0, _src); + vst1q_u8(dst0 + 16, _src2); + + src0 += 32; + dst0 += 32; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld1.u8 {d0-d3}, [%1]! \n" + "subs %0, #1 \n" + "vst1.u8 {d0-d3}, [%2]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(dst0) // %2 + : "0"(nn), + "1"(src0), + "2"(dst0) + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#else + int remain = size; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *dst0++ = *src0++; + } + + src0 += srcwgap; + dst0 -= wgap; + } +} + +static void kanna_rotate_5_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw; + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst0 = dst + y; + unsigned char* dst1 = dst + y + stride; + + int src_step = 2 * srcstride; + int dst_step = 2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8_t _src0 = vld1_u8(src0); + uint8x8_t _src1 = vld1_u8(src1); + + uint8x8_t _src2 = vld1_u8(src0 + src_step); + uint8x8_t _src3 = vld1_u8(src1 + src_step); + + uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step); + uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step); + + uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step); + uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1); + uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3); + uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5); + uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); + + uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]); + uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]); + uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]); + uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]); + uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]); + uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]); + uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]); + uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]); + + vst1_u8(dst0, _dst0); + vst1_u8(dst1, _dst1); + vst1_u8(dst0 + dst_step, _dst2); + vst1_u8(dst1 + dst_step, _dst3); + vst1_u8(dst0 + 2 * dst_step, _dst4); + vst1_u8(dst1 + 2 * dst_step, _dst5); + vst1_u8(dst0 + 3 * dst_step, _dst6); + vst1_u8(dst1 + 3 * dst_step, _dst7); + + src0 += 8; + src1 += 8; + + dst0 += 4 * dst_step; + dst1 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #64] \n" + "vld1.u8 {d0}, [%1], %10 \n" + + "pld [%2, #64] \n" + "vld1.u8 {d1}, [%2], %10 \n" + + "pld [%1, #64] \n" + "vld1.u8 {d2}, [%1], %10 \n" + + "vtrn.u8 d0, d1 \n" // _src01t_r + + "pld [%2, #64] \n" + "vld1.u8 {d3}, [%2], %10 \n" + + "pld [%1, #64] \n" + "vld1.u8 {d4}, [%1], %10 \n" + + "vtrn.u8 d2, d3 \n" // _src23t_r + + "pld [%2, #64] \n" + "vld1.u8 {d5}, [%2], %10 \n" + + "pld [%1, #64] \n" + "vld1.u8 {d6}, [%1], %10 \n" + + "vtrn.u8 d4, d5 \n" // _src45t_r + + "pld [%2, #64] \n" + "vld1.u8 {d7}, [%2], %10 \n" + + "vtrn.u8 d6, d7 \n" // _src67t_r + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q0, q1 \n" // _src02tt_r _src13tt_r + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q2, q3 \n" // _src13tt_r _src46tt_r + + "add %1, #8 \n" // src0 += 8 + + "vtrn.u32 q0, q2 \n" // _src04ttt_r _src15ttt_r + + "add %2, #8 \n" // src1 += 8 + + "vtrn.u32 q1, q3 \n" // _src26ttt_r _src37ttt_r + "vst1.u8 {d0}, [%3], %11 \n" + "vst1.u8 {d1}, [%4], %11 \n" + + "subs %0, #1 \n" + + "vst1.u8 {d2}, [%3], %11 \n" + "vst1.u8 {d3}, [%4], %11 \n" + "vst1.u8 {d4}, [%3], %11 \n" + "vst1.u8 {d5}, [%4], %11 \n" + "vst1.u8 {d6}, [%3], %11 \n" + "vst1.u8 {d7}, [%4], %11 \n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst0[0] = src0[0]; + dst0[1] = src1[0]; + dst0[2] = src0[0 + src_step]; + dst0[3] = src1[0 + src_step]; + dst0[4] = src0[0 + 2 * src_step]; + dst0[5] = src1[0 + 2 * src_step]; + dst0[6] = src0[0 + 3 * src_step]; + dst0[7] = src1[0 + 3 * src_step]; + + src0 += 1; + src1 += 1; + + dst0 += stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dst + y; + + int x = 0; + for (; x < srcw; x++) + { + *dst0 = *src0; + + src0 += 1; + dst0 += stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_5_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw * 2; + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst0 = dst + y * 2; + unsigned char* dst1 = dst + y * 2 + stride; + + int src_step = 2 * srcstride; + int dst_step = 2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x2_t _src0 = vld2_u8(src0); + uint8x8x2_t _src1 = vld2_u8(src1); + + uint8x8x2_t _src2 = vld2_u8(src0 + src_step); + uint8x8x2_t _src3 = vld2_u8(src1 + src_step); + + uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step); + uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step); + + uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step); + uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); + uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); + uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); + uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); + + uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); + uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); + uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); + uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); + + uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); + uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); + uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); + uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); + + uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); + uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); + uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); + uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); + + uint8x8x2_t _dst0; + uint8x8x2_t _dst1; + uint8x8x2_t _dst2; + uint8x8x2_t _dst3; + uint8x8x2_t _dst4; + uint8x8x2_t _dst5; + uint8x8x2_t _dst6; + uint8x8x2_t _dst7; + + _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); + _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); + _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); + _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); + _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); + _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); + _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); + _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); + + _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); + _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); + _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); + _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); + _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); + _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); + _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); + _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); + + vst2_u8(dst0, _dst0); + vst2_u8(dst1, _dst1); + vst2_u8(dst0 + dst_step, _dst2); + vst2_u8(dst1 + dst_step, _dst3); + vst2_u8(dst0 + 2 * dst_step, _dst4); + vst2_u8(dst1 + 2 * dst_step, _dst5); + vst2_u8(dst0 + 3 * dst_step, _dst6); + vst2_u8(dst1 + 3 * dst_step, _dst7); + + src0 += 2 * 8; + src1 += 2 * 8; + + dst0 += 4 * dst_step; + dst1 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #128] \n" + "vld2.u8 {d0-d1}, [%1], %10 \n" + + "pld [%2, #128] \n" + "vld2.u8 {d2-d3}, [%2], %10 \n" + + "pld [%1, #128] \n" + "vld2.u8 {d4-d5}, [%1], %10 \n" + + "vtrn.u8 q0, q1 \n" // _src01t_r + + "pld [%2, #128] \n" + "vld2.u8 {d6-d7}, [%2], %10 \n" + + "pld [%1, #128] \n" + "vld2.u8 {d16-d17}, [%1], %10\n" + + "vtrn.u8 q2, q3 \n" // _src23t_r + + "pld [%2, #128] \n" + "vld2.u8 {d18-d19}, [%2], %10\n" + + "pld [%1, #128] \n" + "vld2.u8 {d20-d21}, [%1], %10\n" + + "vtrn.u8 q8, q9 \n" // _src45t_r + + "pld [%2, #128] \n" + "vld2.u8 {d22-d23}, [%2], %10\n" + + "vtrn.u8 q10, q11 \n" // _src67t_r + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q0, q2 \n" // _src02tt_r + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q1, q3 \n" // _src13tt_r + + "add %1, #16 \n" // src0 += 16 + + "vtrn.u16 q8, q10 \n" // _src46tt_r + + "add %2, #16 \n" // src1 += 16 + + "vtrn.u16 q9, q11 \n" // _src57tt_r + + "vtrn.u32 q0, q8 \n" // _src04ttt_r + + "vtrn.u32 q1, q9 \n" // _src15ttt_r + "vst2.u8 {d0-d1}, [%3], %11 \n" + + "vtrn.u32 q2, q10 \n" // _src26ttt_r + "vst2.u8 {d2-d3}, [%4], %11 \n" + + "vtrn.u32 q3, q11 \n" // _src37ttt_r + "vst2.u8 {d4-d5}, [%3], %11 \n" + + "subs %0, #1 \n" + + "vst2.u8 {d6-d7}, [%4], %11 \n" + "vst2.u8 {d16-d17}, [%3], %11\n" + "vst2.u8 {d18-d19}, [%4], %11\n" + "vst2.u8 {d20-d21}, [%3], %11\n" + "vst2.u8 {d22-d23}, [%4], %11\n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src1[0]; + dst0[3] = src1[1]; + dst0[4] = src0[0 + src_step]; + dst0[5] = src0[1 + src_step]; + dst0[6] = src1[0 + src_step]; + dst0[7] = src1[1 + src_step]; + dst0[8] = src0[0 + 2 * src_step]; + dst0[9] = src0[1 + 2 * src_step]; + dst0[10] = src1[0 + 2 * src_step]; + dst0[11] = src1[1 + 2 * src_step]; + dst0[12] = src0[0 + 3 * src_step]; + dst0[13] = src0[1 + 3 * src_step]; + dst0[14] = src1[0 + 3 * src_step]; + dst0[15] = src1[1 + 3 * src_step]; + + src0 += 2; + src1 += 2; + + dst0 += stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dst + y * 2; + + int x = 0; + for (; x < srcw; x++) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + + src0 += 2; + dst0 += stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_5_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw * 3; + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst0 = dst + y * 3; + unsigned char* dst1 = dst + y * 3 + stride; + + int src_step = 2 * srcstride; + int dst_step = 2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x3_t _src0 = vld3_u8(src0); + uint8x8x3_t _src1 = vld3_u8(src1); + + uint8x8x3_t _src2 = vld3_u8(src0 + src_step); + uint8x8x3_t _src3 = vld3_u8(src1 + src_step); + + uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step); + uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step); + + uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step); + uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); + uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); + uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); + uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); + + uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); + uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); + uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); + uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); + + uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]); + uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]); + uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]); + uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); + + uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); + uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); + uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); + uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); + + uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0])); + uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1])); + uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0])); + uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); + + uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); + uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); + uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); + uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); + + uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0])); + uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0])); + uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1])); + uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1])); + + uint8x8x3_t _dst0; + uint8x8x3_t _dst1; + uint8x8x3_t _dst2; + uint8x8x3_t _dst3; + uint8x8x3_t _dst4; + uint8x8x3_t _dst5; + uint8x8x3_t _dst6; + uint8x8x3_t _dst7; + + _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); + _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); + _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); + _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); + _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); + _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); + _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); + _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); + + _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); + _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); + _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); + _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); + _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); + _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); + _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); + _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); + + _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); + _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); + _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); + _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); + _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); + _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); + _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); + _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); + + vst3_u8(dst0, _dst0); + vst3_u8(dst1, _dst1); + vst3_u8(dst0 + dst_step, _dst2); + vst3_u8(dst1 + dst_step, _dst3); + vst3_u8(dst0 + 2 * dst_step, _dst4); + vst3_u8(dst1 + 2 * dst_step, _dst5); + vst3_u8(dst0 + 3 * dst_step, _dst6); + vst3_u8(dst1 + 3 * dst_step, _dst7); + + src0 += 3 * 8; + src1 += 3 * 8; + + dst0 += 4 * dst_step; + dst1 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #192] \n" + "vld3.u8 {d0-d2}, [%1], %10 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d4-d6}, [%2], %10 \n" + + "pld [%1, #192] \n" + "vld3.u8 {d8-d10}, [%1], %10 \n" + + "vtrn.u8 q0, q2 \n" // _src01t_r + "vtrn.u8 d2, d6 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d12-d14}, [%2], %10\n" + + "pld [%1, #192] \n" + "vld3.u8 {d16-d18}, [%1], %10\n" + + "vtrn.u8 q4, q6 \n" // _src23t_r + "vtrn.u8 d10, d14 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d20-d22}, [%2], %10\n" + + "pld [%1, #192] \n" + "vld3.u8 {d24-d26}, [%1], %10\n" + + "vtrn.u8 q8, q10 \n" // _src45t_r + "vtrn.u8 d18, d22 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d28-d30}, [%2], %10\n" + + "vtrn.u8 q12, q14 \n" // _src67t_r + "vtrn.u8 d26, d30 \n" + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q0, q4 \n" // _src02tt_r + "vtrn.u16 d2, d10 \n" + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q2, q6 \n" // _src13tt_r + "vtrn.u16 d6, d14 \n" + + "add %1, #24 \n" // src0 += 24 + + "vtrn.u16 q8, q12 \n" // _src46tt_r + "vtrn.u16 d18, d26 \n" + + "add %2, #24 \n" // src1 += 24 + + "vtrn.u16 q10, q14 \n" // _src57tt_r + "vtrn.u16 d22, d30 \n" + + "vtrn.u32 q0, q8 \n" // _src04ttt_r + "vtrn.u32 d2, d18 \n" + + "vtrn.u32 q2, q10 \n" // _src15ttt_r + "vst3.u8 {d0-d2}, [%3], %11 \n" + "vtrn.u32 d6, d22 \n" + + "vtrn.u32 q4, q12 \n" // _src26ttt_r + "vst3.u8 {d4-d6}, [%4], %11 \n" + "vtrn.u32 d10, d26 \n" + + "vtrn.u32 q6, q14 \n" // _src37ttt_r + "vst3.u8 {d8-d10}, [%3], %11 \n" + "vtrn.u32 d14, d30 \n" + + "subs %0, #1 \n" + + "vst3.u8 {d16-d18}, [%3], %11\n" + "vst3.u8 {d12-d14}, [%4], %11\n" + "vst3.u8 {d20-d22}, [%4], %11\n" + "vst3.u8 {d24-d26}, [%3], %11\n" + "vst3.u8 {d28-d30}, [%4], %11\n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src0[2]; + dst0[3] = src1[0]; + dst0[4] = src1[1]; + dst0[5] = src1[2]; + dst0[6] = src0[0 + src_step]; + dst0[7] = src0[1 + src_step]; + dst0[8] = src0[2 + src_step]; + dst0[9] = src1[0 + src_step]; + dst0[10] = src1[1 + src_step]; + dst0[11] = src1[2 + src_step]; + dst0[12] = src0[0 + 2 * src_step]; + dst0[13] = src0[1 + 2 * src_step]; + dst0[14] = src0[2 + 2 * src_step]; + dst0[15] = src1[0 + 2 * src_step]; + dst0[16] = src1[1 + 2 * src_step]; + dst0[17] = src1[2 + 2 * src_step]; + dst0[18] = src0[0 + 3 * src_step]; + dst0[19] = src0[1 + 3 * src_step]; + dst0[20] = src0[2 + 3 * src_step]; + dst0[21] = src1[0 + 3 * src_step]; + dst0[22] = src1[1 + 3 * src_step]; + dst0[23] = src1[2 + 3 * src_step]; + + src0 += 3; + src1 += 3; + + dst0 += stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dst + y * 3; + + int x = 0; + for (; x < srcw; x++) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src0[2]; + + src0 += 3; + dst0 += stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_5_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw * 4; + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst0 = dst + y * 4; + unsigned char* dst1 = dst + y * 4 + stride; + + int src_step = 2 * srcstride; + int dst_step = 2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x4_t _src0 = vld4_u8(src0); + uint8x8x4_t _src1 = vld4_u8(src1); + + uint8x8x4_t _src2 = vld4_u8(src0 + src_step); + uint8x8x4_t _src3 = vld4_u8(src1 + src_step); + + uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step); + uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step); + + uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step); + uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); + uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); + uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); + uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); + + uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); + uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); + uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); + uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); + + uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]); + uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]); + uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]); + uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]); + + uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]); + uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]); + uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]); + uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); + + uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); + uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); + uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); + uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); + + uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0])); + uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1])); + uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0])); + uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1])); + + uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0])); + uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1])); + uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0])); + uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); + + uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); + uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); + uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); + uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); + + uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0])); + uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0])); + uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1])); + uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1])); + + uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0])); + uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0])); + uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1])); + uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1])); + + uint8x8x4_t _dst0; + uint8x8x4_t _dst1; + uint8x8x4_t _dst2; + uint8x8x4_t _dst3; + uint8x8x4_t _dst4; + uint8x8x4_t _dst5; + uint8x8x4_t _dst6; + uint8x8x4_t _dst7; + + _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); + _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); + _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); + _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); + _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); + _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); + _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); + _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); + + _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); + _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); + _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); + _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); + _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); + _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); + _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); + _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); + + _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); + _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); + _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); + _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); + _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); + _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); + _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); + _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); + + _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]); + _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]); + _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]); + _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]); + _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]); + _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]); + _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]); + _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]); + + vst4_u8(dst0, _dst0); + vst4_u8(dst1, _dst1); + vst4_u8(dst0 + dst_step, _dst2); + vst4_u8(dst1 + dst_step, _dst3); + vst4_u8(dst0 + 2 * dst_step, _dst4); + vst4_u8(dst1 + 2 * dst_step, _dst5); + vst4_u8(dst0 + 3 * dst_step, _dst6); + vst4_u8(dst1 + 3 * dst_step, _dst7); + + src0 += 4 * 8; + src1 += 4 * 8; + + dst0 += 4 * dst_step; + dst1 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld4.u8 {d0-d3}, [%1], %10 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d4-d7}, [%2], %10 \n" + + "pld [%1, #256] \n" + "vld4.u8 {d8-d11}, [%1], %10 \n" + + "vtrn.u8 q0, q2 \n" // _src01t_r + "vtrn.u8 q1, q3 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d12-d15}, [%2], %10\n" + + "pld [%1, #256] \n" + "vld4.u8 {d16-d19}, [%1], %10\n" + + "vtrn.u8 q4, q6 \n" // _src23t_r + "vtrn.u8 q5, q7 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d20-d23}, [%2], %10\n" + + "pld [%1, #256] \n" + "vld4.u8 {d24-d27}, [%1], %10\n" + + "vtrn.u8 q8, q10 \n" // _src45t_r + "vtrn.u8 q9, q11 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d28-d31}, [%2], %10\n" + + "vtrn.u8 q12, q14 \n" // _src67t_r + "vtrn.u8 q13, q15 \n" + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q0, q4 \n" // _src02tt_r + "vtrn.u16 q1, q5 \n" + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q2, q6 \n" // _src13tt_r + "vtrn.u16 q3, q7 \n" + + "add %1, #32 \n" // src0 += 32 + + "vtrn.u16 q8, q12 \n" // _src46tt_r + "vtrn.u16 q9, q13 \n" + + "add %2, #32 \n" // src1 += 32 + + "vtrn.u16 q10, q14 \n" // _src57tt_r + "vtrn.u16 q11, q15 \n" + + "vtrn.u32 q0, q8 \n" // _src04ttt_r + "vtrn.u32 q1, q9 \n" + + "vtrn.u32 q2, q10 \n" // _src15ttt_r + "vst4.u8 {d0-d3}, [%3], %11 \n" + "vtrn.u32 q3, q11 \n" + + "vtrn.u32 q4, q12 \n" // _src26ttt_r + "vst4.u8 {d4-d7}, [%4], %11 \n" + "vtrn.u32 q5, q13 \n" + + "vtrn.u32 q6, q14 \n" // _src37ttt_r + "vst4.u8 {d8-d11}, [%3], %11 \n" + "vtrn.u32 q7, q15 \n" + + "subs %0, #1 \n" + + "vst4.u8 {d16-d19}, [%3], %11\n" + "vst4.u8 {d12-d15}, [%4], %11\n" + "vst4.u8 {d20-d23}, [%4], %11\n" + "vst4.u8 {d24-d27}, [%3], %11\n" + "vst4.u8 {d28-d31}, [%4], %11\n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src0[2]; + dst0[3] = src0[3]; + dst0[4] = src1[0]; + dst0[5] = src1[1]; + dst0[6] = src1[2]; + dst0[7] = src1[3]; + dst0[8] = src0[0 + src_step]; + dst0[9] = src0[1 + src_step]; + dst0[10] = src0[2 + src_step]; + dst0[11] = src0[3 + src_step]; + dst0[12] = src1[0 + src_step]; + dst0[13] = src1[1 + src_step]; + dst0[14] = src1[2 + src_step]; + dst0[15] = src1[3 + src_step]; + dst0[16] = src0[0 + 2 * src_step]; + dst0[17] = src0[1 + 2 * src_step]; + dst0[18] = src0[2 + 2 * src_step]; + dst0[19] = src0[3 + 2 * src_step]; + dst0[20] = src1[0 + 2 * src_step]; + dst0[21] = src1[1 + 2 * src_step]; + dst0[22] = src1[2 + 2 * src_step]; + dst0[23] = src1[3 + 2 * src_step]; + dst0[24] = src0[0 + 3 * src_step]; + dst0[25] = src0[1 + 3 * src_step]; + dst0[26] = src0[2 + 3 * src_step]; + dst0[27] = src0[3 + 3 * src_step]; + dst0[28] = src1[0 + 3 * src_step]; + dst0[29] = src1[1 + 3 * src_step]; + dst0[30] = src1[2 + 3 * src_step]; + dst0[31] = src1[3 + 3 * src_step]; + + src0 += 4; + src1 += 4; + + dst0 += stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dst + y * 4; + + int x = 0; + for (; x < srcw; x++) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src0[2]; + dst0[3] = src0[3]; + + src0 += 4; + dst0 += stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_6_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw; + + // point to the last dst pixel in row + unsigned char* dstend = dst + w; + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst0 = dstend - y - 8; + unsigned char* dst1 = dstend - y - 8 + stride; + + int src_step = 2 * srcstride; + int dst_step = 2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8_t _src0 = vld1_u8(src0); + uint8x8_t _src1 = vld1_u8(src1); + + uint8x8_t _src2 = vld1_u8(src0 + src_step); + uint8x8_t _src3 = vld1_u8(src1 + src_step); + + uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step); + uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step); + + uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step); + uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0); + uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2); + uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4); + uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); + + uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]); + uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]); + uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]); + uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]); + uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]); + uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]); + uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]); + uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]); + + vst1_u8(dst0, _dst7); + vst1_u8(dst1, _dst6); + vst1_u8(dst0 + dst_step, _dst5); + vst1_u8(dst1 + dst_step, _dst4); + vst1_u8(dst0 + 2 * dst_step, _dst3); + vst1_u8(dst1 + 2 * dst_step, _dst2); + vst1_u8(dst0 + 3 * dst_step, _dst1); + vst1_u8(dst1 + 3 * dst_step, _dst0); + + src0 += 8; + src1 += 8; + + dst0 += 4 * dst_step; + dst1 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #64] \n" + "vld1.u8 {d0}, [%1], %10 \n" + + "pld [%2, #64] \n" + "vld1.u8 {d1}, [%2], %10 \n" + + "pld [%1, #64] \n" + "vld1.u8 {d2}, [%1], %10 \n" + + "vtrn.u8 d1, d0 \n" // _src01t_r + + "pld [%2, #64] \n" + "vld1.u8 {d3}, [%2], %10 \n" + + "pld [%1, #64] \n" + "vld1.u8 {d4}, [%1], %10 \n" + + "vtrn.u8 d3, d2 \n" // _src23t_r + + "pld [%2, #64] \n" + "vld1.u8 {d5}, [%2], %10 \n" + + "pld [%1, #64] \n" + "vld1.u8 {d6}, [%1], %10 \n" + + "vtrn.u8 d5, d4 \n" // _src45t_r + + "pld [%2, #64] \n" + "vld1.u8 {d7}, [%2], %10 \n" + + "vtrn.u8 d7, d6 \n" // _src67t_r + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q1, q0 \n" // _src02tt_r _src13tt_r + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q3, q2 \n" // _src46tt_r _src57tt_r + + "add %1, #8 \n" // src0 += 8 + + "vtrn.u32 q3, q1 \n" // _src26ttt_r _src37ttt_r + + "add %2, #8 \n" // src1 += 8 + + "vtrn.u32 q2, q0 \n" // _src04ttt_r _src15ttt_r + "vst1.u8 {d6}, [%4], %11 \n" + "vst1.u8 {d7}, [%3], %11 \n" + + "subs %0, #1 \n" + + "vst1.u8 {d4}, [%4], %11 \n" + "vst1.u8 {d5}, [%3], %11 \n" + "vst1.u8 {d2}, [%4], %11 \n" + "vst1.u8 {d3}, [%3], %11 \n" + "vst1.u8 {d0}, [%4], %11 \n" + "vst1.u8 {d1}, [%3], %11 \n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst0[0] = src1[0 + 3 * src_step]; + dst0[1] = src0[0 + 3 * src_step]; + dst0[2] = src1[0 + 2 * src_step]; + dst0[3] = src0[0 + 2 * src_step]; + dst0[4] = src1[0 + src_step]; + dst0[5] = src0[0 + src_step]; + dst0[6] = src1[0]; + dst0[7] = src0[0]; + + src0 += 1; + src1 += 1; + + dst0 += stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dstend - y - 1; + + int x = 0; + for (; x < srcw; x++) + { + *dst0 = *src0; + + src0 += 1; + dst0 += stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_6_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw * 2; + + // point to the last dst pixel in row + unsigned char* dstend = dst + w * 2; + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst0 = dstend - y * 2 - 8 * 2; + unsigned char* dst1 = dstend - y * 2 - 8 * 2 + stride; + + int src_step = 2 * srcstride; + int dst_step = 2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x2_t _src0 = vld2_u8(src0); + uint8x8x2_t _src1 = vld2_u8(src1); + + uint8x8x2_t _src2 = vld2_u8(src0 + src_step); + uint8x8x2_t _src3 = vld2_u8(src1 + src_step); + + uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step); + uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step); + + uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step); + uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); + uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); + uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); + uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); + + uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); + uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); + uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); + uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); + + uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); + uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); + uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); + uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); + + uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); + uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); + uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); + uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); + + uint8x8x2_t _dst0; + uint8x8x2_t _dst1; + uint8x8x2_t _dst2; + uint8x8x2_t _dst3; + uint8x8x2_t _dst4; + uint8x8x2_t _dst5; + uint8x8x2_t _dst6; + uint8x8x2_t _dst7; + + _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); + _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); + _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); + _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); + _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); + _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); + _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); + _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); + + _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); + _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); + _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); + _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); + _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); + _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); + _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); + _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); + + vst2_u8(dst0, _dst7); + vst2_u8(dst1, _dst6); + vst2_u8(dst0 + dst_step, _dst5); + vst2_u8(dst1 + dst_step, _dst4); + vst2_u8(dst0 + 2 * dst_step, _dst3); + vst2_u8(dst1 + 2 * dst_step, _dst2); + vst2_u8(dst0 + 3 * dst_step, _dst1); + vst2_u8(dst1 + 3 * dst_step, _dst0); + + src0 += 2 * 8; + src1 += 2 * 8; + + dst0 += 4 * dst_step; + dst1 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #128] \n" + "vld2.u8 {d0-d1}, [%1], %10 \n" + + "pld [%2, #128] \n" + "vld2.u8 {d2-d3}, [%2], %10 \n" + + "pld [%1, #128] \n" + "vld2.u8 {d4-d5}, [%1], %10 \n" + + "vtrn.u8 q1, q0 \n" // _src01t_r + + "pld [%2, #128] \n" + "vld2.u8 {d6-d7}, [%2], %10 \n" + + "pld [%1, #128] \n" + "vld2.u8 {d16-d17}, [%1], %10\n" + + "vtrn.u8 q3, q2 \n" // _src23t_r + + "pld [%2, #128] \n" + "vld2.u8 {d18-d19}, [%2], %10\n" + + "pld [%1, #128] \n" + "vld2.u8 {d20-d21}, [%1], %10\n" + + "vtrn.u8 q9, q8 \n" // _src45t_r + + "pld [%2, #128] \n" + "vld2.u8 {d22-d23}, [%2], %10\n" + + "vtrn.u8 q11, q10 \n" // _src67t_r + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q2, q0 \n" // _src02tt_r + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q3, q1 \n" // _src13tt_r + + "add %1, #16 \n" // src0 += 16 + + "vtrn.u16 q10, q8 \n" // _src46tt_r + + "add %2, #16 \n" // src1 += 16 + + "vtrn.u16 q11, q9 \n" // _src57tt_r + + "vtrn.u32 q10, q2 \n" // _src26ttt_r + + "vtrn.u32 q11, q3 \n" // _src37ttt_r + "vst2.u8 {d20-d21}, [%4], %11\n" + + "vtrn.u32 q8, q0 \n" // _src04ttt_r + "vst2.u8 {d22-d23}, [%3], %11\n" + + "vtrn.u32 q9, q1 \n" // _src15ttt_r + "vst2.u8 {d16-d17}, [%4], %11\n" + + "subs %0, #1 \n" + + "vst2.u8 {d18-d19}, [%3], %11\n" + "vst2.u8 {d4-d5}, [%4], %11 \n" + "vst2.u8 {d6-d7}, [%3], %11 \n" + "vst2.u8 {d0-d1}, [%4], %11 \n" + "vst2.u8 {d2-d3}, [%3], %11 \n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst0[0] = src1[0 + 3 * src_step]; + dst0[1] = src1[1 + 3 * src_step]; + dst0[2] = src0[0 + 3 * src_step]; + dst0[3] = src0[1 + 3 * src_step]; + dst0[4] = src1[0 + 2 * src_step]; + dst0[5] = src1[1 + 2 * src_step]; + dst0[6] = src0[0 + 2 * src_step]; + dst0[7] = src0[1 + 2 * src_step]; + dst0[8] = src1[0 + src_step]; + dst0[9] = src1[1 + src_step]; + dst0[10] = src0[0 + src_step]; + dst0[11] = src0[1 + src_step]; + dst0[12] = src1[0]; + dst0[13] = src1[1]; + dst0[14] = src0[0]; + dst0[15] = src0[1]; + + src0 += 2; + src1 += 2; + + dst0 += stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dstend - y * 2 - 2; + + int x = 0; + for (; x < srcw; x++) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + + src0 += 2; + dst0 += stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_6_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw * 3; + + // point to the last dst pixel in row + unsigned char* dstend = dst + w * 3; + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst0 = dstend - y * 3 - 8 * 3; + unsigned char* dst1 = dstend - y * 3 - 8 * 3 + stride; + + int src_step = 2 * srcstride; + int dst_step = 2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x3_t _src0 = vld3_u8(src0); + uint8x8x3_t _src1 = vld3_u8(src1); + + uint8x8x3_t _src2 = vld3_u8(src0 + src_step); + uint8x8x3_t _src3 = vld3_u8(src1 + src_step); + + uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step); + uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step); + + uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step); + uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); + uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); + uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); + uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); + + uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); + uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); + uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); + uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); + + uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]); + uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]); + uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]); + uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); + + uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); + uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); + uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); + uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); + + uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1])); + uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0])); + uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1])); + uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); + + uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); + uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); + uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); + uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); + + uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1])); + uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1])); + uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0])); + uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0])); + + uint8x8x3_t _dst0; + uint8x8x3_t _dst1; + uint8x8x3_t _dst2; + uint8x8x3_t _dst3; + uint8x8x3_t _dst4; + uint8x8x3_t _dst5; + uint8x8x3_t _dst6; + uint8x8x3_t _dst7; + + _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); + _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); + _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); + _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); + _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); + _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); + _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); + _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); + + _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); + _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); + _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); + _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); + _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); + _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); + _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); + _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); + + _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); + _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); + _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); + _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); + _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); + _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); + _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); + _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); + + vst3_u8(dst0, _dst7); + vst3_u8(dst1, _dst6); + vst3_u8(dst0 + dst_step, _dst5); + vst3_u8(dst1 + dst_step, _dst4); + vst3_u8(dst0 + 2 * dst_step, _dst3); + vst3_u8(dst1 + 2 * dst_step, _dst2); + vst3_u8(dst0 + 3 * dst_step, _dst1); + vst3_u8(dst1 + 3 * dst_step, _dst0); + + src0 += 3 * 8; + src1 += 3 * 8; + + dst0 += 4 * dst_step; + dst1 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #192] \n" + "vld3.u8 {d0-d2}, [%1], %10 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d4-d6}, [%2], %10 \n" + + "pld [%1, #192] \n" + "vld3.u8 {d8-d10}, [%1], %10 \n" + + "vtrn.u8 q2, q0 \n" // _src01t_r + "vtrn.u8 d6, d2 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d12-d14}, [%2], %10\n" + + "pld [%1, #192] \n" + "vld3.u8 {d16-d18}, [%1], %10\n" + + "vtrn.u8 q6, q4 \n" // _src23t_r + "vtrn.u8 d14, d10 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d20-d22}, [%2], %10\n" + + "pld [%1, #192] \n" + "vld3.u8 {d24-d26}, [%1], %10\n" + + "vtrn.u8 q10, q8 \n" // _src45t_r + "vtrn.u8 d22, d18 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d28-d30}, [%2], %10\n" + + "vtrn.u8 q14, q12 \n" // _src67t_r + "vtrn.u8 d30, d26 \n" + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q4, q0 \n" // _src02tt_r + "vtrn.u16 d10, d2 \n" + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q6, q2 \n" // _src13tt_r + "vtrn.u16 d14, d6 \n" + + "add %1, #24 \n" // src0 += 24 + + "vtrn.u16 q12, q8 \n" // _src46tt_r + "vtrn.u16 d26, d18 \n" + + "add %2, #24 \n" // src1 += 24 + + "vtrn.u16 q14, q10 \n" // _src57tt_r + "vtrn.u16 d30, d22 \n" + + "vtrn.u32 q12, q4 \n" // _src26ttt_r + "vtrn.u32 d26, d10 \n" + + "vtrn.u32 q14, q6 \n" // _src37ttt_r + "vst3.u8 {d24-d26}, [%4], %11\n" + "vtrn.u32 d30, d14 \n" + + "vtrn.u32 q8, q0 \n" // _src04ttt_r + "vst3.u8 {d28-d30}, [%3], %11\n" + "vtrn.u32 d18, d2 \n" + + "vtrn.u32 q10, q2 \n" // _src15ttt_r + "vst3.u8 {d16-d18}, [%4], %11\n" + "vtrn.u32 d22, d6 \n" + + "subs %0, #1 \n" + + "vst3.u8 {d20-d22}, [%3], %11\n" + "vst3.u8 {d8-d10}, [%4], %11 \n" + "vst3.u8 {d12-d14}, [%3], %11\n" + "vst3.u8 {d0-d2}, [%4], %11 \n" + "vst3.u8 {d4-d6}, [%3], %11 \n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst0[0] = src1[0 + 3 * src_step]; + dst0[1] = src1[1 + 3 * src_step]; + dst0[2] = src1[2 + 3 * src_step]; + dst0[3] = src0[0 + 3 * src_step]; + dst0[4] = src0[1 + 3 * src_step]; + dst0[5] = src0[2 + 3 * src_step]; + dst0[6] = src1[0 + 2 * src_step]; + dst0[7] = src1[1 + 2 * src_step]; + dst0[8] = src1[2 + 2 * src_step]; + dst0[9] = src0[0 + 2 * src_step]; + dst0[10] = src0[1 + 2 * src_step]; + dst0[11] = src0[2 + 2 * src_step]; + dst0[12] = src1[0 + src_step]; + dst0[13] = src1[1 + src_step]; + dst0[14] = src1[2 + src_step]; + dst0[15] = src0[0 + src_step]; + dst0[16] = src0[1 + src_step]; + dst0[17] = src0[2 + src_step]; + dst0[18] = src1[0]; + dst0[19] = src1[1]; + dst0[20] = src1[2]; + dst0[21] = src0[0]; + dst0[22] = src0[1]; + dst0[23] = src0[2]; + + src0 += 3; + src1 += 3; + + dst0 += stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dstend - y * 3 - 3; + + int x = 0; + for (; x < srcw; x++) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src0[2]; + + src0 += 3; + dst0 += stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_6_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride) +{ + const int srcwgap = srcstride - srcw * 4; + + // point to the last dst pixel in row + unsigned char* dstend = dst + w * 4; + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst0 = dstend - y * 4 - 8 * 4; + unsigned char* dst1 = dstend - y * 4 - 8 * 4 + stride; + + int src_step = 2 * srcstride; + int dst_step = 2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x4_t _src0 = vld4_u8(src0); + uint8x8x4_t _src1 = vld4_u8(src1); + + uint8x8x4_t _src2 = vld4_u8(src0 + src_step); + uint8x8x4_t _src3 = vld4_u8(src1 + src_step); + + uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step); + uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step); + + uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step); + uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); + uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); + uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); + uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); + + uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); + uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); + uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); + uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); + + uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]); + uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]); + uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]); + uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]); + + uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]); + uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]); + uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]); + uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); + + uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); + uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); + uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); + uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); + + uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1])); + uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0])); + uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1])); + uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0])); + + uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1])); + uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0])); + uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1])); + uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); + + uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); + uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); + uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); + uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); + + uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1])); + uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1])); + uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0])); + uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0])); + + uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1])); + uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1])); + uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0])); + uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0])); + + uint8x8x4_t _dst0; + uint8x8x4_t _dst1; + uint8x8x4_t _dst2; + uint8x8x4_t _dst3; + uint8x8x4_t _dst4; + uint8x8x4_t _dst5; + uint8x8x4_t _dst6; + uint8x8x4_t _dst7; + + _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); + _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); + _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); + _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); + _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); + _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); + _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); + _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); + + _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); + _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); + _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); + _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); + _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); + _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); + _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); + _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); + + _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); + _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); + _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); + _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); + _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); + _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); + _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); + _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); + + _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]); + _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]); + _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]); + _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]); + _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]); + _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]); + _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]); + _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]); + + vst4_u8(dst0, _dst7); + vst4_u8(dst1, _dst6); + vst4_u8(dst0 + dst_step, _dst5); + vst4_u8(dst1 + dst_step, _dst4); + vst4_u8(dst0 + 2 * dst_step, _dst3); + vst4_u8(dst1 + 2 * dst_step, _dst2); + vst4_u8(dst0 + 3 * dst_step, _dst1); + vst4_u8(dst1 + 3 * dst_step, _dst0); + + src0 += 4 * 8; + src1 += 4 * 8; + + dst0 += 4 * dst_step; + dst1 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld4.u8 {d0-d3}, [%1], %10 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d4-d7}, [%2], %10 \n" + + "pld [%1, #256] \n" + "vld4.u8 {d8-d11}, [%1], %10 \n" + + "vtrn.u8 q2, q0 \n" // _src01t_r + "vtrn.u8 q3, q1 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d12-d15}, [%2], %10\n" + + "pld [%1, #256] \n" + "vld4.u8 {d16-d19}, [%1], %10\n" + + "vtrn.u8 q6, q4 \n" // _src23t_r + "vtrn.u8 q7, q5 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d20-d23}, [%2], %10\n" + + "pld [%1, #256] \n" + "vld4.u8 {d24-d27}, [%1], %10\n" + + "vtrn.u8 q10, q8 \n" // _src45t_r + "vtrn.u8 q11, q9 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d28-d31}, [%2], %10\n" + + "vtrn.u8 q14, q12 \n" // _src67t_r + "vtrn.u8 q15, q13 \n" + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q4, q0 \n" // _src02tt_r + "vtrn.u16 q5, q1 \n" + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q6, q2 \n" // _src13tt_r + "vtrn.u16 q7, q3 \n" + + "add %1, #32 \n" // src0 += 32 + + "vtrn.u16 q12, q8 \n" // _src46tt_r + "vtrn.u16 q13, q9 \n" + + "add %2, #32 \n" // src1 += 32 + + "vtrn.u16 q14, q10 \n" // _src57tt_r + "vtrn.u16 q15, q11 \n" + + "vtrn.u32 q12, q4 \n" // _src26ttt_r + "vtrn.u32 q13, q5 \n" + + "vtrn.u32 q14, q6 \n" // _src37ttt_r + "vst4.u8 {d24-d27}, [%4], %11\n" + "vtrn.u32 q15, q7 \n" + + "vtrn.u32 q8, q0 \n" // _src04ttt_r + "vst4.u8 {d28-d31}, [%3], %11\n" + "vtrn.u32 q9, q1 \n" + + "vtrn.u32 q10, q2 \n" // _src15ttt_r + "vst4.u8 {d16-d19}, [%4], %11\n" + "vtrn.u32 q11, q3 \n" + + "subs %0, #1 \n" + + "vst4.u8 {d8-d11}, [%4], %11 \n" + "vst4.u8 {d20-d23}, [%3], %11\n" + "vst4.u8 {d12-d15}, [%3], %11\n" + "vst4.u8 {d0-d3}, [%4], %11 \n" + "vst4.u8 {d4-d7}, [%3], %11 \n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst0), // %3 + "=r"(dst1) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst0), + "4"(dst1), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst0[0] = src1[0 + 3 * src_step]; + dst0[1] = src1[1 + 3 * src_step]; + dst0[2] = src1[2 + 3 * src_step]; + dst0[3] = src1[3 + 3 * src_step]; + dst0[4] = src0[0 + 3 * src_step]; + dst0[5] = src0[1 + 3 * src_step]; + dst0[6] = src0[2 + 3 * src_step]; + dst0[7] = src0[3 + 3 * src_step]; + dst0[8] = src1[0 + 2 * src_step]; + dst0[9] = src1[1 + 2 * src_step]; + dst0[10] = src1[2 + 2 * src_step]; + dst0[11] = src1[3 + 2 * src_step]; + dst0[12] = src0[0 + 2 * src_step]; + dst0[13] = src0[1 + 2 * src_step]; + dst0[14] = src0[2 + 2 * src_step]; + dst0[15] = src0[3 + 2 * src_step]; + dst0[16] = src1[0 + src_step]; + dst0[17] = src1[1 + src_step]; + dst0[18] = src1[2 + src_step]; + dst0[19] = src1[3 + src_step]; + dst0[20] = src0[0 + src_step]; + dst0[21] = src0[1 + src_step]; + dst0[22] = src0[2 + src_step]; + dst0[23] = src0[3 + src_step]; + dst0[24] = src1[0]; + dst0[25] = src1[1]; + dst0[26] = src1[2]; + dst0[27] = src1[3]; + dst0[28] = src0[0]; + dst0[29] = src0[1]; + dst0[30] = src0[2]; + dst0[31] = src0[3]; + + src0 += 4; + src1 += 4; + + dst0 += stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dstend - y * 4 - 4; + + int x = 0; + for (; x < srcw; x++) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src0[2]; + dst0[3] = src0[3]; + + src0 += 4; + dst0 += stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_7_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) +{ + const int srcwgap = srcstride - srcw; + + // point to the last dst pixel + unsigned char* dstend = dst + stride * (h - 1) + w; + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst6 = dstend - y - 8 - stride; + unsigned char* dst7 = dstend - y - 8; + + int src_step = 2 * srcstride; + int dst_step = -2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8_t _src0 = vld1_u8(src0); + uint8x8_t _src1 = vld1_u8(src1); + + uint8x8_t _src2 = vld1_u8(src0 + src_step); + uint8x8_t _src3 = vld1_u8(src1 + src_step); + + uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step); + uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step); + + uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step); + uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0); + uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2); + uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4); + uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); + + uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]); + uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]); + uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]); + uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]); + uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]); + uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]); + uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]); + uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]); + + vst1_u8(dst7, _dst7); + vst1_u8(dst6, _dst6); + vst1_u8(dst7 + dst_step, _dst5); + vst1_u8(dst6 + dst_step, _dst4); + vst1_u8(dst7 + 2 * dst_step, _dst3); + vst1_u8(dst6 + 2 * dst_step, _dst2); + vst1_u8(dst7 + 3 * dst_step, _dst1); + vst1_u8(dst6 + 3 * dst_step, _dst0); + + src0 += 8; + src1 += 8; + + dst7 += 4 * dst_step; + dst6 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #64] \n" + "vld1.u8 {d0}, [%1], %10 \n" + + "pld [%2, #64] \n" + "vld1.u8 {d1}, [%2], %10 \n" + + "pld [%1, #64] \n" + "vld1.u8 {d2}, [%1], %10 \n" + + "vtrn.u8 d1, d0 \n" // _src01t_r + + "pld [%2, #64] \n" + "vld1.u8 {d3}, [%2], %10 \n" + + "pld [%1, #64] \n" + "vld1.u8 {d4}, [%1], %10 \n" + + "vtrn.u8 d3, d2 \n" // _src23t_r + + "pld [%2, #64] \n" + "vld1.u8 {d5}, [%2], %10 \n" + + "pld [%1, #64] \n" + "vld1.u8 {d6}, [%1], %10 \n" + + "vtrn.u8 d5, d4 \n" // _src45t_r + + "pld [%2, #64] \n" + "vld1.u8 {d7}, [%2], %10 \n" + + "vtrn.u8 d7, d6 \n" // _src67t_r + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q1, q0 \n" // _src02tt_r _src13tt_r + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q3, q2 \n" // _src46tt_r _src57tt_r + + "add %1, #8 \n" // src0 += 8 + + "vtrn.u32 q3, q1 \n" // _src26ttt_r _src37ttt_r + + "add %2, #8 \n" // src1 += 8 + + "vtrn.u32 q2, q0 \n" // _src04ttt_r _src15ttt_r + "vst1.u8 {d6}, [%4], %11 \n" + "vst1.u8 {d7}, [%3], %11 \n" + + "subs %0, #1 \n" + + "vst1.u8 {d4}, [%4], %11 \n" + "vst1.u8 {d5}, [%3], %11 \n" + "vst1.u8 {d2}, [%4], %11 \n" + "vst1.u8 {d3}, [%3], %11 \n" + "vst1.u8 {d0}, [%4], %11 \n" + "vst1.u8 {d1}, [%3], %11 \n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst7), // %3 + "=r"(dst6) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst7), + "4"(dst6), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst7[0] = src1[0 + 3 * src_step]; + dst7[1] = src0[0 + 3 * src_step]; + dst7[2] = src1[0 + 2 * src_step]; + dst7[3] = src0[0 + 2 * src_step]; + dst7[4] = src1[0 + src_step]; + dst7[5] = src0[0 + src_step]; + dst7[6] = src1[0]; + dst7[7] = src0[0]; + + src0 += 1; + src1 += 1; + + dst7 -= stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dstend - y - 1; + + int x = 0; + for (; x < srcw; x++) + { + *dst0 = *src0; + + src0 += 1; + dst0 -= stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_7_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) +{ + const int srcwgap = srcstride - srcw * 2; + + // point to the last dst pixel + unsigned char* dstend = dst + stride * (h - 1) + w * 2; + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst6 = dstend - y * 2 - 8 * 2 - stride; + unsigned char* dst7 = dstend - y * 2 - 8 * 2; + + int src_step = 2 * srcstride; + int dst_step = -2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x2_t _src0 = vld2_u8(src0); + uint8x8x2_t _src1 = vld2_u8(src1); + + uint8x8x2_t _src2 = vld2_u8(src0 + src_step); + uint8x8x2_t _src3 = vld2_u8(src1 + src_step); + + uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step); + uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step); + + uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step); + uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); + uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); + uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); + uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); + + uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); + uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); + uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); + uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); + + uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); + uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); + uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); + uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); + + uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); + uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); + uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); + uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); + + uint8x8x2_t _dst0; + uint8x8x2_t _dst1; + uint8x8x2_t _dst2; + uint8x8x2_t _dst3; + uint8x8x2_t _dst4; + uint8x8x2_t _dst5; + uint8x8x2_t _dst6; + uint8x8x2_t _dst7; + + _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); + _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); + _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); + _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); + _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); + _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); + _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); + _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); + + _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); + _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); + _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); + _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); + _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); + _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); + _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); + _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); + + vst2_u8(dst7, _dst7); + vst2_u8(dst6, _dst6); + vst2_u8(dst7 + dst_step, _dst5); + vst2_u8(dst6 + dst_step, _dst4); + vst2_u8(dst7 + 2 * dst_step, _dst3); + vst2_u8(dst6 + 2 * dst_step, _dst2); + vst2_u8(dst7 + 3 * dst_step, _dst1); + vst2_u8(dst6 + 3 * dst_step, _dst0); + + src0 += 2 * 8; + src1 += 2 * 8; + + dst7 += 4 * dst_step; + dst6 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #128] \n" + "vld2.u8 {d0-d1}, [%1], %10 \n" + + "pld [%2, #128] \n" + "vld2.u8 {d2-d3}, [%2], %10 \n" + + "pld [%1, #128] \n" + "vld2.u8 {d4-d5}, [%1], %10 \n" + + "vtrn.u8 q1, q0 \n" // _src01t_r + + "pld [%2, #128] \n" + "vld2.u8 {d6-d7}, [%2], %10 \n" + + "pld [%1, #128] \n" + "vld2.u8 {d16-d17}, [%1], %10\n" + + "vtrn.u8 q3, q2 \n" // _src23t_r + + "pld [%2, #128] \n" + "vld2.u8 {d18-d19}, [%2], %10\n" + + "pld [%1, #128] \n" + "vld2.u8 {d20-d21}, [%1], %10\n" + + "vtrn.u8 q9, q8 \n" // _src45t_r + + "pld [%2, #128] \n" + "vld2.u8 {d22-d23}, [%2], %10\n" + + "vtrn.u8 q11, q10 \n" // _src67t_r + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q2, q0 \n" // _src02tt_r + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q3, q1 \n" // _src13tt_r + + "add %1, #16 \n" // src0 += 16 + + "vtrn.u16 q10, q8 \n" // _src46tt_r + + "add %2, #16 \n" // src1 += 16 + + "vtrn.u16 q11, q9 \n" // _src57tt_r + + "vtrn.u32 q10, q2 \n" // _src26ttt_r + + "vtrn.u32 q11, q3 \n" // _src37ttt_r + "vst2.u8 {d20-d21}, [%4], %11\n" + + "vtrn.u32 q8, q0 \n" // _src04ttt_r + "vst2.u8 {d22-d23}, [%3], %11\n" + + "vtrn.u32 q9, q1 \n" // _src15ttt_r + "vst2.u8 {d16-d17}, [%4], %11\n" + + "subs %0, #1 \n" + + "vst2.u8 {d4-d5}, [%4], %11 \n" + "vst2.u8 {d18-d19}, [%3], %11\n" + "vst2.u8 {d6-d7}, [%3], %11 \n" + "vst2.u8 {d0-d1}, [%4], %11 \n" + "vst2.u8 {d2-d3}, [%3], %11 \n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst7), // %3 + "=r"(dst6) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst7), + "4"(dst6), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst7[0] = src1[0 + 3 * src_step]; + dst7[1] = src1[1 + 3 * src_step]; + dst7[2] = src0[0 + 3 * src_step]; + dst7[3] = src0[1 + 3 * src_step]; + dst7[4] = src1[0 + 2 * src_step]; + dst7[5] = src1[1 + 2 * src_step]; + dst7[6] = src0[0 + 2 * src_step]; + dst7[7] = src0[1 + 2 * src_step]; + dst7[8] = src1[0 + src_step]; + dst7[9] = src1[1 + src_step]; + dst7[10] = src0[0 + src_step]; + dst7[11] = src0[1 + src_step]; + dst7[12] = src1[0]; + dst7[13] = src1[1]; + dst7[14] = src0[0]; + dst7[15] = src0[1]; + + src0 += 2; + src1 += 2; + + dst7 -= stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dstend - y * 2 - 2; + + int x = 0; + for (; x < srcw; x++) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + + src0 += 2; + dst0 -= stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_7_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) +{ + const int srcwgap = srcstride - srcw * 3; + + // point to the last dst pixel + unsigned char* dstend = dst + stride * (h - 1) + w * 3; + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst6 = dstend - y * 3 - 8 * 3 - stride; + unsigned char* dst7 = dstend - y * 3 - 8 * 3; + + int src_step = 2 * srcstride; + int dst_step = -2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x3_t _src0 = vld3_u8(src0); + uint8x8x3_t _src1 = vld3_u8(src1); + + uint8x8x3_t _src2 = vld3_u8(src0 + src_step); + uint8x8x3_t _src3 = vld3_u8(src1 + src_step); + + uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step); + uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step); + + uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step); + uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); + uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); + uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); + uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); + + uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); + uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); + uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); + uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); + + uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]); + uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]); + uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]); + uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); + + uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); + uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); + uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); + uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); + + uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1])); + uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0])); + uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1])); + uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); + + uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); + uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); + uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); + uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); + + uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1])); + uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1])); + uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0])); + uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0])); + + uint8x8x3_t _dst0; + uint8x8x3_t _dst1; + uint8x8x3_t _dst2; + uint8x8x3_t _dst3; + uint8x8x3_t _dst4; + uint8x8x3_t _dst5; + uint8x8x3_t _dst6; + uint8x8x3_t _dst7; + + _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); + _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); + _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); + _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); + _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); + _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); + _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); + _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); + + _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); + _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); + _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); + _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); + _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); + _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); + _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); + _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); + + _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); + _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); + _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); + _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); + _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); + _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); + _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); + _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); + + vst3_u8(dst7, _dst7); + vst3_u8(dst6, _dst6); + vst3_u8(dst7 + dst_step, _dst5); + vst3_u8(dst6 + dst_step, _dst4); + vst3_u8(dst7 + 2 * dst_step, _dst3); + vst3_u8(dst6 + 2 * dst_step, _dst2); + vst3_u8(dst7 + 3 * dst_step, _dst1); + vst3_u8(dst6 + 3 * dst_step, _dst0); + + src0 += 3 * 8; + src1 += 3 * 8; + + dst7 += 4 * dst_step; + dst6 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #192] \n" + "vld3.u8 {d0-d2}, [%1], %10 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d4-d6}, [%2], %10 \n" + + "pld [%1, #192] \n" + "vld3.u8 {d8-d10}, [%1], %10 \n" + + "vtrn.u8 q2, q0 \n" // _src01t_r + "vtrn.u8 d6, d2 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d12-d14}, [%2], %10\n" + + "pld [%1, #192] \n" + "vld3.u8 {d16-d18}, [%1], %10\n" + + "vtrn.u8 q6, q4 \n" // _src23t_r + "vtrn.u8 d14, d10 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d20-d22}, [%2], %10\n" + + "pld [%1, #192] \n" + "vld3.u8 {d24-d26}, [%1], %10\n" + + "vtrn.u8 q10, q8 \n" // _src45t_r + "vtrn.u8 d22, d18 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d28-d30}, [%2], %10\n" + + "vtrn.u8 q14, q12 \n" // _src67t_r + "vtrn.u8 d30, d26 \n" + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q4, q0 \n" // _src02tt_r + "vtrn.u16 d10, d2 \n" + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q6, q2 \n" // _src13tt_r + "vtrn.u16 d14, d6 \n" + + "add %1, #24 \n" // src0 += 24 + + "vtrn.u16 q12, q8 \n" // _src46tt_r + "vtrn.u16 d26, d18 \n" + + "add %2, #24 \n" // src1 += 24 + + "vtrn.u16 q14, q10 \n" // _src57tt_r + "vtrn.u16 d30, d22 \n" + + "vtrn.u32 q12, q4 \n" // _src26ttt_r + "vtrn.u32 d26, d10 \n" + + "vtrn.u32 q14, q6 \n" // _src37ttt_r + "vst3.u8 {d24-d26}, [%4], %11\n" + "vtrn.u32 d30, d14 \n" + + "vtrn.u32 q8, q0 \n" // _src04ttt_r + "vst3.u8 {d28-d30}, [%3], %11\n" + "vtrn.u32 d18, d2 \n" + + "vtrn.u32 q10, q2 \n" // _src15ttt_r + "vst3.u8 {d16-d18}, [%4], %11\n" + "vtrn.u32 d22, d6 \n" + + "subs %0, #1 \n" + + "vst3.u8 {d8-d10}, [%4], %11 \n" + "vst3.u8 {d20-d22}, [%3], %11\n" + "vst3.u8 {d12-d14}, [%3], %11\n" + "vst3.u8 {d0-d2}, [%4], %11 \n" + "vst3.u8 {d4-d6}, [%3], %11 \n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst7), // %3 + "=r"(dst6) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst7), + "4"(dst6), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst7[0] = src1[0 + 3 * src_step]; + dst7[1] = src1[1 + 3 * src_step]; + dst7[2] = src1[2 + 3 * src_step]; + dst7[3] = src0[0 + 3 * src_step]; + dst7[4] = src0[1 + 3 * src_step]; + dst7[5] = src0[2 + 3 * src_step]; + dst7[6] = src1[0 + 2 * src_step]; + dst7[7] = src1[1 + 2 * src_step]; + dst7[8] = src1[2 + 2 * src_step]; + dst7[9] = src0[0 + 2 * src_step]; + dst7[10] = src0[1 + 2 * src_step]; + dst7[11] = src0[2 + 2 * src_step]; + dst7[12] = src1[0 + src_step]; + dst7[13] = src1[1 + src_step]; + dst7[14] = src1[2 + src_step]; + dst7[15] = src0[0 + src_step]; + dst7[16] = src0[1 + src_step]; + dst7[17] = src0[2 + src_step]; + dst7[18] = src1[0]; + dst7[19] = src1[1]; + dst7[20] = src1[2]; + dst7[21] = src0[0]; + dst7[22] = src0[1]; + dst7[23] = src0[2]; + + src0 += 3; + src1 += 3; + + dst7 -= stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dstend - y * 3 - 3; + + int x = 0; + for (; x < srcw; x++) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src0[2]; + + src0 += 3; + dst0 -= stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_7_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) +{ + const int srcwgap = srcstride - srcw * 4; + + // point to the last dst pixel + unsigned char* dstend = dst + stride * (h - 1) + w * 4; + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst6 = dstend - y * 4 - 8 * 4 - stride; + unsigned char* dst7 = dstend - y * 4 - 8 * 4; + + int src_step = 2 * srcstride; + int dst_step = -2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x4_t _src0 = vld4_u8(src0); + uint8x8x4_t _src1 = vld4_u8(src1); + + uint8x8x4_t _src2 = vld4_u8(src0 + src_step); + uint8x8x4_t _src3 = vld4_u8(src1 + src_step); + + uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step); + uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step); + + uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step); + uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); + uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); + uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); + uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); + + uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); + uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); + uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); + uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); + + uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]); + uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]); + uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]); + uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]); + + uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]); + uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]); + uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]); + uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); + + uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); + uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); + uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); + uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); + + uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1])); + uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0])); + uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1])); + uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0])); + + uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1])); + uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0])); + uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1])); + uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); + + uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); + uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); + uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); + uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); + + uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1])); + uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1])); + uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0])); + uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0])); + + uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1])); + uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1])); + uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0])); + uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0])); + + uint8x8x4_t _dst0; + uint8x8x4_t _dst1; + uint8x8x4_t _dst2; + uint8x8x4_t _dst3; + uint8x8x4_t _dst4; + uint8x8x4_t _dst5; + uint8x8x4_t _dst6; + uint8x8x4_t _dst7; + + _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); + _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); + _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); + _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); + _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); + _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); + _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); + _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); + + _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); + _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); + _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); + _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); + _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); + _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); + _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); + _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); + + _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); + _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); + _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); + _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); + _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); + _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); + _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); + _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); + + _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]); + _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]); + _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]); + _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]); + _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]); + _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]); + _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]); + _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]); + + vst4_u8(dst7, _dst7); + vst4_u8(dst6, _dst6); + vst4_u8(dst7 + dst_step, _dst5); + vst4_u8(dst6 + dst_step, _dst4); + vst4_u8(dst7 + 2 * dst_step, _dst3); + vst4_u8(dst6 + 2 * dst_step, _dst2); + vst4_u8(dst7 + 3 * dst_step, _dst1); + vst4_u8(dst6 + 3 * dst_step, _dst0); + + src0 += 4 * 8; + src1 += 4 * 8; + + dst7 += 4 * dst_step; + dst6 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld4.u8 {d0-d3}, [%1], %10 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d4-d7}, [%2], %10 \n" + + "pld [%1, #256] \n" + "vld4.u8 {d8-d11}, [%1], %10 \n" + + "vtrn.u8 q2, q0 \n" // _src01t_r + "vtrn.u8 q3, q1 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d12-d15}, [%2], %10\n" + + "pld [%1, #256] \n" + "vld4.u8 {d16-d19}, [%1], %10\n" + + "vtrn.u8 q6, q4 \n" // _src23t_r + "vtrn.u8 q7, q5 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d20-d23}, [%2], %10\n" + + "pld [%1, #256] \n" + "vld4.u8 {d24-d27}, [%1], %10\n" + + "vtrn.u8 q10, q8 \n" // _src45t_r + "vtrn.u8 q11, q9 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d28-d31}, [%2], %10\n" + + "vtrn.u8 q14, q12 \n" // _src67t_r + "vtrn.u8 q15, q13 \n" + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q4, q0 \n" // _src02tt_r + "vtrn.u16 q5, q1 \n" + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q6, q2 \n" // _src13tt_r + "vtrn.u16 q7, q3 \n" + + "add %1, #32 \n" // src0 += 32 + + "vtrn.u16 q12, q8 \n" // _src46tt_r + "vtrn.u16 q13, q9 \n" + + "add %2, #32 \n" // src1 += 32 + + "vtrn.u16 q14, q10 \n" // _src57tt_r + "vtrn.u16 q15, q11 \n" + + "vtrn.u32 q12, q4 \n" // _src26ttt_r + "vtrn.u32 q13, q5 \n" + + "vtrn.u32 q14, q6 \n" // _src37ttt_r + "vst4.u8 {d24-d27}, [%4], %11\n" + "vtrn.u32 q15, q7 \n" + + "vtrn.u32 q8, q0 \n" // _src04ttt_r + "vst4.u8 {d28-d31}, [%3], %11\n" + "vtrn.u32 q9, q1 \n" + + "vtrn.u32 q10, q2 \n" // _src15ttt_r + "vst4.u8 {d16-d19}, [%4], %11\n" + "vtrn.u32 q11, q3 \n" + + "subs %0, #1 \n" + + "vst4.u8 {d8-d11}, [%4], %11 \n" + "vst4.u8 {d20-d23}, [%3], %11\n" + "vst4.u8 {d12-d15}, [%3], %11\n" + "vst4.u8 {d0-d3}, [%4], %11 \n" + "vst4.u8 {d4-d7}, [%3], %11 \n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst7), // %3 + "=r"(dst6) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst7), + "4"(dst6), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst7[0] = src1[0 + 3 * src_step]; + dst7[1] = src1[1 + 3 * src_step]; + dst7[2] = src1[2 + 3 * src_step]; + dst7[3] = src1[3 + 3 * src_step]; + dst7[4] = src0[0 + 3 * src_step]; + dst7[5] = src0[1 + 3 * src_step]; + dst7[6] = src0[2 + 3 * src_step]; + dst7[7] = src0[3 + 3 * src_step]; + dst7[8] = src1[0 + 2 * src_step]; + dst7[9] = src1[1 + 2 * src_step]; + dst7[10] = src1[2 + 2 * src_step]; + dst7[11] = src1[3 + 2 * src_step]; + dst7[12] = src0[0 + 2 * src_step]; + dst7[13] = src0[1 + 2 * src_step]; + dst7[14] = src0[2 + 2 * src_step]; + dst7[15] = src0[3 + 2 * src_step]; + dst7[16] = src1[0 + src_step]; + dst7[17] = src1[1 + src_step]; + dst7[18] = src1[2 + src_step]; + dst7[19] = src1[3 + src_step]; + dst7[20] = src0[0 + src_step]; + dst7[21] = src0[1 + src_step]; + dst7[22] = src0[2 + src_step]; + dst7[23] = src0[3 + src_step]; + dst7[24] = src1[0]; + dst7[25] = src1[1]; + dst7[26] = src1[2]; + dst7[27] = src1[3]; + dst7[28] = src0[0]; + dst7[29] = src0[1]; + dst7[30] = src0[2]; + dst7[31] = src0[3]; + + src0 += 4; + src1 += 4; + + dst7 -= stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dstend - y * 4 - 4; + + int x = 0; + for (; x < srcw; x++) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src0[2]; + dst0[3] = src0[3]; + + src0 += 4; + dst0 -= stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_8_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride) +{ + const int srcwgap = srcstride - srcw; + + // point to the last dst pixel row + unsigned char* dstend = dst + stride * (h - 1); + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst7 = dstend + y; + unsigned char* dst6 = dstend + y - stride; + + int src_step = 2 * srcstride; + int dst_step = -2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8_t _src0 = vld1_u8(src0); + uint8x8_t _src1 = vld1_u8(src1); + + uint8x8_t _src2 = vld1_u8(src0 + src_step); + uint8x8_t _src3 = vld1_u8(src1 + src_step); + + uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step); + uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step); + + uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step); + uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1); + uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3); + uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5); + uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); + + uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]); + uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]); + uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]); + uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]); + uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]); + uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]); + uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]); + uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]); + + vst1_u8(dst7, _dst0); + vst1_u8(dst6, _dst1); + vst1_u8(dst7 + dst_step, _dst2); + vst1_u8(dst6 + dst_step, _dst3); + vst1_u8(dst7 + 2 * dst_step, _dst4); + vst1_u8(dst6 + 2 * dst_step, _dst5); + vst1_u8(dst7 + 3 * dst_step, _dst6); + vst1_u8(dst6 + 3 * dst_step, _dst7); + + src0 += 8; + src1 += 8; + + dst7 += 4 * dst_step; + dst6 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #64] \n" + "vld1.u8 {d0}, [%1], %10 \n" + + "pld [%2, #64] \n" + "vld1.u8 {d1}, [%2], %10 \n" + + "pld [%1, #64] \n" + "vld1.u8 {d2}, [%1], %10 \n" + + "vtrn.u8 d0, d1 \n" // _src01t_r + + "pld [%2, #64] \n" + "vld1.u8 {d3}, [%2], %10 \n" + + "pld [%1, #64] \n" + "vld1.u8 {d4}, [%1], %10 \n" + + "vtrn.u8 d2, d3 \n" // _src23t_r + + "pld [%2, #64] \n" + "vld1.u8 {d5}, [%2], %10 \n" + + "pld [%1, #64] \n" + "vld1.u8 {d6}, [%1], %10 \n" + + "vtrn.u8 d4, d5 \n" // _src45t_r + + "pld [%2, #64] \n" + "vld1.u8 {d7}, [%2], %10 \n" + + "vtrn.u8 d6, d7 \n" // _src67t_r + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q0, q1 \n" // _src02tt_r _src13tt_r + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q2, q3 \n" // _src46tt_r _src57tt_r + + "add %1, #8 \n" // src0 += 8 + + "vtrn.u32 q0, q2 \n" // _src04ttt_r _src15ttt_r + + "add %2, #8 \n" // src1 += 8 + + "vtrn.u32 q1, q3 \n" // _src26ttt_r _src37ttt_r + "vst1.u8 {d0}, [%3], %11 \n" + "vst1.u8 {d1}, [%4], %11 \n" + + "subs %0, #1 \n" + + "vst1.u8 {d2}, [%3], %11 \n" + "vst1.u8 {d3}, [%4], %11 \n" + "vst1.u8 {d4}, [%3], %11 \n" + "vst1.u8 {d5}, [%4], %11 \n" + "vst1.u8 {d6}, [%3], %11 \n" + "vst1.u8 {d7}, [%4], %11 \n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst7), // %3 + "=r"(dst6) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst7), + "4"(dst6), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst7[0] = src0[0]; + dst7[1] = src1[0]; + dst7[2] = src0[0 + src_step]; + dst7[3] = src1[0 + src_step]; + dst7[4] = src0[0 + 2 * src_step]; + dst7[5] = src1[0 + 2 * src_step]; + dst7[6] = src0[0 + 3 * src_step]; + dst7[7] = src1[0 + 3 * src_step]; + + src0 += 1; + src1 += 1; + + dst7 -= stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dstend + y; + + int x = 0; + for (; x < srcw; x++) + { + *dst0 = *src0; + + src0 += 1; + dst0 -= stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_8_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride) +{ + const int srcwgap = srcstride - srcw * 2; + + // point to the last dst pixel row + unsigned char* dstend = dst + stride * (h - 1); + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst7 = dstend + y * 2; + unsigned char* dst6 = dstend + y * 2 - stride; + + int src_step = 2 * srcstride; + int dst_step = -2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x2_t _src0 = vld2_u8(src0); + uint8x8x2_t _src1 = vld2_u8(src1); + + uint8x8x2_t _src2 = vld2_u8(src0 + src_step); + uint8x8x2_t _src3 = vld2_u8(src1 + src_step); + + uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step); + uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step); + + uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step); + uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); + uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); + uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); + uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); + + uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); + uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); + uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); + uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); + + uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); + uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); + uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); + uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); + + uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); + uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); + uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); + uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); + + uint8x8x2_t _dst0; + uint8x8x2_t _dst1; + uint8x8x2_t _dst2; + uint8x8x2_t _dst3; + uint8x8x2_t _dst4; + uint8x8x2_t _dst5; + uint8x8x2_t _dst6; + uint8x8x2_t _dst7; + + _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); + _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); + _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); + _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); + _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); + _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); + _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); + _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); + + _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); + _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); + _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); + _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); + _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); + _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); + _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); + _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); + + vst2_u8(dst7, _dst0); + vst2_u8(dst6, _dst1); + vst2_u8(dst7 + dst_step, _dst2); + vst2_u8(dst6 + dst_step, _dst3); + vst2_u8(dst7 + 2 * dst_step, _dst4); + vst2_u8(dst6 + 2 * dst_step, _dst5); + vst2_u8(dst7 + 3 * dst_step, _dst6); + vst2_u8(dst6 + 3 * dst_step, _dst7); + + src0 += 2 * 8; + src1 += 2 * 8; + + dst7 += 4 * dst_step; + dst6 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #128] \n" + "vld2.u8 {d0-d1}, [%1], %10 \n" + + "pld [%2, #128] \n" + "vld2.u8 {d2-d3}, [%2], %10 \n" + + "pld [%1, #128] \n" + "vld2.u8 {d4-d5}, [%1], %10 \n" + + "vtrn.u8 q0, q1 \n" // _src01t_r + + "pld [%2, #128] \n" + "vld2.u8 {d6-d7}, [%2], %10 \n" + + "pld [%1, #128] \n" + "vld2.u8 {d16-d17}, [%1], %10\n" + + "vtrn.u8 q2, q3 \n" // _src23t_r + + "pld [%2, #128] \n" + "vld2.u8 {d18-d19}, [%2], %10\n" + + "pld [%1, #128] \n" + "vld2.u8 {d20-d21}, [%1], %10\n" + + "vtrn.u8 q8, q9 \n" // _src45t_r + + "pld [%2, #128] \n" + "vld2.u8 {d22-d23}, [%2], %10\n" + + "vtrn.u8 q10, q11 \n" // _src67t_r + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q0, q2 \n" // _src02tt_r + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q1, q3 \n" // _src13tt_r + + "add %1, #16 \n" // src0 += 16 + + "vtrn.u16 q8, q10 \n" // _src46tt_r + + "add %2, #16 \n" // src1 += 16 + + "vtrn.u16 q9, q11 \n" // _src57tt_r + + "vtrn.u32 q0, q8 \n" // _src04ttt_r + + "vtrn.u32 q1, q9 \n" // _src15ttt_r + "vst2.u8 {d0-d1}, [%3], %11 \n" + + "vtrn.u32 q2, q10 \n" // _src26ttt_r + "vst2.u8 {d2-d3}, [%4], %11 \n" + + "vtrn.u32 q3, q11 \n" // _src37ttt_r + "vst2.u8 {d4-d5}, [%3], %11 \n" + + "subs %0, #1 \n" + + "vst2.u8 {d16-d17}, [%3], %11\n" + "vst2.u8 {d6-d7}, [%4], %11 \n" + "vst2.u8 {d18-d19}, [%4], %11\n" + "vst2.u8 {d20-d21}, [%3], %11\n" + "vst2.u8 {d22-d23}, [%4], %11\n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst7), // %3 + "=r"(dst6) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst7), + "4"(dst6), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst7[0] = src0[0]; + dst7[1] = src0[1]; + dst7[2] = src1[0]; + dst7[3] = src1[1]; + dst7[4] = src0[0 + src_step]; + dst7[5] = src0[1 + src_step]; + dst7[6] = src1[0 + src_step]; + dst7[7] = src1[1 + src_step]; + dst7[8] = src0[0 + 2 * src_step]; + dst7[9] = src0[1 + 2 * src_step]; + dst7[10] = src1[0 + 2 * src_step]; + dst7[11] = src1[1 + 2 * src_step]; + dst7[12] = src0[0 + 3 * src_step]; + dst7[13] = src0[1 + 3 * src_step]; + dst7[14] = src1[0 + 3 * src_step]; + dst7[15] = src1[1 + 3 * src_step]; + + src0 += 2; + src1 += 2; + + dst7 -= stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dstend + y * 2; + + int x = 0; + for (; x < srcw; x++) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + + src0 += 2; + dst0 -= stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_8_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride) +{ + const int srcwgap = srcstride - srcw * 3; + + // point to the last dst pixel row + unsigned char* dstend = dst + stride * (h - 1); + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst7 = dstend + y * 3; + unsigned char* dst6 = dstend + y * 3 - stride; + + int src_step = 2 * srcstride; + int dst_step = -2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x3_t _src0 = vld3_u8(src0); + uint8x8x3_t _src1 = vld3_u8(src1); + + uint8x8x3_t _src2 = vld3_u8(src0 + src_step); + uint8x8x3_t _src3 = vld3_u8(src1 + src_step); + + uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step); + uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step); + + uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step); + uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); + uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); + uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); + uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); + + uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); + uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); + uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); + uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); + + uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]); + uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]); + uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]); + uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); + + uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); + uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); + uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); + uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); + + uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0])); + uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1])); + uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0])); + uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); + + uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); + uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); + uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); + uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); + + uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0])); + uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0])); + uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1])); + uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1])); + + uint8x8x3_t _dst0; + uint8x8x3_t _dst1; + uint8x8x3_t _dst2; + uint8x8x3_t _dst3; + uint8x8x3_t _dst4; + uint8x8x3_t _dst5; + uint8x8x3_t _dst6; + uint8x8x3_t _dst7; + + _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); + _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); + _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); + _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); + _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); + _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); + _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); + _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); + + _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); + _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); + _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); + _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); + _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); + _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); + _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); + _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); + + _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); + _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); + _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); + _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); + _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); + _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); + _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); + _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); + + vst3_u8(dst7, _dst0); + vst3_u8(dst6, _dst1); + vst3_u8(dst7 + dst_step, _dst2); + vst3_u8(dst6 + dst_step, _dst3); + vst3_u8(dst7 + 2 * dst_step, _dst4); + vst3_u8(dst6 + 2 * dst_step, _dst5); + vst3_u8(dst7 + 3 * dst_step, _dst6); + vst3_u8(dst6 + 3 * dst_step, _dst7); + + src0 += 3 * 8; + src1 += 3 * 8; + + dst7 += 4 * dst_step; + dst6 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #192] \n" + "vld3.u8 {d0-d2}, [%1], %10 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d4-d6}, [%2], %10 \n" + + "pld [%1, #192] \n" + "vld3.u8 {d8-d10}, [%1], %10 \n" + + "vtrn.u8 q0, q2 \n" // _src01t_r + "vtrn.u8 d2, d6 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d12-d14}, [%2], %10\n" + + "pld [%1, #192] \n" + "vld3.u8 {d16-d18}, [%1], %10\n" + + "vtrn.u8 q4, q6 \n" // _src23t_r + "vtrn.u8 d10, d14 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d20-d22}, [%2], %10\n" + + "pld [%1, #192] \n" + "vld3.u8 {d24-d26}, [%1], %10\n" + + "vtrn.u8 q8, q10 \n" // _src45t_r + "vtrn.u8 d18, d22 \n" + + "pld [%2, #192] \n" + "vld3.u8 {d28-d30}, [%2], %10\n" + + "vtrn.u8 q12, q14 \n" // _src67t_r + "vtrn.u8 d26, d30 \n" + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q0, q4 \n" // _src02tt_r + "vtrn.u16 d2, d10 \n" + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q2, q6 \n" // _src13tt_r + "vtrn.u16 d6, d14 \n" + + "add %1, #24 \n" // src0 += 24 + + "vtrn.u16 q8, q12 \n" // _src46tt_r + "vtrn.u16 d18, d26 \n" + + "add %2, #24 \n" // src1 += 24 + + "vtrn.u16 q10, q14 \n" // _src57tt_r + "vtrn.u16 d22, d30 \n" + + "vtrn.u32 q0, q8 \n" // _src04ttt_r + "vtrn.u32 d2, d18 \n" + + "vtrn.u32 q2, q10 \n" // _src15ttt_r + "vst3.u8 {d0-d2}, [%3], %11 \n" + "vtrn.u32 d6, d22 \n" + + "vtrn.u32 q4, q12 \n" // _src26ttt_r + "vst3.u8 {d4-d6}, [%4], %11 \n" + "vtrn.u32 d10, d26 \n" + + "vtrn.u32 q6, q14 \n" // _src37ttt_r + "vst3.u8 {d8-d10}, [%3], %11 \n" + "vtrn.u32 d14, d30 \n" + + "subs %0, #1 \n" + + "vst3.u8 {d16-d18}, [%3], %11\n" + "vst3.u8 {d12-d14}, [%4], %11\n" + "vst3.u8 {d20-d22}, [%4], %11\n" + "vst3.u8 {d24-d26}, [%3], %11\n" + "vst3.u8 {d28-d30}, [%4], %11\n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst7), // %3 + "=r"(dst6) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst7), + "4"(dst6), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst7[0] = src0[0]; + dst7[1] = src0[1]; + dst7[2] = src0[2]; + dst7[3] = src1[0]; + dst7[4] = src1[1]; + dst7[5] = src1[2]; + dst7[6] = src0[0 + src_step]; + dst7[7] = src0[1 + src_step]; + dst7[8] = src0[2 + src_step]; + dst7[9] = src1[0 + src_step]; + dst7[10] = src1[1 + src_step]; + dst7[11] = src1[2 + src_step]; + dst7[12] = src0[0 + 2 * src_step]; + dst7[13] = src0[1 + 2 * src_step]; + dst7[14] = src0[2 + 2 * src_step]; + dst7[15] = src1[0 + 2 * src_step]; + dst7[16] = src1[1 + 2 * src_step]; + dst7[17] = src1[2 + 2 * src_step]; + dst7[18] = src0[0 + 3 * src_step]; + dst7[19] = src0[1 + 3 * src_step]; + dst7[20] = src0[2 + 3 * src_step]; + dst7[21] = src1[0 + 3 * src_step]; + dst7[22] = src1[1 + 3 * src_step]; + dst7[23] = src1[2 + 3 * src_step]; + + src0 += 3; + src1 += 3; + + dst7 -= stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dstend + y * 3; + + int x = 0; + for (; x < srcw; x++) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src0[2]; + + src0 += 3; + dst0 -= stride; + } + + src0 += srcwgap; + } +} + +static void kanna_rotate_8_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride) +{ + const int srcwgap = srcstride - srcw * 4; + + // point to the last dst pixel row + unsigned char* dstend = dst + stride * (h - 1); + + const unsigned char* src0 = src; + + int y = 0; +#if __ARM_NEON + for (; y + 7 < srch; y += 8) + { + const unsigned char* src1 = src0 + srcstride; + + unsigned char* dst7 = dstend + y * 4; + unsigned char* dst6 = dstend + y * 4 - stride; + + int src_step = 2 * srcstride; + int dst_step = -2 * stride; + + int nn = srcw >> 3; + int remain = srcw - (nn << 3); + +#if __aarch64__ + for (; nn > 0; nn--) + { + uint8x8x4_t _src0 = vld4_u8(src0); + uint8x8x4_t _src1 = vld4_u8(src1); + + uint8x8x4_t _src2 = vld4_u8(src0 + src_step); + uint8x8x4_t _src3 = vld4_u8(src1 + src_step); + + uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step); + uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step); + + uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step); + uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step); + + uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); + uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); + uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); + uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); + + uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); + uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); + uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); + uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); + + uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]); + uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]); + uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]); + uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]); + + uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]); + uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]); + uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]); + uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]); + + uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); + uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); + uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); + uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); + + uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); + uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); + uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); + uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); + + uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0])); + uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1])); + uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0])); + uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1])); + + uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0])); + uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1])); + uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0])); + uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1])); + + uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); + uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); + uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); + uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); + + uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); + uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); + uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); + uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); + + uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0])); + uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0])); + uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1])); + uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1])); + + uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0])); + uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0])); + uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1])); + uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1])); + + uint8x8x4_t _dst0; + uint8x8x4_t _dst1; + uint8x8x4_t _dst2; + uint8x8x4_t _dst3; + uint8x8x4_t _dst4; + uint8x8x4_t _dst5; + uint8x8x4_t _dst6; + uint8x8x4_t _dst7; + + _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); + _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); + _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); + _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); + _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); + _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); + _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); + _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); + + _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); + _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); + _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); + _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); + _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); + _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); + _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); + _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); + + _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); + _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); + _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); + _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); + _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); + _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); + _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); + _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); + + _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]); + _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]); + _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]); + _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]); + _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]); + _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]); + _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]); + _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]); + + vst4_u8(dst7, _dst0); + vst4_u8(dst6, _dst1); + vst4_u8(dst7 + dst_step, _dst2); + vst4_u8(dst6 + dst_step, _dst3); + vst4_u8(dst7 + 2 * dst_step, _dst4); + vst4_u8(dst6 + 2 * dst_step, _dst5); + vst4_u8(dst7 + 3 * dst_step, _dst6); + vst4_u8(dst6 + 3 * dst_step, _dst7); + + src0 += 4 * 8; + src1 += 4 * 8; + + dst7 += 4 * dst_step; + dst6 += 4 * dst_step; + } +#else + if (nn > 0) + { + asm volatile( + "0: \n" + "pld [%1, #256] \n" + "vld4.u8 {d0-d3}, [%1], %10 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d4-d7}, [%2], %10 \n" + + "pld [%1, #256] \n" + "vld4.u8 {d8-d11}, [%1], %10 \n" + + "vtrn.u8 q0, q2 \n" // _src01t_r + "vtrn.u8 q1, q3 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d12-d15}, [%2], %10\n" + + "pld [%1, #256] \n" + "vld4.u8 {d16-d19}, [%1], %10\n" + + "vtrn.u8 q4, q6 \n" // _src23t_r + "vtrn.u8 q5, q7 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d20-d23}, [%2], %10\n" + + "pld [%1, #256] \n" + "vld4.u8 {d24-d27}, [%1], %10\n" + + "vtrn.u8 q8, q10 \n" // _src45t_r + "vtrn.u8 q9, q11 \n" + + "pld [%2, #256] \n" + "vld4.u8 {d28-d31}, [%2], %10\n" + + "vtrn.u8 q12, q14 \n" // _src67t_r + "vtrn.u8 q13, q15 \n" + + "sub %1, %1, %10, lsl #2 \n" // restore src0 + + "vtrn.u16 q0, q4 \n" // _src02tt_r + "vtrn.u16 q1, q5 \n" + + "sub %2, %2, %10, lsl #2 \n" // restore src1 + + "vtrn.u16 q2, q6 \n" // _src13tt_r + "vtrn.u16 q3, q7 \n" + + "add %1, #32 \n" // src0 += 32 + + "vtrn.u16 q8, q12 \n" // _src46tt_r + "vtrn.u16 q9, q13 \n" + + "add %2, #32 \n" // src1 += 32 + + "vtrn.u16 q10, q14 \n" // _src57tt_r + "vtrn.u16 q11, q15 \n" + + "vtrn.u32 q0, q8 \n" // _src04ttt_r + "vtrn.u32 q1, q9 \n" + + "vtrn.u32 q2, q10 \n" // _src15ttt_r + "vst4.u8 {d0-d3}, [%3], %11 \n" + "vtrn.u32 q3, q11 \n" + + "vtrn.u32 q4, q12 \n" // _src26ttt_r + "vst4.u8 {d4-d7}, [%4], %11 \n" + "vtrn.u32 q5, q13 \n" + + "vtrn.u32 q6, q14 \n" // _src37ttt_r + "vst4.u8 {d8-d11}, [%3], %11 \n" + "vtrn.u32 q7, q15 \n" + + "subs %0, #1 \n" + + "vst4.u8 {d16-d19}, [%3], %11\n" + "vst4.u8 {d12-d15}, [%4], %11\n" + "vst4.u8 {d20-d23}, [%4], %11\n" + "vst4.u8 {d24-d27}, [%3], %11\n" + "vst4.u8 {d28-d31}, [%4], %11\n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(src0), // %1 + "=r"(src1), // %2 + "=r"(dst7), // %3 + "=r"(dst6) // %4 + : "0"(nn), + "1"(src0), + "2"(src1), + "3"(dst7), + "4"(dst6), + "r"(src_step), // %10 + "r"(dst_step) // %11 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + } +#endif // __aarch64__ + for (; remain > 0; remain--) + { + dst7[0] = src0[0]; + dst7[1] = src0[1]; + dst7[2] = src0[2]; + dst7[3] = src0[3]; + dst7[4] = src1[0]; + dst7[5] = src1[1]; + dst7[6] = src1[2]; + dst7[7] = src1[3]; + dst7[8] = src0[0 + src_step]; + dst7[9] = src0[1 + src_step]; + dst7[10] = src0[2 + src_step]; + dst7[11] = src0[3 + src_step]; + dst7[12] = src1[0 + src_step]; + dst7[13] = src1[1 + src_step]; + dst7[14] = src1[2 + src_step]; + dst7[15] = src1[3 + src_step]; + dst7[16] = src0[0 + 2 * src_step]; + dst7[17] = src0[1 + 2 * src_step]; + dst7[18] = src0[2 + 2 * src_step]; + dst7[19] = src0[3 + 2 * src_step]; + dst7[20] = src1[0 + 2 * src_step]; + dst7[21] = src1[1 + 2 * src_step]; + dst7[22] = src1[2 + 2 * src_step]; + dst7[23] = src1[3 + 2 * src_step]; + dst7[24] = src0[0 + 3 * src_step]; + dst7[25] = src0[1 + 3 * src_step]; + dst7[26] = src0[2 + 3 * src_step]; + dst7[27] = src0[3 + 3 * src_step]; + dst7[28] = src1[0 + 3 * src_step]; + dst7[29] = src1[1 + 3 * src_step]; + dst7[30] = src1[2 + 3 * src_step]; + dst7[31] = src1[3 + 3 * src_step]; + + src0 += 4; + src1 += 4; + + dst7 -= stride; + } + + src0 += srcwgap + 7 * srcstride; + } +#endif // __ARM_NEON + for (; y < srch; y++) + { + unsigned char* dst0 = dstend + y * 4; + + int x = 0; + for (; x < srcw; x++) + { + dst0[0] = src0[0]; + dst0[1] = src0[1]; + dst0[2] = src0[2]; + dst0[3] = src0[3]; + + src0 += 4; + dst0 -= stride; + } + + src0 += srcwgap; + } +} + +void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) +{ + return kanna_rotate_c1(src, srcw, srch, srcw, dst, w, h, w, type); +} + +void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) +{ + return kanna_rotate_c2(src, srcw, srch, srcw * 2, dst, w, h, w * 2, type); +} + +void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) +{ + return kanna_rotate_c3(src, srcw, srch, srcw * 3, dst, w, h, w * 3, type); +} + +void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) +{ + return kanna_rotate_c4(src, srcw, srch, srcw * 4, dst, w, h, w * 4, type); +} + +void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type) +{ + // assert srcw == w && srch == h for type 1234 + // assert srcw == h && srch == w for type 5678 + + switch (type) + { + case 1: + kanna_rotate_1_c1(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 2: + kanna_rotate_2_c1(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 3: + kanna_rotate_3_c1(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 4: + kanna_rotate_4_c1(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 5: + kanna_rotate_5_c1(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 6: + kanna_rotate_6_c1(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 7: + kanna_rotate_7_c1(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 8: + kanna_rotate_8_c1(src, srcw, srch, srcstride, dst, w, h, stride); + break; + default: + // unsupported rotate type + break; + } +} + +void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type) +{ + // assert srcw == w && srch == h for type 1234 + // assert srcw == h && srch == w for type 5678 + + switch (type) + { + case 1: + kanna_rotate_1_c2(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 2: + kanna_rotate_2_c2(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 3: + kanna_rotate_3_c2(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 4: + kanna_rotate_4_c2(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 5: + kanna_rotate_5_c2(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 6: + kanna_rotate_6_c2(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 7: + kanna_rotate_7_c2(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 8: + kanna_rotate_8_c2(src, srcw, srch, srcstride, dst, w, h, stride); + break; + default: + // unsupported rotate type + break; + } +} + +void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type) +{ + // assert srcw == w && srch == h for type 1234 + // assert srcw == h && srch == w for type 5678 + + switch (type) + { + case 1: + kanna_rotate_1_c3(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 2: + kanna_rotate_2_c3(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 3: + kanna_rotate_3_c3(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 4: + kanna_rotate_4_c3(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 5: + kanna_rotate_5_c3(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 6: + kanna_rotate_6_c3(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 7: + kanna_rotate_7_c3(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 8: + kanna_rotate_8_c3(src, srcw, srch, srcstride, dst, w, h, stride); + break; + default: + // unsupported rotate type + break; + } +} + +void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type) +{ + // assert srcw == w && srch == h for type 1234 + // assert srcw == h && srch == w for type 5678 + + switch (type) + { + case 1: + kanna_rotate_1_c4(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 2: + kanna_rotate_2_c4(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 3: + kanna_rotate_3_c4(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 4: + kanna_rotate_4_c4(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 5: + kanna_rotate_5_c4(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 6: + kanna_rotate_6_c4(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 7: + kanna_rotate_7_c4(src, srcw, srch, srcstride, dst, w, h, stride); + break; + case 8: + kanna_rotate_8_c4(src, srcw, srch, srcstride, dst, w, h, stride); + break; + default: + // unsupported rotate type + break; + } +} + +void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) +{ + // assert srcw % 2 == 0 + // assert srch % 2 == 0 + // assert w % 2 == 0 + // assert h % 2 == 0 + + const unsigned char* srcY = src; + unsigned char* dstY = dst; + kanna_rotate_c1(srcY, srcw, srch, dstY, w, h, type); + + const unsigned char* srcUV = src + srcw * srch; + unsigned char* dstUV = dst + w * h; + kanna_rotate_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2, type); +} diff --git a/highgui/src/kanna_rotate.h b/highgui/src/kanna_rotate.h new file mode 100644 index 00000000..8414fa62 --- /dev/null +++ b/highgui/src/kanna_rotate.h @@ -0,0 +1,36 @@ +// +// Copyright (C) 2024 nihui +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef KANNA_ROTATE_H +#define KANNA_ROTATE_H + +void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type); + +void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type); + +void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type); + +void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type); + +void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type); + +void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type); + +void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type); + +void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type); + +#endif // KANNA_ROTATE_H From 58e0f56065074ad234d4cf24e336a7d11da84437 Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 9 Feb 2024 22:59:56 +0800 Subject: [PATCH 02/10] wip --- .github/workflows/release.yml | 46 +++++++++++++++++------------------ README.md | 2 +- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0a40e104..97c017ba 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,9 +1,9 @@ name: release -on: pull_request -# on: -# push: -# tags: -# - '*' +# on: pull_request +on: + push: + tags: + - '*' env: DEVELOPER_DIR: /Applications/Xcode_13.4.1.app/Contents/Developer @@ -1247,21 +1247,21 @@ jobs: name: ${{ env.PACKAGE_NAME }} path: ${{ env.PACKAGE_NAME }}.zip - # release: - # permissions: - # contents: write # for softprops/action-gh-release to create a release - # needs: [setup, android, ios, ios-simulator, armlinux, macos, mac-catalyst, windows, ubuntu, webassembly, apple, devboard] - # runs-on: ubuntu-latest - # steps: - # - name: download - # uses: actions/download-artifact@v4 - # with: - # path: artifacts - # - # - name: create-release - # uses: softprops/action-gh-release@v1 - # with: - # token: ${{ secrets.GITHUB_TOKEN }} - # tag_name: ${{ needs.setup.outputs.VERSION }} - # name: Release ${{ needs.setup.outputs.VERSION }} - # files: artifacts/*/*.zip + release: + permissions: + contents: write # for softprops/action-gh-release to create a release + needs: [setup, android, ios, ios-simulator, armlinux, macos, mac-catalyst, windows, ubuntu, webassembly, apple, devboard] + runs-on: ubuntu-latest + steps: + - name: download + uses: actions/download-artifact@v4 + with: + path: artifacts + + - name: create-release + uses: softprops/action-gh-release@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + tag_name: ${{ needs.setup.outputs.VERSION }} + name: Release ${{ needs.setup.outputs.VERSION }} + files: artifacts/*/*.zip diff --git a/README.md b/README.md index 7e60a936..c6b27242 100644 --- a/README.md +++ b/README.md @@ -331,7 +331,7 @@ https://github.com/nihui/opencv-mobile/releases/latest
tinyvision
arm-linux-uclibcgnueabihf
- ❌ HW JPG decoder (WIP)
+ ✅ HW JPG decoder
❌ HW JPG encoder (WIP)
✅ MIPI CSI camera
From 515dca9099877c4435b83d7888e2295c5c24463c Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 12 Feb 2024 15:46:08 +0800 Subject: [PATCH 03/10] aw jpg encoder --- highgui/CMakeLists.txt | 1 + highgui/src/highgui.cpp | 65 ++ highgui/src/jpeg_encoder_aw.cpp | 1350 +++++++++++++++++++++++++++++++ highgui/src/jpeg_encoder_aw.h | 43 + 4 files changed, 1459 insertions(+) create mode 100644 highgui/src/jpeg_encoder_aw.cpp create mode 100644 highgui/src/jpeg_encoder_aw.h diff --git a/highgui/CMakeLists.txt b/highgui/CMakeLists.txt index d8ff69a3..694c3212 100644 --- a/highgui/CMakeLists.txt +++ b/highgui/CMakeLists.txt @@ -9,6 +9,7 @@ set(highgui_srcs ${CMAKE_CURRENT_LIST_DIR}/src/highgui.cpp ${CMAKE_CURRENT_LIST_DIR}/src/jpeg_decoder_aw.cpp ${CMAKE_CURRENT_LIST_DIR}/src/jpeg_decoder_cvi.cpp + ${CMAKE_CURRENT_LIST_DIR}/src/jpeg_encoder_aw.cpp ${CMAKE_CURRENT_LIST_DIR}/src/jpeg_encoder_rk_mpp.cpp ${CMAKE_CURRENT_LIST_DIR}/src/kanna_rotate.cpp ${CMAKE_CURRENT_LIST_DIR}/src/videocapture.cpp diff --git a/highgui/src/highgui.cpp b/highgui/src/highgui.cpp index c4eb63e8..34b6478c 100644 --- a/highgui/src/highgui.cpp +++ b/highgui/src/highgui.cpp @@ -42,6 +42,7 @@ #if defined __linux__ #include "jpeg_decoder_aw.h" #include "jpeg_decoder_cvi.h" +#include "jpeg_encoder_aw.h" #include "jpeg_encoder_rk_mpp.h" #endif @@ -320,6 +321,38 @@ bool imwrite(const String& filename, InputArray _img, const std::vector& pa #if defined __linux__ if (ext == ".jpg" || ext == ".jpeg" || ext == ".JPG" || ext == ".JPEG") { + if (jpeg_encoder_aw::supported(img.cols, img.rows, c)) + { + // anything to bgr + if (!img.isContinuous()) + { + img = img.clone(); + } + + int quality = 95; + for (size_t i = 0; i < params.size(); i += 2) + { + if (params[i] == IMWRITE_JPEG_QUALITY) + { + quality = params[i + 1]; + break; + } + } + + jpeg_encoder_aw e; + int ret = e.init(img.cols, img.rows, c, quality); + if (ret == 0) + { + ret = e.encode(img.data, filename.c_str()); + if (ret == 0) + { + e.deinit(); + return true; + } + } + + // fallback to stb_image_write + } if (jpeg_encoder_rk_mpp::supported(img.cols, img.rows, c)) { // anything to bgr @@ -604,6 +637,38 @@ bool imencode(const String& ext, InputArray _img, std::vector& buf, const #if defined __linux__ if (ext == ".jpg" || ext == ".jpeg" || ext == ".JPG" || ext == ".JPEG") { + if (jpeg_encoder_aw::supported(img.cols, img.rows, c)) + { + // anything to bgr + if (!img.isContinuous()) + { + img = img.clone(); + } + + int quality = 95; + for (size_t i = 0; i < params.size(); i += 2) + { + if (params[i] == IMWRITE_JPEG_QUALITY) + { + quality = params[i + 1]; + break; + } + } + + jpeg_encoder_aw e; + int ret = e.init(img.cols, img.rows, c, quality); + if (ret == 0) + { + ret = e.encode(img.data, buf); + if (ret == 0) + { + e.deinit(); + return true; + } + } + + // fallback to stb_image_write + } if (jpeg_encoder_rk_mpp::supported(img.cols, img.rows, c)) { // anything to bgr diff --git a/highgui/src/jpeg_encoder_aw.cpp b/highgui/src/jpeg_encoder_aw.cpp new file mode 100644 index 00000000..840ede29 --- /dev/null +++ b/highgui/src/jpeg_encoder_aw.cpp @@ -0,0 +1,1350 @@ +// +// Copyright (C) 2024 nihui +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "jpeg_encoder_aw.h" + +#if defined __linux__ +#include +#include +#include + +#include +#include + +#if __ARM_NEON +#include +#endif + +#include +#include +#include +#include +#include +#include + +// 0 = unknown +// 1 = t113-i +// 2 = tinyvision +static int get_device_model() +{ + static int device_model = -1; + + if (device_model >= 0) + return device_model; + + device_model = 0; + + FILE* fp = fopen("/proc/device-tree/model", "rb"); + if (fp) + { + char buf[1024]; + fgets(buf, 1024, fp); + fclose(fp); + + if (strncmp(buf, "sun8iw20", 8) == 0) + { + // t113-i + device_model = 1; + } + if (strncmp(buf, "sun8iw21", 8) == 0) + { + // tinyvision + device_model = 2; + } + } + + return device_model; +} + +static bool is_device_whitelisted() +{ + const int device_model = get_device_model(); + + if (device_model == 1) + { + // t113-i + return true; + } + if (device_model == 2) + { + // tinyvision + return true; + } + + return false; +} + +extern "C" { + +typedef enum VENC_CODEC_TYPE { + VENC_CODEC_H264, + VENC_CODEC_JPEG, + VENC_CODEC_H264_VER2, + VENC_CODEC_H265, + VENC_CODEC_VP8, +} VENC_CODEC_TYPE; + +typedef enum VENC_PIXEL_FMT { + VENC_PIXEL_YUV420SP, + VENC_PIXEL_YVU420SP, + VENC_PIXEL_YUV420P, + VENC_PIXEL_YVU420P, + VENC_PIXEL_YUV422SP, + VENC_PIXEL_YVU422SP, + VENC_PIXEL_YUV422P, + VENC_PIXEL_YVU422P, + VENC_PIXEL_YUYV422, + VENC_PIXEL_UYVY422, + VENC_PIXEL_YVYU422, + VENC_PIXEL_VYUY422, + VENC_PIXEL_ARGB, + VENC_PIXEL_RGBA, + VENC_PIXEL_ABGR, + VENC_PIXEL_BGRA, + VENC_PIXEL_TILE_32X32, + VENC_PIXEL_TILE_128X32, + VENC_PIXEL_AFBC_AW, + VENC_PIXEL_LBC_AW, //* for v5v200 and newer ic +} VENC_PIXEL_FMT; + +typedef enum VENC_INDEXTYPE { + VENC_IndexParamBitrate = 0x0, + /**< reference type: int */ + VENC_IndexParamFramerate, + /**< reference type: int */ + VENC_IndexParamMaxKeyInterval, + /**< reference type: int */ + VENC_IndexParamIfilter, + /**< reference type: int */ + VENC_IndexParamRotation, + /**< reference type: int */ + VENC_IndexParamSliceHeight, + /**< reference type: int */ + VENC_IndexParamForceKeyFrame, + /**< reference type: int (write only)*/ + VENC_IndexParamMotionDetectEnable, + /**< reference type: MotionParam(write only) */ + VENC_IndexParamMotionDetectStatus, + /**< reference type: int(read only) */ + VENC_IndexParamRgb2Yuv, + /**< reference type: VENC_COLOR_SPACE */ + VENC_IndexParamYuv2Yuv, + /**< reference type: VENC_YUV2YUV */ + VENC_IndexParamROIConfig, + /**< reference type: VencROIConfig */ + VENC_IndexParamStride, + /**< reference type: int */ + VENC_IndexParamColorFormat, + /**< reference type: VENC_PIXEL_FMT */ + VENC_IndexParamSize, + /**< reference type: VencSize(read only) */ + VENC_IndexParamSetVbvSize, + /**< reference type: setVbvSize(write only) */ + VENC_IndexParamVbvInfo, + /**< reference type: getVbvInfo(read only) */ + VENC_IndexParamSuperFrameConfig, + /**< reference type: VencSuperFrameConfig */ + VENC_IndexParamSetPSkip, + /**< reference type: unsigned int */ + VENC_IndexParamResetEnc, + /**< reference type: */ + VENC_IndexParamSaveBSFile, + /**< reference type: VencSaveBSFile */ + VENC_IndexParamHorizonFlip, + /**< reference type: unsigned int */ + + /* check capabiliy */ + VENC_IndexParamMAXSupportSize, + /**< reference type: VencSize(read only) */ + VENC_IndexParamCheckColorFormat, + /**< reference type: VencCheckFormat(read only) */ + + /* H264 param */ + VENC_IndexParamH264Param = 0x100, + /**< reference type: VencH264Param */ + VENC_IndexParamH264SPSPPS, + /**< reference type: VencHeaderData (read only)*/ + VENC_IndexParamH264QPRange, + /**< reference type: VencQPRange */ + VENC_IndexParamH264ProfileLevel, + /**< reference type: VencProfileLevel */ + VENC_IndexParamH264EntropyCodingCABAC, + /**< reference type: int(0:CAVLC 1:CABAC) */ + VENC_IndexParamH264CyclicIntraRefresh, + /**< reference type: VencCyclicIntraRefresh */ + VENC_IndexParamH264FixQP, + /**< reference type: VencFixQP */ + VENC_IndexParamH264SVCSkip, + /**< reference type: VencH264SVCSkip */ + VENC_IndexParamH264AspectRatio, + /**< reference type: VencH264AspectRatio */ + VENC_IndexParamFastEnc, + /**< reference type: int */ + VENC_IndexParamH264VideoSignal, + /**< reference type: VencH264VideoSignal */ + VENC_IndexParamH264VideoTiming, + /**< reference type: VencH264VideoTiming */ + VENC_IndexParamChmoraGray, + /**< reference type: unsigned char */ + VENC_IndexParamIQpOffset, + /**< reference type: constant QP */ + VENC_IndexParamH264ConstantQP, + /**< reference type: int */ + /* jpeg param */ + VENC_IndexParamJpegQuality = 0x200, + /**< reference type: int (1~100) */ + VENC_IndexParamJpegExifInfo, + /**< reference type: EXIFInfo */ + VENC_IndexParamJpegEncMode, + /**< reference type: 0:jpeg; 1:motion_jepg */ + VENC_IndexParamJpegVideoSignal, + /**< reference type: VencJpegVideoSignal */ + + /* VP8 param */ + VENC_IndexParamVP8Param, + /* max one frame length */ + VENC_IndexParamSetFrameLenThreshold, + /**< reference type: int */ + /* decrease the a20 dram bands */ + VENC_IndexParamSetA20LowBands, + /**< reference type: 0:disable; 1:enable */ + VENC_IndexParamSetBitRateRange, + /**< reference type: VencBitRateRange */ + VENC_IndexParamLongTermReference, + /**< reference type: 0:disable; 1:enable, default:enable */ + + /* h265 param */ + VENC_IndexParamH265Param = 0x300, + VENC_IndexParamH265Gop, + VENC_IndexParamH265ToalFramesNum, + VENC_IndexParamH26xUpdateLTRef, + VENC_IndexParamH265Header, + VENC_IndexParamH265TendRatioCoef, + VENC_IndexParamH265Trans, + /**< reference type: VencH265TranS */ + VENC_IndexParamH265Sao, + /**< reference type: VencH265SaoS */ + VENC_IndexParamH265Dblk, + /**< reference type: VencH265DblkS */ + VENC_IndexParamH265Timing, + /**< reference type: VencH265TimingS */ + VENC_IndexParamIntraPeriod, + VENC_IndexParamMBModeCtrl, + VENC_IndexParamMBSumInfoOutput, + VENC_IndexParamMBInfoOutput, + VENC_IndexParamVUIAspectRatio, + VENC_IndexParamVUIVideoSignal, + VENC_IndexParamVUIChromaLoc, + VENC_IndexParamVUIDisplayWindow, + VENC_IndexParamVUIBitstreamRestriction, + + VENC_IndexParamAlterFrame = 0x400, + /**< reference type: unsigned int */ + VENC_IndexParamVirtualIFrame, + VENC_IndexParamChannelNum, + VENC_IndexParamProcSet, + /**< reference type: VencOverlayInfoS */ + VENC_IndexParamSetOverlay, + /**< reference type: unsigned char */ + VENC_IndexParamAllParams, + /**< reference type:VencBrightnessS */ + VENC_IndexParamBright, + /**< reference type:VencSmartFun */ + VENC_IndexParamSmartFuntion, + /**< reference type: VencHVS */ + VENC_IndexParamHVS, + /**< reference type: unsigned char */ + VENC_IndexParamSkipTend, + /**< reference type: unsigned char */ + VENC_IndexParamHighPassFilter, + /**< reference type: unsigned char */ + VENC_IndexParamPFrameIntraEn, + /**< reference type: unsigned char */ + VENC_IndexParamEncodeTimeEn, + /**< reference type: VencEncodeTimeS */ + VENC_IndexParamGetEncodeTime, + /**< reference type: unsigned char */ + VENC_IndexParam3DFilter, + /**< reference type: unsigned char */ + VENC_IndexParamIntra4x4En, + + /**< reference type: unsigned int */ + VENC_IndexParamSetNullFrame = 0x500, + /**< reference type: VencThumbInfo */ + VENC_IndexParamGetThumbYUV, + /**< reference type: E_ISP_SCALER_RATIO */ + VENC_IndexParamSetThumbScaler, + /**< reference type: unsigned char */ + VENC_IndexParamAdaptiveIntraInP, + /**< reference type: VencBaseConfig */ + VENC_IndexParamUpdateBaseInfo, + + /**< reference type: unsigned char */ + VENC_IndexParamFillingCbr, + + /**< reference type: unsigned char */ + VENC_IndexParamRoi, + + /**< reference type: unsigned int */ + /* drop the frame that bitstreamLen exceed vbv-valid-size */ + VENC_IndexParamDropOverflowFrame, + + /**< reference type: unsigned int; 0: day, 1: night*/ + VENC_IndexParamIsNightCaseFlag, + + /**< reference type: unsigned int; 0: normal case, 1: ipc case*/ + VENC_IndexParamProductCase, + + /**< reference type: VencWatermarkInfoS */ + VENC_IndexParamSetOverlayByWatermark, +} VENC_INDEXTYPE; + +struct ScMemOpsS; +struct VeOpsS; + +typedef struct VencBaseConfig { + unsigned char bEncH264Nalu; + unsigned int nInputWidth; + unsigned int nInputHeight; + unsigned int nDstWidth; + unsigned int nDstHeight; + unsigned int nStride; + VENC_PIXEL_FMT eInputFormat; + struct ScMemOpsS *memops; + VeOpsS* veOpsS; + void* pVeOpsSelf; + + unsigned char bOnlyWbFlag; + + //* for v5v200 and newer ic + unsigned char bLbcLossyComEnFlag2x; + unsigned char bLbcLossyComEnFlag2_5x; + unsigned char bIsVbvNoCache; + //* end +} VencBaseConfig; + +typedef struct VencAllocateBufferParam { + unsigned int nBufferNum; + unsigned int nSizeY; + unsigned int nSizeC; +} VencAllocateBufferParam; + +typedef struct VencRect { + int nLeft; + int nTop; + int nWidth; + int nHeight; +} VencRect; + +/* support 4 ROI region */ +typedef struct VencROIConfig { + int bEnable; + int index; /* (0~3) */ + int nQPoffset; + unsigned char roi_abs_flag; + VencRect sRect; +} VencROIConfig; + +typedef struct VencInputBuffer { + unsigned long nID; + long long nPts; + unsigned int nFlag; + unsigned char* pAddrPhyY; + unsigned char* pAddrPhyC; + unsigned char* pAddrVirY; + unsigned char* pAddrVirC; + int nWidth; + int nHeight; + int nAlign; + int bEnableCorp; + VencRect sCropInfo; + + int ispPicVar; + int ispPicVarChroma; //chroma filter coef[0-63], from isp + int bUseInputBufferRoi; + VencROIConfig roi_param[8]; + int bAllocMemSelf; + int nShareBufFd; + unsigned char bUseCsiColorFormat; + VENC_PIXEL_FMT eCsiColorFormat; + + int envLV; +} VencInputBuffer; + +typedef struct FrameInfo { + int CurrQp; + int avQp; + int nGopIndex; + int nFrameIndex; + int nTotalIndex; +} FrameInfo; + +typedef struct VencOutputBuffer { + int nID; + long long nPts; + unsigned int nFlag; + unsigned int nSize0; + unsigned int nSize1; + unsigned char* pData0; + unsigned char* pData1; + + FrameInfo frame_info; + unsigned int nSize2; + unsigned char* pData2; +} VencOutputBuffer; + +typedef void* VideoEncoder; + +typedef VideoEncoder* (*PFN_VideoEncCreate)(VENC_CODEC_TYPE eCodecType); +typedef void (*PFN_VideoEncDestroy)(VideoEncoder* pEncoder); +typedef int (*PFN_VideoEncInit)(VideoEncoder* pEncoder, VencBaseConfig* pConfig); +typedef int (*PFN_VideoEncUnInit)(VideoEncoder* pEncoder); +typedef int (*PFN_AllocInputBuffer)(VideoEncoder* pEncoder, VencAllocateBufferParam* pBufferParam); +typedef int (*PFN_GetOneAllocInputBuffer)(VideoEncoder* pEncoder, VencInputBuffer* pInputbuffer); +typedef int (*PFN_FlushCacheAllocInputBuffer)(VideoEncoder* pEncoder, VencInputBuffer* pInputbuffer); +typedef int (*PFN_ReturnOneAllocInputBuffer)(VideoEncoder* pEncoder, VencInputBuffer* pInputbuffer); +typedef int (*PFN_ReleaseAllocInputBuffer)(VideoEncoder* pEncoder); +typedef int (*PFN_AddOneInputBuffer)(VideoEncoder* pEncoder, VencInputBuffer* pInputbuffer); +typedef int (*PFN_VideoEncodeOneFrame)(VideoEncoder* pEncoder); +typedef int (*PFN_AlreadyUsedInputBuffer)(VideoEncoder* pEncoder, VencInputBuffer* pBuffer); +typedef int (*PFN_ValidBitstreamFrameNum)(VideoEncoder* pEncoder); +typedef int (*PFN_GetOneBitstreamFrame)(VideoEncoder* pEncoder, VencOutputBuffer* pBuffer); +typedef int (*PFN_FreeOneBitStreamFrame)(VideoEncoder* pEncoder, VencOutputBuffer* pBuffer); +typedef int (*PFN_VideoEncGetParameter)(VideoEncoder* pEncoder, VENC_INDEXTYPE indexType, void* paramData); +typedef int (*PFN_VideoEncSetParameter)(VideoEncoder* pEncoder, VENC_INDEXTYPE indexType, void* paramData); + +} + +static void* libvencoder = 0; + +static PFN_VideoEncCreate VideoEncCreate = 0; +static PFN_VideoEncDestroy VideoEncDestroy = 0; +static PFN_VideoEncInit VideoEncInit = 0; +static PFN_VideoEncUnInit VideoEncUnInit = 0; +static PFN_AllocInputBuffer AllocInputBuffer = 0; +static PFN_GetOneAllocInputBuffer GetOneAllocInputBuffer = 0; +static PFN_FlushCacheAllocInputBuffer FlushCacheAllocInputBuffer = 0; +static PFN_ReturnOneAllocInputBuffer ReturnOneAllocInputBuffer = 0; +static PFN_ReleaseAllocInputBuffer ReleaseAllocInputBuffer = 0; +static PFN_AddOneInputBuffer AddOneInputBuffer = 0; +static PFN_VideoEncodeOneFrame VideoEncodeOneFrame = 0; +static PFN_AlreadyUsedInputBuffer AlreadyUsedInputBuffer = 0; +static PFN_ValidBitstreamFrameNum ValidBitstreamFrameNum = 0; +static PFN_GetOneBitstreamFrame GetOneBitstreamFrame = 0; +static PFN_FreeOneBitStreamFrame FreeOneBitStreamFrame = 0; +static PFN_VideoEncGetParameter VideoEncGetParameter = 0; +static PFN_VideoEncSetParameter VideoEncSetParameter = 0; + +static int load_vencoder_library() +{ + if (libvencoder) + return 0; + + // check device whitelist + bool whitelisted = is_device_whitelisted(); + if (!whitelisted) + { + fprintf(stderr, "this device is not whitelisted for jpeg encoder aw cedarc\n"); + return -1; + } + + libvencoder = dlopen("libvencoder.so", RTLD_LOCAL | RTLD_NOW); + if (!libvencoder) + { + libvencoder = dlopen("/usr/lib/libvencoder.so", RTLD_LOCAL | RTLD_NOW); + } + if (!libvencoder) + { + return -1; + } + + VideoEncCreate = (PFN_VideoEncCreate)dlsym(libvencoder, "VideoEncCreate"); + VideoEncDestroy = (PFN_VideoEncDestroy)dlsym(libvencoder, "VideoEncDestroy"); + VideoEncInit = (PFN_VideoEncInit)dlsym(libvencoder, "VideoEncInit"); + VideoEncUnInit = (PFN_VideoEncUnInit)dlsym(libvencoder, "VideoEncUnInit"); + AllocInputBuffer = (PFN_AllocInputBuffer)dlsym(libvencoder, "AllocInputBuffer"); + GetOneAllocInputBuffer = (PFN_GetOneAllocInputBuffer)dlsym(libvencoder, "GetOneAllocInputBuffer"); + FlushCacheAllocInputBuffer = (PFN_FlushCacheAllocInputBuffer)dlsym(libvencoder, "FlushCacheAllocInputBuffer"); + ReturnOneAllocInputBuffer = (PFN_ReturnOneAllocInputBuffer)dlsym(libvencoder, "ReturnOneAllocInputBuffer"); + ReleaseAllocInputBuffer = (PFN_ReleaseAllocInputBuffer)dlsym(libvencoder, "ReleaseAllocInputBuffer"); + AddOneInputBuffer = (PFN_AddOneInputBuffer)dlsym(libvencoder, "AddOneInputBuffer"); + VideoEncodeOneFrame = (PFN_VideoEncodeOneFrame)dlsym(libvencoder, "VideoEncodeOneFrame"); + AlreadyUsedInputBuffer = (PFN_AlreadyUsedInputBuffer)dlsym(libvencoder, "AlreadyUsedInputBuffer"); + ValidBitstreamFrameNum = (PFN_ValidBitstreamFrameNum)dlsym(libvencoder, "ValidBitstreamFrameNum"); + GetOneBitstreamFrame = (PFN_GetOneBitstreamFrame)dlsym(libvencoder, "GetOneBitstreamFrame"); + FreeOneBitStreamFrame = (PFN_FreeOneBitStreamFrame)dlsym(libvencoder, "FreeOneBitStreamFrame"); + VideoEncGetParameter = (PFN_VideoEncGetParameter)dlsym(libvencoder, "VideoEncGetParameter"); + VideoEncSetParameter = (PFN_VideoEncSetParameter)dlsym(libvencoder, "VideoEncSetParameter"); + + return 0; +} + +static int unload_vencoder_library() +{ + if (!libvencoder) + return 0; + + dlclose(libvencoder); + libvencoder = 0; + + VideoEncCreate = 0; + VideoEncDestroy = 0; + VideoEncInit = 0; + VideoEncUnInit = 0; + AllocInputBuffer = 0; + GetOneAllocInputBuffer = 0; + FlushCacheAllocInputBuffer = 0; + ReturnOneAllocInputBuffer = 0; + ReleaseAllocInputBuffer = 0; + AddOneInputBuffer = 0; + VideoEncodeOneFrame = 0; + AlreadyUsedInputBuffer = 0; + ValidBitstreamFrameNum = 0; + GetOneBitstreamFrame = 0; + FreeOneBitStreamFrame = 0; + VideoEncGetParameter = 0; + VideoEncSetParameter = 0; + + return 0; +} + +class vencoder_library_loader +{ +public: + bool ready; + + vencoder_library_loader() + { + ready = (load_vencoder_library() == 0); + } + + ~vencoder_library_loader() + { + unload_vencoder_library(); + } +}; + +static vencoder_library_loader vencoder; + + +static void gray2yuv420sp(const unsigned char* graydata, int width, int height, unsigned char* yptr, unsigned char* uvptr, int stride) +{ + for (int y = 0; y + 1 < height; y += 2) + { + const unsigned char* p0 = graydata + y * width; + const unsigned char* p1 = graydata + (y + 1) * width; + unsigned char* yptr0 = yptr + y * stride; + unsigned char* yptr1 = yptr + (y + 1) * stride; + unsigned char* uvptr0 = uvptr + (y / 2) * stride; + + memcpy(yptr0, p0, width); + memcpy(yptr1, p1, width); + memset(uvptr0, 128, width); + } +} + +static void bgr2yuv420sp(const unsigned char* bgrdata, int width, int height, unsigned char* yptr, unsigned char* uvptr, int stride) +{ +#if __ARM_NEON + uint8x8_t _v38 = vdup_n_u8(38); + uint8x8_t _v75 = vdup_n_u8(75); + uint8x8_t _v15 = vdup_n_u8(15); + + uint8x8_t _v127 = vdup_n_u8(127); + uint8x8_t _v84_107 = vzip_u8(vdup_n_u8(84), vdup_n_u8(107)).val[0]; + uint8x8_t _v43_20 = vzip_u8(vdup_n_u8(43), vdup_n_u8(20)).val[0]; + uint16x8_t _v128 = vdupq_n_u16((128 << 8) + 128); +#endif // __ARM_NEON + + for (int y = 0; y + 1 < height; y += 2) + { + const unsigned char* p0 = bgrdata + y * width * 3; + const unsigned char* p1 = bgrdata + (y + 1) * width * 3; + unsigned char* yptr0 = yptr + y * stride; + unsigned char* yptr1 = yptr + (y + 1) * stride; + unsigned char* uvptr0 = uvptr + (y / 2) * stride; + + int x = 0; +#if __ARM_NEON + for (; x + 7 < width; x += 8) + { + uint8x8x3_t _bgr0 = vld3_u8(p0); + uint8x8x3_t _bgr1 = vld3_u8(p1); + + uint16x8_t _y0 = vmull_u8(_bgr0.val[0], _v15); + uint16x8_t _y1 = vmull_u8(_bgr1.val[0], _v15); + _y0 = vmlal_u8(_y0, _bgr0.val[1], _v75); + _y1 = vmlal_u8(_y1, _bgr1.val[1], _v75); + _y0 = vmlal_u8(_y0, _bgr0.val[2], _v38); + _y1 = vmlal_u8(_y1, _bgr1.val[2], _v38); + uint8x8_t _y0_u8 = vqrshrun_n_s16(vreinterpretq_s16_u16(_y0), 7); + uint8x8_t _y1_u8 = vqrshrun_n_s16(vreinterpretq_s16_u16(_y1), 7); + + uint16x4_t _b4 = vpaddl_u8(_bgr0.val[0]); + uint16x4_t _g4 = vpaddl_u8(_bgr0.val[1]); + uint16x4_t _r4 = vpaddl_u8(_bgr0.val[2]); + _b4 = vpadal_u8(_b4, _bgr1.val[0]); + _g4 = vpadal_u8(_g4, _bgr1.val[1]); + _r4 = vpadal_u8(_r4, _bgr1.val[2]); + uint16x4x2_t _brbr = vzip_u16(_b4, _r4); + uint16x4x2_t _gggg = vzip_u16(_g4, _g4); + uint16x4x2_t _rbrb = vzip_u16(_r4, _b4); + uint8x8_t _br = vshrn_n_u16(vcombine_u16(_brbr.val[0], _brbr.val[1]), 2); + uint8x8_t _gg = vshrn_n_u16(vcombine_u16(_gggg.val[0], _gggg.val[1]), 2); + uint8x8_t _rb = vshrn_n_u16(vcombine_u16(_rbrb.val[0], _rbrb.val[1]), 2); + + // uint8x8_t _br = vtrn_u8(_bgr0.val[0], _bgr0.val[2]).val[0]; + // uint8x8_t _gg = vtrn_u8(_bgr0.val[1], _bgr0.val[1]).val[0]; + // uint8x8_t _rb = vtrn_u8(_bgr0.val[2], _bgr0.val[0]).val[0]; + + uint16x8_t _uv = vmlal_u8(_v128, _br, _v127); + _uv = vmlsl_u8(_uv, _gg, _v84_107); + _uv = vmlsl_u8(_uv, _rb, _v43_20); + uint8x8_t _uv_u8 = vqshrn_n_u16(_uv, 8); + + vst1_u8(yptr0, _y0_u8); + vst1_u8(yptr1, _y1_u8); + vst1_u8(uvptr0, _uv_u8); + + p0 += 24; + p1 += 24; + yptr0 += 8; + yptr1 += 8; + uvptr0 += 8; + } +#endif + for (; x + 1 < width; x += 2) + { + unsigned char b00 = p0[0]; + unsigned char g00 = p0[1]; + unsigned char r00 = p0[2]; + + unsigned char b01 = p0[3]; + unsigned char g01 = p0[4]; + unsigned char r01 = p0[5]; + + unsigned char b10 = p1[0]; + unsigned char g10 = p1[1]; + unsigned char r10 = p1[2]; + + unsigned char b11 = p1[3]; + unsigned char g11 = p1[4]; + unsigned char r11 = p1[5]; + + // y = 0.29900 * r + 0.58700 * g + 0.11400 * b + // u = -0.16874 * r - 0.33126 * g + 0.50000 * b + 128 + // v = 0.50000 * r - 0.41869 * g - 0.08131 * b + 128 + +#define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255); + unsigned char y00 = SATURATE_CAST_UCHAR(( 38 * r00 + 75 * g00 + 15 * b00 + 64) >> 7); + unsigned char y01 = SATURATE_CAST_UCHAR(( 38 * r01 + 75 * g01 + 15 * b01 + 64) >> 7); + unsigned char y10 = SATURATE_CAST_UCHAR(( 38 * r10 + 75 * g10 + 15 * b10 + 64) >> 7); + unsigned char y11 = SATURATE_CAST_UCHAR(( 38 * r11 + 75 * g11 + 15 * b11 + 64) >> 7); + + unsigned char b4 = (b00 + b01 + b10 + b11) / 4; + unsigned char g4 = (g00 + g01 + g10 + g11) / 4; + unsigned char r4 = (r00 + r01 + r10 + r11) / 4; + + // unsigned char b4 = b00; + // unsigned char g4 = g00; + // unsigned char r4 = r00; + + unsigned char u = SATURATE_CAST_UCHAR(((-43 * r4 - 84 * g4 + 127 * b4 + 128) >> 8) + 128); + unsigned char v = SATURATE_CAST_UCHAR(((127 * r4 - 107 * g4 - 20 * b4 + 128) >> 8) + 128); +#undef SATURATE_CAST_UCHAR + + yptr0[0] = y00; + yptr0[1] = y01; + yptr1[0] = y10; + yptr1[1] = y11; + uvptr0[0] = u; + uvptr0[1] = v; + + p0 += 6; + p1 += 6; + yptr0 += 2; + yptr1 += 2; + uvptr0 += 2; + } + } +} + +static void bgra2yuv420sp(const unsigned char* bgradata, int width, int height, unsigned char* yptr, unsigned char* uvptr, int stride) +{ +#if __ARM_NEON + uint8x8_t _v38 = vdup_n_u8(38); + uint8x8_t _v75 = vdup_n_u8(75); + uint8x8_t _v15 = vdup_n_u8(15); + + uint8x8_t _v127 = vdup_n_u8(127); + uint8x8_t _v84_107 = vzip_u8(vdup_n_u8(84), vdup_n_u8(107)).val[0]; + uint8x8_t _v43_20 = vzip_u8(vdup_n_u8(43), vdup_n_u8(20)).val[0]; + uint16x8_t _v128 = vdupq_n_u16((128 << 8) + 128); +#endif // __ARM_NEON + + for (int y = 0; y + 1 < height; y += 2) + { + const unsigned char* p0 = bgradata + y * width * 4; + const unsigned char* p1 = bgradata + (y + 1) * width * 4; + unsigned char* yptr0 = yptr + y * stride; + unsigned char* yptr1 = yptr + (y + 1) * stride; + unsigned char* uvptr0 = uvptr + (y / 2) * stride; + + int x = 0; +#if __ARM_NEON + for (; x + 7 < width; x += 8) + { + uint8x8x4_t _bgr0 = vld4_u8(p0); + uint8x8x4_t _bgr1 = vld4_u8(p1); + + uint16x8_t _y0 = vmull_u8(_bgr0.val[0], _v15); + uint16x8_t _y1 = vmull_u8(_bgr1.val[0], _v15); + _y0 = vmlal_u8(_y0, _bgr0.val[1], _v75); + _y1 = vmlal_u8(_y1, _bgr1.val[1], _v75); + _y0 = vmlal_u8(_y0, _bgr0.val[2], _v38); + _y1 = vmlal_u8(_y1, _bgr1.val[2], _v38); + uint8x8_t _y0_u8 = vqrshrun_n_s16(vreinterpretq_s16_u16(_y0), 7); + uint8x8_t _y1_u8 = vqrshrun_n_s16(vreinterpretq_s16_u16(_y1), 7); + + uint16x4_t _b4 = vpaddl_u8(_bgr0.val[0]); + uint16x4_t _g4 = vpaddl_u8(_bgr0.val[1]); + uint16x4_t _r4 = vpaddl_u8(_bgr0.val[2]); + _b4 = vpadal_u8(_b4, _bgr1.val[0]); + _g4 = vpadal_u8(_g4, _bgr1.val[1]); + _r4 = vpadal_u8(_r4, _bgr1.val[2]); + uint16x4x2_t _brbr = vzip_u16(_b4, _r4); + uint16x4x2_t _gggg = vzip_u16(_g4, _g4); + uint16x4x2_t _rbrb = vzip_u16(_r4, _b4); + uint8x8_t _br = vshrn_n_u16(vcombine_u16(_brbr.val[0], _brbr.val[1]), 2); + uint8x8_t _gg = vshrn_n_u16(vcombine_u16(_gggg.val[0], _gggg.val[1]), 2); + uint8x8_t _rb = vshrn_n_u16(vcombine_u16(_rbrb.val[0], _rbrb.val[1]), 2); + + // uint8x8_t _br = vtrn_u8(_bgr0.val[0], _bgr0.val[2]).val[0]; + // uint8x8_t _gg = vtrn_u8(_bgr0.val[1], _bgr0.val[1]).val[0]; + // uint8x8_t _rb = vtrn_u8(_bgr0.val[2], _bgr0.val[0]).val[0]; + + uint16x8_t _uv = vmlal_u8(_v128, _br, _v127); + _uv = vmlsl_u8(_uv, _gg, _v84_107); + _uv = vmlsl_u8(_uv, _rb, _v43_20); + uint8x8_t _uv_u8 = vqshrn_n_u16(_uv, 8); + + vst1_u8(yptr0, _y0_u8); + vst1_u8(yptr1, _y1_u8); + vst1_u8(uvptr0, _uv_u8); + + p0 += 32; + p1 += 32; + yptr0 += 8; + yptr1 += 8; + uvptr0 += 8; + } +#endif + for (; x + 1 < width; x += 2) + { + unsigned char b00 = p0[0]; + unsigned char g00 = p0[1]; + unsigned char r00 = p0[2]; + + unsigned char b01 = p0[4]; + unsigned char g01 = p0[5]; + unsigned char r01 = p0[6]; + + unsigned char b10 = p1[0]; + unsigned char g10 = p1[1]; + unsigned char r10 = p1[2]; + + unsigned char b11 = p1[4]; + unsigned char g11 = p1[5]; + unsigned char r11 = p1[6]; + + // y = 0.29900 * r + 0.58700 * g + 0.11400 * b + // u = -0.16874 * r - 0.33126 * g + 0.50000 * b + 128 + // v = 0.50000 * r - 0.41869 * g - 0.08131 * b + 128 + +#define SATURATE_CAST_UCHAR(X) (unsigned char)::std::min(::std::max((int)(X), 0), 255); + unsigned char y00 = SATURATE_CAST_UCHAR(( 38 * r00 + 75 * g00 + 15 * b00 + 64) >> 7); + unsigned char y01 = SATURATE_CAST_UCHAR(( 38 * r01 + 75 * g01 + 15 * b01 + 64) >> 7); + unsigned char y10 = SATURATE_CAST_UCHAR(( 38 * r10 + 75 * g10 + 15 * b10 + 64) >> 7); + unsigned char y11 = SATURATE_CAST_UCHAR(( 38 * r11 + 75 * g11 + 15 * b11 + 64) >> 7); + + unsigned char b4 = (b00 + b01 + b10 + b11) / 4; + unsigned char g4 = (g00 + g01 + g10 + g11) / 4; + unsigned char r4 = (r00 + r01 + r10 + r11) / 4; + + // unsigned char b4 = b00; + // unsigned char g4 = g00; + // unsigned char r4 = r00; + + unsigned char u = SATURATE_CAST_UCHAR(((-43 * r4 - 84 * g4 + 127 * b4 + 128) >> 8) + 128); + unsigned char v = SATURATE_CAST_UCHAR(((127 * r4 - 107 * g4 - 20 * b4 + 128) >> 8) + 128); +#undef SATURATE_CAST_UCHAR + + yptr0[0] = y00; + yptr0[1] = y01; + yptr1[0] = y10; + yptr1[1] = y11; + uvptr0[0] = u; + uvptr0[1] = v; + + p0 += 8; + p1 += 8; + yptr0 += 2; + yptr1 += 2; + uvptr0 += 2; + } + } +} + +class jpeg_encoder_aw_impl +{ +public: + jpeg_encoder_aw_impl(); + ~jpeg_encoder_aw_impl(); + + int init(int width, int height, int ch, int quality); + + int encode(const unsigned char* bgrdata, std::vector& outdata) const; + + int encode(const unsigned char* bgrdata, const char* outfilepath) const; + + int deinit(); + +protected: + int inited; + int width; + int height; + int ch; + + VideoEncoder* venc; +}; + +jpeg_encoder_aw_impl::jpeg_encoder_aw_impl() +{ + inited = 0; + width = 0; + height = 0; + ch = 0; + + venc = 0; +} + +jpeg_encoder_aw_impl::~jpeg_encoder_aw_impl() +{ + deinit(); +} + +int jpeg_encoder_aw_impl::init(int _width, int _height, int _ch, int quality) +{ + if (!vencoder.ready) + { + fprintf(stderr, "vencoder not ready\n"); + return -1; + } + + if (inited) + { + int ret = deinit(); + if (ret != 0) + { + fprintf(stderr, "deinit failed before re-init\n"); + return -1; + } + } + + width = _width; + height = _height; + ch = _ch; + + // fprintf(stderr, "width = %d\n", width); + // fprintf(stderr, "height = %d\n", height); + // fprintf(stderr, "ch = %d\n", ch); + + const int aligned_width = (width + 15) / 16 * 16; + const int aligned_height = (height + 15) / 16 * 16; + + venc = VideoEncCreate(VENC_CODEC_JPEG); + if (!venc) + { + fprintf(stderr, "VideoEncCreate failed\n"); + goto OUT; + } + + { + int ret = VideoEncSetParameter(venc, VENC_IndexParamJpegQuality, (void*)&quality); + if (ret) + { + fprintf(stderr, "VideoEncSetParameter VENC_IndexParamJpegQuality failed %d\n", ret); + goto OUT; + } + } + + { + int enc_mode = 0; + int ret = VideoEncSetParameter(venc, VENC_IndexParamJpegEncMode, (void*)&enc_mode); + if (ret) + { + fprintf(stderr, "VideoEncSetParameter VENC_IndexParamJpegEncMode failed %d\n", ret); + goto OUT; + } + } + + { + VencBaseConfig config; + memset(&config, 0, sizeof(config)); + config.nInputWidth = width; + config.nInputHeight = height; + config.nDstWidth = width; + config.nDstHeight = height; + config.nStride = aligned_width; + config.eInputFormat = VENC_PIXEL_YUV420SP; + + int ret = VideoEncInit(venc, &config); + if (ret) + { + fprintf(stderr, "VideoEncInit failed %d\n", ret); + goto OUT; + } + } + + { + VencAllocateBufferParam bufferParam; + bufferParam.nSizeY = aligned_width * aligned_height; + bufferParam.nSizeC = aligned_width * aligned_height / 2; + bufferParam.nBufferNum = 1; + + int ret = AllocInputBuffer(venc, &bufferParam); + if (ret) + { + fprintf(stderr, "AllocInputBuffer failed %d\n", ret); + goto OUT; + } + } + + inited = 1; + + return 0; + +OUT: + deinit(); + + return -1; +} + +int jpeg_encoder_aw_impl::encode(const unsigned char* bgrdata, std::vector& outdata) const +{ + outdata.clear(); + + if (!inited) + { + fprintf(stderr, "not inited\n"); + return -1; + } + + int ret_val = 0; + + VencInputBuffer input_buffer; + VencOutputBuffer output_buffer; + int b_input_buffer_got = 0; + int b_output_buffer_got = 0; + + const int aligned_width = (width + 15) / 16 * 16; + + { + memset(&input_buffer, 0, sizeof(input_buffer)); + int ret = GetOneAllocInputBuffer(venc, &input_buffer); + if (ret) + { + fprintf(stderr, "GetOneAllocInputBuffer failed %d\n", ret); + goto OUT; + } + + b_input_buffer_got = 1; + } + + // fprintf(stderr, "nID = %d\n", input_buffer.nID); + // fprintf(stderr, "nPts = %lld\n", input_buffer.nPts); + // fprintf(stderr, "nFlag = %d\n", input_buffer.nFlag); + // fprintf(stderr, "nWidth = %d\n", input_buffer.nWidth); + // fprintf(stderr, "nHeight = %d\n", input_buffer.nHeight); + // fprintf(stderr, "nAlign = %d\n", input_buffer.nAlign); + // fprintf(stderr, "bEnableCorp = %d\n", input_buffer.bEnableCorp); + // fprintf(stderr, "ispPicVar = %d\n", input_buffer.ispPicVar); + // fprintf(stderr, "ispPicVarChroma = %d\n", input_buffer.ispPicVarChroma); + // fprintf(stderr, "bUseInputBufferRoi = %d\n", input_buffer.bUseInputBufferRoi); + // fprintf(stderr, "bAllocMemSelf = %d\n", input_buffer.bAllocMemSelf); + // fprintf(stderr, "nShareBufFd = %d\n", input_buffer.nShareBufFd); + // fprintf(stderr, "bUseCsiColorFormat = %d\n", input_buffer.bUseCsiColorFormat); + // fprintf(stderr, "eCsiColorFormat = %d\n", input_buffer.eCsiColorFormat); + // fprintf(stderr, "envLV = %d\n", input_buffer.envLV); + + if (ch == 1) + { + gray2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer.pAddrVirY, (unsigned char*)input_buffer.pAddrVirC, aligned_width); + } + if (ch == 3) + { + bgr2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer.pAddrVirY, (unsigned char*)input_buffer.pAddrVirC, aligned_width); + } + if (ch == 4) + { + bgra2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer.pAddrVirY, (unsigned char*)input_buffer.pAddrVirC, aligned_width); + } + + { + int ret = FlushCacheAllocInputBuffer(venc, &input_buffer); + if (ret) + { + fprintf(stderr, "FlushCacheAllocInputBuffer failed %d\n", ret); + ret_val = -1; + goto OUT; + } + } + + { + int ret = AddOneInputBuffer(venc, &input_buffer); + if (ret) + { + fprintf(stderr, "AddOneInputBuffer failed %d\n", ret); + ret_val = -1; + goto OUT; + } + } + + { + int ret = VideoEncodeOneFrame(venc); + if (ret) + { + fprintf(stderr, "VideoEncodeOneFrame failed %d\n", ret); + ret_val = -1; + goto OUT; + } + } + + { + int ret = AlreadyUsedInputBuffer(venc, &input_buffer); + if (ret) + { + fprintf(stderr, "AlreadyUsedInputBuffer failed %d\n", ret); + ret_val = -1; + goto OUT; + } + } + + { + int ret = GetOneBitstreamFrame(venc, &output_buffer); + if (ret) + { + fprintf(stderr, "GetOneBitstreamFrame failed %d\n", ret); + ret_val = -1; + goto OUT; + } + + b_output_buffer_got = 1; + } + + outdata.resize(output_buffer.nSize0 + output_buffer.nSize1); + memcpy(outdata.data(), output_buffer.pData0, output_buffer.nSize0); + if (output_buffer.nSize1) + { + memcpy(outdata.data() + output_buffer.nSize0, output_buffer.pData1, output_buffer.nSize1); + } + +OUT: + if (b_output_buffer_got) + { + int ret = FreeOneBitStreamFrame(venc, &output_buffer); + if (ret) + { + fprintf(stderr, "FreeOneBitStreamFrame failed %d\n", ret); + ret_val = -1; + } + } + + if (b_input_buffer_got) + { + int ret = ReturnOneAllocInputBuffer(venc, &input_buffer); + if (ret) + { + fprintf(stderr, "ReturnOneAllocInputBuffer failed %d\n", ret); + ret_val = -1; + } + } + + return ret_val; +} + +int jpeg_encoder_aw_impl::encode(const unsigned char* bgrdata, const char* outfilepath) const +{ + if (!inited) + { + fprintf(stderr, "not inited\n"); + return -1; + } + + int ret_val = 0; + + VencInputBuffer input_buffer; + VencOutputBuffer output_buffer; + int b_input_buffer_got = 0; + int b_output_buffer_got = 0; + + FILE* fp = 0; + + const int aligned_width = (width + 15) / 16 * 16; + + { + memset(&input_buffer, 0, sizeof(input_buffer)); + int ret = GetOneAllocInputBuffer(venc, &input_buffer); + if (ret) + { + fprintf(stderr, "GetOneAllocInputBuffer failed %d\n", ret); + goto OUT; + } + + b_input_buffer_got = 1; + } + + // fprintf(stderr, "nID = %d\n", input_buffer.nID); + // fprintf(stderr, "nPts = %lld\n", input_buffer.nPts); + // fprintf(stderr, "nFlag = %d\n", input_buffer.nFlag); + // fprintf(stderr, "nWidth = %d\n", input_buffer.nWidth); + // fprintf(stderr, "nHeight = %d\n", input_buffer.nHeight); + // fprintf(stderr, "nAlign = %d\n", input_buffer.nAlign); + // fprintf(stderr, "bEnableCorp = %d\n", input_buffer.bEnableCorp); + // fprintf(stderr, "ispPicVar = %d\n", input_buffer.ispPicVar); + // fprintf(stderr, "ispPicVarChroma = %d\n", input_buffer.ispPicVarChroma); + // fprintf(stderr, "bUseInputBufferRoi = %d\n", input_buffer.bUseInputBufferRoi); + // fprintf(stderr, "bAllocMemSelf = %d\n", input_buffer.bAllocMemSelf); + // fprintf(stderr, "nShareBufFd = %d\n", input_buffer.nShareBufFd); + // fprintf(stderr, "bUseCsiColorFormat = %d\n", input_buffer.bUseCsiColorFormat); + // fprintf(stderr, "eCsiColorFormat = %d\n", input_buffer.eCsiColorFormat); + // fprintf(stderr, "envLV = %d\n", input_buffer.envLV); + + if (ch == 1) + { + gray2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer.pAddrVirY, (unsigned char*)input_buffer.pAddrVirC, aligned_width); + } + if (ch == 3) + { + bgr2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer.pAddrVirY, (unsigned char*)input_buffer.pAddrVirC, aligned_width); + } + if (ch == 4) + { + bgra2yuv420sp(bgrdata, width, height, (unsigned char*)input_buffer.pAddrVirY, (unsigned char*)input_buffer.pAddrVirC, aligned_width); + } + + { + int ret = FlushCacheAllocInputBuffer(venc, &input_buffer); + if (ret) + { + fprintf(stderr, "FlushCacheAllocInputBuffer failed %d\n", ret); + ret_val = -1; + goto OUT; + } + } + + { + int ret = AddOneInputBuffer(venc, &input_buffer); + if (ret) + { + fprintf(stderr, "AddOneInputBuffer failed %d\n", ret); + ret_val = -1; + goto OUT; + } + } + + { + int ret = VideoEncodeOneFrame(venc); + if (ret) + { + fprintf(stderr, "VideoEncodeOneFrame failed %d\n", ret); + ret_val = -1; + goto OUT; + } + } + + { + int ret = AlreadyUsedInputBuffer(venc, &input_buffer); + if (ret) + { + fprintf(stderr, "AlreadyUsedInputBuffer failed %d\n", ret); + ret_val = -1; + goto OUT; + } + } + + { + int ret = GetOneBitstreamFrame(venc, &output_buffer); + if (ret) + { + fprintf(stderr, "GetOneBitstreamFrame failed %d\n", ret); + ret_val = -1; + goto OUT; + } + + b_output_buffer_got = 1; + } + + fp = fopen(outfilepath, "wb"); + if (!fp) + { + fprintf(stderr, "fopen %s failed\n", outfilepath); + ret_val = -1; + goto OUT; + } + + fwrite(output_buffer.pData0, 1, output_buffer.nSize0, fp); + if (output_buffer.nSize1) + { + fwrite(output_buffer.pData1, 1, output_buffer.nSize1, fp); + } + +OUT: + if (b_output_buffer_got) + { + int ret = FreeOneBitStreamFrame(venc, &output_buffer); + if (ret) + { + fprintf(stderr, "FreeOneBitStreamFrame failed %d\n", ret); + ret_val = -1; + } + } + + if (b_input_buffer_got) + { + int ret = ReturnOneAllocInputBuffer(venc, &input_buffer); + if (ret) + { + fprintf(stderr, "ReturnOneAllocInputBuffer failed %d\n", ret); + ret_val = -1; + } + } + + if (fp) + { + fclose(fp); + } + + return ret_val; +} + +int jpeg_encoder_aw_impl::deinit() +{ + if (!inited) + return 0; + + int ret_val = 0; + + if (venc) + { + VideoEncDestroy(venc); + venc = 0; + } + + width = 0; + height = 0; + ch = 0; + + inited = 0; + + return ret_val; +} + +bool jpeg_encoder_aw::supported(int width, int height, int ch) +{ + if (!vencoder.ready) + return false; + + if (ch != 1 && ch != 3 && ch != 4) + return false; + + if (width % 2 != 0 || height % 2 != 0) + return false; + + if (width < 8 || height < 8) + return false; + + if (width * height > 4000 * 4000) + return false; + + return true; +} + +jpeg_encoder_aw::jpeg_encoder_aw() : d(new jpeg_encoder_aw_impl) +{ +} + +jpeg_encoder_aw::~jpeg_encoder_aw() +{ + delete d; +} + +int jpeg_encoder_aw::init(int width, int height, int ch, int quality) +{ + return d->init(width, height, ch, quality); +} + +int jpeg_encoder_aw::encode(const unsigned char* bgrdata, std::vector& outdata) const +{ + return d->encode(bgrdata, outdata); +} + +int jpeg_encoder_aw::encode(const unsigned char* bgrdata, const char* outfilepath) const +{ + return d->encode(bgrdata, outfilepath); +} + +int jpeg_encoder_aw::deinit() +{ + return d->deinit(); +} + +#else // defined __linux__ + +bool jpeg_encoder_aw::supported(int /*width*/, int /*height*/, int /*ch*/) +{ + return false; +} + +jpeg_encoder_aw::jpeg_encoder_aw() : d(0) +{ +} + +jpeg_encoder_aw::~jpeg_encoder_aw() +{ +} + +int jpeg_encoder_aw::init(int /*width*/, int /*height*/, int /*ch*/, int /*quality*/) +{ + return -1; +} + +int jpeg_encoder_aw::encode(const unsigned char* /*bgrdata*/, std::vector& /*outdata*/) const +{ + return -1; +} + +int jpeg_encoder_aw::encode(const unsigned char* /*bgrdata*/, const char* /*outfilepath*/) const +{ + return -1; +} + +int jpeg_encoder_aw::deinit() +{ + return -1; +} + +#endif // defined __linux__ diff --git a/highgui/src/jpeg_encoder_aw.h b/highgui/src/jpeg_encoder_aw.h new file mode 100644 index 00000000..c70dfa2a --- /dev/null +++ b/highgui/src/jpeg_encoder_aw.h @@ -0,0 +1,43 @@ +// +// Copyright (C) 2024 nihui +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef JPEG_ENCODER_AW_H +#define JPEG_ENCODER_AW_H + +#include + +class jpeg_encoder_aw_impl; +class jpeg_encoder_aw +{ +public: + static bool supported(int width, int height, int ch); + + jpeg_encoder_aw(); + ~jpeg_encoder_aw(); + + int init(int width, int height, int ch, int quality); + + int encode(const unsigned char* bgrdata, std::vector& outdata) const; + + int encode(const unsigned char* bgrdata, const char* outfilepath) const; + + int deinit(); + +private: + jpeg_encoder_aw_impl* const d; +}; + +#endif // JPEG_ENCODER_AW_H From 0f2cb68fc7f0a8154db2bc5e64b1905436e5deac Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 12 Feb 2024 15:46:52 +0800 Subject: [PATCH 04/10] wip --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c6b27242..f024f384 100644 --- a/README.md +++ b/README.md @@ -332,7 +332,7 @@ https://github.com/nihui/opencv-mobile/releases/latest
arm-linux-uclibcgnueabihf
✅ HW JPG decoder
- ❌ HW JPG encoder (WIP)
+ ✅ HW JPG encoder
✅ MIPI CSI camera
opencv4-tinyvision From b8f3399bf591809a987cdf8e00ce5d0f6625c2c8 Mon Sep 17 00:00:00 2001 From: nihui Date: Thu, 15 Feb 2024 13:06:14 +0800 Subject: [PATCH 05/10] wip --- highgui/src/jpeg_decoder_aw.cpp | 30 +++++++++++++++-------------- highgui/src/jpeg_encoder_aw.cpp | 34 ++++++++++++++++----------------- 2 files changed, 33 insertions(+), 31 deletions(-) diff --git a/highgui/src/jpeg_decoder_aw.cpp b/highgui/src/jpeg_decoder_aw.cpp index c3671c2c..d01ab4b6 100644 --- a/highgui/src/jpeg_decoder_aw.cpp +++ b/highgui/src/jpeg_decoder_aw.cpp @@ -37,7 +37,8 @@ #include "kanna_rotate.h" // 0 = unknown -// 1 = tinyvision +// 1 = t113-i +// 2 = tinyvision static int get_device_model() { static int device_model = -1; @@ -54,10 +55,15 @@ static int get_device_model() fgets(buf, 1024, fp); fclose(fp); + if (strncmp(buf, "sun8iw20", 8) == 0) + { + // t113-i + device_model = 1; + } if (strncmp(buf, "sun8iw21", 8) == 0) { // tinyvision - device_model = 1; + device_model = 2; } } @@ -69,6 +75,11 @@ static bool is_device_whitelisted() const int device_model = get_device_model(); if (device_model == 1) + { + // t113-i + return true; + } + if (device_model == 2) { // tinyvision return true; @@ -793,7 +804,7 @@ jpeg_decoder_aw_impl::jpeg_decoder_aw_impl() components = 0; sampling_factor = -1; progressive = 0; - orientation = -1; + orientation = 0; } jpeg_decoder_aw_impl::~jpeg_decoder_aw_impl() @@ -901,7 +912,6 @@ int jpeg_decoder_aw_impl::init(const unsigned char* jpgdata, int jpgsize, int* _ orientation = 1; } } - // orientation = 7; if (corrupted) return -1; @@ -1002,8 +1012,6 @@ int jpeg_decoder_aw_impl::decode(const unsigned char* jpgdata, int jpgsize, unsi vconfig.nDisplayHoldingFrameBufferNum = 1; vconfig.nRotateHoldingFrameBufferNum = 0; vconfig.nDecodeSmoothFrameBufferNum = 1; - vconfig.nSupportMaxWidth = src_width; - vconfig.nSupportMaxHeight = src_height; int ret = InitializeVideoDecoder(vdec, &videoInfo, &vconfig); if (ret != 0) @@ -1044,17 +1052,11 @@ int jpeg_decoder_aw_impl::decode(const unsigned char* jpgdata, int jpgsize, unsi { VideoStreamDataInfo dataInfo; + memset(&dataInfo, 0, sizeof(dataInfo)); dataInfo.pData = pBuf; dataInfo.nLength = jpgsize; - dataInfo.nPts = 0; - dataInfo.nPcr = 0; dataInfo.bIsFirstPart = 1; dataInfo.bIsLastPart = 1; - dataInfo.nID = 0; - dataInfo.nStreamIndex = 0; - dataInfo.bValid = 0; - dataInfo.bVideoInfoFlag = 0; - dataInfo.pVideoInfo = 0; int ret = SubmitVideoStreamData(vdec, &dataInfo, 0); if (ret != 0) @@ -1161,7 +1163,7 @@ int jpeg_decoder_aw_impl::deinit() components = 0; sampling_factor = -1; progressive = 0; - orientation = -1; + orientation = 0; return 0; } diff --git a/highgui/src/jpeg_encoder_aw.cpp b/highgui/src/jpeg_encoder_aw.cpp index 840ede29..57cf2874 100644 --- a/highgui/src/jpeg_encoder_aw.cpp +++ b/highgui/src/jpeg_encoder_aw.cpp @@ -500,23 +500,23 @@ static int unload_vencoder_library() dlclose(libvencoder); libvencoder = 0; - VideoEncCreate = 0; - VideoEncDestroy = 0; - VideoEncInit = 0; - VideoEncUnInit = 0; - AllocInputBuffer = 0; - GetOneAllocInputBuffer = 0; - FlushCacheAllocInputBuffer = 0; - ReturnOneAllocInputBuffer = 0; - ReleaseAllocInputBuffer = 0; - AddOneInputBuffer = 0; - VideoEncodeOneFrame = 0; - AlreadyUsedInputBuffer = 0; - ValidBitstreamFrameNum = 0; - GetOneBitstreamFrame = 0; - FreeOneBitStreamFrame = 0; - VideoEncGetParameter = 0; - VideoEncSetParameter = 0; + VideoEncCreate = 0; + VideoEncDestroy = 0; + VideoEncInit = 0; + VideoEncUnInit = 0; + AllocInputBuffer = 0; + GetOneAllocInputBuffer = 0; + FlushCacheAllocInputBuffer = 0; + ReturnOneAllocInputBuffer = 0; + ReleaseAllocInputBuffer = 0; + AddOneInputBuffer = 0; + VideoEncodeOneFrame = 0; + AlreadyUsedInputBuffer = 0; + ValidBitstreamFrameNum = 0; + GetOneBitstreamFrame = 0; + FreeOneBitStreamFrame = 0; + VideoEncGetParameter = 0; + VideoEncSetParameter = 0; return 0; } From 410a9c388f81439265419d1be539b3777d6e8647 Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 19 Feb 2024 21:29:17 +0800 Subject: [PATCH 06/10] fix v85x decoder --- highgui/src/jpeg_decoder_aw.cpp | 126 ++++++++++++++++++++++++++++---- 1 file changed, 112 insertions(+), 14 deletions(-) diff --git a/highgui/src/jpeg_decoder_aw.cpp b/highgui/src/jpeg_decoder_aw.cpp index d01ab4b6..fda0f1d0 100644 --- a/highgui/src/jpeg_decoder_aw.cpp +++ b/highgui/src/jpeg_decoder_aw.cpp @@ -95,11 +95,32 @@ typedef void (*PFN_AddVDPluginSingle)(const char* lib); } +static void* libcdc_base = 0; static void* libvideoengine = 0; static PFN_AddVDPlugin AddVDPlugin = 0; static PFN_AddVDPluginSingle AddVDPluginSingle = 0; +static int unload_videoengine_library() +{ + if (libcdc_base) + { + dlclose(libcdc_base); + libcdc_base = 0; + } + + if (libvideoengine) + { + dlclose(libvideoengine); + libvideoengine = 0; + } + + AddVDPlugin = 0; + AddVDPluginSingle = 0; + + return 0; +} + static int load_videoengine_library() { if (libvideoengine) @@ -113,6 +134,16 @@ static int load_videoengine_library() return -1; } + libcdc_base = dlopen("libcdc_base.so", RTLD_GLOBAL | RTLD_LAZY); + if (!libcdc_base) + { + libcdc_base = dlopen("/usr/lib/libcdc_base.so", RTLD_GLOBAL | RTLD_LAZY); + } + if (!libcdc_base) + { + goto OUT; + } + libvideoengine = dlopen("libvideoengine.so", RTLD_LOCAL | RTLD_NOW); if (!libvideoengine) { @@ -120,27 +151,18 @@ static int load_videoengine_library() } if (!libvideoengine) { - return -1; + goto OUT; } AddVDPlugin = (PFN_AddVDPlugin)dlsym(libvideoengine, "AddVDPlugin"); AddVDPluginSingle = (PFN_AddVDPluginSingle)dlsym(libvideoengine, "AddVDPluginSingle"); return 0; -} -static int unload_videoengine_library() -{ - if (!libvideoengine) - return 0; - - dlclose(libvideoengine); - libvideoengine = 0; - - AddVDPlugin = 0; - AddVDPluginSingle = 0; +OUT: + unload_videoengine_library(); - return 0; + return -1; } class videoengine_library_loader @@ -374,6 +396,70 @@ typedef struct VCONFIG int bATMFlag; }VConfig; +typedef enum eVeLbcMode +{ + LBC_MODE_DISABLE = 0, + LBC_MODE_1_5X = 1, + LBC_MODE_2_0X = 2, + LBC_MODE_2_5X = 3, + LBC_MODE_NO_LOSSY = 4, +}eVeLbcMode; + +typedef struct VCONFIG_v85x +{ + int bScaleDownEn; + int bRotationEn; + int bSecOutputEn; + int nHorizonScaleDownRatio; + int nVerticalScaleDownRatio; + int nSecHorizonScaleDownRatio; + int nSecVerticalScaleDownRatio; + int nRotateDegree; + int bThumbnailMode; + int eOutputPixelFormat; + int eSecOutputPixelFormat; + int bNoBFrames; + int bDisable3D; + int bSupportMaf; //not use + int bDispErrorFrame; + int nVbvBufferSize; + int nFrameBufferNum; + int bSecureosEn; + int bGpuBufValid; + int nAlignStride; + int bIsSoftDecoderFlag; + int bVirMallocSbm; + int bSupportPallocBufBeforeDecode; + //only used for xuqi, set this flag to 1 meaning palloc the fbm buffer before + // decode the sequence, to short the first frame decoing time + int nDeInterlaceHoldingFrameBufferNum; + int nDisplayHoldingFrameBufferNum; + int nRotateHoldingFrameBufferNum; + int nDecodeSmoothFrameBufferNum; + int bIsTvStream; + eVeLbcMode nLbcLossyComMod;//1:1.5x; 2:2x; 3:2.5x; + + struct ScMemOpsS *memops; + eControlAfbcMode eCtlAfbcMode; + eControlIptvMode eCtlIptvMode; + + VeOpsS* veOpsS; + void* pVeOpsSelf; + int bConvertVp910bitTo8bit; + unsigned int nVeFreq; + + int bCalledByOmxFlag; + + int bSetProcInfoEnable; //* for check the decoder info by cat devices-note + int nSetProcInfoFreq; + int nChannelNum; + int nSupportMaxWidth; //the max width of mjpeg continue decode + int nSupportMaxHeight; //the max height of mjpeg continue decode + + unsigned int bIsLossy; //lossy compression or not + unsigned int bRcEn; //compact storage or not +}VConfig_v85x; + typedef struct VIDEOSTREAMDATAINFO { char* pData; @@ -1013,7 +1099,19 @@ int jpeg_decoder_aw_impl::decode(const unsigned char* jpgdata, int jpgsize, unsi vconfig.nRotateHoldingFrameBufferNum = 0; vconfig.nDecodeSmoothFrameBufferNum = 1; - int ret = InitializeVideoDecoder(vdec, &videoInfo, &vconfig); + VConfig_v85x vconfig_v85x; + memset(&vconfig_v85x, 0, sizeof(vconfig_v85x)); + vconfig_v85x.eOutputPixelFormat = PIXEL_FORMAT_NV21; + vconfig_v85x.eSecOutputPixelFormat = PIXEL_FORMAT_NV21; + vconfig_v85x.bSupportPallocBufBeforeDecode = 1; + vconfig_v85x.nDeInterlaceHoldingFrameBufferNum = 1; + vconfig_v85x.nDisplayHoldingFrameBufferNum = 1; + vconfig_v85x.nRotateHoldingFrameBufferNum = 0; + vconfig_v85x.nDecodeSmoothFrameBufferNum = 1; + + VConfig* p_vconfig = get_device_model() == 2 ? (VConfig*)&vconfig_v85x : &vconfig; + + int ret = InitializeVideoDecoder(vdec, &videoInfo, p_vconfig); if (ret != 0) { fprintf(stderr, "InitializeVideoDecoder failed %d\n", ret); From eba9e91d85f420a6513a0d958c176a092ba55557 Mon Sep 17 00:00:00 2001 From: nihui Date: Tue, 20 Feb 2024 00:19:47 +0800 Subject: [PATCH 07/10] wip --- highgui/src/jpeg_encoder_aw.cpp | 814 +++++++++++++++++++++++--------- 1 file changed, 587 insertions(+), 227 deletions(-) diff --git a/highgui/src/jpeg_encoder_aw.cpp b/highgui/src/jpeg_encoder_aw.cpp index 57cf2874..e752f449 100644 --- a/highgui/src/jpeg_encoder_aw.cpp +++ b/highgui/src/jpeg_encoder_aw.cpp @@ -87,6 +87,9 @@ static bool is_device_whitelisted() return false; } + + + extern "C" { typedef enum VENC_CODEC_TYPE { @@ -315,6 +318,15 @@ typedef enum VENC_INDEXTYPE { struct ScMemOpsS; struct VeOpsS; +typedef enum eVeLbcMode +{ + LBC_MODE_DISABLE = 0, + LBC_MODE_1_5X = 1, + LBC_MODE_2_0X = 2, + LBC_MODE_2_5X = 3, + LBC_MODE_NO_LOSSY = 4, +}eVeLbcMode; + typedef struct VencBaseConfig { unsigned char bEncH264Nalu; unsigned int nInputWidth; @@ -336,6 +348,38 @@ typedef struct VencBaseConfig { //* end } VencBaseConfig; +typedef struct VencBaseConfig_v85x { + unsigned char bEncH264Nalu; + unsigned int nInputWidth; + unsigned int nInputHeight; + unsigned int nDstWidth; + unsigned int nDstHeight; + unsigned int nStride; + VENC_PIXEL_FMT eInputFormat; + struct ScMemOpsS *memops; + VeOpsS* veOpsS; + void* pVeOpsSelf; + + unsigned char bOnlyWbFlag; + + //* for v5v200 and newer ic + unsigned char bLbcLossyComEnFlag1_5x; + unsigned char bLbcLossyComEnFlag2x; + unsigned char bLbcLossyComEnFlag2_5x; + unsigned char bIsVbvNoCache; + //* end + + unsigned int bOnlineMode; //* 1: online mode, 0: offline mode; + unsigned int bOnlineChannel; //* 1: online channel, 0: offline channel; + unsigned int nOnlineShareBufNum; //* share buffer num + + //*for debug + unsigned int extend_flag; //* flag&0x1: printf reg before interrupt + //* flag&0x2: printf reg after interrupt + eVeLbcMode rec_lbc_mode; //*0: disable, 1:1.5x , 2: 2.0x, 3: 2.5x, 4: no_lossy + //*for debug(end) +} VencBaseConfig_v85x; + typedef struct VencAllocateBufferParam { unsigned int nBufferNum; unsigned int nSizeY; @@ -384,6 +428,30 @@ typedef struct VencInputBuffer { int envLV; } VencInputBuffer; +typedef struct VencInputBuffer_v85x { + unsigned long nID; + long long nPts; + unsigned int nFlag; + unsigned char* pAddrPhyY; + unsigned char* pAddrPhyC; + unsigned char* pAddrVirY; + unsigned char* pAddrVirC; + int bEnableCorp; + VencRect sCropInfo; + + int ispPicVar; + int ispPicVarChroma; //chroma filter coef[0-63], from isp + int bUseInputBufferRoi; + VencROIConfig roi_param[8]; + int bAllocMemSelf; + int nShareBufFd; + unsigned char bUseCsiColorFormat; + VENC_PIXEL_FMT eCsiColorFormat; + + int envLV; + int bNeedFlushCache; +}VencInputBuffer_v85x; + typedef struct FrameInfo { int CurrQp; int avQp; @@ -426,6 +494,59 @@ typedef int (*PFN_FreeOneBitStreamFrame)(VideoEncoder* pEncoder, VencOutputBuffe typedef int (*PFN_VideoEncGetParameter)(VideoEncoder* pEncoder, VENC_INDEXTYPE indexType, void* paramData); typedef int (*PFN_VideoEncSetParameter)(VideoEncoder* pEncoder, VENC_INDEXTYPE indexType, void* paramData); +// v85x +typedef VideoEncoder* (*PFN_VencCreate)(VENC_CODEC_TYPE eCodecType); +typedef void (*PFN_VencDestroy)(VideoEncoder* pEncoder); +typedef int (*PFN_VencInit)(VideoEncoder* pEncoder, VencBaseConfig_v85x* pConfig); +typedef int (*PFN_VencStart)(VideoEncoder* pEncoder); +typedef int (*PFN_VencPause)(VideoEncoder* pEncoder); +typedef int (*PFN_VencReset)(VideoEncoder* pEncoder); +typedef int (*PFN_VencAllocateInputBuf)(VideoEncoder* pEncoder, VencAllocateBufferParam *pBufferParam, VencInputBuffer_v85x* dst_inputBuf); +typedef int (*PFN_VencGetValidInputBufNum)(VideoEncoder* pEncoder); +typedef int (*PFN_VencQueueInputBuf)(VideoEncoder* pEncoder, VencInputBuffer_v85x* inputbuffer); +typedef int (*PFN_VencGetValidOutputBufNum)(VideoEncoder* pEncoder); +typedef int (*PFN_VencDequeueOutputBuf)(VideoEncoder* pEncoder, VencOutputBuffer* pBuffer); +typedef int (*PFN_VencQueueOutputBuf)(VideoEncoder* pEncoder, VencOutputBuffer* pBuffer); +typedef int (*PFN_VencGetParameter)(VideoEncoder* pEncoder, VENC_INDEXTYPE indexType, void* paramData); +typedef int (*PFN_VencSetParameter)(VideoEncoder* pEncoder, VENC_INDEXTYPE indexType, void* paramData); + +typedef enum +{ + VencEvent_FrameFormatNotMatch = 0, // frame format is not match to initial setting. + VencEvent_UpdateMbModeInfo = 1, + VencEvent_UpdateMbStatInfo = 2, + VencEvent_UpdateSharpParam = 3, + VencEvent_UpdateIspMotionParam = 4, + VencEvent_UpdateVeToIspParam = 5, + VencEvent_Max = 0x7FFFFFFF +} VencEventType; + +typedef struct +{ + int nResult; + VencInputBuffer *pInputBuffer; + //other informations about this frame encoding can be added below. + +} VencCbInputBufferDoneInfo; + +typedef struct +{ + int (*EventHandler)( + VideoEncoder* pEncoder, + void* pAppData, + VencEventType eEvent, + unsigned int nData1, + unsigned int nData2, + void* pEventData); + + int (*InputBufferDone)( + VideoEncoder* pEncoder, + void* pAppData, + VencCbInputBufferDoneInfo* pBufferDoneInfo); +} VencCbType; + +typedef int (*PFN_VencSetCallbacks)(VideoEncoder* pEncoder, VencCbType* pCallbacks, void* pAppData); + } static void* libvencoder = 0; @@ -448,6 +569,23 @@ static PFN_FreeOneBitStreamFrame FreeOneBitStreamFrame = 0; static PFN_VideoEncGetParameter VideoEncGetParameter = 0; static PFN_VideoEncSetParameter VideoEncSetParameter = 0; +// v85x +static PFN_VencCreate VencCreate = 0; +static PFN_VencDestroy VencDestroy = 0; +static PFN_VencInit VencInit = 0; +static PFN_VencStart VencStart = 0; +static PFN_VencPause VencPause = 0; +static PFN_VencReset VencReset = 0; +static PFN_VencAllocateInputBuf VencAllocateInputBuf = 0; +static PFN_VencGetValidInputBufNum VencGetValidInputBufNum = 0; +static PFN_VencQueueInputBuf VencQueueInputBuf = 0; +static PFN_VencGetValidOutputBufNum VencGetValidOutputBufNum = 0; +static PFN_VencDequeueOutputBuf VencDequeueOutputBuf = 0; +static PFN_VencQueueOutputBuf VencQueueOutputBuf = 0; +static PFN_VencGetParameter VencGetParameter = 0; +static PFN_VencSetParameter VencSetParameter = 0; +static PFN_VencSetCallbacks VencSetCallbacks = 0; + static int load_vencoder_library() { if (libvencoder) @@ -471,23 +609,44 @@ static int load_vencoder_library() return -1; } - VideoEncCreate = (PFN_VideoEncCreate)dlsym(libvencoder, "VideoEncCreate"); - VideoEncDestroy = (PFN_VideoEncDestroy)dlsym(libvencoder, "VideoEncDestroy"); - VideoEncInit = (PFN_VideoEncInit)dlsym(libvencoder, "VideoEncInit"); - VideoEncUnInit = (PFN_VideoEncUnInit)dlsym(libvencoder, "VideoEncUnInit"); - AllocInputBuffer = (PFN_AllocInputBuffer)dlsym(libvencoder, "AllocInputBuffer"); - GetOneAllocInputBuffer = (PFN_GetOneAllocInputBuffer)dlsym(libvencoder, "GetOneAllocInputBuffer"); - FlushCacheAllocInputBuffer = (PFN_FlushCacheAllocInputBuffer)dlsym(libvencoder, "FlushCacheAllocInputBuffer"); - ReturnOneAllocInputBuffer = (PFN_ReturnOneAllocInputBuffer)dlsym(libvencoder, "ReturnOneAllocInputBuffer"); - ReleaseAllocInputBuffer = (PFN_ReleaseAllocInputBuffer)dlsym(libvencoder, "ReleaseAllocInputBuffer"); - AddOneInputBuffer = (PFN_AddOneInputBuffer)dlsym(libvencoder, "AddOneInputBuffer"); - VideoEncodeOneFrame = (PFN_VideoEncodeOneFrame)dlsym(libvencoder, "VideoEncodeOneFrame"); - AlreadyUsedInputBuffer = (PFN_AlreadyUsedInputBuffer)dlsym(libvencoder, "AlreadyUsedInputBuffer"); - ValidBitstreamFrameNum = (PFN_ValidBitstreamFrameNum)dlsym(libvencoder, "ValidBitstreamFrameNum"); - GetOneBitstreamFrame = (PFN_GetOneBitstreamFrame)dlsym(libvencoder, "GetOneBitstreamFrame"); - FreeOneBitStreamFrame = (PFN_FreeOneBitStreamFrame)dlsym(libvencoder, "FreeOneBitStreamFrame"); - VideoEncGetParameter = (PFN_VideoEncGetParameter)dlsym(libvencoder, "VideoEncGetParameter"); - VideoEncSetParameter = (PFN_VideoEncSetParameter)dlsym(libvencoder, "VideoEncSetParameter"); + if (get_device_model() == 2) + { + VencCreate = (PFN_VencCreate)dlsym(libvencoder, "VencCreate"); + VencDestroy = (PFN_VencDestroy)dlsym(libvencoder, "VencDestroy"); + VencInit = (PFN_VencInit)dlsym(libvencoder, "VencInit"); + VencStart = (PFN_VencStart)dlsym(libvencoder, "VencStart"); + VencPause = (PFN_VencPause)dlsym(libvencoder, "VencPause"); + VencReset = (PFN_VencReset)dlsym(libvencoder, "VencReset"); + VencAllocateInputBuf = (PFN_VencAllocateInputBuf)dlsym(libvencoder, "VencAllocateInputBuf"); + VencGetValidInputBufNum = (PFN_VencGetValidInputBufNum)dlsym(libvencoder, "VencGetValidInputBufNum"); + VencQueueInputBuf = (PFN_VencQueueInputBuf)dlsym(libvencoder, "VencQueueInputBuf"); + VencGetValidOutputBufNum = (PFN_VencGetValidOutputBufNum)dlsym(libvencoder, "VencGetValidOutputBufNum"); + VencDequeueOutputBuf = (PFN_VencDequeueOutputBuf)dlsym(libvencoder, "VencDequeueOutputBuf"); + VencQueueOutputBuf = (PFN_VencQueueOutputBuf)dlsym(libvencoder, "VencQueueOutputBuf"); + VencGetParameter = (PFN_VencGetParameter)dlsym(libvencoder, "VencGetParameter"); + VencSetParameter = (PFN_VencSetParameter)dlsym(libvencoder, "VencSetParameter"); + VencSetCallbacks = (PFN_VencSetCallbacks)dlsym(libvencoder, "VencSetCallbacks"); + } + else + { + VideoEncCreate = (PFN_VideoEncCreate)dlsym(libvencoder, "VideoEncCreate"); + VideoEncDestroy = (PFN_VideoEncDestroy)dlsym(libvencoder, "VideoEncDestroy"); + VideoEncInit = (PFN_VideoEncInit)dlsym(libvencoder, "VideoEncInit"); + VideoEncUnInit = (PFN_VideoEncUnInit)dlsym(libvencoder, "VideoEncUnInit"); + AllocInputBuffer = (PFN_AllocInputBuffer)dlsym(libvencoder, "AllocInputBuffer"); + GetOneAllocInputBuffer = (PFN_GetOneAllocInputBuffer)dlsym(libvencoder, "GetOneAllocInputBuffer"); + FlushCacheAllocInputBuffer = (PFN_FlushCacheAllocInputBuffer)dlsym(libvencoder, "FlushCacheAllocInputBuffer"); + ReturnOneAllocInputBuffer = (PFN_ReturnOneAllocInputBuffer)dlsym(libvencoder, "ReturnOneAllocInputBuffer"); + ReleaseAllocInputBuffer = (PFN_ReleaseAllocInputBuffer)dlsym(libvencoder, "ReleaseAllocInputBuffer"); + AddOneInputBuffer = (PFN_AddOneInputBuffer)dlsym(libvencoder, "AddOneInputBuffer"); + VideoEncodeOneFrame = (PFN_VideoEncodeOneFrame)dlsym(libvencoder, "VideoEncodeOneFrame"); + AlreadyUsedInputBuffer = (PFN_AlreadyUsedInputBuffer)dlsym(libvencoder, "AlreadyUsedInputBuffer"); + ValidBitstreamFrameNum = (PFN_ValidBitstreamFrameNum)dlsym(libvencoder, "ValidBitstreamFrameNum"); + GetOneBitstreamFrame = (PFN_GetOneBitstreamFrame)dlsym(libvencoder, "GetOneBitstreamFrame"); + FreeOneBitStreamFrame = (PFN_FreeOneBitStreamFrame)dlsym(libvencoder, "FreeOneBitStreamFrame"); + VideoEncGetParameter = (PFN_VideoEncGetParameter)dlsym(libvencoder, "VideoEncGetParameter"); + VideoEncSetParameter = (PFN_VideoEncSetParameter)dlsym(libvencoder, "VideoEncSetParameter"); + } return 0; } @@ -518,6 +677,22 @@ static int unload_vencoder_library() VideoEncGetParameter = 0; VideoEncSetParameter = 0; + VencCreate = 0; + VencDestroy = 0; + VencInit = 0; + VencStart = 0; + VencPause = 0; + VencReset = 0; + VencAllocateInputBuf = 0; + VencGetValidInputBufNum = 0; + VencQueueInputBuf = 0; + VencGetValidOutputBufNum = 0; + VencDequeueOutputBuf = 0; + VencQueueOutputBuf = 0; + VencGetParameter = 0; + VencSetParameter = 0; + VencSetCallbacks = 0; + return 0; } @@ -822,13 +997,17 @@ class jpeg_encoder_aw_impl int deinit(); -protected: +public: int inited; int width; int height; int ch; VideoEncoder* venc; + + mutable VencInputBuffer input_buffer; + mutable VencInputBuffer_v85x input_buffer_v85x; + int b_input_buffer_got; }; jpeg_encoder_aw_impl::jpeg_encoder_aw_impl() @@ -839,6 +1018,8 @@ jpeg_encoder_aw_impl::jpeg_encoder_aw_impl() ch = 0; venc = 0; + + b_input_buffer_got = 0; } jpeg_encoder_aw_impl::~jpeg_encoder_aw_impl() @@ -846,6 +1027,22 @@ jpeg_encoder_aw_impl::~jpeg_encoder_aw_impl() deinit(); } +static int EventHandler(VideoEncoder* pEncoder, void* pAppData, VencEventType eEvent, unsigned int nData1, unsigned int nData2, void* pEventData) +{ + fprintf(stderr, "EventHandler event = %d\n", eEvent); + return 0; +} + +static int InputBufferDone(VideoEncoder* pEncoder, void* pAppData, VencCbInputBufferDoneInfo* pBufferDoneInfo) +{ + fprintf(stderr, "InputBufferDone\n"); + jpeg_encoder_aw_impl* pthis = (jpeg_encoder_aw_impl*)pAppData; + + memcpy(&pthis->input_buffer_v85x, pBufferDoneInfo->pInputBuffer, sizeof(VencInputBuffer_v85x)); + + return 0; +} + int jpeg_encoder_aw_impl::init(int _width, int _height, int _ch, int quality) { if (!vencoder.ready) @@ -875,61 +1072,169 @@ int jpeg_encoder_aw_impl::init(int _width, int _height, int _ch, int quality) const int aligned_width = (width + 15) / 16 * 16; const int aligned_height = (height + 15) / 16 * 16; - venc = VideoEncCreate(VENC_CODEC_JPEG); - if (!venc) + if (get_device_model() == 2) { - fprintf(stderr, "VideoEncCreate failed\n"); - goto OUT; - } - - { - int ret = VideoEncSetParameter(venc, VENC_IndexParamJpegQuality, (void*)&quality); - if (ret) + venc = VencCreate(VENC_CODEC_JPEG); + if (!venc) { - fprintf(stderr, "VideoEncSetParameter VENC_IndexParamJpegQuality failed %d\n", ret); + fprintf(stderr, "VencCreate failed\n"); goto OUT; } - } - { - int enc_mode = 0; - int ret = VideoEncSetParameter(venc, VENC_IndexParamJpegEncMode, (void*)&enc_mode); - if (ret) + // { + // int vbv_size = aligned_width * aligned_height * 3 / 2; + // int ret = VencSetParameter(venc, VENC_IndexParamSetVbvSize, (void*)&vbv_size); + // if (ret) + // { + // fprintf(stderr, "VencSetParameter VENC_IndexParamSetVbvSize failed %d\n", ret); + // goto OUT; + // } + // } + { - fprintf(stderr, "VideoEncSetParameter VENC_IndexParamJpegEncMode failed %d\n", ret); - goto OUT; + int ret = VencSetParameter(venc, VENC_IndexParamJpegQuality, (void*)&quality); + if (ret) + { + fprintf(stderr, "VencSetParameter VENC_IndexParamJpegQuality failed %d\n", ret); + goto OUT; + } } - } + { + int enc_mode = 0; + int ret = VencSetParameter(venc, VENC_IndexParamJpegEncMode, (void*)&enc_mode); + if (ret) + { + fprintf(stderr, "VencSetParameter VENC_IndexParamJpegEncMode failed %d\n", ret); + goto OUT; + } + } + + { + VencBaseConfig_v85x config; + memset(&config, 0, sizeof(config)); + config.nInputWidth = width; + config.nInputHeight = height; + config.nDstWidth = width; + config.nDstHeight = height; + config.nStride = aligned_width; + config.eInputFormat = VENC_PIXEL_YUV420SP; + + int ret = VencInit(venc, &config); + if (ret) + { + fprintf(stderr, "VencInit failed %d\n", ret); + goto OUT; + } + } + + { + VencAllocateBufferParam bufferParam; + bufferParam.nSizeY = aligned_width * aligned_height; + bufferParam.nSizeC = aligned_width * aligned_height / 2; + bufferParam.nBufferNum = 1; + + int ret = VencAllocateInputBuf(venc, &bufferParam, &input_buffer_v85x); + if (ret) + { + fprintf(stderr, "VencAllocateInputBuf failed %d\n", ret); + goto OUT; + } + + b_input_buffer_got = 1; + } + + { + VencCbType vencCallBack; + vencCallBack.EventHandler = EventHandler; + vencCallBack.InputBufferDone = InputBufferDone; + + int ret = VencSetCallbacks(venc, &vencCallBack, this); + if (ret) + { + fprintf(stderr, "VencSetCallbacks failed %d\n", ret); + goto OUT; + } + } + + { + int ret = VencStart(venc); + if (ret) + { + fprintf(stderr, "VencStart failed %d\n", ret); + goto OUT; + } + } + } + else { - VencBaseConfig config; - memset(&config, 0, sizeof(config)); - config.nInputWidth = width; - config.nInputHeight = height; - config.nDstWidth = width; - config.nDstHeight = height; - config.nStride = aligned_width; - config.eInputFormat = VENC_PIXEL_YUV420SP; - - int ret = VideoEncInit(venc, &config); - if (ret) + venc = VideoEncCreate(VENC_CODEC_JPEG); + if (!venc) { - fprintf(stderr, "VideoEncInit failed %d\n", ret); + fprintf(stderr, "VideoEncCreate failed\n"); goto OUT; } - } - { - VencAllocateBufferParam bufferParam; - bufferParam.nSizeY = aligned_width * aligned_height; - bufferParam.nSizeC = aligned_width * aligned_height / 2; - bufferParam.nBufferNum = 1; + { + int ret = VideoEncSetParameter(venc, VENC_IndexParamJpegQuality, (void*)&quality); + if (ret) + { + fprintf(stderr, "VideoEncSetParameter VENC_IndexParamJpegQuality failed %d\n", ret); + goto OUT; + } + } - int ret = AllocInputBuffer(venc, &bufferParam); - if (ret) { - fprintf(stderr, "AllocInputBuffer failed %d\n", ret); - goto OUT; + int enc_mode = 0; + int ret = VideoEncSetParameter(venc, VENC_IndexParamJpegEncMode, (void*)&enc_mode); + if (ret) + { + fprintf(stderr, "VideoEncSetParameter VENC_IndexParamJpegEncMode failed %d\n", ret); + goto OUT; + } + } + + { + VencBaseConfig config; + memset(&config, 0, sizeof(config)); + config.nInputWidth = width; + config.nInputHeight = height; + config.nDstWidth = width; + config.nDstHeight = height; + config.nStride = aligned_width; + config.eInputFormat = VENC_PIXEL_YUV420SP; + + int ret = VideoEncInit(venc, &config); + if (ret) + { + fprintf(stderr, "VideoEncInit failed %d\n", ret); + goto OUT; + } + } + + { + VencAllocateBufferParam bufferParam; + bufferParam.nSizeY = aligned_width * aligned_height; + bufferParam.nSizeC = aligned_width * aligned_height / 2; + bufferParam.nBufferNum = 1; + + int ret = AllocInputBuffer(venc, &bufferParam); + if (ret) + { + fprintf(stderr, "AllocInputBuffer failed %d\n", ret); + goto OUT; + } + } + + { + int ret = GetOneAllocInputBuffer(venc, &input_buffer); + if (ret) + { + fprintf(stderr, "GetOneAllocInputBuffer failed %d\n", ret); + goto OUT; + } + + b_input_buffer_got = 1; } } @@ -955,104 +1260,114 @@ int jpeg_encoder_aw_impl::encode(const unsigned char* bgrdata, std::vector Date: Sat, 24 Feb 2024 23:01:24 +0800 Subject: [PATCH 08/10] wait encoder completion the ugly way --- highgui/src/jpeg_encoder_aw.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/highgui/src/jpeg_encoder_aw.cpp b/highgui/src/jpeg_encoder_aw.cpp index e752f449..e48d6f34 100644 --- a/highgui/src/jpeg_encoder_aw.cpp +++ b/highgui/src/jpeg_encoder_aw.cpp @@ -1291,8 +1291,16 @@ int jpeg_encoder_aw_impl::encode(const unsigned char* bgrdata, std::vector Date: Sat, 24 Feb 2024 23:06:09 +0800 Subject: [PATCH 09/10] ooops --- highgui/src/jpeg_encoder_aw.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/highgui/src/jpeg_encoder_aw.cpp b/highgui/src/jpeg_encoder_aw.cpp index e48d6f34..0b048a10 100644 --- a/highgui/src/jpeg_encoder_aw.cpp +++ b/highgui/src/jpeg_encoder_aw.cpp @@ -1456,8 +1456,16 @@ int jpeg_encoder_aw_impl::encode(const unsigned char* bgrdata, const char* outfi } } + while (1) { int ret = VencDequeueOutputBuf(venc, &output_buffer); + if (ret == 5) + { + // VENC_RESULT_BITSTREAM_IS_EMPTY + // wait encoder complete + usleep(10*1000); + continue; + } if (ret) { fprintf(stderr, "VencDequeueOutputBuf failed %d\n", ret); @@ -1465,6 +1473,7 @@ int jpeg_encoder_aw_impl::encode(const unsigned char* bgrdata, const char* outfi } b_output_buffer_got = 1; + break; } } else From 6c22dcdcc7037ac09c75d29f8f98902a16ffb3b4 Mon Sep 17 00:00:00 2001 From: nihui Date: Sat, 24 Feb 2024 23:11:09 +0800 Subject: [PATCH 10/10] wip --- highgui/src/jpeg_encoder_aw.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/highgui/src/jpeg_encoder_aw.cpp b/highgui/src/jpeg_encoder_aw.cpp index 0b048a10..476e15b5 100644 --- a/highgui/src/jpeg_encoder_aw.cpp +++ b/highgui/src/jpeg_encoder_aw.cpp @@ -1027,15 +1027,15 @@ jpeg_encoder_aw_impl::~jpeg_encoder_aw_impl() deinit(); } -static int EventHandler(VideoEncoder* pEncoder, void* pAppData, VencEventType eEvent, unsigned int nData1, unsigned int nData2, void* pEventData) +static int EventHandler(VideoEncoder* /*pEncoder*/, void* /*pAppData*/, VencEventType /*eEvent*/, unsigned int /*nData1*/, unsigned int /*nData2*/, void* /*pEventData*/) { - fprintf(stderr, "EventHandler event = %d\n", eEvent); + // fprintf(stderr, "EventHandler event = %d\n", eEvent); return 0; } -static int InputBufferDone(VideoEncoder* pEncoder, void* pAppData, VencCbInputBufferDoneInfo* pBufferDoneInfo) +static int InputBufferDone(VideoEncoder* /*pEncoder*/, void* pAppData, VencCbInputBufferDoneInfo* pBufferDoneInfo) { - fprintf(stderr, "InputBufferDone\n"); + // fprintf(stderr, "InputBufferDone\n"); jpeg_encoder_aw_impl* pthis = (jpeg_encoder_aw_impl*)pAppData; memcpy(&pthis->input_buffer_v85x, pBufferDoneInfo->pInputBuffer, sizeof(VencInputBuffer_v85x)); @@ -1544,8 +1544,6 @@ int jpeg_encoder_aw_impl::encode(const unsigned char* bgrdata, const char* outfi } } - fprintf(stderr, "encode 3\n"); - fp = fopen(outfilepath, "wb"); if (!fp) {