Skip to content

Commit

Permalink
fixed simd options to work on all targets
Browse files Browse the repository at this point in the history
  • Loading branch information
Laurence Bank authored and Laurence Bank committed Jan 28, 2024
1 parent 445f175 commit 649ee19
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 1 deletion.
3 changes: 2 additions & 1 deletion src/jpeg.inl
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,13 @@
extern "C" {
void s3_ycbcr_convert_444(uint8_t *pY, uint8_t *pCB, uint8_t *pCR, uint16_t *pOut, int16_t *pConsts, uint8_t ucPixelType);
void s3_ycbcr_convert_420(uint8_t *pY, uint8_t *pCB, uint8_t *pCR, uint16_t *pOut, int16_t *pConsts, uint8_t ucPixelType);
void s3_dequant(int16_t *pMCU, int16_t *pQuant);
}
int16_t i16_Consts[8] = {0x80, 113, 90, 22, 46, 1,32,2048};
#endif // S3 SIMD
#endif // ESP32

#if !defined(HAS_SIMD) && (defined(__arm__) || defined(__arm64__) || defined(__aarch64__))
#if !defined(HAS_SIMD) && (defined(__arm64__) || defined(__aarch64__))
#include <arm_neon.h>
#define HAS_NEON
#endif
Expand Down
2 changes: 2 additions & 0 deletions src/s3_simd_420.S
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// Copyright (c) 2024 BitBank Software, Inc.
// Project started Jan 21, 2024
//
#ifdef ARDUINO_ARCH_ESP32
#include "dsps_fft2r_platform.h"
#if (dsps_fft2r_sc16_aes3_enabled == 1)
.text
Expand Down Expand Up @@ -124,3 +125,4 @@
bnez.n a8,.convert_420_loop
retw.n
#endif // dsps_fft2r_sc16_aes3_enabled
#endif // ESP32
3 changes: 3 additions & 0 deletions src/s3_simd_444.S
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
// Copyright (c) 2024 BitBank Software, Inc.
// Project started Jan 21, 2024
//
#ifdef ARDUINO_ARCH_ESP32

#include "dsps_fft2r_platform.h"
#if (dsps_fft2r_sc16_aes3_enabled == 1)
.text
Expand Down Expand Up @@ -104,3 +106,4 @@ s3_ycbcr_convert_444:
ee.vst.128.ip q2,a5,0
retw.n # done
#endif // dsps_fft2r_sc16_aes3_enabled
#endif // ESP32
64 changes: 64 additions & 0 deletions src/s3_simd_dequant.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
//
// ESP32-S3 SIMD optimized code
// Written by Larry Bank
// Copyright (c) 2024 BitBank Software, Inc.
// Project started Jan 21, 2024
//
#ifdef ARDUINO_ARCH_ESP32

#include "dsps_fft2r_platform.h"
#if (dsps_fft2r_sc16_aes3_enabled == 1)
.text
.align 4
//
// Inverse DCT dequantization for JPEG decompression
// A2 A3
// Call as void s3_dequant(int16_t *pMCU, int16_t *pQuant);
.global s3_dequant
.type s3_dequant,@function

s3_dequant:
# no idea what this frequency keyword does
# .frequency 1.000 0.000
entry a1,16
ee.vld.128.ip q0,a2,16 # load MCU rows 0-3 into Q0,Q1,Q2,Q3
ee.vld.128.ip q4,a3,16 # load quantization values into Q4,Q5,Q6,Q7
ee.vld.128.ip q1,a2,16
ee.vld.128.ip q5,a3,16
ee.vld.128.ip q2,a2,16
ee.vld.128.ip q6,a3,16
ee.vld.128.ip q3,a2,16
ee.vld.128.ip q7,a3,16
movi.n a4,0 # load the shift register with 0
wsr.sar a2 # put it in the SAR (shift amount register)
ee.vmul.s16 q0,q0,q4 # de-quantize each row
ee.vmul.s16 q1,q1,q5
ee.vmul.s16 q2,q2,q6
ee.vmul.s16 q3,q3,q7
addi.n a2,a2,64 # point to first row of MCUs to store dequantized values
ee.vst.128.ip q0,a2,16 # write back dequantized rows 0-3
ee.vst.128.ip q1,a2,16
ee.vst.128.ip q2,a2,16
ee.vst.128.ip q3,a2,16
// repeat for rows 4-7
ee.vld.128.ip q0,a2,16 # load MCU rows 4-7 into Q0,Q1,Q2,Q3
ee.vld.128.ip q4,a3,16 # load quantization values into Q4,Q5,Q6,Q7
ee.vld.128.ip q1,a2,16
ee.vld.128.ip q5,a3,16
ee.vld.128.ip q2,a2,16
ee.vld.128.ip q6,a3,16
ee.vld.128.ip q3,a2,16
ee.vld.128.ip q7,a3,16

ee.vmul.s16 q0,q0,q4 # de-quantize rows 4-7
ee.vmul.s16 q1,q1,q5
ee.vmul.s16 q2,q2,q6
ee.vmul.s16 q3,q3,q7
addi.n a2,a2,64 # point to 4th row of MCUs
ee.vst.128.ip q0,a2,16 # write back dequantized rows 4-7
ee.vst.128.ip q1,a2,16
ee.vst.128.ip q2,a2,16
ee.vst.128.ip q3,a2,16
retw.n # done
#endif // dsps_fft2r_sc16_aes3_enabled
#endif // ESP32

0 comments on commit 649ee19

Please sign in to comment.