fixed simd options to work on all targets

bitbank2 · Jan 28, 2024 · 649ee19 · 649ee19
1 parent 445f175
commit 649ee19
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 1 deletion.
diff --git a/src/jpeg.inl b/src/jpeg.inl
@@ -38,12 +38,13 @@
 extern "C" {
 void s3_ycbcr_convert_444(uint8_t *pY, uint8_t *pCB, uint8_t *pCR, uint16_t *pOut, int16_t *pConsts, uint8_t ucPixelType);
 void s3_ycbcr_convert_420(uint8_t *pY, uint8_t *pCB, uint8_t *pCR, uint16_t *pOut, int16_t *pConsts, uint8_t ucPixelType);
+void s3_dequant(int16_t *pMCU, int16_t *pQuant);
 }
 int16_t i16_Consts[8] = {0x80, 113, 90, 22, 46, 1,32,2048};
 #endif // S3 SIMD
 #endif // ESP32
 
-#if !defined(HAS_SIMD) && (defined(__arm__) || defined(__arm64__) || defined(__aarch64__))
+#if !defined(HAS_SIMD) && (defined(__arm64__) || defined(__aarch64__))
 #include <arm_neon.h>
 #define HAS_NEON
 #endif

diff --git a/src/s3_simd_420.S b/src/s3_simd_420.S
@@ -4,6 +4,7 @@
 // Copyright (c) 2024 BitBank Software, Inc.
 // Project started Jan 21, 2024
 //
+#ifdef ARDUINO_ARCH_ESP32
 #include "dsps_fft2r_platform.h"
 #if (dsps_fft2r_sc16_aes3_enabled == 1)
  .text
@@ -124,3 +125,4 @@
  bnez.n a8,.convert_420_loop
  retw.n
 #endif // dsps_fft2r_sc16_aes3_enabled
+#endif // ESP32
diff --git a/src/s3_simd_444.S b/src/s3_simd_444.S
@@ -4,6 +4,8 @@
 // Copyright (c) 2024 BitBank Software, Inc.
 // Project started Jan 21, 2024
 //
+#ifdef ARDUINO_ARCH_ESP32
+
 #include "dsps_fft2r_platform.h"
 #if (dsps_fft2r_sc16_aes3_enabled == 1)
  .text
@@ -104,3 +106,4 @@ s3_ycbcr_convert_444:
  ee.vst.128.ip q2,a5,0
  retw.n # done
 #endif // dsps_fft2r_sc16_aes3_enabled
+#endif // ESP32
diff --git a/src/s3_simd_dequant.S b/src/s3_simd_dequant.S
@@ -0,0 +1,64 @@
+//
+// ESP32-S3 SIMD optimized code
+// Written by Larry Bank
+// Copyright (c) 2024 BitBank Software, Inc.
+// Project started Jan 21, 2024
+//
+#ifdef ARDUINO_ARCH_ESP32
+
+#include "dsps_fft2r_platform.h"
+#if (dsps_fft2r_sc16_aes3_enabled == 1)
+ .text
+ .align 4
+//
+// Inverse DCT dequantization for JPEG decompression
+// A2 A3
+// Call as void s3_dequant(int16_t *pMCU, int16_t *pQuant);
+ .global s3_dequant
+ .type s3_dequant,@function
+
+s3_dequant:
+ # no idea what this frequency keyword does
+# .frequency 1.000 0.000
+ entry a1,16
+ ee.vld.128.ip q0,a2,16 # load MCU rows 0-3 into Q0,Q1,Q2,Q3
+ ee.vld.128.ip q4,a3,16 # load quantization values into Q4,Q5,Q6,Q7
+ ee.vld.128.ip q1,a2,16
+ ee.vld.128.ip q5,a3,16
+ ee.vld.128.ip q2,a2,16
+ ee.vld.128.ip q6,a3,16
+ ee.vld.128.ip q3,a2,16
+ ee.vld.128.ip q7,a3,16
+ movi.n a4,0 # load the shift register with 0
+ wsr.sar a2 # put it in the SAR (shift amount register)
+ ee.vmul.s16 q0,q0,q4 # de-quantize each row
+ ee.vmul.s16 q1,q1,q5
+ ee.vmul.s16 q2,q2,q6
+ ee.vmul.s16 q3,q3,q7
+ addi.n a2,a2,64 # point to first row of MCUs to store dequantized values
+ ee.vst.128.ip q0,a2,16 # write back dequantized rows 0-3
+ ee.vst.128.ip q1,a2,16
+ ee.vst.128.ip q2,a2,16
+ ee.vst.128.ip q3,a2,16
+// repeat for rows 4-7
+ ee.vld.128.ip q0,a2,16 # load MCU rows 4-7 into Q0,Q1,Q2,Q3
+ ee.vld.128.ip q4,a3,16 # load quantization values into Q4,Q5,Q6,Q7
+ ee.vld.128.ip q1,a2,16
+ ee.vld.128.ip q5,a3,16
+ ee.vld.128.ip q2,a2,16
+ ee.vld.128.ip q6,a3,16
+ ee.vld.128.ip q3,a2,16
+ ee.vld.128.ip q7,a3,16
+
+ ee.vmul.s16 q0,q0,q4 # de-quantize rows 4-7
+ ee.vmul.s16 q1,q1,q5
+ ee.vmul.s16 q2,q2,q6
+ ee.vmul.s16 q3,q3,q7
+ addi.n a2,a2,64 # point to 4th row of MCUs
+ ee.vst.128.ip q0,a2,16 # write back dequantized rows 4-7
+ ee.vst.128.ip q1,a2,16
+ ee.vst.128.ip q2,a2,16
+ ee.vst.128.ip q3,a2,16
+ retw.n # done
+#endif // dsps_fft2r_sc16_aes3_enabled
+#endif // ESP32