-
-
Notifications
You must be signed in to change notification settings - Fork 46
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fixed simd options to work on all targets
- Loading branch information
Laurence Bank
authored and
Laurence Bank
committed
Jan 28, 2024
1 parent
445f175
commit 649ee19
Showing
4 changed files
with
71 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
// | ||
// ESP32-S3 SIMD optimized code | ||
// Written by Larry Bank | ||
// Copyright (c) 2024 BitBank Software, Inc. | ||
// Project started Jan 21, 2024 | ||
// | ||
#ifdef ARDUINO_ARCH_ESP32 | ||
|
||
#include "dsps_fft2r_platform.h" | ||
#if (dsps_fft2r_sc16_aes3_enabled == 1) | ||
.text | ||
.align 4 | ||
// | ||
// Inverse DCT dequantization for JPEG decompression | ||
// A2 A3 | ||
// Call as void s3_dequant(int16_t *pMCU, int16_t *pQuant); | ||
.global s3_dequant | ||
.type s3_dequant,@function | ||
|
||
s3_dequant: | ||
# no idea what this frequency keyword does | ||
# .frequency 1.000 0.000 | ||
entry a1,16 | ||
ee.vld.128.ip q0,a2,16 # load MCU rows 0-3 into Q0,Q1,Q2,Q3 | ||
ee.vld.128.ip q4,a3,16 # load quantization values into Q4,Q5,Q6,Q7 | ||
ee.vld.128.ip q1,a2,16 | ||
ee.vld.128.ip q5,a3,16 | ||
ee.vld.128.ip q2,a2,16 | ||
ee.vld.128.ip q6,a3,16 | ||
ee.vld.128.ip q3,a2,16 | ||
ee.vld.128.ip q7,a3,16 | ||
movi.n a4,0 # load the shift register with 0 | ||
wsr.sar a2 # put it in the SAR (shift amount register) | ||
ee.vmul.s16 q0,q0,q4 # de-quantize each row | ||
ee.vmul.s16 q1,q1,q5 | ||
ee.vmul.s16 q2,q2,q6 | ||
ee.vmul.s16 q3,q3,q7 | ||
addi.n a2,a2,64 # point to first row of MCUs to store dequantized values | ||
ee.vst.128.ip q0,a2,16 # write back dequantized rows 0-3 | ||
ee.vst.128.ip q1,a2,16 | ||
ee.vst.128.ip q2,a2,16 | ||
ee.vst.128.ip q3,a2,16 | ||
// repeat for rows 4-7 | ||
ee.vld.128.ip q0,a2,16 # load MCU rows 4-7 into Q0,Q1,Q2,Q3 | ||
ee.vld.128.ip q4,a3,16 # load quantization values into Q4,Q5,Q6,Q7 | ||
ee.vld.128.ip q1,a2,16 | ||
ee.vld.128.ip q5,a3,16 | ||
ee.vld.128.ip q2,a2,16 | ||
ee.vld.128.ip q6,a3,16 | ||
ee.vld.128.ip q3,a2,16 | ||
ee.vld.128.ip q7,a3,16 | ||
|
||
ee.vmul.s16 q0,q0,q4 # de-quantize rows 4-7 | ||
ee.vmul.s16 q1,q1,q5 | ||
ee.vmul.s16 q2,q2,q6 | ||
ee.vmul.s16 q3,q3,q7 | ||
addi.n a2,a2,64 # point to 4th row of MCUs | ||
ee.vst.128.ip q0,a2,16 # write back dequantized rows 4-7 | ||
ee.vst.128.ip q1,a2,16 | ||
ee.vst.128.ip q2,a2,16 | ||
ee.vst.128.ip q3,a2,16 | ||
retw.n # done | ||
#endif // dsps_fft2r_sc16_aes3_enabled | ||
#endif // ESP32 |