-
Notifications
You must be signed in to change notification settings - Fork 27
/
esl_avx512.h
164 lines (140 loc) · 6.7 KB
/
esl_avx512.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
/* Vectorized routines for x86 AVX-512 instructions.
*
* This header file, unusually, provides many complete function
* implementations so they can be inlined by the compiler.
*
* Contents:
* 1. Function declarations for esl_avx512.c
* 2. Inlined functions: horizontal max, sum
* 3. Inlined functions: left, right shift
*/
#ifndef eslAVX512_INCLUDED
#define eslAVX512_INCLUDED
#include <esl_config.h>
#ifdef eslENABLE_AVX512
#include "easel.h"
#include <stdio.h>
#include <x86intrin.h>
/*****************************************************************
* 1. Function declarations for esl_avx512.c
*****************************************************************/
extern void esl_avx512_dump_512i_hex8(__m512i v);
/*****************************************************************
* 2. Inlined functions: horizontal max, sum
*****************************************************************/
/* Function: esl_avx512_hmax_epu8()
* Synopsis: Return max of 64 unsigned uint8_t elements in epu8 vector.
*/
static inline uint8_t
esl_avx512_hmax_epu8(__m512i a)
{
// Use AVX instructions for this because AVX-512 can't extract 8-bit quantities
// Intel has stated that there will be no performance penalty for switching between AVX-512 and AVX
__m256i b = _mm256_max_epu8(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1));
b = _mm256_max_epu8(b, _mm256_permute2x128_si256(b, b, 0x01));
b = _mm256_max_epu8(b, _mm256_shuffle_epi32 (b, 0x4e));
b = _mm256_max_epu8(b, _mm256_shuffle_epi32 (b, 0xb1));
b = _mm256_max_epu8(b, _mm256_shufflelo_epi16 (b, 0xb1));
b = _mm256_max_epu8(b, _mm256_srli_si256 (b, 1));
return _mm256_extract_epi8(b, 0); // epi8 is fine here. gets cast properly to uint8_t on return.
}
/* Function: esl_avx512_hmax_epu8()
* Synopsis: Return max of 64 unsigned uint8_t elements in epu8 vector.
* Incept: SRE, Thu May 25 13:20:45 2017 [Old 97's, Oppenheimer]
*/
static inline int8_t
esl_avx512_hmax_epi8(__m512i a)
{
__m256i b = _mm256_max_epi8(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1));
b = _mm256_max_epi8(b, _mm256_permute2x128_si256(b, b, 0x01));
b = _mm256_max_epi8(b, _mm256_shuffle_epi32 (b, 0x4e));
b = _mm256_max_epi8(b, _mm256_shuffle_epi32 (b, 0xb1));
b = _mm256_max_epi8(b, _mm256_shufflelo_epi16 (b, 0xb1));
b = _mm256_max_epi8(b, _mm256_srli_si256 (b, 1));
return _mm256_extract_epi8(b, 0);
}
/* Function: esl_avx512_hmax_epi16()
* Synopsis: Return max of 32 signed int8_t elements in epi16 vector.
*/
static inline int16_t
esl_avx512_hmax_epi16(__m512i a)
{
__m256i b = _mm256_max_epi16(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1));
b = _mm256_max_epi16(b, _mm256_permute2x128_si256(b, b, 0x01));
b = _mm256_max_epi16(b, _mm256_shuffle_epi32 (b, 0x4e));
b = _mm256_max_epi16(b, _mm256_shuffle_epi32 (b, 0xb1));
b = _mm256_max_epi16(b, _mm256_shufflelo_epi16 (b, 0xb1));
return _mm256_extract_epi16(b, 0);
}
/* Function: esl_avx512_hsum_ps()
* Synopsis: sums the floating-point values in an __m512 vector
* returns the result in ret_sum
* Purpose: To compute the sum of the 32-bit float elements of a 512-bit vector
*/
static inline void
esl_avx512_hsum_ps(__m512 a, float *ret_sum)
{
__m512 temp1_AVX_512 = _mm512_shuffle_f32x4(a, a, 0x4e); //swap high and low halves of a
__m512 temp2_AVX_512 = _mm512_add_ps(a, temp1_AVX_512); // sum corresponding floats in the high, low halves
temp1_AVX_512 = _mm512_shuffle_f32x4(temp2_AVX_512, temp2_AVX_512, 0xb1); //swap high and low quarters of each half of temp2
temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // sum corresponding floats in the high, low quarters
temp1_AVX_512 = _mm512_shuffle_ps(temp2_AVX_512, temp2_AVX_512, 0x4e); //swap high and low eigths of each quarter of a
temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // sum corresponding floats in the high, low eighths
temp1_AVX_512 = _mm512_shuffle_ps(temp2_AVX_512, temp2_AVX_512, 0xb1); //swap high and low sixteenths of each eighth
temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // each element of temp2_AVX_512 now contains the sum of all the floats in a
__m256 temp3_AVX = _mm512_extractf32x8_ps(temp2_AVX_512, 0); //Grab the low half of temp2_AVX_512
// because AVX-512 doesn't provide an operation to extract one float from a 512-bit vector
// printf("output sum vector is: ");
// print_512(temp2_AVX_512);
int *retint_ptr = (int *) ret_sum; // This is a horrible hack because there isn't an intrinsic to extract a float from
// an __m256. Do this to avoid casting an int back to a float and screwing it up
*retint_ptr = _mm256_extract_epi32((__m256i) temp3_AVX, 0);
}
/*****************************************************************
* 3. Inlined functions: left and right shifts
*****************************************************************/
/* Function: esl_avx512_rightshift_int8()
* Synopsis: Shift int8 vector elements to the right, shifting -inf on.
* Incept: SRE, Sun Jun 4 17:43:14 2017
* See: esl_sse.h::esl_sse_rightshift_int8()
*/
static inline __m512i
esl_avx512_rightshift_int8(__m512i v, __m512i neginfmask)
{
// Similar to AVX logic, but complicated by lack of permute2x128 instruction.
v = _mm512_alignr_epi8(v, _mm512_maskz_shuffle_i32x4(0xfff0, v, v, 0x90), 15);
return _mm512_or_si512(v, neginfmask);
}
/* Function: esl_avx512_rightshift_int16()
* Synopsis: Shift int16 vector elements to the right, shifting -inf on.
* Incept: SRE, Sun Jun 4 17:49:02 2017
* See: esl_sse.h::esl_sse_rightshift_int16()
*/
static inline __m512i
esl_avx512_rightshift_int16(__m512i v, __m512i neginfmask)
{
v = _mm512_alignr_epi8(v, _mm512_maskz_shuffle_i32x4(0xfff0, v, v, 0x90), 14);
return _mm512_or_si512(v, neginfmask);
}
/* Function: esl_avx512_rightshiftz_float()
* Synopsis: Shift float vector elements to the right, shifting zero on.
* Incept: SRE, Sun Jun 4 17:59:54 2017
* See: esl_sse.h::esl_sse_rightshiftz_float()
*/
static inline __m512
esl_avx512_rightshiftz_float(__m512 v)
{
return ((__m512) _mm512_alignr_epi8((__m512i) v, _mm512_maskz_shuffle_i32x4(0xfff0, (__m512i) v, (__m512i) v, 0x90), 12));
}
/* Function: esl_avx512_leftshiftz_float()
* Synopsis: Shift float vector elements to the left, shifting zero on.
* Incept: SRE, Sun Jun 4 18:04:34 2017
* See: esl_sse.h::esl_sse_leftshiftz_float()
*/
static inline __m512
esl_avx512_leftshiftz_float(__m512 v)
{
return ((__m512) _mm512_alignr_epi8( _mm512_maskz_shuffle_i32x4(0x0fff, (__m512i) v, (__m512i) v, 0x39), (__m512i) v, 4));
}
#endif //eslAVX512_INCLUDED
#endif //eslENABLE_AVX512