Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AVX512 GCC<9 gather fix #1050

Merged
merged 2 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ RUN \
binfmt-support qemu-user-static \
creduce screen htop parallel nano rsync strace \
npm libsleef-dev \
pipx wget
pipx wget curl

# Meson on stable is too old, and we want to make sure we keep 0.55
# working for a while.
Expand All @@ -62,7 +62,7 @@ ENV PATH="/usr/lib/ccache:$PATH:/root/.local/bin:/root/.jsvu/bin"
RUN \
apt-get update -y && \
apt-get upgrade -y && \
apt-get install -yq curl gpg && \
apt-get install -yq gpg && \
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
| gpg --dearmor > /usr/share/keyrings/oneapi-archive-keyring.gpg && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" > /etc/apt/sources.list.d/oneAPI.list && \
Expand Down
4 changes: 2 additions & 2 deletions docker/Dockerfile.ubuntu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN \
binfmt-support qemu-user-static \
creduce screen htop parallel nano rsync strace \
npm libsleef-dev \
pipx wget
pipx wget curl

# Meson on stable is too old, and we want to make sure we keep 0.55
# working for a while.
Expand All @@ -54,7 +54,7 @@ ENV PATH="/usr/lib/ccache:$PATH:/root/.local/bin:/root/.jsvu/bin"
RUN \
apt-get update -y && \
apt-get upgrade -y && \
apt-get install -yq curl gpg && \
apt-get install -yq gpg && \
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
| gpg --dearmor > /usr/share/keyrings/oneapi-archive-keyring.gpg && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" > /etc/apt/sources.list.d/oneAPI.list && \
Expand Down
12 changes: 6 additions & 6 deletions docker/Dockerfile.ubuntu_bionic
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,12 @@ ENV PATH="/usr/lib/ccache:$PATH:/root/.local/bin:/root/.jsvu/bin"
# /opt/ibm/xlC/${XLC_VERSION}/bin/xlc_configure <<< 1 >/dev/null

# Intel SDE
# COPY test/download-sde.sh /tmp/simde-bin/download-sde.sh
# RUN \
# "/tmp/simde-bin/download-sde.sh" "/opt/intel/sde" && \
# for executable in sde sde64; do \
# ln -s "/opt/intel/sde/${executable}" "/usr/bin/${executable}"; \
# done
COPY test/download-sde.sh /tmp/simde-bin/download-sde.sh
RUN \
"/tmp/simde-bin/download-sde.sh" "/opt/intel/sde" && \
for executable in sde sde64; do \
ln -s "/opt/intel/sde/${executable}" "/usr/bin/${executable}"; \
done

# Emscripten
# RUN \
Expand Down
18 changes: 18 additions & 0 deletions docker/cross-files/intel-all-gcc-8.cross
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[binaries]
c = 'gcc-8'
cpp = 'g++-8'
ar = 'ar'
strip = 'strip'
objcopy = 'objcopy'
ld = 'ld'
exe_wrapper = ['sde64', '-skx', '--']

[properties]
c_args = ['-march=icelake-server', '-Wextra', '-Werror']
cpp_args = ['-march=icelake-server', '-Wextra', '-Werror']

[host_machine]
system = 'linux'
cpu_family = 'x86_64-all'
cpu = 'x86_64'
endian = 'little'
4 changes: 2 additions & 2 deletions simde/x86/avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -2269,11 +2269,11 @@ simde_mm256_i32gather_ps(const simde_float32* base_addr, simde__m256i vindex, co
return simde__m256_from_private(r_);
}
#if defined(SIMDE_X86_AVX2_NATIVE)
#define simde_mm256_i32gather_ps(base_addr, vindex, scale) _mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale)
#define simde_mm256_i32gather_ps(base_addr, vindex, scale) _mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, (base_addr)), (vindex), (scale))
#endif
#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES)
#undef _mm256_i32gather_ps
#define _mm256_i32gather_ps(base_addr, vindex, scale) simde_mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale)
#define _mm256_i32gather_ps(base_addr, vindex, scale) simde_mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, (base_addr)), (vindex), (scale))
#endif

SIMDE_FUNCTION_ATTRIBUTES
Expand Down
44 changes: 42 additions & 2 deletions simde/x86/avx512/gather.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

#include "types.h"
#include "../avx2.h"
#include "extract.h"

HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
Expand Down Expand Up @@ -65,6 +66,13 @@ simde_mm512_i32gather_ps(simde__m512i vindex, const void* base_addr, const int32
HEDLEY_STATIC_CAST(float const*, (base_addr)), simde_mm512_i32gather_ps_vindex_.m256i[1], (scale)); \
simde__m512_from_private(simde_mm512_i32gather_ps_r_); \
}))
#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_)
#define simde_mm512_i32gather_ps(vindex, base_addr, scale) \
simde_x_mm512_set_m256( \
_mm256_i32gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \
simde_mm512_extracti32x8_epi32((vindex), 1), (scale)), \
_mm256_i32gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \
simde_mm512_extracti32x8_epi32((vindex), 0), (scale)) )
#endif
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_i32gather_ps
Expand Down Expand Up @@ -104,6 +112,15 @@ simde_mm512_i64gather_epi32(simde__m512i vindex, const void* base_addr, const in
HEDLEY_STATIC_CAST(int const*, (base_addr)), simde_mm512_i64gather_epi32_vindex_.m256i[1], (scale)); \
simde__m256i_from_private(simde_mm512_i64gather_epi32_r_); \
}))
#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_)
#define simde_mm512_i64gather_epi32(vindex, base_addr, scale) \
_mm256_insertf128_si256( \
_mm256_castsi128_si256( \
_mm256_i64gather_epi32(HEDLEY_STATIC_CAST(int const*, (base_addr)), \
simde_mm512_extracti64x4_epi64((vindex), 0), (scale))), \
_mm256_i64gather_epi32(HEDLEY_STATIC_CAST(int const*, (base_addr)), \
simde_mm512_extracti64x4_epi64((vindex), 1), (scale)), \
1)
#endif
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_i64gather_epi32
Expand Down Expand Up @@ -147,11 +164,18 @@ simde_mm512_i64gather_epi64(simde__m512i vindex, const void* base_addr, const in
simde__m512i_private simde_mm512_i64gather_epi64_r_, \
simde_mm512_i64gather_epi64_vindex_ = simde__m512i_to_private((vindex)); \
simde_mm512_i64gather_epi64_r_.m256i[0] = _mm256_i64gather_epi64( \
HEDLEY_STATIC_CAST(int64_t const*, (base_addr)), simde_mm512_i64gather_epi64_vindex_.m256i[0], (scale)); \
HEDLEY_STATIC_CAST(long long const*, (base_addr)), simde_mm512_i64gather_epi64_vindex_.m256i[0], (scale)); \
simde_mm512_i64gather_epi64_r_.m256i[1] = _mm256_i64gather_epi64( \
HEDLEY_STATIC_CAST(int64_t const*, (base_addr)), simde_mm512_i64gather_epi64_vindex_.m256i[1], (scale)); \
HEDLEY_STATIC_CAST(long long const*, (base_addr)), simde_mm512_i64gather_epi64_vindex_.m256i[1], (scale)); \
simde__m512i_from_private(simde_mm512_i64gather_epi64_r_); \
}))
#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_)
#define simde_mm512_i64gather_epi64(vindex, base_addr, scale) \
simde_x_mm512_set_m256i( \
_mm256_i64gather_epi64(HEDLEY_STATIC_CAST(long long const*, (base_addr)), \
simde_mm512_extracti32x8_epi32((vindex), 1), (scale)), \
_mm256_i64gather_epi64(HEDLEY_STATIC_CAST(long long const*, (base_addr)), \
simde_mm512_extracti32x8_epi32((vindex), 0), (scale)) )
#endif
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_i64gather_epi64
Expand Down Expand Up @@ -201,6 +225,13 @@ simde_mm512_i64gather_pd(simde__m512i vindex, const void* base_addr, const int32
HEDLEY_STATIC_CAST(double const*, (base_addr)), simde_mm512_i64gather_pd_vindex_.m256i[1], (scale)); \
simde__m512d_from_private(simde_mm512_i64gather_pd_r_); \
}))
#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_)
#define simde_mm512_i64gather_pd(vindex, base_addr, scale) \
simde_x_mm512_set_m256d( \
_mm256_i64gather_pd(HEDLEY_STATIC_CAST(double const*, (base_addr)), \
simde_mm512_extracti64x4_epi64((vindex), 1), (scale)), \
_mm256_i64gather_pd(HEDLEY_STATIC_CAST(double const*, (base_addr)), \
simde_mm512_extracti64x4_epi64((vindex), 0), (scale)) )
#endif
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_i64gather_pd
Expand Down Expand Up @@ -250,6 +281,15 @@ simde_mm512_i64gather_ps(simde__m512i vindex, const void* base_addr, const int32
HEDLEY_STATIC_CAST(float const*, (base_addr)), simde_mm512_i64gather_ps_vindex_.m256i[1], (scale)); \
simde__m256_from_private(simde_mm512_i64gather_ps_r_); \
}))
#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_)
#define simde_mm512_i64gather_ps(vindex, base_addr, scale) \
_mm256_insertf128_ps( \
_mm256_castps128_ps256( \
_mm256_i64gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \
simde_mm512_extracti64x4_epi64((vindex), 0), (scale))), \
_mm256_i64gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \
simde_mm512_extracti64x4_epi64((vindex), 1), (scale)), \
1)
#endif
#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
#undef _mm512_i64gather_ps
Expand Down
34 changes: 33 additions & 1 deletion simde/x86/avx512/set.h
Original file line number Diff line number Diff line change
Expand Up @@ -415,11 +415,27 @@ simde_x_mm512_set_m128i (simde__m128i a, simde__m128i b, simde__m128i c, simde__
#endif
}

SIMDE_FUNCTION_ATTRIBUTES
simde__m512
simde_x_mm512_set_m256 (simde__m256 a, simde__m256 b) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
SIMDE_ALIGN_TO_64 simde__m256 v[] = { b, a };
return simde_mm512_load_ps(HEDLEY_STATIC_CAST(__m512 *, HEDLEY_STATIC_CAST(void *, v)));
#else
simde__m512_private r_;

r_.m256[0] = b;
r_.m256[1] = a;

return simde__m512_from_private(r_);
#endif
}

SIMDE_FUNCTION_ATTRIBUTES
simde__m512i
simde_x_mm512_set_m256i (simde__m256i a, simde__m256i b) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
SIMDE_ALIGN_LIKE_32(simde__m256i) simde__m256i v[] = { b, a };
SIMDE_ALIGN_TO_64 simde__m256i v[] = { b, a };
return simde_mm512_load_si512(HEDLEY_STATIC_CAST(__m512i *, HEDLEY_STATIC_CAST(void *, v)));
#else
simde__m512i_private r_;
Expand All @@ -431,6 +447,22 @@ simde_x_mm512_set_m256i (simde__m256i a, simde__m256i b) {
#endif
}

SIMDE_FUNCTION_ATTRIBUTES
simde__m512d
simde_x_mm512_set_m256d (simde__m256d a, simde__m256d b) {
#if defined(SIMDE_X86_AVX512F_NATIVE)
SIMDE_ALIGN_TO_64 simde__m256d v[] = { b, a };
return simde_mm512_load_pd(HEDLEY_STATIC_CAST(__m512d *, HEDLEY_STATIC_CAST(void *, v)));
#else
simde__m512d_private r_;

r_.m256d[0] = b;
r_.m256d[1] = a;

return simde__m512d_from_private(r_);
#endif
}

SIMDE_FUNCTION_ATTRIBUTES
simde__m512
simde_mm512_set_ps (simde_float32 e15, simde_float32 e14, simde_float32 e13, simde_float32 e12,
Expand Down
Loading