From 1898b14e9c390944519ca3cea17fff268aac98fd Mon Sep 17 00:00:00 2001 From: Igor Ivanov Date: Thu, 15 Nov 2018 10:34:42 +0200 Subject: [PATCH 1/4] issue: 1557652 Update atomic operations - Removed not used operations - Refactored organization of atomic code - Introduced atomic_store_explicit/atomic_load_explicit - Fixed atomic_read/atomic_set Signed-off-by: Igor Ivanov --- src/utils/asm-arm64.h | 10 --- src/utils/asm-ppc64.h | 17 ----- src/utils/asm-x86.h | 44 ++--------- src/utils/asm.h | 10 +++ src/utils/atomic.h | 168 +++++++++++++++++++++++++++++------------- 5 files changed, 131 insertions(+), 118 deletions(-) diff --git a/src/utils/asm-arm64.h b/src/utils/asm-arm64.h index ce28cedda..e3a845448 100644 --- a/src/utils/asm-arm64.h +++ b/src/utils/asm-arm64.h @@ -52,16 +52,6 @@ #define wmb() asm volatile("dsb st" ::: "memory") #define wc_wmb() wmb() -/** - * Add to the atomic variable. - * @param i integer value to add. - * @param v pointer of type atomic_t. - * @return Value before add. - */ -static inline int atomic_fetch_and_add(int i, volatile int *ptr) -{ - return __atomic_fetch_add(ptr, i, __ATOMIC_ACQUIRE); -} /** * Read RDTSC register diff --git a/src/utils/asm-ppc64.h b/src/utils/asm-ppc64.h index c4cda5762..336d6d46a 100644 --- a/src/utils/asm-ppc64.h +++ b/src/utils/asm-ppc64.h @@ -52,21 +52,6 @@ #define wmb() rmb() #define wc_wmb() mb() -/** - * Add to the atomic variable. - * @param i integer value to add. - * @param v pointer of type atomic_t. - * @return Value before add. - */ -static inline int atomic_fetch_and_add(int i, volatile int *ptr) -{ -#ifdef __ATOMIC_ACQUIRE - return __atomic_fetch_add(ptr, i, __ATOMIC_ACQUIRE); -#else - return __sync_fetch_and_add(ptr, i); -#endif -} - /** * Read RDTSC register @@ -97,6 +82,4 @@ static inline void prefetch_range(void *addr, size_t len) prefetch(cp); } - - #endif diff --git a/src/utils/asm-x86.h b/src/utils/asm-x86.h index 9880a9753..f14880e04 100644 --- a/src/utils/asm-x86.h +++ b/src/utils/asm-x86.h @@ -36,9 +36,7 @@ #include #include -#include "utils/bullseye.h" -#define __xg(x) ((volatile long *)(x)) #define mb() asm volatile("" ::: "memory") #define rmb() mb() @@ -59,36 +57,6 @@ dst += 8; \ src += 8 -#if _BullseyeCoverage - #pragma BullseyeCoverage off -#endif -/** - * Atomic swap - */ -static inline unsigned long xchg(unsigned long x, volatile void *ptr) -{ - __asm__ __volatile__("xchg %0,%1" - :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - return x; -} - -/** - * Atomic compare-and-swap - */ -static inline bool cmpxchg(unsigned long old_value, unsigned long new_value, volatile void *ptr) -{ - unsigned long prev_value = old_value; - __asm__ __volatile__("lock; cmpxchg %1,%2" - : "=a"(prev_value) - : "r"(new_value), "m"(*__xg(ptr)), "0"(old_value) - : "memory"); - return prev_value == old_value; -} -#if _BullseyeCoverage - #pragma BullseyeCoverage on -#endif /** * Add to the atomic variable. @@ -96,13 +64,15 @@ static inline bool cmpxchg(unsigned long old_value, unsigned long new_value, vol * @param v pointer of type atomic_t. * @return Value before add. */ -static inline int atomic_fetch_and_add(int x, volatile int *ptr) +#define __vma_atomic_fetch_add_explicit __x86_atomic_fetch_and_add +static inline int __x86_atomic_fetch_and_add(atomic_t *obj, int val, int order) { + (void)order; __asm__ __volatile__("lock; xaddl %0,%1" - : "=r"(x) - : "m"(*ptr), "0"(x) + : "=r"(val) + : "m"(obj->value), "0"(val) : "memory"); - return x; + return val; } /** @@ -147,6 +117,4 @@ static inline void prefetch_range(void *addr, size_t len) prefetch(cp); } - - #endif diff --git a/src/utils/asm.h b/src/utils/asm.h index 78530cc42..d9353cd57 100644 --- a/src/utils/asm.h +++ b/src/utils/asm.h @@ -34,6 +34,16 @@ #ifndef ASM_H_ #define ASM_H_ +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +#define vma_atomic_type(_type) \ + struct { volatile __typeof__(_type) value; } + +typedef vma_atomic_type(int) atomic_int; +typedef atomic_int atomic_t; + #if defined(__aarch64__) #include "asm-arm64.h" #elif defined(__powerpc64__) diff --git a/src/utils/atomic.h b/src/utils/atomic.h index 6ba552d36..e34d2e110 100644 --- a/src/utils/atomic.h +++ b/src/utils/atomic.h @@ -35,89 +35,151 @@ #define ATOMIC_H_ #include "asm.h" -#include "utils/bullseye.h" -struct atomic_t { - __volatile__ int counter; + +/* + * C++11 memory model + */ +#ifndef __ATOMIC_RELAXED +#define __ATOMIC_RELAXED 0 +#endif +#ifndef __ATOMIC_CONSUME +#define __ATOMIC_CONSUME 1 +#endif +#ifndef __ATOMIC_ACQUIRE +#define __ATOMIC_ACQUIRE 2 +#endif +#ifndef __ATOMIC_RELEASE +#define __ATOMIC_RELEASE 3 +#endif +#ifndef __ATOMIC_ACQ_REL +#define __ATOMIC_ACQ_REL 4 +#endif +#ifndef __ATOMIC_SEQ_CST +#define __ATOMIC_SEQ_CST 5 +#endif + +enum memory_order { + /* memory_order_relaxed: + * Only atomicity is provided there are no constraints on reordering of memory + * accesses around the atomic variable. + */ + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_consume = __ATOMIC_CONSUME, + /* memory_order_acquire: + * No reads in the current thread can be reordered before this load. + * This ensures that all writes in other threads that release the same atomic variable + * are visible in the current thread. + */ + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + /* memory_order_seq_cst: + * Enforces total ordering. The operation has the same semantics as acquire-release operation + * (memory_order_acq_rel), and additionally has sequentially-consistent operation ordering. + */ + memory_order_seq_cst = __ATOMIC_SEQ_CST }; #define ATOMIC_INIT(i) { (i) } +#ifndef __vma_atomic_fetch_add_explicit + #if defined(__ATOMIC_RELAXED) + #define atomic_fetch_add_explicit(_obj, _operand, _order) \ + __atomic_fetch_add(&(obj)->value, _operand, _order) + #elif defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) + #define atomic_fetch_add_explicit(_obj, _order) \ + __sync_fetch_and_add(&(_obj)->value, _operand) + #else + #error "atomic_fetch_add_explicit() is not supported" + #endif +#else + #define atomic_fetch_add_explicit __vma_atomic_fetch_add_explicit +#endif /* atomic_load_explicit */ + + /** - * Read atomic variable. - * @param v pointer of type atomic_t - * @return Value of the atomic. - * - * Atomically reads the value of @v. + * Atomically stores 'value' into '*object', respecting the given memory order. + * @param _obj pointer of type atomic_t. + * @param _val required value. + * @param _order memory order. */ -#define atomic_read(v) ((v)->counter) +#ifndef __vma_atomic_store_explicit + #if defined(__ATOMIC_RELAXED) + #define atomic_store_explicit(_obj, _val, _order) \ + __atomic_store_n(&(_obj)->value, (_val), (_order)) + #elif defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) + #define atomic_store_explicit(_obj, _val, _order) \ + do { \ + __sync_synchronize(); \ + (_obj)->value = (_val); \ + __sync_synchronize(); \ + } \ + while (0) + #else + #error "atomic_store_explicit() is not supported" + #endif +#else + #define atomic_store_explicit __vma_atomic_store_explicit +#endif /* atomic_store_explicit */ /** - * Set atomic variable. - * @param v pointer of type atomic_t. - * @param i required value. + * Atomically loads 'value' from '*object', respecting the given memory order. + * @param _obj pointer of type atomic_t. + * @param _order memory order. + * @return Value before add. */ -#define atomic_set(v,i) (((v)->counter) = (i)) - -#if 0 +#ifndef __vma_atomic_load_explicit + #if defined(__ATOMIC_RELAXED) + #define atomic_load_explicit(_obj, _order) \ + __atomic_load_n(&(_obj)->value, (_order)) + #elif defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) + #define atomic_load_explicit(_obj, _order) \ + __sync_fetch_and_add(&(object)->value, 0) + #else + #error "atomic_load_explicit() is not supported" + #endif +#else + #define atomic_load_explicit __vma_atomic_load_explicit +#endif /* atomic_load_explicit */ /** - * Returns current contents of addr and replaces contents with value. - * @param value Values to set. - * @param addr Address to set. - * @return Previous value of *addr. + * Read atomic variable. + * @param _obj pointer of type atomic_t + * @return Value of the atomic. + * + * Atomically reads the value of @v. */ -template -static inline T atomic_swap(T new_value, T *addr) -{ - return (T)xchg((unsigned long)new_value, (void*)addr); -} - +#define atomic_read(_obj) \ + atomic_load_explicit((_obj), memory_order_relaxed) /** - * Replaces *addr with new_value if it equals old_value. - * @param old_value Expected value. - * @param new_value Value to set. - * @param addr Address to set. - * @return true if was set, false if not. + * Set atomic variable. + * @param _obj pointer of type atomic_t. + * @param _val required value. */ -template -static bool atomic_cas(T old_value, T new_value, T *addr) -{ - return cmpxchg((unsigned long)old_value, (unsigned long)new_value, (void*)addr); -} -#if _BullseyeCoverage - #pragma BullseyeCoverage on -#endif - -#endif +#define atomic_set(_obj, _val) \ + atomic_store_explicit((_obj), (_val), memory_order_relaxed) /** * Add to the atomic variable. * @param i integer value to add. - * @param v pointer of type atomic_t. + * @param obj pointer of type atomic_t. * @return Value before add. */ -static inline int atomic_fetch_and_inc(atomic_t *v) +static inline int atomic_fetch_and_inc(atomic_t *obj) { - return atomic_fetch_and_add(1, &v->counter); + return atomic_fetch_add_explicit(obj, 1, memory_order_acquire); } -#if _BullseyeCoverage - #pragma BullseyeCoverage off -#endif /** * Add to the atomic variable. * @param i integer value to add. - * @param v pointer of type atomic_t. + * @param obj pointer of type atomic_t. * @return Value before add. */ -static inline int atomic_fetch_and_dec(atomic_t *v) +static inline int atomic_fetch_and_dec(atomic_t *obj) { - return atomic_fetch_and_add(-1, &v->counter); + return atomic_fetch_add_explicit(obj, -1, memory_order_acquire); } -#if _BullseyeCoverage - #pragma BullseyeCoverage on -#endif - #endif /* ATOMIC_H_ */ From c75f756e6c90b195867bfb2e72153a3b32573c48 Mon Sep 17 00:00:00 2001 From: Igor Ivanov Date: Thu, 15 Nov 2018 15:01:38 +0200 Subject: [PATCH 2/4] issue: 1557652 Introduce compiler.m4 Use no optimization for asm functions Signed-off-by: Igor Ivanov --- config/m4/compiler.m4 | 53 +++++++++++++++++++++++++++++++++++++++++ configure.ac | 18 ++------------ src/utils/Makefile.am | 3 ++- src/utils/compiler.h | 43 +++++++++++++++++++++++++++++++++ src/vma/util/sys_vars.h | 5 ++-- 5 files changed, 103 insertions(+), 19 deletions(-) create mode 100644 config/m4/compiler.m4 create mode 100644 src/utils/compiler.h diff --git a/config/m4/compiler.m4 b/config/m4/compiler.m4 new file mode 100644 index 000000000..76b1a1db2 --- /dev/null +++ b/config/m4/compiler.m4 @@ -0,0 +1,53 @@ +# compiler.m4 - Parsing compiler capabilities +# +# Copyright (C) Mellanox Technologies Ltd. 2001-2018. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + + +# Check compiler specific attributes +# Usage: CHECK_COMPILER_ATTRIBUTE([attribute], [program], [definition]) +# Note: +# - [definition] can be omitted if it is equal to attribute +# +AC_DEFUN([CHECK_COMPILER_ATTRIBUTE], [ + AC_CACHE_VAL(vma_cv_attribute_[$1], [ + # + # Try to compile using the C compiler + # + AC_TRY_COMPILE([$2],[], + [vma_cv_attribute_$1=yes], + [vma_cv_attribute_$1=no]) + AS_IF([test "x$vma_cv_attribute_$1" = "xyes"], [ + AC_LANG_PUSH(C++) + AC_TRY_COMPILE([extern "C" { + $2 + }],[], + [vma_cv_attribute_$1=yes], + [vma_cv_attribute_$1=no]) + AC_LANG_POP(C++) + ]) + ]) + + AC_MSG_CHECKING([for attribute $1]) + AC_MSG_RESULT([$vma_cv_attribute_$1]) + AS_IF([test "x$vma_cv_attribute_$1" = "xyes"], [ + AS_IF([test "x$3" = "x"], + [AC_DEFINE_UNQUOTED([DEFINED_$1], [1], [Define to 1 if attribute $1 is supported])], + [AC_DEFINE_UNQUOTED([DEFINED_$3], [1], [Define to 1 if attribute $1 is supported])] + ) + ]) +]) + + + +########################## +# Set compiler capabilities +# +AC_DEFUN([COMPILER_CAPABILITY_SETUP], +[ + +CHECK_COMPILER_ATTRIBUTE([optimize], + [int foo (int arg) __attribute__ ((optimize("O0")));], + [ATTRIBUTE_OPTIMIZE]) +]) diff --git a/configure.ac b/configure.ac index 72abb6690..fd4dce224 100644 --- a/configure.ac +++ b/configure.ac @@ -88,6 +88,7 @@ m4_include([config/m4/opt.m4]) m4_include([config/m4/verbs.m4]) m4_include([config/m4/nl.m4]) m4_include([config/m4/prof.m4]) +m4_include([config/m4/compiler.m4]) FUNC_CONFIGURE_INIT() @@ -163,6 +164,7 @@ case $CC in ;; esac +COMPILER_CAPABILITY_SETUP() dnl===-----------------------------------------------------------------------=== dnl=== @@ -376,22 +378,6 @@ dnl=== dnl===-----------------------------------------------------------------------=== show_section_title "Check for functions, types and structures" -# Does this compiler have built-in functions for atomic memory access? -AC_MSG_CHECKING([for atomic memory access (__sync_bool_compare_and_swap) support]) -AC_TRY_LINK(, -[ - int variable = 1; - return (__sync_bool_compare_and_swap(&variable, 1, 2) - && __sync_add_and_fetch(&variable, 1)) ? 1 : 0; -], -[ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_BUILTIN_ATOMIC, 1, [Define to 1 if gcc supports __sync_bool_compare_and_swap() a.o.]) -], -[ - AC_MSG_RESULT([no]) -]) - AC_MSG_CHECKING([for SOF_TIMESTAMPING_SOFTWARE support]) AC_TRY_LINK( #include diff --git a/src/utils/Makefile.am b/src/utils/Makefile.am index 5808729ab..5ec3a3968 100644 --- a/src/utils/Makefile.am +++ b/src/utils/Makefile.am @@ -13,7 +13,8 @@ libutils_la_SOURCES = \ clock.h \ lock_wrapper.h \ rdtsc.h \ - types.h + types.h \ + compiler.h noinst_PROGRAMS = timetest timetest_LDADD = -lrt libutils.la diff --git a/src/utils/compiler.h b/src/utils/compiler.h new file mode 100644 index 000000000..b97de24ec --- /dev/null +++ b/src/utils/compiler.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2001-2018 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef SRC_UTILS_COMPILER_H_ +#define SRC_UTILS_COMPILER_H_ + +#if defined(DEFINED_ATTRIBUTE_OPTIMIZE) + #define VMA_ATTRIBUTE_OPTIMIZE_NONE __attribute__((optimize("O0"))) +#else + #define VMA_ATTRIBUTE_OPTIMIZE_NONE +#endif + +#endif /* SRC_UTILS_COMPILER_H_ */ diff --git a/src/vma/util/sys_vars.h b/src/vma/util/sys_vars.h index ed4bb71b8..6578d0c39 100644 --- a/src/vma/util/sys_vars.h +++ b/src/vma/util/sys_vars.h @@ -42,6 +42,7 @@ #include "vtypes.h" #include "config.h" +#include "utils/compiler.h" #include "vma/ib/base/verbs_extra.h" #include "vma/util/sysctl_reader.h" #include "vma/vma_extra.h" @@ -429,8 +430,8 @@ struct mce_sys_var { int env_to_cpuset(char *orig_start, cpu_set_t *cpu_set); void read_env_variable_with_pid(char* mce_sys_name, size_t mce_sys_max_size, char* env_ptr); bool check_cpuinfo_flag(const char* flag); - bool cpuid_hv(); - const char* cpuid_hv_vendor(); + bool cpuid_hv() VMA_ATTRIBUTE_OPTIMIZE_NONE; + const char* cpuid_hv_vendor() VMA_ATTRIBUTE_OPTIMIZE_NONE; void read_hv(); // prevent unautothrized creation of objects From 90ba94a47c35060c088af732a5e7949a5233a1ab Mon Sep 17 00:00:00 2001 From: Igor Ivanov Date: Thu, 15 Nov 2018 18:29:55 +0200 Subject: [PATCH 3/4] issue: 1557652 Improve memory copy for Blue Flame usage Use write to BF operation based on CPU command set. Signed-off-by: Igor Ivanov --- src/utils/asm-arm64.h | 12 --- src/utils/asm-ppc64.h | 9 -- src/utils/asm-x86.h | 146 ++++++++++++++++++++++++++++---- src/utils/asm.h | 16 ++++ src/utils/atomic.h | 58 +++++++------ src/vma/dev/qp_mgr_eth_mlx5.cpp | 5 +- src/vma/proto/mem_buf_desc.h | 2 +- 7 files changed, 182 insertions(+), 66 deletions(-) diff --git a/src/utils/asm-arm64.h b/src/utils/asm-arm64.h index e3a845448..12fe2c93b 100644 --- a/src/utils/asm-arm64.h +++ b/src/utils/asm-arm64.h @@ -37,22 +37,12 @@ #include #include -#define COPY_64B_NT(dst, src) \ - *dst++ = *src++; \ - *dst++ = *src++; \ - *dst++ = *src++; \ - *dst++ = *src++; \ - *dst++ = *src++; \ - *dst++ = *src++; \ - *dst++ = *src++; \ - *dst++ = *src++ #define mb() asm volatile("dsb sy" ::: "memory") #define rmb() asm volatile("dsb ld" ::: "memory") #define wmb() asm volatile("dsb st" ::: "memory") #define wc_wmb() wmb() - /** * Read RDTSC register */ @@ -84,6 +74,4 @@ static inline void prefetch_range(void *addr, size_t len) prefetch(cp); } - - #endif diff --git a/src/utils/asm-ppc64.h b/src/utils/asm-ppc64.h index 336d6d46a..32b571130 100644 --- a/src/utils/asm-ppc64.h +++ b/src/utils/asm-ppc64.h @@ -37,15 +37,6 @@ #include #include -#define COPY_64B_NT(dst, src) \ - *dst++ = *src++; \ - *dst++ = *src++; \ - *dst++ = *src++; \ - *dst++ = *src++; \ - *dst++ = *src++; \ - *dst++ = *src++; \ - *dst++ = *src++; \ - *dst++ = *src++ #define mb() asm volatile("sync" ::: "memory") #define rmb() asm volatile("lwsync" ::: "memory") diff --git a/src/utils/asm-x86.h b/src/utils/asm-x86.h index f14880e04..0a0a9381c 100644 --- a/src/utils/asm-x86.h +++ b/src/utils/asm-x86.h @@ -43,21 +43,6 @@ #define wmb() asm volatile("" ::: "memory") #define wc_wmb() asm volatile("sfence" ::: "memory") -#define COPY_64B_NT(dst, src) \ - __asm__ __volatile__ ( \ - " movdqa (%1),%%xmm0\n" \ - " movdqa 16(%1),%%xmm1\n" \ - " movdqa 32(%1),%%xmm2\n" \ - " movdqa 48(%1),%%xmm3\n" \ - " movntdq %%xmm0, (%0)\n" \ - " movntdq %%xmm1, 16(%0)\n" \ - " movntdq %%xmm2, 32(%0)\n" \ - " movntdq %%xmm3, 48(%0)\n" \ - : : "r" (dst), "r" (src) : "memory"); \ - dst += 8; \ - src += 8 - - /** * Add to the atomic variable. * @param i integer value to add. @@ -117,4 +102,135 @@ static inline void prefetch_range(void *addr, size_t len) prefetch(cp); } +enum { + CPU_FLAG_CMOV = (1 << 0), + CPU_FLAG_MMX = (1 << 1), + CPU_FLAG_MMX2 = (1 << 2), + CPU_FLAG_SSE = (1 << 3), + CPU_FLAG_SSE2 = (1 << 4), + CPU_FLAG_SSE3 = (1 << 5), + CPU_FLAG_SSSE3 = (1 << 6), + CPU_FLAG_SSE41 = (1 << 7), + CPU_FLAG_SSE42 = (1 << 8), + CPU_FLAG_AVX = (1 << 9), + CPU_FLAG_AVX2 = (1 << 10) +}; + +#define X86_CPUID_GET_MODEL 0x00000001u +#define X86_CPUID_GET_BASE_VALUE 0x00000000u +#define X86_CPUID_GET_EXTD_VALUE 0x00000007u +#define X86_CPUID_GET_MAX_VALUE 0x80000000u + +VMA_ATTRIBUTE_OPTIMIZE_NONE + static inline void __x86_cpuid(uint32_t level, + uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) +{ + asm volatile ("cpuid\n\t" + : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d) + : "0" (level)); +} + +/* This allows the CPU detection to work with assemblers not supporting + * the xgetbv mnemonic. + */ +#define __x86_xgetbv(_index, _eax, _edx) \ + asm volatile (".byte 0x0f, 0x01, 0xd0" : "=a"(_eax), "=d"(_edx) : "c" (_index)) + +/** + * Read CPU instruction set + */ +VMA_ATTRIBUTE_OPTIMIZE_NONE + static inline int cpuid_flags() +{ + static int cpu_flag = -1; + + if (cpu_flag < 0) { + uint32_t result = 0; + uint32_t base_value; + uint32_t _eax, _ebx, _ecx, _edx; + + __x86_cpuid(X86_CPUID_GET_BASE_VALUE, &_eax, &_ebx, &_ecx, &_edx); + base_value = _eax; + + if (base_value >= 1) { + __x86_cpuid(X86_CPUID_GET_MODEL, &_eax, &_ebx, &_ecx, &_edx); + if (_edx & (1 << 15)) { + result |= CPU_FLAG_CMOV; + } + if (_edx & (1 << 23)) { + result |= CPU_FLAG_MMX; + } + if (_edx & (1 << 25)) { + result |= CPU_FLAG_MMX2; + } + if (_edx & (1 << 25)) { + result |= CPU_FLAG_SSE; + } + if (_edx & (1 << 26)) { + result |= CPU_FLAG_SSE2; + } + if (_ecx & 1) { + result |= CPU_FLAG_SSE3; + } + if (_ecx & (1 << 9)) { + result |= CPU_FLAG_SSSE3; + } + if (_ecx & (1 << 19)) { + result |= CPU_FLAG_SSE41; + } + if (_ecx & (1 << 20)) { + result |= CPU_FLAG_SSE42; + } + if ((_ecx & 0x18000000) == 0x18000000) { + __x86_xgetbv(0, _eax, _edx); + if ((_eax & 0x6) == 0x6) { + result |= CPU_FLAG_AVX; + } + } + } + if (base_value >= 7) { + __x86_cpuid(X86_CPUID_GET_EXTD_VALUE, &_eax, &_ebx, &_ecx, &_edx); + if ((result & CPU_FLAG_AVX) && (_ebx & (1 << 5))) { + result |= CPU_FLAG_AVX2; + } + } + cpu_flag = result; + } + + return cpu_flag; +} + +#define __vma_memory_copy64(_dst, _src) \ +{ \ + static int is_wc_simd = cpuid_flags() & \ + (CPU_FLAG_SSE3 | CPU_FLAG_SSSE3 | \ + CPU_FLAG_SSE41 | CPU_FLAG_SSE42 | \ + CPU_FLAG_AVX | CPU_FLAG_AVX2); \ + \ + if (is_wc_simd) { \ + __asm__ __volatile__ ( \ + " movdqa (%1), %%xmm0\n" \ + " movdqa 16(%1), %%xmm1\n" \ + " movdqa 32(%1), %%xmm2\n" \ + " movdqa 48(%1), %%xmm3\n" \ + \ + " movntdq %%xmm0, (%0)\n" \ + " movntdq %%xmm1, 16(%0)\n" \ + " movntdq %%xmm2, 32(%0)\n" \ + " movntdq %%xmm3, 48(%0)\n" \ + : : "r" (_dst), "r" (_src) : "memory"); \ + _dst += 8; \ + _src += 8; \ + } else { \ + *_dst++ = *_src++; \ + *_dst++ = *_src++; \ + *_dst++ = *_src++; \ + *_dst++ = *_src++; \ + *_dst++ = *_src++; \ + *_dst++ = *_src++; \ + *_dst++ = *_src++; \ + *_dst++ = *_src++; \ + } \ +} + #endif diff --git a/src/utils/asm.h b/src/utils/asm.h index d9353cd57..4d8421105 100644 --- a/src/utils/asm.h +++ b/src/utils/asm.h @@ -34,6 +34,8 @@ #ifndef ASM_H_ #define ASM_H_ +#include "utils/compiler.h" + #ifndef __has_builtin #define __has_builtin(x) 0 #endif @@ -54,4 +56,18 @@ typedef atomic_int atomic_t; #error No architecture specific memory barrier definitions found! #endif +#ifndef __vma_memory_copy64 + #define memory_copy64(dst, src) \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++ +#else + #define memory_copy64 __vma_memory_copy64 +#endif /* atomic_load_explicit */ + #endif diff --git a/src/utils/atomic.h b/src/utils/atomic.h index e34d2e110..62f547fd9 100644 --- a/src/utils/atomic.h +++ b/src/utils/atomic.h @@ -36,29 +36,33 @@ #include "asm.h" +#if defined(__clang__) && __has_builtin(__atomic_load_n) \ + && __has_builtin(__atomic_store_n) \ + && __has_builtin(__atomic_add_fetch) \ + && __has_builtin(__atomic_exchange_n) \ + && __has_builtin(__atomic_compare_exchange_n) \ + && defined(__ATOMIC_RELAXED) \ + && defined(__ATOMIC_CONSUME) \ + && defined(__ATOMIC_ACQUIRE) \ + && defined(__ATOMIC_RELEASE) \ + && defined(__ATOMIC_ACQ_REL) \ + && defined(__ATOMIC_SEQ_CST) + #define USE_BUILTIN_ATOMIC +#elif defined(__GNUC__) && \ + ((__GNUC__ >= 5) || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 7)) + #define USE_BUILTIN_ATOMIC +#else + #define __ATOMIC_RELAXED 0 + #define __ATOMIC_CONSUME 1 + #define __ATOMIC_ACQUIRE 2 + #define __ATOMIC_RELEASE 3 + #define __ATOMIC_ACQ_REL 4 + #define __ATOMIC_SEQ_CST 5 +#endif /* * C++11 memory model */ -#ifndef __ATOMIC_RELAXED -#define __ATOMIC_RELAXED 0 -#endif -#ifndef __ATOMIC_CONSUME -#define __ATOMIC_CONSUME 1 -#endif -#ifndef __ATOMIC_ACQUIRE -#define __ATOMIC_ACQUIRE 2 -#endif -#ifndef __ATOMIC_RELEASE -#define __ATOMIC_RELEASE 3 -#endif -#ifndef __ATOMIC_ACQ_REL -#define __ATOMIC_ACQ_REL 4 -#endif -#ifndef __ATOMIC_SEQ_CST -#define __ATOMIC_SEQ_CST 5 -#endif - enum memory_order { /* memory_order_relaxed: * Only atomicity is provided there are no constraints on reordering of memory @@ -84,11 +88,11 @@ enum memory_order { #define ATOMIC_INIT(i) { (i) } #ifndef __vma_atomic_fetch_add_explicit - #if defined(__ATOMIC_RELAXED) - #define atomic_fetch_add_explicit(_obj, _operand, _order) \ + #if defined(USE_BUILTIN_ATOMIC) + #define atomic_fetch_add_explicit(_obj, _operand, _order) \ __atomic_fetch_add(&(obj)->value, _operand, _order) #elif defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) - #define atomic_fetch_add_explicit(_obj, _order) \ + #define atomic_fetch_add_explicit(_obj, _order) \ __sync_fetch_and_add(&(_obj)->value, _operand) #else #error "atomic_fetch_add_explicit() is not supported" @@ -105,7 +109,7 @@ enum memory_order { * @param _order memory order. */ #ifndef __vma_atomic_store_explicit - #if defined(__ATOMIC_RELAXED) + #if defined(USE_BUILTIN_ATOMIC) #define atomic_store_explicit(_obj, _val, _order) \ __atomic_store_n(&(_obj)->value, (_val), (_order)) #elif defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) @@ -130,12 +134,12 @@ enum memory_order { * @return Value before add. */ #ifndef __vma_atomic_load_explicit - #if defined(__ATOMIC_RELAXED) - #define atomic_load_explicit(_obj, _order) \ + #if defined(USE_BUILTIN_ATOMIC) + #define atomic_load_explicit(_obj, _order) \ __atomic_load_n(&(_obj)->value, (_order)) #elif defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) - #define atomic_load_explicit(_obj, _order) \ - __sync_fetch_and_add(&(object)->value, 0) + #define atomic_load_explicit(_obj, _order) \ + __sync_fetch_and_add(&(_obj)->value, 0) #else #error "atomic_load_explicit() is not supported" #endif diff --git a/src/vma/dev/qp_mgr_eth_mlx5.cpp b/src/vma/dev/qp_mgr_eth_mlx5.cpp index f6dfaaeca..5408dac96 100644 --- a/src/vma/dev/qp_mgr_eth_mlx5.cpp +++ b/src/vma/dev/qp_mgr_eth_mlx5.cpp @@ -34,6 +34,7 @@ #if defined(DEFINED_DIRECT_VERBS) #include +#include "utils/asm.h" #include "cq_mgr_mlx5.h" #include "vma/util/utils.h" #include "vlogger/vlogger.h" @@ -323,11 +324,11 @@ inline void qp_mgr_eth_mlx5::ring_doorbell(uint64_t* wqe, int num_wqebb, int num * which do not guarantee order of copying. */ while (num_wqebb--) { - COPY_64B_NT(dst, src); + memory_copy64(dst, src); } src = (uint64_t*)m_sq_wqes; while (num_wqebb_top--) { - COPY_64B_NT(dst, src); + memory_copy64(dst, src); } } else { *dst = *src; diff --git a/src/vma/proto/mem_buf_desc.h b/src/vma/proto/mem_buf_desc.h index ec1915090..4107ea39f 100644 --- a/src/vma/proto/mem_buf_desc.h +++ b/src/vma/proto/mem_buf_desc.h @@ -130,7 +130,7 @@ class mem_buf_desc_t { // Rx: cq_mgr owns the mem_buf_desc and the associated data buffer ring_slave* p_desc_owner; - inline int get_ref_count() const {return atomic_read(&n_ref_count);} + inline int get_ref_count() {return atomic_read(&n_ref_count);} inline void reset_ref_count() {atomic_set(&n_ref_count, 0);} inline int inc_ref_count() {return atomic_fetch_and_inc(&n_ref_count);} inline int dec_ref_count() {return atomic_fetch_and_dec(&n_ref_count);} From 84bcc053feec511ea888c3536b6cbe7a0b9ab008 Mon Sep 17 00:00:00 2001 From: Igor Ivanov Date: Mon, 19 Nov 2018 13:23:06 +0200 Subject: [PATCH 4/4] issue: 1557652 BF is supported for upstream on VMs This reverts commit cf04c899da65f10d9fde1e2c1fca29ab063a1099. --- src/utils/asm-x86.h | 4 +--- src/vma/dev/qp_mgr_eth_mlx5.cpp | 7 ------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/src/utils/asm-x86.h b/src/utils/asm-x86.h index 0a0a9381c..5d0385412 100644 --- a/src/utils/asm-x86.h +++ b/src/utils/asm-x86.h @@ -203,9 +203,7 @@ VMA_ATTRIBUTE_OPTIMIZE_NONE #define __vma_memory_copy64(_dst, _src) \ { \ static int is_wc_simd = cpuid_flags() & \ - (CPU_FLAG_SSE3 | CPU_FLAG_SSSE3 | \ - CPU_FLAG_SSE41 | CPU_FLAG_SSE42 | \ - CPU_FLAG_AVX | CPU_FLAG_AVX2); \ + (CPU_FLAG_SSE41 | CPU_FLAG_SSE42); \ \ if (is_wc_simd) { \ __asm__ __volatile__ ( \ diff --git a/src/vma/dev/qp_mgr_eth_mlx5.cpp b/src/vma/dev/qp_mgr_eth_mlx5.cpp index 5408dac96..7a6d5db3d 100644 --- a/src/vma/dev/qp_mgr_eth_mlx5.cpp +++ b/src/vma/dev/qp_mgr_eth_mlx5.cpp @@ -90,13 +90,6 @@ static bool is_bf(struct ibv_context *ib_ctx) static off_t offset = VMA_MLX5_MMAP_GET_WC_PAGES_CMD << VMA_MLX5_IB_MMAP_CMD_SHIFT; char *env; -#if defined(DEFINED_VERBS_VERSION) && (DEFINED_VERBS_VERSION == 3) - /* This limitation is done for RM: 1557652 */ - if (safe_mce_sys().hypervisor != mce_sys_var::HYPER_NONE) { - return false; - } -#endif - env = getenv("MLX5_SHUT_UP_BF"); if (!env || !strcmp(env, "0")) { /*