From 14d10feb44ab1edd75853517406ab65fd9211ea5 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Thu, 14 Sep 2023 19:59:01 +0000 Subject: [PATCH 1/5] Add ruy submodule --- .gitmodules | 3 +++ src/3rd_party/ruy | 1 + 2 files changed, 4 insertions(+) create mode 160000 src/3rd_party/ruy diff --git a/.gitmodules b/.gitmodules index a1a876d8b..07791f94f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -20,3 +20,6 @@ [submodule "src/3rd_party/simple-websocket-server"] path = src/3rd_party/simple-websocket-server url = https://github.com/marian-nmt/Simple-WebSocket-Server +[submodule "src/3rd_party/ruy"] + path = src/3rd_party/ruy + url = https://github.com/google/ruy.git diff --git a/src/3rd_party/ruy b/src/3rd_party/ruy new file mode 160000 index 000000000..c04e5e52a --- /dev/null +++ b/src/3rd_party/ruy @@ -0,0 +1 @@ +Subproject commit c04e5e52ae6b144f74ac032652e3c538bda15c9b From 300a8e9b3e0be338459d5bdb60d9a17eb39a99b9 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Thu, 14 Sep 2023 22:12:45 +0000 Subject: [PATCH 2/5] Initial bits --- CMakeLists.txt | 25 +++++- src/3rd_party/CMakeLists.txt | 12 ++- src/tensors/cpu/integer_common.h | 14 ++-- src/tensors/cpu/prod.cpp | 8 -- src/tensors/cpu/prod_blas.h | 135 +++++++++++++++++++++++++++++-- 5 files changed, 173 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c674e68d..eb51bb8ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -80,6 +80,29 @@ if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release") endif() +# ARM bits +if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm") + # Define that we are using ARM + add_compile_definitions(ARM) + set(ARM ON) + option(USE_RUY "Use Ruy" ON) # For 8 bit code, later on + set(EXT_LIBS ${EXT_LIBS} ruy) + + # Apple M1 has Apple Accelerate. Otherwise fallback to RUY + if(APPLE) + option(USE_RUY_SGEMM "Compile with Ruy SGEMM" OFF) + else(APPLE) + option(USE_RUY_SGEMM "Compile with Ruy SGEMM" ON) + endif(APPLE) + + set(USE_SIMD_UTILS ON) + + # Some warnings as errors. I don't feel comfortable about the strict aliasing. + set(ARM_WARNINGS "-fno-strict-aliasing -Wno-comment") + +endif(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm") + + ############################################################################### # Set compilation flags if(MSVC) @@ -221,7 +244,7 @@ else(MSVC) # Clang-10.0.0 complains when CUDA is newer than 10.1 set(CLANG_IGNORE_UNKNOWN_CUDA "-Wno-unknown-warning-option -Wno-unknown-cuda-version") endif() - set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA}") + set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA} ${ARM_WARNINGS}") # These are used in src/CMakeLists.txt on a per-target basis list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated; diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index 838951c50..5fc2c285f 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -9,10 +9,20 @@ add_subdirectory(./faiss) include_directories(./faiss) if(COMPILE_CPU) - if(NOT GENERATE_MARIAN_INSTALL_TARGETS) + if((NOT ARM) AND (NOT GENERATE_MARIAN_INSTALL_TARGETS)) set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests") add_subdirectory(./intgemm) endif() + + if(USE_RUY) + set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_PKG_CONFIG OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_TOOLS OFF CACHE BOOL " " FORCE) + add_subdirectory(ruy/third_party/cpuinfo EXCLUDE_FROM_ALL) + add_subdirectory(ruy EXCLUDE_FROM_ALL) + endif(USE_RUY) endif(COMPILE_CPU) if(USE_FBGEMM) diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h index f4e632b5c..8a00a7870 100644 --- a/src/tensors/cpu/integer_common.h +++ b/src/tensors/cpu/integer_common.h @@ -5,7 +5,7 @@ #include "tensors/cpu/aligned.h" #include "common/io_item.h" -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) #include "3rd_party/intgemm/intgemm/intgemm.h" #else namespace intgemm { @@ -31,10 +31,12 @@ namespace intgemm { } #endif +#ifndef ARM #include #include #include #include +#endif #include #include @@ -98,7 +100,7 @@ template <> struct intgemm_ { template static inline float& getQuantMult(marian::Tensor val) { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) ABORT_IF(!isIntgemm(val->type()), "getQuantMult does not work for type {}", val->type()); typedef typename intgemm_::type Integer; return *(reinterpret_cast(val->data() + val->shape().elements())); @@ -109,7 +111,7 @@ static inline float& getQuantMult(marian::Tensor val) { } static inline Type getIntgemmType(Type vtype) { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) if (vtype == Type::intgemm8) { if (intgemm::kCPU == intgemm::CPUType::AVX512VNNI) { return Type::intgemm8avx512vnni; @@ -142,7 +144,7 @@ static inline Type getIntgemmType(Type vtype) { } static inline bool passOrAbort(Type vtype) { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) if (vtype == Type::intgemm8 || vtype == Type::intgemm16) { return true; } else if (vtype == Type::intgemm16sse2) { @@ -166,7 +168,7 @@ static inline bool passOrAbort(Type vtype) { template static inline float computeQuantMult(marian::Tensor val) { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) if(sizeOf(vtype) == 1) return 127.0f / intgemm::MaxAbsolute(val->data(), val->data() + val->shape().elements()); else if(sizeOf(vtype) == 2) @@ -186,7 +188,7 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias); // in our binary format. Then we copy the quantizationMultiplier information at the end template void prepareAndTransposeB(io::Item& item, const char * input) { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) typedef typename intgemm_::type Integer; Integer * output_tensor = reinterpret_cast(&(*item.bytes.begin())); // Sometimes we will end up with misaligned intput (and output) so we can't use them directly. diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp index 8fcca924b..639027d05 100755 --- a/src/tensors/cpu/prod.cpp +++ b/src/tensors/cpu/prod.cpp @@ -7,14 +7,6 @@ #include "tensors/tensor.h" #include "tensors/tensor_allocator.h" -#if MKL_FOUND -#include -#else -#if BLAS_FOUND -#include -#endif -#endif - #include "integer_common.h" #include "prod_blas.h" diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h index a591fdd26..85234c05b 100644 --- a/src/tensors/cpu/prod_blas.h +++ b/src/tensors/cpu/prod_blas.h @@ -1,11 +1,122 @@ +#pragma once #if MKL_FOUND -#include -#else -#if BLAS_FOUND -#include -#endif + #include +#elif BLAS_FOUND + #include +#elif USE_RUY_SGEMM +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcomment" + #include "ruy/ruy.h" + #include "ruy/system_aligned_alloc.h" +#pragma GCC pop #endif +#if USE_RUY_SGEMM +// AlignedVector allocates aligned memory and cleans up after itself. RAII +// wrapper similar to intgemm's AlignedVector. +template +class AlignedVector { +public: + AlignedVector(size_t num_elem) + : size_(num_elem), + storage_(reinterpret_cast(ruy::detail::SystemAlignedAlloc(sizeof(T) * num_elem))) {} + + T *begin() { return storage_; } + T *data() { return storage_; } + size_t size() const { return size_; } + size_t memSize() const { return sizeof(T) * size_; } + + // Forbid copy + AlignedVector(const AlignedVector &) = delete; + AlignedVector &operator=(const AlignedVector &) = delete; + + ~AlignedVector() { ruy::detail::SystemAlignedFree(reinterpret_cast(storage_)); } + +private: + size_t size_; + T *storage_; +}; + + +inline void GemmRuy(const bool transA, + const bool transB, + const int M, + const int N, + const int K, + const float alpha, + const float *A, + const int lda, + const float *B, + const int ldb, + const float beta, + float *C, + const int ldc) { + ruy::Context context; + + // If we need to transpose, we can swap dimensions in layout claim the matrix + // is just column-major. Set ordering so transpose. + const auto orderA = (transA ? ruy::Order::kColMajor : ruy::Order::kRowMajor); + const auto orderB = (transB ? ruy::Order::kColMajor : ruy::Order::kRowMajor); + + ruy::Matrix lhs; + ruy::MakeSimpleLayout(M, K, orderA, lhs.mutable_layout()); + lhs.set_data(A); + + ruy::Matrix rhs; + ruy::MakeSimpleLayout(K, N, orderB, rhs.mutable_layout()); + rhs.set_data(B); + + ruy::Matrix dst; + ruy::MakeSimpleLayout(M, N, ruy::Order::kRowMajor, dst.mutable_layout()); + + if(beta == 0) { + // For beta = 0, we want to avoid the additional allocation. This is a + // large amount of our inference use-cases. sgemm is called with `beta` for + // accumulating gradients in backpropogation, which is 0.0 during + // inference. + + dst.set_data(C); + ruy::MulParams mul_params; + ruy::Mul(lhs, rhs, mul_params, &context, &dst); + + if(alpha != 1.0) { + // Write out C as C = alpha * [op(A) * op(B)] + beta * C + // Can we expect the compiler to autovectorize this? + // TODO: Come back and explicitly use SIMD. + const size_t size = M * N; + const float *opA_opB = C; // Alias. + for(size_t i = 0; i < size; i++) { + C[i] = alpha * opA_opB[i]; + } + } + + } else { + // @jerinphilip has not yet been able to find a ruy primitive that does in + // place addition to obtain full gemm. + // + // Safe bet is to make an additional allocation to store the result of + // multiply and use the existing values in C. + // + // See also: https://github.com/google/ruy/issues/307 + + AlignedVector intermediate(M * N); + dst.set_data(intermediate.data()); + ruy::MulParams mul_params; + ruy::Mul(lhs, rhs, mul_params, &context, &dst); + + // Write out C as C = alpha * [op(A) * op(B)] + beta * C + // Can we expect the compiler to autovectorize this? + // TODO: Come back and explicitly use SIMD. + const size_t size = M * N; + const float *opA_opB = intermediate.data(); + for(size_t i = 0; i < size; i++) { + C[i] = alpha * opA_opB[i] + beta * C[i]; + } + } +} + +#endif // RUY_SGEMM + inline void sgemm(bool transA, bool transB, int rows_a, @@ -34,6 +145,20 @@ inline void sgemm(bool transA, beta, c, ldc); +#elif USE_RUY_SGEMM + GemmRuy(transA, + transB, + rows_a, + rows_b, + width, + alpha, + a, + lda, + b, + ldb, + beta, + c, + ldc); #else transA; transB; rows_a; rows_b; width; alpha; a; lda; b; ldb; beta; c; ldc; // make compiler happy ABORT("Marian must be compiled with a BLAS library"); From 8f8fa562bcdddc5b8c86f0b73693bbef7e5a99ac Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Fri, 15 Sep 2023 15:40:28 +0000 Subject: [PATCH 3/5] Add simd utils --- .gitmodules | 3 +++ src/3rd_party/CMakeLists.txt | 2 +- src/3rd_party/simd_utils | 1 + src/tensors/cpu/prod_blas.h | 7 +------ 4 files changed, 6 insertions(+), 7 deletions(-) create mode 160000 src/3rd_party/simd_utils diff --git a/.gitmodules b/.gitmodules index 07791f94f..37b125076 100644 --- a/.gitmodules +++ b/.gitmodules @@ -23,3 +23,6 @@ [submodule "src/3rd_party/ruy"] path = src/3rd_party/ruy url = https://github.com/google/ruy.git +[submodule "src/3rd_party/simd_utils"] + path = src/3rd_party/simd_utils + url = https://github.com/JishinMaster/simd_utils.git diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index 5fc2c285f..1ab9a53b2 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -9,7 +9,7 @@ add_subdirectory(./faiss) include_directories(./faiss) if(COMPILE_CPU) - if((NOT ARM) AND (NOT GENERATE_MARIAN_INSTALL_TARGETS)) + if((NOT ${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm") AND (NOT GENERATE_MARIAN_INSTALL_TARGETS)) set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests") add_subdirectory(./intgemm) endif() diff --git a/src/3rd_party/simd_utils b/src/3rd_party/simd_utils new file mode 160000 index 000000000..c092ef9dd --- /dev/null +++ b/src/3rd_party/simd_utils @@ -0,0 +1 @@ +Subproject commit c092ef9dd406cd9b9d54da1ff30cc86c39b4c0a5 diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h index 85234c05b..a281aa7bf 100644 --- a/src/tensors/cpu/prod_blas.h +++ b/src/tensors/cpu/prod_blas.h @@ -91,12 +91,7 @@ inline void GemmRuy(const bool transA, } } else { - // @jerinphilip has not yet been able to find a ruy primitive that does in - // place addition to obtain full gemm. - // - // Safe bet is to make an additional allocation to store the result of - // multiply and use the existing values in C. - // + // No multiply-add in Ruy // See also: https://github.com/google/ruy/issues/307 AlignedVector intermediate(M * N); From 70ab9c6db53fcd7010a9f92c8497f21dfe241526 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Fri, 15 Sep 2023 16:30:18 +0000 Subject: [PATCH 4/5] First bits working --- CMakeLists.txt | 27 +++- cmake/TargetArch.cmake | 142 ++++++++++++++++++++ src/3rd_party/faiss/VectorTransform.cpp | 4 + src/3rd_party/simd_utils | 2 +- src/common/types.h | 6 +- src/functional/operators.h | 5 +- src/tensors/cpu/expression_graph_packable.h | 2 +- src/tensors/cpu/fbgemm/packed_gemm.cpp | 8 +- src/tensors/cpu/intgemm_interface.h | 4 +- 9 files changed, 189 insertions(+), 11 deletions(-) create mode 100644 cmake/TargetArch.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index eb51bb8ec..dca8e69d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,22 @@ if (POLICY CMP0074) endif () project(marian CXX C) + +######### ARCH DETECTION ######### +# Architecture detection +include(TargetArch) + +target_architecture(CMAKE_TARGET_ARCHITECTURES) +list(LENGTH CMAKE_TARGET_ARCHITECTURES cmake_target_arch_len) +if(NOT "${cmake_target_arch_len}" STREQUAL "1") + set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL TRUE) + set(CMAKE_TARGET_ARCHITECTURE_CODE "universal") +else() + set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL FALSE) + set(CMAKE_TARGET_ARCHITECTURE_CODE "${CMAKE_TARGET_ARCHITECTURES}") +endif() +######### ARCH DETECTION ######### + set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.") @@ -100,6 +116,15 @@ if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm") # Some warnings as errors. I don't feel comfortable about the strict aliasing. set(ARM_WARNINGS "-fno-strict-aliasing -Wno-comment") + set(USE_SIMD_UTILS ON) + # @TODO this assumes ArmV8. We should also look at armv7 + add_compile_definitions(ARM FMA SSE) #added for ARM + if(MSVC) + add_compile_options(/flax-vector-conversions) + else(MSVC) + add_compile_options(-flax-vector-conversions) + endif(MSVC) + endif(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm") @@ -534,7 +559,7 @@ endif() ############################################################################### # Find BLAS library if(COMPILE_CPU) - if(NOT GENERATE_MARIAN_INSTALL_TARGETS) + if(NOT GENERATE_MARIAN_INSTALL_TARGETS AND NOT ARM) set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU add_definitions(-DCOMPILE_CPU=1) endif() diff --git a/cmake/TargetArch.cmake b/cmake/TargetArch.cmake new file mode 100644 index 000000000..6e0bb3953 --- /dev/null +++ b/cmake/TargetArch.cmake @@ -0,0 +1,142 @@ +# Modified from https://github.com/axr/solar-cmake/blob/73cfea0db0284c5e2010aca23989046e5bda95c9/Solar.cmake +# Based on the Qt 5 processor detection code, so should be very accurate +# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h +# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64) + +# Regarding POWER/PowerPC, just as is noted in the Qt source, +# "There are many more known variants/revisions that we do not handle/detect." + +set(archdetect_c_code " +#if defined(__arm__) || defined(__TARGET_ARCH_ARM) || defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || defined(__ARM64__) + #if defined(__ARM_ARCH_8__) || defined(__ARM_ARCH_8) \\ + || defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8A) \\ + || defined(__ARM_ARCH_8R__) || defined(__ARM_ARCH_8R) \\ + || defined(__ARM_ARCH_8M__) || defined(__ARM_ARCH_8M) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 8) + #error cmake_ARCH armv8 + #elif defined(__ARM_ARCH_7__) \\ + || defined(__ARM_ARCH_7A__) \\ + || defined(__ARM_ARCH_7R__) \\ + || defined(__ARM_ARCH_7M__) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7) + #error cmake_ARCH armv7 + #elif defined(__ARM_ARCH_6__) \\ + || defined(__ARM_ARCH_6J__) \\ + || defined(__ARM_ARCH_6T2__) \\ + || defined(__ARM_ARCH_6Z__) \\ + || defined(__ARM_ARCH_6K__) \\ + || defined(__ARM_ARCH_6ZK__) \\ + || defined(__ARM_ARCH_6M__) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6) + #error cmake_ARCH armv6 + #elif defined(__ARM_ARCH_5TEJ__) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5) + #error cmake_ARCH armv5 + #else + #error cmake_ARCH arm + #endif +#elif defined(__i386) || defined(__i386__) || defined(_M_IX86) + #error cmake_ARCH i386 +#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64) + #error cmake_ARCH x86_64 +#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64) + #error cmake_ARCH ia64 +#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\ + || defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \\ + || defined(_M_MPPC) || defined(_M_PPC) + #if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__) + #error cmake_ARCH ppc64 + #else + #error cmake_ARCH ppc + #endif +#endif + +#error cmake_ARCH unknown +") + + +# Set ppc_support to TRUE before including this file or ppc and ppc64 +# will be treated as invalid architectures since they are no longer supported by Apple + +function(target_architecture output_var) + if(APPLE AND CMAKE_OSX_ARCHITECTURES) + # On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set + # First let's normalize the order of the values + + # Note that it's not possible to compile PowerPC applications if you are using + # the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we + # disable it by default + # See this page for more information: + # http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4 + + # Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime. + # On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise. + + foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES}) + if("${osx_arch}" STREQUAL "ppc" AND ppc_support) + set(osx_arch_ppc TRUE) + elseif("${osx_arch}" STREQUAL "i386") + set(osx_arch_i386 TRUE) + elseif("${osx_arch}" STREQUAL "x86_64") + set(osx_arch_x86_64 TRUE) + elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support) + set(osx_arch_ppc64 TRUE) + else() + message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}") + endif() + endforeach() + + # Now add all the architectures in our normalized order + if(osx_arch_ppc) + list(APPEND ARCH ppc) + endif() + + if(osx_arch_i386) + list(APPEND ARCH i386) + endif() + + if(osx_arch_x86_64) + list(APPEND ARCH x86_64) + endif() + + if(osx_arch_ppc64) + list(APPEND ARCH ppc64) + endif() + else() + file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}") + + enable_language(C) + + # Detect the architecture in a rather creative way... + # This compiles a small C program which is a series of ifdefs that selects a + # particular #error preprocessor directive whose message string contains the + # target architecture. The program will always fail to compile (both because + # file is not a valid C program, and obviously because of the presence of the + # #error preprocessor directives... but by exploiting the preprocessor in this + # way, we can detect the correct target architecture even when cross-compiling, + # since the program itself never needs to be run (only the compiler/preprocessor) + try_run( + run_result_unused + compile_result_unused + "${CMAKE_BINARY_DIR}" + "${CMAKE_BINARY_DIR}/arch.c" + COMPILE_OUTPUT_VARIABLE ARCH + CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} + ) + + # Parse the architecture name from the compiler output + string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}") + + # Get rid of the value marker leaving just the architecture name + string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}") + + # If we are compiling with an unknown architecture this variable should + # already be set to "unknown" but in the case that it's empty (i.e. due + # to a typo in the code), then set it to unknown + if (NOT ARCH) + set(ARCH unknown) + endif() + endif() + + set(${output_var} "${ARCH}" PARENT_SCOPE) +endfunction() \ No newline at end of file diff --git a/src/3rd_party/faiss/VectorTransform.cpp b/src/3rd_party/faiss/VectorTransform.cpp index 103b0910e..c6bf9d4a1 100644 --- a/src/3rd_party/faiss/VectorTransform.cpp +++ b/src/3rd_party/faiss/VectorTransform.cpp @@ -19,6 +19,10 @@ using namespace faiss; +#ifdef ARM +#include "3rd_party/simd_utils/simd_utils.h" +#endif + extern "C" { diff --git a/src/3rd_party/simd_utils b/src/3rd_party/simd_utils index c092ef9dd..696036258 160000 --- a/src/3rd_party/simd_utils +++ b/src/3rd_party/simd_utils @@ -1 +1 @@ -Subproject commit c092ef9dd406cd9b9d54da1ff30cc86c39b4c0a5 +Subproject commit 6960362584481c977cdae9f6a8f7061a37c766cb diff --git a/src/common/types.h b/src/common/types.h index a0930a0f8..763edb09b 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -17,7 +17,11 @@ #include #ifndef __CUDACC__ // NVCC is very unreliable when it comes to CPU intrinsics, we hide them completely from NVCC-compiled code -#include + #ifndef ARM + #include + #else + #include "3rd_party/simd_utils/simd_utils.h" + #endif #endif #ifdef __CUDACC__ // nvcc is compiling this code diff --git a/src/functional/operators.h b/src/functional/operators.h index 3628fdcb9..6ecc02bd8 100644 --- a/src/functional/operators.h +++ b/src/functional/operators.h @@ -217,8 +217,11 @@ struct Ops { // __CUDACC__ is defined when compiling with NVCC regardless of device type // __CUDA_ARCH__ is defined when compiling device (GPU) code #ifndef __CUDACC__ - +#ifndef ARM #include "3rd_party/sse_mathfun.h" +#else +#include "3rd_party/simd_utils/simd_utils.h" // @TODO this might be dependent on NEON +#endif namespace marian { namespace functional { diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h index 1a233372c..4af69fac9 100644 --- a/src/tensors/cpu/expression_graph_packable.h +++ b/src/tensors/cpu/expression_graph_packable.h @@ -152,7 +152,7 @@ class ExpressionGraphPackable : public ExpressionGraph { #endif } else if (isIntgemm(gemmElementType) && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2 /* || pName.find("Wemb") != std::string::npos*/)) { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) using cpu::integer::cols; using cpu::integer::rows; auto allocator = New(getBackend()); diff --git a/src/tensors/cpu/fbgemm/packed_gemm.cpp b/src/tensors/cpu/fbgemm/packed_gemm.cpp index dd81d0f7f..23ed559f1 100644 --- a/src/tensors/cpu/fbgemm/packed_gemm.cpp +++ b/src/tensors/cpu/fbgemm/packed_gemm.cpp @@ -2,16 +2,16 @@ #include "tensors/tensor_allocator.h" #include "tensors/tensor_operators.h" -#include -#include -#include -#include #include #include #include //#include #if USE_FBGEMM +#include +#include +#include +#include #ifdef _MSC_VER #pragma warning(disable: 4505) // 'fbgemmAlignedAlloc' in fbgemm.h: unreferenced local function has been removed (missing 'static inline') #pragma warning(disable: 4251) // 'fbgemm::CompressedSparseColumn::colptr_': class 'std::vector>' needs to have dll-interface to be used by clients of class 'fbgemm::CompressedSparseColumn' diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h index 88408aa18..80784e0f6 100644 --- a/src/tensors/cpu/intgemm_interface.h +++ b/src/tensors/cpu/intgemm_interface.h @@ -9,7 +9,7 @@ namespace marian { namespace cpu { namespace integer { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) /* * Prepare an activation matrix into intgemm8/16 format. For now the activation matrix is just quantized. * Expr input: The input tensor @@ -45,7 +45,7 @@ static inline Expr prepareA(Expr a) { */ template static inline Expr affineOrDotTyped(Expr a, Expr bQuant, Expr bias, bool transA, bool /*transB*/, float scale) { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) ABORT_IF(!isFloat(a->value_type()), "Intgemm expects type of A to be float32 not {}", a->value_type()); ABORT_IF(!isIntgemm(bQuant->value_type()), "Intgemm expects type of B to be a variant of intgemm not {}", bQuant->value_type()); From 3ac5facd45934c472561ad733953df625707e561 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Fri, 15 Sep 2023 16:50:46 +0000 Subject: [PATCH 5/5] Remove m64 --- CMakeLists.txt | 8 ++++---- src/3rd_party/CMakeLists.txt | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dca8e69d9..71ae4cac1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -289,9 +289,9 @@ else(MSVC) endif(CMAKE_COMPILER_IS_GNUCC) set(CMAKE_CXX_FLAGS "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}") - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}") + set(CMAKE_CXX_FLAGS_RELEASE "-O3 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}") - set(CMAKE_CXX_FLAGS_SLIM "-O3 -m64 -funroll-loops -DNDEBUG") + set(CMAKE_CXX_FLAGS_SLIM "-O3 -funroll-loops -DNDEBUG") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELEASE}") set(CMAKE_CXX_FLAGS_PROFILE "${CMAKE_CXX_FLAGS_RELEASE} -pg") set(CMAKE_CXX_FLAGS_PROFGEN "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-generate -fprofile-correction") @@ -299,9 +299,9 @@ else(MSVC) # these need to be set separately set(CMAKE_C_FLAGS "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}") - set(CMAKE_C_FLAGS_RELEASE "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}") + set(CMAKE_C_FLAGS_RELEASE "-O3 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}") set(CMAKE_C_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}") - set(CMAKE_C_FLAGS_SLIM "-O3 -m64 -funroll-loops -DNDEBUG") + set(CMAKE_C_FLAGS_SLIM "-O3 -funroll-loops -DNDEBUG") set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELEASE}") set(CMAKE_C_FLAGS_PROFILE "${CMAKE_C_FLAGS_RELEASE} -pg") set(CMAKE_C_FLAGS_PROFGEN "${CMAKE_C_FLAGS_RELEASE} -fprofile-generate -fprofile-correction") diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index 1ab9a53b2..2cec739ba 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -9,7 +9,7 @@ add_subdirectory(./faiss) include_directories(./faiss) if(COMPILE_CPU) - if((NOT ${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm") AND (NOT GENERATE_MARIAN_INSTALL_TARGETS)) + if(NOT GENERATE_MARIAN_INSTALL_TARGETS AND NOT ARM) set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests") add_subdirectory(./intgemm) endif()