From 14d10feb44ab1edd75853517406ab65fd9211ea5 Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Thu, 14 Sep 2023 19:59:01 +0000
Subject: [PATCH 1/5] Add ruy submodule

---
 .gitmodules       | 3 +++
 src/3rd_party/ruy | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 src/3rd_party/ruy

diff --git a/.gitmodules b/.gitmodules
index a1a876d8b..07791f94f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -20,3 +20,6 @@
 [submodule "src/3rd_party/simple-websocket-server"]
 	path = src/3rd_party/simple-websocket-server
 	url = https://github.com/marian-nmt/Simple-WebSocket-Server
+[submodule "src/3rd_party/ruy"]
+	path = src/3rd_party/ruy
+	url = https://github.com/google/ruy.git
diff --git a/src/3rd_party/ruy b/src/3rd_party/ruy
new file mode 160000
index 000000000..c04e5e52a
--- /dev/null
+++ b/src/3rd_party/ruy
@@ -0,0 +1 @@
+Subproject commit c04e5e52ae6b144f74ac032652e3c538bda15c9b

From 300a8e9b3e0be338459d5bdb60d9a17eb39a99b9 Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Thu, 14 Sep 2023 22:12:45 +0000
Subject: [PATCH 2/5] Initial bits

---
 CMakeLists.txt                   |  25 +++++-
 src/3rd_party/CMakeLists.txt     |  12 ++-
 src/tensors/cpu/integer_common.h |  14 ++--
 src/tensors/cpu/prod.cpp         |   8 --
 src/tensors/cpu/prod_blas.h      | 135 +++++++++++++++++++++++++++++--
 5 files changed, 173 insertions(+), 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c674e68d..eb51bb8ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,6 +80,29 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "Release")
 endif()
 
+# ARM bits
+if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
+  # Define that we are using ARM
+  add_compile_definitions(ARM)
+  set(ARM ON)
+  option(USE_RUY "Use Ruy" ON) # For 8 bit code, later on
+  set(EXT_LIBS ${EXT_LIBS} ruy)
+
+  # Apple M1 has Apple Accelerate. Otherwise fallback to RUY
+  if(APPLE)
+    option(USE_RUY_SGEMM "Compile with Ruy SGEMM" OFF)
+  else(APPLE)
+    option(USE_RUY_SGEMM "Compile with Ruy SGEMM" ON)
+  endif(APPLE)
+
+  set(USE_SIMD_UTILS ON)
+
+  # Some warnings as errors. I don't feel comfortable about the strict aliasing.
+  set(ARM_WARNINGS "-fno-strict-aliasing -Wno-comment")
+
+endif(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
+
+
 ###############################################################################
 # Set compilation flags
 if(MSVC)
@@ -221,7 +244,7 @@ else(MSVC)
     # Clang-10.0.0 complains when CUDA is newer than 10.1
     set(CLANG_IGNORE_UNKNOWN_CUDA "-Wno-unknown-warning-option -Wno-unknown-cuda-version")
   endif()
-  set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA}")
+  set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA} ${ARM_WARNINGS}")
 
   # These are used in src/CMakeLists.txt on a per-target basis
   list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated;
diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt
index 838951c50..5fc2c285f 100644
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@@ -9,10 +9,20 @@ add_subdirectory(./faiss)
 include_directories(./faiss)
 
 if(COMPILE_CPU)
-  if(NOT GENERATE_MARIAN_INSTALL_TARGETS)
+  if((NOT ARM) AND (NOT GENERATE_MARIAN_INSTALL_TARGETS))
     set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests")
     add_subdirectory(./intgemm)
   endif()
+
+  if(USE_RUY)
+    set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL " " FORCE)
+    set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL " " FORCE)
+    set(CPUINFO_BUILD_PKG_CONFIG OFF CACHE BOOL " " FORCE)
+    set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL " " FORCE)
+    set(CPUINFO_BUILD_TOOLS      OFF CACHE BOOL " " FORCE)
+    add_subdirectory(ruy/third_party/cpuinfo EXCLUDE_FROM_ALL)
+    add_subdirectory(ruy EXCLUDE_FROM_ALL)
+  endif(USE_RUY)
 endif(COMPILE_CPU)
 
 if(USE_FBGEMM)
diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h
index f4e632b5c..8a00a7870 100644
--- a/src/tensors/cpu/integer_common.h
+++ b/src/tensors/cpu/integer_common.h
@@ -5,7 +5,7 @@
 #include "tensors/cpu/aligned.h"
 #include "common/io_item.h"
 
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
 #include "3rd_party/intgemm/intgemm/intgemm.h"
 #else
 namespace intgemm {
@@ -31,10 +31,12 @@ namespace intgemm {
 }
 #endif
 
+#ifndef ARM
 #include <emmintrin.h>
 #include <immintrin.h>
 #include <tmmintrin.h>
 #include <xmmintrin.h>
+#endif
 #include <cassert>
 #include <cstddef>
 
@@ -98,7 +100,7 @@ template <> struct intgemm_<Type::intgemm16avx512> {
 
 template <Type vtype>
 static inline float& getQuantMult(marian::Tensor val) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
   ABORT_IF(!isIntgemm(val->type()), "getQuantMult does not work for type {}", val->type());
   typedef typename intgemm_<vtype>::type Integer;
   return *(reinterpret_cast<float*>(val->data<Integer>() + val->shape().elements()));
@@ -109,7 +111,7 @@ static inline float& getQuantMult(marian::Tensor val) {
 }
 
 static inline Type getIntgemmType(Type vtype) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
   if (vtype == Type::intgemm8) {
     if (intgemm::kCPU == intgemm::CPUType::AVX512VNNI) {
       return Type::intgemm8avx512vnni;
@@ -142,7 +144,7 @@ static inline Type getIntgemmType(Type vtype) {
 }
 
 static inline bool passOrAbort(Type vtype) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
   if (vtype == Type::intgemm8 || vtype == Type::intgemm16) {
     return true;
   } else if (vtype == Type::intgemm16sse2) {
@@ -166,7 +168,7 @@ static inline bool passOrAbort(Type vtype) {
 
 template <Type vtype>
 static inline float computeQuantMult(marian::Tensor val) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
   if(sizeOf(vtype) == 1)
     return 127.0f / intgemm::MaxAbsolute(val->data(), val->data() + val->shape().elements());
   else if(sizeOf(vtype) == 2)
@@ -186,7 +188,7 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias);
 // in our binary format. Then we copy the quantizationMultiplier information at the end
 template<Type vtype>
 void prepareAndTransposeB(io::Item& item, const char * input) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
     typedef typename intgemm_<vtype>::type Integer;
     Integer * output_tensor = reinterpret_cast<Integer *>(&(*item.bytes.begin()));
     // Sometimes we will end up with misaligned intput (and output) so we can't use them directly.
diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp
index 8fcca924b..639027d05 100755
--- a/src/tensors/cpu/prod.cpp
+++ b/src/tensors/cpu/prod.cpp
@@ -7,14 +7,6 @@
 #include "tensors/tensor.h"
 #include "tensors/tensor_allocator.h"
 
-#if MKL_FOUND
-#include <mkl.h>
-#else
-#if BLAS_FOUND
-#include <cblas.h>
-#endif
-#endif
-
 #include "integer_common.h"
 #include "prod_blas.h"
 
diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h
index a591fdd26..85234c05b 100644
--- a/src/tensors/cpu/prod_blas.h
+++ b/src/tensors/cpu/prod_blas.h
@@ -1,11 +1,122 @@
+#pragma once
 #if MKL_FOUND
-#include <mkl.h>
-#else
-#if BLAS_FOUND
-#include <cblas.h>
-#endif
+    #include <mkl.h>
+#elif BLAS_FOUND
+    #include <cblas.h>
+#elif USE_RUY_SGEMM
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcomment"
+    #include "ruy/ruy.h"
+    #include "ruy/system_aligned_alloc.h"
+#pragma GCC pop
 #endif
 
+#if USE_RUY_SGEMM
+// AlignedVector allocates aligned memory and cleans up after itself. RAII
+// wrapper similar to intgemm's AlignedVector.
+template <class T>
+class AlignedVector {
+public:
+  AlignedVector(size_t num_elem)
+      : size_(num_elem),
+        storage_(reinterpret_cast<T *>(ruy::detail::SystemAlignedAlloc(sizeof(T) * num_elem))) {}
+
+  T *begin() { return storage_; }
+  T *data() { return storage_; }
+  size_t size() const { return size_; }
+  size_t memSize() const { return sizeof(T) * size_; }
+
+  // Forbid copy
+  AlignedVector(const AlignedVector &) = delete;
+  AlignedVector &operator=(const AlignedVector &) = delete;
+
+  ~AlignedVector() { ruy::detail::SystemAlignedFree(reinterpret_cast<void *>(storage_)); }
+
+private:
+  size_t size_;
+  T *storage_;
+};
+
+
+inline void GemmRuy(const bool transA,
+                    const bool transB,
+                    const int M,
+                    const int N,
+                    const int K,
+                    const float alpha,
+                    const float *A,
+                    const int lda,
+                    const float *B,
+                    const int ldb,
+                    const float beta,
+                    float *C,
+                    const int ldc) {
+  ruy::Context context;
+
+  // If we need to transpose, we can swap dimensions in layout claim the matrix
+  // is just column-major. Set ordering so transpose.
+  const auto orderA = (transA ? ruy::Order::kColMajor : ruy::Order::kRowMajor);
+  const auto orderB = (transB ? ruy::Order::kColMajor : ruy::Order::kRowMajor);
+
+  ruy::Matrix<float> lhs;
+  ruy::MakeSimpleLayout(M, K, orderA, lhs.mutable_layout());
+  lhs.set_data(A);
+
+  ruy::Matrix<float> rhs;
+  ruy::MakeSimpleLayout(K, N, orderB, rhs.mutable_layout());
+  rhs.set_data(B);
+
+  ruy::Matrix<float> dst;
+  ruy::MakeSimpleLayout(M, N, ruy::Order::kRowMajor, dst.mutable_layout());
+
+  if(beta == 0) {
+    // For beta = 0, we want to avoid the additional allocation. This is a
+    // large amount of our inference use-cases. sgemm is called with `beta` for
+    // accumulating gradients in backpropogation, which is 0.0 during
+    // inference.
+
+    dst.set_data(C);
+    ruy::MulParams<float, float> mul_params;
+    ruy::Mul(lhs, rhs, mul_params, &context, &dst);
+
+    if(alpha != 1.0) {
+        // Write out C as C = alpha * [op(A) * op(B)] + beta * C
+        // Can we expect the compiler to autovectorize this?
+        // TODO: Come back and explicitly use SIMD.
+        const size_t size    = M * N;
+        const float *opA_opB = C;  // Alias.
+        for(size_t i = 0; i < size; i++) {
+          C[i] = alpha * opA_opB[i];
+        }
+    }
+
+  } else {
+    // @jerinphilip has not yet been able to find a ruy primitive that does in
+    // place addition to obtain full gemm.
+    //
+    // Safe bet is to make an additional allocation to store the result of
+    // multiply  and use the existing values in C.
+    //
+    // See also: https://github.com/google/ruy/issues/307
+
+    AlignedVector<float> intermediate(M * N);
+    dst.set_data(intermediate.data());
+    ruy::MulParams<float, float> mul_params;
+    ruy::Mul(lhs, rhs, mul_params, &context, &dst);
+
+    // Write out C as C = alpha * [op(A) * op(B)] + beta * C
+    // Can we expect the compiler to autovectorize this?
+    // TODO: Come back and explicitly use SIMD.
+    const size_t size    = M * N;
+    const float *opA_opB = intermediate.data();
+    for(size_t i = 0; i < size; i++) {
+      C[i] = alpha * opA_opB[i] + beta * C[i];
+    }
+  }
+}
+
+#endif // RUY_SGEMM
+
 inline void sgemm(bool transA,
                   bool transB,
                   int rows_a,
@@ -34,6 +145,20 @@ inline void sgemm(bool transA,
               beta,
               c,
               ldc);
+#elif USE_RUY_SGEMM
+        GemmRuy(transA,
+                transB,
+                rows_a,
+                rows_b,
+                width,
+                alpha,
+                a,
+                lda,
+                b,
+                ldb,
+                beta,
+                c,
+                ldc);
 #else
     transA; transB; rows_a; rows_b; width; alpha; a; lda; b; ldb; beta; c; ldc; // make compiler happy
     ABORT("Marian must be compiled with a BLAS library");

From 8f8fa562bcdddc5b8c86f0b73693bbef7e5a99ac Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Fri, 15 Sep 2023 15:40:28 +0000
Subject: [PATCH 3/5] Add simd utils

---
 .gitmodules                  | 3 +++
 src/3rd_party/CMakeLists.txt | 2 +-
 src/3rd_party/simd_utils     | 1 +
 src/tensors/cpu/prod_blas.h  | 7 +------
 4 files changed, 6 insertions(+), 7 deletions(-)
 create mode 160000 src/3rd_party/simd_utils

diff --git a/.gitmodules b/.gitmodules
index 07791f94f..37b125076 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -23,3 +23,6 @@
 [submodule "src/3rd_party/ruy"]
 	path = src/3rd_party/ruy
 	url = https://github.com/google/ruy.git
+[submodule "src/3rd_party/simd_utils"]
+	path = src/3rd_party/simd_utils
+	url = https://github.com/JishinMaster/simd_utils.git
diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt
index 5fc2c285f..1ab9a53b2 100644
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@@ -9,7 +9,7 @@ add_subdirectory(./faiss)
 include_directories(./faiss)
 
 if(COMPILE_CPU)
-  if((NOT ARM) AND (NOT GENERATE_MARIAN_INSTALL_TARGETS))
+  if((NOT ${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm") AND (NOT GENERATE_MARIAN_INSTALL_TARGETS))
     set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests")
     add_subdirectory(./intgemm)
   endif()
diff --git a/src/3rd_party/simd_utils b/src/3rd_party/simd_utils
new file mode 160000
index 000000000..c092ef9dd
--- /dev/null
+++ b/src/3rd_party/simd_utils
@@ -0,0 +1 @@
+Subproject commit c092ef9dd406cd9b9d54da1ff30cc86c39b4c0a5
diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h
index 85234c05b..a281aa7bf 100644
--- a/src/tensors/cpu/prod_blas.h
+++ b/src/tensors/cpu/prod_blas.h
@@ -91,12 +91,7 @@ inline void GemmRuy(const bool transA,
     }
 
   } else {
-    // @jerinphilip has not yet been able to find a ruy primitive that does in
-    // place addition to obtain full gemm.
-    //
-    // Safe bet is to make an additional allocation to store the result of
-    // multiply  and use the existing values in C.
-    //
+    // No multiply-add in Ruy
     // See also: https://github.com/google/ruy/issues/307
 
     AlignedVector<float> intermediate(M * N);

From 70ab9c6db53fcd7010a9f92c8497f21dfe241526 Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Fri, 15 Sep 2023 16:30:18 +0000
Subject: [PATCH 4/5] First bits working

---
 CMakeLists.txt                              |  27 +++-
 cmake/TargetArch.cmake                      | 142 ++++++++++++++++++++
 src/3rd_party/faiss/VectorTransform.cpp     |   4 +
 src/3rd_party/simd_utils                    |   2 +-
 src/common/types.h                          |   6 +-
 src/functional/operators.h                  |   5 +-
 src/tensors/cpu/expression_graph_packable.h |   2 +-
 src/tensors/cpu/fbgemm/packed_gemm.cpp      |   8 +-
 src/tensors/cpu/intgemm_interface.h         |   4 +-
 9 files changed, 189 insertions(+), 11 deletions(-)
 create mode 100644 cmake/TargetArch.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index eb51bb8ec..dca8e69d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,22 @@ if (POLICY CMP0074)
 endif ()
 
 project(marian CXX C)
+
+######### ARCH DETECTION #########
+# Architecture detection
+include(TargetArch)
+
+target_architecture(CMAKE_TARGET_ARCHITECTURES)
+list(LENGTH CMAKE_TARGET_ARCHITECTURES cmake_target_arch_len)
+if(NOT "${cmake_target_arch_len}" STREQUAL "1")
+    set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL TRUE)
+    set(CMAKE_TARGET_ARCHITECTURE_CODE "universal")
+else()
+    set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL FALSE)
+    set(CMAKE_TARGET_ARCHITECTURE_CODE "${CMAKE_TARGET_ARCHITECTURES}")
+endif()
+######### ARCH DETECTION #########
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
@@ -100,6 +116,15 @@ if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
   # Some warnings as errors. I don't feel comfortable about the strict aliasing.
   set(ARM_WARNINGS "-fno-strict-aliasing -Wno-comment")
 
+  set(USE_SIMD_UTILS ON)
+  # @TODO this assumes ArmV8. We should also look at armv7
+  add_compile_definitions(ARM FMA SSE) #added for ARM
+  if(MSVC)
+    add_compile_options(/flax-vector-conversions)
+  else(MSVC)
+    add_compile_options(-flax-vector-conversions)
+  endif(MSVC)
+
 endif(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
 
 
@@ -534,7 +559,7 @@ endif()
 ###############################################################################
 # Find BLAS library
 if(COMPILE_CPU)
-  if(NOT GENERATE_MARIAN_INSTALL_TARGETS)
+  if(NOT GENERATE_MARIAN_INSTALL_TARGETS AND NOT ARM)
     set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU
     add_definitions(-DCOMPILE_CPU=1)
   endif()
diff --git a/cmake/TargetArch.cmake b/cmake/TargetArch.cmake
new file mode 100644
index 000000000..6e0bb3953
--- /dev/null
+++ b/cmake/TargetArch.cmake
@@ -0,0 +1,142 @@
+# Modified from https://github.com/axr/solar-cmake/blob/73cfea0db0284c5e2010aca23989046e5bda95c9/Solar.cmake
+# Based on the Qt 5 processor detection code, so should be very accurate
+# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h
+# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64)
+
+# Regarding POWER/PowerPC, just as is noted in the Qt source,
+# "There are many more known variants/revisions that we do not handle/detect."
+
+set(archdetect_c_code "
+#if defined(__arm__) || defined(__TARGET_ARCH_ARM)  || defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || defined(__ARM64__)
+    #if defined(__ARM_ARCH_8__)     || defined(__ARM_ARCH_8)          \\
+        || defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8A)         \\
+        || defined(__ARM_ARCH_8R__) || defined(__ARM_ARCH_8R)         \\
+        || defined(__ARM_ARCH_8M__) || defined(__ARM_ARCH_8M)         \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 8)
+        #error cmake_ARCH armv8
+    #elif defined(__ARM_ARCH_7__)                                     \\
+        || defined(__ARM_ARCH_7A__)                                   \\
+        || defined(__ARM_ARCH_7R__)                                   \\
+        || defined(__ARM_ARCH_7M__)                                   \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
+        #error cmake_ARCH armv7
+    #elif defined(__ARM_ARCH_6__)                                      \\
+        || defined(__ARM_ARCH_6J__)                                    \\
+        || defined(__ARM_ARCH_6T2__)                                   \\
+        || defined(__ARM_ARCH_6Z__)                                    \\
+        || defined(__ARM_ARCH_6K__)                                    \\
+        || defined(__ARM_ARCH_6ZK__)                                   \\
+        || defined(__ARM_ARCH_6M__)                                    \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
+        #error cmake_ARCH armv6
+    #elif defined(__ARM_ARCH_5TEJ__) \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
+        #error cmake_ARCH armv5
+    #else
+        #error cmake_ARCH arm
+    #endif
+#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
+    #error cmake_ARCH i386
+#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
+    #error cmake_ARCH x86_64
+#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+    #error cmake_ARCH ia64
+#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\
+      || defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC)  \\
+      || defined(_M_MPPC) || defined(_M_PPC)
+    #if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
+        #error cmake_ARCH ppc64
+    #else
+        #error cmake_ARCH ppc
+    #endif
+#endif
+
+#error cmake_ARCH unknown
+")
+
+
+# Set ppc_support to TRUE before including this file or ppc and ppc64
+# will be treated as invalid architectures since they are no longer supported by Apple
+
+function(target_architecture output_var)
+    if(APPLE AND CMAKE_OSX_ARCHITECTURES)
+        # On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set
+        # First let's normalize the order of the values
+
+        # Note that it's not possible to compile PowerPC applications if you are using
+        # the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we
+        # disable it by default
+        # See this page for more information:
+        # http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4
+
+        # Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime.
+        # On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise.
+
+        foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES})
+            if("${osx_arch}" STREQUAL "ppc" AND ppc_support)
+                set(osx_arch_ppc TRUE)
+            elseif("${osx_arch}" STREQUAL "i386")
+                set(osx_arch_i386 TRUE)
+            elseif("${osx_arch}" STREQUAL "x86_64")
+                set(osx_arch_x86_64 TRUE)
+            elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support)
+                set(osx_arch_ppc64 TRUE)
+            else()
+                message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}")
+            endif()
+        endforeach()
+
+        # Now add all the architectures in our normalized order
+        if(osx_arch_ppc)
+            list(APPEND ARCH ppc)
+        endif()
+
+        if(osx_arch_i386)
+            list(APPEND ARCH i386)
+        endif()
+
+        if(osx_arch_x86_64)
+            list(APPEND ARCH x86_64)
+        endif()
+
+        if(osx_arch_ppc64)
+            list(APPEND ARCH ppc64)
+        endif()
+    else()
+        file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}")
+
+        enable_language(C)
+
+        # Detect the architecture in a rather creative way...
+        # This compiles a small C program which is a series of ifdefs that selects a
+        # particular #error preprocessor directive whose message string contains the
+        # target architecture. The program will always fail to compile (both because
+        # file is not a valid C program, and obviously because of the presence of the
+        # #error preprocessor directives... but by exploiting the preprocessor in this
+        # way, we can detect the correct target architecture even when cross-compiling,
+        # since the program itself never needs to be run (only the compiler/preprocessor)
+        try_run(
+            run_result_unused
+            compile_result_unused
+            "${CMAKE_BINARY_DIR}"
+            "${CMAKE_BINARY_DIR}/arch.c"
+            COMPILE_OUTPUT_VARIABLE ARCH
+            CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
+        )
+
+        # Parse the architecture name from the compiler output
+        string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}")
+
+        # Get rid of the value marker leaving just the architecture name
+        string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}")
+
+        # If we are compiling with an unknown architecture this variable should
+        # already be set to "unknown" but in the case that it's empty (i.e. due
+        # to a typo in the code), then set it to unknown
+        if (NOT ARCH)
+            set(ARCH unknown)
+        endif()
+    endif()
+
+    set(${output_var} "${ARCH}" PARENT_SCOPE)
+endfunction()
\ No newline at end of file
diff --git a/src/3rd_party/faiss/VectorTransform.cpp b/src/3rd_party/faiss/VectorTransform.cpp
index 103b0910e..c6bf9d4a1 100644
--- a/src/3rd_party/faiss/VectorTransform.cpp
+++ b/src/3rd_party/faiss/VectorTransform.cpp
@@ -19,6 +19,10 @@
 
 using namespace faiss;
 
+#ifdef ARM
+#include "3rd_party/simd_utils/simd_utils.h"
+#endif
+
 
 extern "C" {
 
diff --git a/src/3rd_party/simd_utils b/src/3rd_party/simd_utils
index c092ef9dd..696036258 160000
--- a/src/3rd_party/simd_utils
+++ b/src/3rd_party/simd_utils
@@ -1 +1 @@
-Subproject commit c092ef9dd406cd9b9d54da1ff30cc86c39b4c0a5
+Subproject commit 6960362584481c977cdae9f6a8f7061a37c766cb
diff --git a/src/common/types.h b/src/common/types.h
index a0930a0f8..763edb09b 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -17,7 +17,11 @@
 #include <type_traits>
 
 #ifndef __CUDACC__ // NVCC is very unreliable when it comes to CPU intrinsics, we hide them completely from NVCC-compiled code
-#include <immintrin.h>
+  #ifndef ARM
+    #include <immintrin.h>
+  #else
+    #include "3rd_party/simd_utils/simd_utils.h"
+  #endif
 #endif
 
 #ifdef __CUDACC__ // nvcc is compiling this code
diff --git a/src/functional/operators.h b/src/functional/operators.h
index 3628fdcb9..6ecc02bd8 100644
--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@@ -217,8 +217,11 @@ struct Ops<double> {
 // __CUDACC__ is defined when compiling with NVCC regardless of device type
 // __CUDA_ARCH__ is defined when compiling device (GPU) code
 #ifndef __CUDACC__
-
+#ifndef ARM
 #include "3rd_party/sse_mathfun.h"
+#else
+#include "3rd_party/simd_utils/simd_utils.h" // @TODO this might be dependent on NEON
+#endif
 
 namespace marian {
 namespace functional {
diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h
index 1a233372c..4af69fac9 100644
--- a/src/tensors/cpu/expression_graph_packable.h
+++ b/src/tensors/cpu/expression_graph_packable.h
@@ -152,7 +152,7 @@ class ExpressionGraphPackable : public ExpressionGraph {
 #endif
       } else if (isIntgemm(gemmElementType) &&
       (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2 /* || pName.find("Wemb") != std::string::npos*/)) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
         using cpu::integer::cols;
         using cpu::integer::rows;
         auto allocator = New<TensorAllocator>(getBackend());
diff --git a/src/tensors/cpu/fbgemm/packed_gemm.cpp b/src/tensors/cpu/fbgemm/packed_gemm.cpp
index dd81d0f7f..23ed559f1 100644
--- a/src/tensors/cpu/fbgemm/packed_gemm.cpp
+++ b/src/tensors/cpu/fbgemm/packed_gemm.cpp
@@ -2,16 +2,16 @@
 #include "tensors/tensor_allocator.h"
 #include "tensors/tensor_operators.h"
 
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <tmmintrin.h>
-#include <xmmintrin.h>
 #include <cassert>
 #include <cstddef>
 #include <unordered_map>
 //#include <chrono>
 
 #if USE_FBGEMM
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
 #ifdef _MSC_VER
 #pragma warning(disable: 4505) // 'fbgemmAlignedAlloc' in fbgemm.h: unreferenced local function has been removed (missing 'static inline')
 #pragma warning(disable: 4251) // 'fbgemm::CompressedSparseColumn::colptr_': class 'std::vector<int,std::allocator<_Ty>>' needs to have dll-interface to be used by clients of class 'fbgemm::CompressedSparseColumn'
diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h
index 88408aa18..80784e0f6 100644
--- a/src/tensors/cpu/intgemm_interface.h
+++ b/src/tensors/cpu/intgemm_interface.h
@@ -9,7 +9,7 @@ namespace marian {
 namespace cpu {
 namespace integer {
 
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
 /*
  * Prepare an activation matrix into intgemm8/16 format. For now the activation matrix is just quantized.
  * Expr input: The input tensor
@@ -45,7 +45,7 @@ static inline Expr prepareA(Expr a) {
  */
 template<Type vtype>
 static inline Expr affineOrDotTyped(Expr a, Expr bQuant, Expr bias, bool transA, bool /*transB*/, float scale) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
   ABORT_IF(!isFloat(a->value_type()), "Intgemm expects type of A to be float32 not {}", a->value_type());
   ABORT_IF(!isIntgemm(bQuant->value_type()), "Intgemm expects type of B to be a variant of intgemm not {}", bQuant->value_type());
 

From 3ac5facd45934c472561ad733953df625707e561 Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Fri, 15 Sep 2023 16:50:46 +0000
Subject: [PATCH 5/5] Remove m64

---
 CMakeLists.txt               | 8 ++++----
 src/3rd_party/CMakeLists.txt | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dca8e69d9..71ae4cac1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -289,9 +289,9 @@ else(MSVC)
   endif(CMAKE_COMPILER_IS_GNUCC)
 
   set(CMAKE_CXX_FLAGS                 "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
-  set(CMAKE_CXX_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
+  set(CMAKE_CXX_FLAGS_RELEASE         "-O3 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_CXX_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
-  set(CMAKE_CXX_FLAGS_SLIM            "-O3 -m64 -funroll-loops -DNDEBUG")
+  set(CMAKE_CXX_FLAGS_SLIM            "-O3 -funroll-loops -DNDEBUG")
   set(CMAKE_CXX_FLAGS_RELWITHDEBINFO  "${CMAKE_CXX_FLAGS_RELEASE}")
   set(CMAKE_CXX_FLAGS_PROFILE         "${CMAKE_CXX_FLAGS_RELEASE} -pg")
   set(CMAKE_CXX_FLAGS_PROFGEN         "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-generate -fprofile-correction")
@@ -299,9 +299,9 @@ else(MSVC)
 
   # these need to be set separately
   set(CMAKE_C_FLAGS                 "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
-  set(CMAKE_C_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
+  set(CMAKE_C_FLAGS_RELEASE         "-O3 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_C_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
-  set(CMAKE_C_FLAGS_SLIM            "-O3 -m64 -funroll-loops -DNDEBUG")
+  set(CMAKE_C_FLAGS_SLIM            "-O3 -funroll-loops -DNDEBUG")
   set(CMAKE_C_FLAGS_RELWITHDEBINFO  "${CMAKE_C_FLAGS_RELEASE}")
   set(CMAKE_C_FLAGS_PROFILE         "${CMAKE_C_FLAGS_RELEASE} -pg")
   set(CMAKE_C_FLAGS_PROFGEN         "${CMAKE_C_FLAGS_RELEASE} -fprofile-generate -fprofile-correction")
diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt
index 1ab9a53b2..2cec739ba 100644
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@@ -9,7 +9,7 @@ add_subdirectory(./faiss)
 include_directories(./faiss)
 
 if(COMPILE_CPU)
-  if((NOT ${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm") AND (NOT GENERATE_MARIAN_INSTALL_TARGETS))
+  if(NOT GENERATE_MARIAN_INSTALL_TARGETS AND NOT ARM)
     set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests")
     add_subdirectory(./intgemm)
   endif()