Skip to content

Commit

Permalink
First bits working
Browse files Browse the repository at this point in the history
  • Loading branch information
XapaJIaMnu committed Sep 15, 2023
1 parent 8f8fa56 commit 70ab9c6
Show file tree
Hide file tree
Showing 9 changed files with 189 additions and 11 deletions.
27 changes: 26 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,22 @@ if (POLICY CMP0074)
endif ()

project(marian CXX C)

######### ARCH DETECTION #########
# Architecture detection
include(TargetArch)

target_architecture(CMAKE_TARGET_ARCHITECTURES)
list(LENGTH CMAKE_TARGET_ARCHITECTURES cmake_target_arch_len)
if(NOT "${cmake_target_arch_len}" STREQUAL "1")
set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL TRUE)
set(CMAKE_TARGET_ARCHITECTURE_CODE "universal")
else()
set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL FALSE)
set(CMAKE_TARGET_ARCHITECTURE_CODE "${CMAKE_TARGET_ARCHITECTURES}")
endif()
######### ARCH DETECTION #########

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
Expand Down Expand Up @@ -100,6 +116,15 @@ if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
# Some warnings as errors. I don't feel comfortable about the strict aliasing.
set(ARM_WARNINGS "-fno-strict-aliasing -Wno-comment")

set(USE_SIMD_UTILS ON)
# @TODO this assumes ArmV8. We should also look at armv7
add_compile_definitions(ARM FMA SSE) #added for ARM
if(MSVC)
add_compile_options(/flax-vector-conversions)
else(MSVC)
add_compile_options(-flax-vector-conversions)
endif(MSVC)

endif(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")


Expand Down Expand Up @@ -534,7 +559,7 @@ endif()
###############################################################################
# Find BLAS library
if(COMPILE_CPU)
if(NOT GENERATE_MARIAN_INSTALL_TARGETS)
if(NOT GENERATE_MARIAN_INSTALL_TARGETS AND NOT ARM)
set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU
add_definitions(-DCOMPILE_CPU=1)
endif()
Expand Down
142 changes: 142 additions & 0 deletions cmake/TargetArch.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# Modified from https://github.com/axr/solar-cmake/blob/73cfea0db0284c5e2010aca23989046e5bda95c9/Solar.cmake
# Based on the Qt 5 processor detection code, so should be very accurate
# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h
# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64)

# Regarding POWER/PowerPC, just as is noted in the Qt source,
# "There are many more known variants/revisions that we do not handle/detect."

set(archdetect_c_code "
#if defined(__arm__) || defined(__TARGET_ARCH_ARM) || defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || defined(__ARM64__)
#if defined(__ARM_ARCH_8__) || defined(__ARM_ARCH_8) \\
|| defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8A) \\
|| defined(__ARM_ARCH_8R__) || defined(__ARM_ARCH_8R) \\
|| defined(__ARM_ARCH_8M__) || defined(__ARM_ARCH_8M) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 8)
#error cmake_ARCH armv8
#elif defined(__ARM_ARCH_7__) \\
|| defined(__ARM_ARCH_7A__) \\
|| defined(__ARM_ARCH_7R__) \\
|| defined(__ARM_ARCH_7M__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
#error cmake_ARCH armv7
#elif defined(__ARM_ARCH_6__) \\
|| defined(__ARM_ARCH_6J__) \\
|| defined(__ARM_ARCH_6T2__) \\
|| defined(__ARM_ARCH_6Z__) \\
|| defined(__ARM_ARCH_6K__) \\
|| defined(__ARM_ARCH_6ZK__) \\
|| defined(__ARM_ARCH_6M__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
#error cmake_ARCH armv6
#elif defined(__ARM_ARCH_5TEJ__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
#error cmake_ARCH armv5
#else
#error cmake_ARCH arm
#endif
#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
#error cmake_ARCH i386
#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
#error cmake_ARCH x86_64
#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
#error cmake_ARCH ia64
#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\
|| defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \\
|| defined(_M_MPPC) || defined(_M_PPC)
#if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
#error cmake_ARCH ppc64
#else
#error cmake_ARCH ppc
#endif
#endif
#error cmake_ARCH unknown
")


# Set ppc_support to TRUE before including this file or ppc and ppc64
# will be treated as invalid architectures since they are no longer supported by Apple

function(target_architecture output_var)
if(APPLE AND CMAKE_OSX_ARCHITECTURES)
# On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set
# First let's normalize the order of the values

# Note that it's not possible to compile PowerPC applications if you are using
# the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we
# disable it by default
# See this page for more information:
# http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4

# Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime.
# On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise.

foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES})
if("${osx_arch}" STREQUAL "ppc" AND ppc_support)
set(osx_arch_ppc TRUE)
elseif("${osx_arch}" STREQUAL "i386")
set(osx_arch_i386 TRUE)
elseif("${osx_arch}" STREQUAL "x86_64")
set(osx_arch_x86_64 TRUE)
elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support)
set(osx_arch_ppc64 TRUE)
else()
message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}")
endif()
endforeach()

# Now add all the architectures in our normalized order
if(osx_arch_ppc)
list(APPEND ARCH ppc)
endif()

if(osx_arch_i386)
list(APPEND ARCH i386)
endif()

if(osx_arch_x86_64)
list(APPEND ARCH x86_64)
endif()

if(osx_arch_ppc64)
list(APPEND ARCH ppc64)
endif()
else()
file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}")

enable_language(C)

# Detect the architecture in a rather creative way...
# This compiles a small C program which is a series of ifdefs that selects a
# particular #error preprocessor directive whose message string contains the
# target architecture. The program will always fail to compile (both because
# file is not a valid C program, and obviously because of the presence of the
# #error preprocessor directives... but by exploiting the preprocessor in this
# way, we can detect the correct target architecture even when cross-compiling,
# since the program itself never needs to be run (only the compiler/preprocessor)
try_run(
run_result_unused
compile_result_unused
"${CMAKE_BINARY_DIR}"
"${CMAKE_BINARY_DIR}/arch.c"
COMPILE_OUTPUT_VARIABLE ARCH
CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
)

# Parse the architecture name from the compiler output
string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}")

# Get rid of the value marker leaving just the architecture name
string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}")

# If we are compiling with an unknown architecture this variable should
# already be set to "unknown" but in the case that it's empty (i.e. due
# to a typo in the code), then set it to unknown
if (NOT ARCH)
set(ARCH unknown)
endif()
endif()

set(${output_var} "${ARCH}" PARENT_SCOPE)
endfunction()
4 changes: 4 additions & 0 deletions src/3rd_party/faiss/VectorTransform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@

using namespace faiss;

#ifdef ARM
#include "3rd_party/simd_utils/simd_utils.h"
#endif


extern "C" {

Expand Down
6 changes: 5 additions & 1 deletion src/common/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
#include <type_traits>

#ifndef __CUDACC__ // NVCC is very unreliable when it comes to CPU intrinsics, we hide them completely from NVCC-compiled code
#include <immintrin.h>
#ifndef ARM
#include <immintrin.h>
#else
#include "3rd_party/simd_utils/simd_utils.h"
#endif
#endif

#ifdef __CUDACC__ // nvcc is compiling this code
Expand Down
5 changes: 4 additions & 1 deletion src/functional/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,11 @@ struct Ops<double> {
// __CUDACC__ is defined when compiling with NVCC regardless of device type
// __CUDA_ARCH__ is defined when compiling device (GPU) code
#ifndef __CUDACC__

#ifndef ARM
#include "3rd_party/sse_mathfun.h"
#else
#include "3rd_party/simd_utils/simd_utils.h" // @TODO this might be dependent on NEON
#endif

namespace marian {
namespace functional {
Expand Down
2 changes: 1 addition & 1 deletion src/tensors/cpu/expression_graph_packable.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ class ExpressionGraphPackable : public ExpressionGraph {
#endif
} else if (isIntgemm(gemmElementType) &&
(pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2 /* || pName.find("Wemb") != std::string::npos*/)) {
#if COMPILE_CPU
#if COMPILE_CPU && !defined(ARM)
using cpu::integer::cols;
using cpu::integer::rows;
auto allocator = New<TensorAllocator>(getBackend());
Expand Down
8 changes: 4 additions & 4 deletions src/tensors/cpu/fbgemm/packed_gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
#include "tensors/tensor_allocator.h"
#include "tensors/tensor_operators.h"

#include <emmintrin.h>
#include <immintrin.h>
#include <tmmintrin.h>
#include <xmmintrin.h>
#include <cassert>
#include <cstddef>
#include <unordered_map>
//#include <chrono>

#if USE_FBGEMM
#include <emmintrin.h>
#include <immintrin.h>
#include <tmmintrin.h>
#include <xmmintrin.h>
#ifdef _MSC_VER
#pragma warning(disable: 4505) // 'fbgemmAlignedAlloc' in fbgemm.h: unreferenced local function has been removed (missing 'static inline')
#pragma warning(disable: 4251) // 'fbgemm::CompressedSparseColumn::colptr_': class 'std::vector<int,std::allocator<_Ty>>' needs to have dll-interface to be used by clients of class 'fbgemm::CompressedSparseColumn'
Expand Down
4 changes: 2 additions & 2 deletions src/tensors/cpu/intgemm_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace marian {
namespace cpu {
namespace integer {

#if COMPILE_CPU
#if COMPILE_CPU && !defined(ARM)
/*
* Prepare an activation matrix into intgemm8/16 format. For now the activation matrix is just quantized.
* Expr input: The input tensor
Expand Down Expand Up @@ -45,7 +45,7 @@ static inline Expr prepareA(Expr a) {
*/
template<Type vtype>
static inline Expr affineOrDotTyped(Expr a, Expr bQuant, Expr bias, bool transA, bool /*transB*/, float scale) {
#if COMPILE_CPU
#if COMPILE_CPU && !defined(ARM)
ABORT_IF(!isFloat(a->value_type()), "Intgemm expects type of A to be float32 not {}", a->value_type());
ABORT_IF(!isIntgemm(bQuant->value_type()), "Intgemm expects type of B to be a variant of intgemm not {}", bQuant->value_type());

Expand Down

2 comments on commit 70ab9c6

@hieuhoang
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the cmake flags to compile this?

@XapaJIaMnu
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cmake .. -DCOMPILE_CUDA=OFF

Please sign in to comment.