diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..820254dba --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "deps/sse2neon"] + path = deps/sse2neon + url = https://github.com/DLTcollab/sse2neon diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9ef094d17..4a81df03b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -64,3 +64,4 @@ Please add your name to the end of this file and include this file to the PR, un * Brent Stephens * M. Asim Jamshed * Yan Grunenberger +* Md Ashfaqur Rahaman diff --git a/build.py b/build.py index ff4289523..75897d3c6 100755 --- a/build.py +++ b/build.py @@ -41,6 +41,7 @@ import subprocess import textwrap import argparse +import platform def cmd(cmd, quiet=False, shell=False): @@ -90,7 +91,14 @@ def cmd(cmd, quiet=False, shell=False): DPDK_URL = 'https://fast.dpdk.org/rel' DPDK_VER = 'dpdk-19.11.4' -DPDK_TARGET = 'x86_64-native-linuxapp-gcc' + +if platform.uname().machine == 'x86_64': + DPDK_TARGET = 'x86_64-native-linuxapp-gcc' +elif platform.uname().machine == 'aarch64': + DPDK_TARGET = 'arm64-armv8a-linux-gcc' +else: + print("Unsupported platform") + sys.exit(1) kernel_release = cmd('uname -r', quiet=True).strip() @@ -299,10 +307,10 @@ def configure_dpdk(): check_mlx() generate_dpdk_extra_mk() - arch = os.getenv('CPU') - if arch: - print(' - Building DPDK with -march=%s' % arch) - set_config(DPDK_CONFIG, "CONFIG_RTE_MACHINE", arch) + arch = platform.uname().machine + if arch == 'aarch64': + print(' - Building DPDK with -march=%s' % 'armv8-a') + set_config(DPDK_CONFIG, "CONFIG_RTE_MACHINE", 'armv8-a') def makeflags(): diff --git a/core/Makefile b/core/Makefile index cff111737..827da8161 100644 --- a/core/Makefile +++ b/core/Makefile @@ -60,9 +60,18 @@ ifeq "$(CXXCOMPILER)" "g++" endif HAS_PKG_CONFIG := $(shell command -v $(PKG_CONFIG) 2>&1 >/dev/null && echo yes || echo no) +ARCH := $(shell uname -m) RTE_SDK ?= $(abspath ../deps/dpdk-19.11.4) -RTE_TARGET ?= $(shell uname -m)-native-linuxapp-gcc + +ifeq ($(ARCH),x86_64) + RTE_TARGET ?= $(shell uname -m)-native-linuxapp-gcc +else ifeq ($(ARCH),aarch64) + RTE_TARGET ?= arm64-armv8a-linux-gcc +else + $(error Unsupported architecture) +endif + DPDK_LIB ?= dpdk ifneq ($(wildcard $(RTE_SDK)/$(RTE_TARGET)/*),) @@ -76,6 +85,10 @@ else ifneq ($(MAKECMDGOALS),clean) $(error DPDK is not available. Make sure $(abspath $(RTE_SDK)) is available and built) endif +# Library for translating Intel SSE intrinsics +# to ARM64 NEON +SSE2NEON_DIR ?= $(abspath ../deps/sse2neon) + # We always want these libraries to be dynamically linked even when the # user requests a static build. ALWAYS_DYN_LIBS := -lpthread -ldl @@ -108,9 +121,16 @@ endif # these headers. Should fix the warnings. Using -isystem also disables # -MMD dependency recording (should we use -MD?). COREDIR := $(abspath .) -CPU ?= native +ifeq ($(ARCH),x86_64) + CPU ?= native +else ifeq ($(ARCH),aarch64) + CPU ?= armv8-a+fp+simd +else + $(error Unsupported architecture) +endif CXXFLAGS += -std=c++17 -g3 -ggdb3 -march=$(CPU) \ -isystem $(DPDK_INC_DIR) -isystem $(COREDIR) \ + -isystem $(SSE2NEON_DIR) \ -isystem $(dir $<).. -isystem $(COREDIR)/modules \ -D_GNU_SOURCE \ -Werror -Wall -Wextra -Wcast-align -Wno-error=deprecated-declarations \ @@ -133,6 +153,10 @@ ifeq "$(shell expr $(CXXCOMPILER) = g++)" "1" CXXFLAGS += -fno-gnu-unique endif +ifeq ($(ARCH),aarch64) + CXXFLAGS += -DRTE_FORCE_INTRINSICS +endif + LDFLAGS += -rdynamic -L$(DPDK_LIB_DIR) -Wl,-rpath=$(DPDK_LIB_DIR) -pthread ifdef BESS_LINK_DYNAMIC LIBS_ALL_SHARED = -Wl,-call_shared diff --git a/core/debug.cc b/core/debug.cc index 787582505..d8dcf418e 100644 --- a/core/debug.cc +++ b/core/debug.cc @@ -410,7 +410,16 @@ static bool SkipSymbol(char *symbol) { // TODO: Only use async-signal-safe operations in the signal handler. static void TrapHandler(int sig_num, siginfo_t *info, void *ucontext) { std::ostringstream oops; + +#if (__i386 || __x86_64) auto *uc = static_cast(ucontext); +#elif __aarch64__ + // unused parameter + (void)ucontext; +#else +#error Unsupported architecture +#endif + bool is_fatal = (sig_num != SIGUSR1); static volatile bool already_trapped = false; @@ -422,8 +431,10 @@ static void TrapHandler(int sig_num, siginfo_t *info, void *ucontext) { trap_ip = reinterpret_cast(uc->uc_mcontext.gregs[REG_EIP]); #elif __x86_64 trap_ip = reinterpret_cast(uc->uc_mcontext.gregs[REG_RIP]); +#elif __aarch64__ + trap_ip = nullptr; #else -#error neither x86 or x86-64 +#error Unsupported architecture #endif if (is_fatal) { diff --git a/core/gate_hooks/pcapng.cc b/core/gate_hooks/pcapng.cc index d70be3302..b6f374e7a 100644 --- a/core/gate_hooks/pcapng.cc +++ b/core/gate_hooks/pcapng.cc @@ -62,7 +62,7 @@ T PadSize(T a, T b) { // Return the single hex digit representing `nibble`. If it cannot be // represented, return the char 'X'. -char NibbleToHD(char nibble) { +char NibbleToHD(signed char nibble) { if (nibble >= 0 && nibble <= 9) { return nibble + '0'; } else if (nibble >= 10 && nibble <= 15) { diff --git a/core/kmod/llring.h b/core/kmod/llring.h index 137954d79..3944db72f 100644 --- a/core/kmod/llring.h +++ b/core/kmod/llring.h @@ -149,7 +149,13 @@ typedef uint64_t phys_addr_t; #define llring_likely(x) __builtin_expect(!!(x), 1) #define llring_unlikely(x) __builtin_expect(!!(x), 0) +#if __x86_64 #include +#elif __aarch64__ +#include +#else +#error Unsupported architecture +#endif static inline void llring_pause(void) { _mm_pause(); } diff --git a/core/modules/ip_lookup.cc b/core/modules/ip_lookup.cc index 3c8b11571..2efc5301a 100644 --- a/core/modules/ip_lookup.cc +++ b/core/modules/ip_lookup.cc @@ -39,7 +39,13 @@ #include "../utils/format.h" #include "../utils/ip.h" +#if __x86_64 #define VECTOR_OPTIMIZATION 1 +#elif __aarch64__ +#define VECTOR_OPTIMIZATION 0 +#else +#error Unsupported architecture +#endif static inline int is_valid_gate(gate_idx_t gate) { return (gate < MAX_GATES || gate == DROP_GATE); @@ -84,7 +90,7 @@ void IPLookup::ProcessBatch(Context *ctx, bess::PacketBatch *batch) { gate_idx_t default_gate = default_gate_; int cnt = batch->cnt(); - int i; + int i = 0; #if VECTOR_OPTIMIZATION // Convert endianness for four addresses at the same time diff --git a/core/modules/set_metadata.cc b/core/modules/set_metadata.cc index d3441c1d8..96ce06f24 100644 --- a/core/modules/set_metadata.cc +++ b/core/modules/set_metadata.cc @@ -28,7 +28,13 @@ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. +#if __x86_64 #include +#elif __aarch64__ +#include +#else +#error Unsupported architecture +#endif #include diff --git a/core/utils/bits.h b/core/utils/bits.h index 5c4b00072..ac3691ce1 100644 --- a/core/utils/bits.h +++ b/core/utils/bits.h @@ -31,7 +31,13 @@ #define BESS_UTILS_BITS_H_ #include +#if __x86_64 #include +#elif __aarch64__ +#include +#else +#error Unsupported architecture +#endif #include diff --git a/core/utils/checksum.h b/core/utils/checksum.h index b206777d4..c4a542479 100644 --- a/core/utils/checksum.h +++ b/core/utils/checksum.h @@ -34,7 +34,13 @@ #ifndef BESS_UTILS_CHECKSUM_H_ #define BESS_UTILS_CHECKSUM_H_ +#if __x86_64 #include +#elif __aarch64__ +#include +#else +#error Unsupported architecture +#endif #include "common.h" #include "ip.h" @@ -214,6 +220,7 @@ static inline bool VerifyIpv4NoOptChecksum(const Ipv4 &iph) { // Calculate internet checksum, the optimized way is // 1. get 32-bit one's complement sum including carrys +#if __x86_64 asm("addl %[u1], %[sum] \n\t" "adcl %[u2], %[sum] \n\t" "adcl %[u3], %[sum] \n\t" @@ -222,6 +229,25 @@ static inline bool VerifyIpv4NoOptChecksum(const Ipv4 &iph) { : [sum] "+r"(sum) : [u1] "m"(buf32[1]), [u2] "m"(buf32[2]), [u3] "m"(buf32[3]), [u4] "m"(buf32[4])); +#elif __aarch64__ + uint32_t tmp = 0; + + asm("ldr %w[tmp], %w[u1] \n\t" + "adds %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u2] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u3] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u4] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "mov %w[tmp], #0 \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + : [sum] "+r"(sum), [tmp] "+r" (tmp) + : [u1] "m"(buf32[1]), [u2] "m"(buf32[2]), [u3] "m"(buf32[3]), + [u4] "m"(buf32[4])); +#else +#error Unsupported architecture +#endif // 2. reduce to 16-bit unsigned integer and negate return FoldChecksum(sum) == 0; @@ -236,6 +262,7 @@ static inline uint16_t CalculateIpv4NoOptChecksum(const Ipv4 &iph) { // Calculate internet checksum, the optimized way is // 1. get 32-bit one's complement sum including carrys +#if __x86_64 asm("addl %[u1], %[sum] \n\t" "adcl %[u2], %[sum] \n\t" "adcl %[u3], %[sum] \n\t" @@ -245,6 +272,27 @@ static inline uint16_t CalculateIpv4NoOptChecksum(const Ipv4 &iph) { : [u1] "m"(buf32[1]), [u2] "g"(buf32[2] & 0xFFFF), // skip checksum fields [u3] "m"(buf32[3]), [u4] "m"(buf32[4])); +#elif __aarch64__ + uint32_t tmp = 0; + uint32_t tmp2 = 0xffff; + + asm("ldr %w[tmp], %w[u1] \n\t" + "adds %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u2] \n\t" + "and %w[tmp], %w[tmp2], %w[tmp] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u3] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u4] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "mov %w[tmp], #0 \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + : [sum] "+r"(sum), [tmp] "+r" (tmp) + : [u1] "m"(buf32[1]), [u2] "m"(buf32[2]), [u3] "m"(buf32[3]), + [u4] "m"(buf32[4]), [tmp2] "r" (tmp2)); +#else +#error Unsupported architecture +#endif // 2. reduce to 16-bit unsigned integer and negate return FoldChecksum(sum); @@ -268,6 +316,7 @@ static inline bool VerifyIpv4Checksum(const Ipv4 &iph) { // Calculate internet checksum, the optimized way is // 1. get 32-bit one's complement sum including carrys +#if __x86_64 asm("addl %[u0], %[sum] \n\t" "adcl %[u1], %[sum] \n\t" "adcl %[u2], %[sum] \n\t" @@ -277,6 +326,27 @@ static inline bool VerifyIpv4Checksum(const Ipv4 &iph) { : [sum] "+r"(sum) : [u0] "m"(buf32[0]), [u1] "m"(buf32[1]), [u2] "m"(buf32[2]), [u3] "m"(buf32[3]), [u4] "m"(buf32[4])); +#elif __aarch64__ + uint32_t tmp = 0; + + asm("ldr %w[tmp], %w[u0] \n\t" + "adds %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u1] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u2] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u3] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u4] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "mov %w[tmp], #0 \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + : [sum] "+r"(sum), [tmp] "+r" (tmp) + : [u0] "m"(buf32[0]), [u1] "m"(buf32[1]), [u2] "m"(buf32[2]), + [u3] "m"(buf32[3]), [u4] "m"(buf32[4])); +#else +#error Unsupported architecture +#endif // 2. reduce to 16-bit unsigned integer and negate return FoldChecksum(sum) == 0; @@ -302,6 +372,7 @@ static inline uint16_t CalculateIpv4Checksum(const Ipv4 &iph) { // Calculate internet checksum, the optimized way is // 1. get 32-bit one's complement sum including carrys +#if __x86_64 asm("addl %[u0], %[sum] \n\t" "adcl %[u1], %[sum] \n\t" "adcl %[u2], %[sum] \n\t" @@ -312,6 +383,29 @@ static inline uint16_t CalculateIpv4Checksum(const Ipv4 &iph) { : [u0] "m"(buf32[0]), [u1] "m"(buf32[1]), [u2] "g"(buf32[2] & 0xFFFF), // skip checksum fields [u3] "m"(buf32[3]), [u4] "m"(buf32[4])); +#elif __aarch64__ + uint32_t tmp = 0; + uint32_t tmp2 = 0xffff; + + asm("ldr %w[tmp], %w[u0] \n\t" + "adds %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u1] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u2] \n\t" + "and %w[tmp], %w[tmp2], %w[tmp] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u3] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u4] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "mov %w[tmp], #0 \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + : [sum] "+r"(sum), [tmp] "+r" (tmp) + : [u0] "m"(buf32[0]), [u1] "m"(buf32[1]), [u2] "m"(buf32[2]), + [u3] "m"(buf32[3]), [u4] "m"(buf32[4]), [tmp2] "r" (tmp2)); +#else +#error Unsupported architecture +#endif // 2. reduce to 16-bit unsigned integer and negate return FoldChecksum(sum); @@ -336,6 +430,7 @@ static inline bool VerifyIpv4UdpChecksum(const Udp &udph, be32_t src_ip, uint32_t len = static_cast(be16_t::swap(udp_len)); // Calculate the checksum of UDP header and pseudo header +#if __x86_64 asm("addl %[u0], %[sum] \n\t" "adcl %[u1], %[sum] \n\t" "adcl %[src], %[sum] \n\t" @@ -346,6 +441,26 @@ static inline bool VerifyIpv4UdpChecksum(const Udp &udph, be32_t src_ip, : [sum] "+r"(sum) : [u0] "m"(buf32[0]), [u1] "m"(buf32[1]), [src] "r"(src_ip.raw_value()), [dst] "r"(dst_ip.raw_value()), [len] "r"(len)); +#elif __aarch64__ + uint32_t tmp = 0; + + asm("ldr %w[tmp], %w[u0] \n\t" + "adds %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u1] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "adcs %w[sum], %w[src], %w[sum] \n\t" + "adcs %w[sum], %w[dst], %w[sum] \n\t" + "adcs %w[sum], %w[len], %w[sum] \n\t" + "mov %w[tmp], #0x1100 \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "mov %w[tmp], #0 \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + : [sum] "+r"(sum), [tmp] "+r" (tmp) + : [u0] "m"(buf32[0]), [u1] "m"(buf32[1]), [src] "r"(src_ip.raw_value()), + [dst] "r"(dst_ip.raw_value()), [len] "r"(len)); +#else +#error Unsupported architecture +#endif return FoldChecksum(sum) == 0; } @@ -377,6 +492,7 @@ static inline uint16_t CalculateIpv4UdpChecksum(const Udp &udph, be32_t src, uint32_t len = static_cast(be16_t::swap(udp_len)); // Calculate the checksum of UDP header and pseudo header +#if __x86_64 asm("addl %[u0], %[sum] \n\t" "adcl %[u1], %[sum] \n\t" "adcl %[src], %[sum] \n\t" @@ -387,6 +503,28 @@ static inline uint16_t CalculateIpv4UdpChecksum(const Udp &udph, be32_t src, : [sum] "+r"(sum) : [u0] "m"(buf32[0]), [u1] "g"(buf32[1] & 0xFFFF), // skip checksum field [src] "r"(src.raw_value()), [dst] "r"(dst.raw_value()), [len] "r"(len)); +#elif __aarch64__ + uint32_t tmp = 0; + uint32_t tmp2 = 0xffff; + + asm("ldr %w[tmp], %w[u0] \n\t" + "adds %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u1] \n\t" + "and %w[tmp], %w[tmp2], %w[tmp] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "adcs %w[sum], %w[src], %w[sum] \n\t" + "adcs %w[sum], %w[dst], %w[sum] \n\t" + "adcs %w[sum], %w[len], %w[sum] \n\t" + "mov %w[tmp], #0x1100 \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "mov %w[tmp], #0 \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + : [sum] "+r"(sum), [tmp] "+r" (tmp) + : [u0] "m"(buf32[0]), [u1] "m"(buf32[1]), [tmp2] "r" (tmp2), + [src] "r"(src.raw_value()), [dst] "r"(dst.raw_value()), [len] "r"(len)); +#else +#error Unsupported architecture +#endif // If the result of UDP checksum calculation is 0, return all ones (rfc 768) return FoldChecksum(sum) ?: 0xFFFF; @@ -420,6 +558,7 @@ static inline bool VerifyIpv4TcpChecksum(const Tcp &tcph, be32_t src_ip, uint32_t len = static_cast(be16_t::swap(tcp_len)); // Calculate the checksum of TCP header and pseudo header +#if __x86_64 asm("addl %[u0], %[sum] \n\t" "adcl %[u1], %[sum] \n\t" "adcl %[u2], %[sum] \n\t" @@ -434,6 +573,33 @@ static inline bool VerifyIpv4TcpChecksum(const Tcp &tcph, be32_t src_ip, : [u0] "m"(buf32[0]), [u1] "m"(buf32[1]), [u2] "m"(buf32[2]), [u3] "m"(buf32[3]), [u4] "m"(buf32[4]), [src] "r"(src_ip.raw_value()), [dst] "r"(dst_ip.raw_value()), [len] "r"(len)); +#elif __aarch64__ + uint32_t tmp = 0; + + asm("ldr %w[tmp], %w[u0] \n\t" + "adds %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u1] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u2] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u3] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u4] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "adcs %w[sum], %w[src], %w[sum] \n\t" + "adcs %w[sum], %w[dst], %w[sum] \n\t" + "adcs %w[sum], %w[len], %w[sum] \n\t" + "mov %w[tmp], #0x0600 \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "mov %w[tmp], #0 \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + : [sum] "+r"(sum), [tmp] "+r" (tmp) + : [u0] "m"(buf32[0]), [u1] "m"(buf32[1]), [u2] "m"(buf32[2]), + [u3] "m"(buf32[3]), [u4] "m"(buf32[4]), [src] "r"(src_ip.raw_value()), + [dst] "r"(dst_ip.raw_value()), [len] "r"(len)); +#else +#error Unsupported architecture +#endif return FoldChecksum(sum) == 0; } @@ -467,6 +633,7 @@ static inline uint16_t CalculateIpv4TcpChecksum(const Tcp &tcph, be32_t src, uint32_t len = static_cast(be16_t::swap(tcp_len)); // Calculate the checksum of TCP header and pseudo header +#if __x86_64 asm("addl %[u0], %[sum] \n\t" "adcl %[u1], %[sum] \n\t" "adcl %[u2], %[sum] \n\t" @@ -482,6 +649,34 @@ static inline uint16_t CalculateIpv4TcpChecksum(const Tcp &tcph, be32_t src, [u3] "m"(buf32[3]), [u4] "g"(buf32[4] >> 16), // skip checksum field [src] "r"(src.raw_value()), [dst] "r"(dst.raw_value()), [len] "r"(len)); +#elif __aarch64__ + uint32_t tmp = 0; + + asm("ldr %w[tmp], %w[u0] \n\t" + "adds %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u1] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u2] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u3] \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "ldr %w[tmp], %w[u4] \n\t" + "lsr %w[tmp], %w[tmp], #16 \n\t" // skip checksum field + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "adcs %w[sum], %w[src], %w[sum] \n\t" + "adcs %w[sum], %w[dst], %w[sum] \n\t" + "adcs %w[sum], %w[len], %w[sum] \n\t" + "mov %w[tmp], #0x0600 \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + "mov %w[tmp], #0 \n\t" + "adcs %w[sum], %w[tmp], %w[sum] \n\t" + : [sum] "+r"(sum), [tmp] "+r" (tmp) + : [u0] "m"(buf32[0]), [u1] "m"(buf32[1]), [u2] "m"(buf32[2]), + [u3] "m"(buf32[3]), [u4] "m"(buf32[4]), + [src] "r"(src.raw_value()), [dst] "r"(dst.raw_value()), [len] "r"(len)); +#else +#error Unsupported architecture +#endif return FoldChecksum(sum); } diff --git a/core/utils/common.h b/core/utils/common.h index 75634d2b5..9c40ee1a9 100644 --- a/core/utils/common.h +++ b/core/utils/common.h @@ -91,7 +91,13 @@ static inline uint64_t align_ceil_pow2(uint64_t v) { #define INST_BARRIER() asm volatile("" ::: "memory") #define LOAD_BARRIER() INST_BARRIER() #define STORE_BARRIER() INST_BARRIER() +#if (__i368 || __x86_64) #define FULL_BARRIER() asm volatile("mfence" ::: "memory") +#elif __aarch64__ +#define FULL_BARRIER() asm volatile("dmb " "ish" ::: "memory") +#else +#error Unsupported architecture +#endif // Put this in the declarations for a class to be uncopyable. #define DISALLOW_COPY(TypeName) TypeName(const TypeName &) = delete diff --git a/core/utils/copy.h b/core/utils/copy.h index a332d4993..68860eec1 100644 --- a/core/utils/copy.h +++ b/core/utils/copy.h @@ -31,7 +31,13 @@ #define BESS_UTILS_COPY_H_ #include +#if __x86_64 #include +#elif __aarch64__ +#include +#else +#error Unsupported architecture +#endif #include diff --git a/core/utils/http_parser.cc b/core/utils/http_parser.cc index 2431f4ab0..2b86a4e7d 100644 --- a/core/utils/http_parser.cc +++ b/core/utils/http_parser.cc @@ -32,7 +32,51 @@ #ifdef _MSC_VER #include #else +#if (__i386 || __x86_64) #include +#elif __aarch64__ +#include +#else +#error Unsupported architecture +#endif +#endif + +#ifdef __aarch64__ + +#define _SIDD_UBYTE_OPS 0x00 +#define _SIDD_CMP_RANGES 0x04 +#define _SIDD_LEAST_SIGNIFICANT 0x00 + +typedef union __attribute__((aligned(16))) __oword { + int64x2_t m128i; + uint8_t m128i_u8[16]; +} __oword; + +static inline int _mm_cmpestri(int64x2_t str1, int len1, int64x2_t str2, int len2, int mode) { + __oword a, b; + a.m128i = str1; + b.m128i = str2; + + // mode is unused + (void)mode; + + int i,j, result; + + for (i = 0; i < len2; i++) { + for (j = 0; j < len1; j+=2) { + if (b.m128i_u8[i] >= a.m128i_u8[j] && b.m128i_u8[i] <= a.m128i_u8[j+1]) + break; + } + } + + result = i; + + if (result == len2) + result = 16; + + return result; +} + #endif /* $Id$ */ diff --git a/core/utils/mcslock.h b/core/utils/mcslock.h index a78f550b3..755a6c8c9 100644 --- a/core/utils/mcslock.h +++ b/core/utils/mcslock.h @@ -61,10 +61,22 @@ static inline void mcs_lock(mcslock_t *lock, mcslock_node_t *mynode) { /* it's hold by others. queue up and spin on the node of myself */ pre->next = mynode; +#if (__i386 || __x86_64) asm volatile("sfence" ::: "memory"); +#elif __aarch64__ + asm volatile("dmb " "ishst" ::: "memory"); +#else +#error Unsupported architecture +#endif while (mynode->locked) { +#if (__i386 || __x86_64) __builtin_ia32_pause(); +#elif __aarch64__ + asm volatile("yield"); +#else +#error Unsupported architecture +#endif } } @@ -73,8 +85,15 @@ static inline void mcs_unlock(mcslock_t *lock, mcslock_node_t *mynode) { if (__sync_bool_compare_and_swap(&lock->tail, mynode, nullptr)) return; while (mynode->next == nullptr) { +#if (__i386 || __x86_64) asm volatile("lfence" ::: "memory"); __builtin_ia32_pause(); +#elif __aarch64__ + asm volatile("dmb " "ishld" ::: "memory"); + asm volatile("yield"); +#else +#error Unsupported architecture +#endif } } diff --git a/core/utils/simd.h b/core/utils/simd.h index 8e204af42..dd1ccb978 100644 --- a/core/utils/simd.h +++ b/core/utils/simd.h @@ -33,7 +33,13 @@ #include +#if __x86_64 #include +#elif __aarch64__ +#include +#else +#error Unsupported architecture +#endif #include @@ -41,9 +47,11 @@ #define __ymm_aligned __attribute__((aligned(32))) #define __zmm_aligned __attribute__((aligned(64))) +#if __x86_64 #if !__SSE4_2__ #error CPU must be at least Intel Nehalem equivalent (SSE4.2 required) #endif +#endif std::string m128i_to_str(__m128i a); diff --git a/core/utils/syscallthread.h b/core/utils/syscallthread.h index 165dcbb44..053fc50a6 100644 --- a/core/utils/syscallthread.h +++ b/core/utils/syscallthread.h @@ -35,7 +35,13 @@ #include #include +#if __x86_64 #include +#elif __aarch64__ +#include +#else +#error Unsupported architecture +#endif #include namespace bess { diff --git a/core/utils/time.h b/core/utils/time.h index 13dfb5513..9c94e9e7a 100644 --- a/core/utils/time.h +++ b/core/utils/time.h @@ -40,9 +40,19 @@ extern uint64_t tsc_hz; static inline uint64_t rdtsc(void) { + uint64_t val; + +#if (__i386 || __x86_64) uint32_t hi, lo; __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); - return (uint64_t)lo | ((uint64_t)hi << 32); + val = (uint64_t)lo | ((uint64_t)hi << 32); +#elif __aarch64__ + __asm__ __volatile__("mrs %0, cntvct_el0" : "=r" (val)); +#else +#error Unsupported architecture +#endif + + return val; } static inline uint64_t tsc_to_ns(uint64_t cycles) { diff --git a/deps/sse2neon b/deps/sse2neon new file mode 160000 index 000000000..6bf2fa37a --- /dev/null +++ b/deps/sse2neon @@ -0,0 +1 @@ +Subproject commit 6bf2fa37af851498aa7e495aeae7b443c00f26b6