From 049d5583a5c320e3949bed04ab2329eb43a16be9 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Wed, 27 Apr 2016 09:05:46 +0000 Subject: [PATCH 1/7] lib/checksum_simd.dasl: IP checksum in AVX2 assembler (prototype) See source code comments for implementation status/notes. --- src/lib/checksum_simd.dasl | 111 +++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 src/lib/checksum_simd.dasl diff --git a/src/lib/checksum_simd.dasl b/src/lib/checksum_simd.dasl new file mode 100644 index 0000000000..1aa9ff302a --- /dev/null +++ b/src/lib/checksum_simd.dasl @@ -0,0 +1,111 @@ +-- checksum_simd: SIMD checksum routines +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(..., package.seeall) + +local dasm = require("dasm") +local ffi = require("ffi") +local C = ffi.C + +|.arch x64 +|.actionlist actions +|.globalnames globalnames + +-- Calculate checksum using AVX2 vector instructions. Optimized for +-- both speed and short/simple code. +-- +-- Algorithm: +-- +-- Operate on 32-byte chunks: +-- Divide each chunk into 16xU16 values. +-- Sum 1st 8xU16 into 8xU32 accumulators +-- Sum 2nd 8xU16 into 8xU32 accumulators +-- "Fold" accumulators from 8xU32 down to the 1xU16 result. +-- +-- The input is temporarily padded with a 32-byte "trailer" block of +-- zeros at the end. This means there is no "remainder" to worry about. +-- This requires input pointers for which this is safe (e.g. 'struct +-- packet'). +-- +-- TODO: +-- Check carry/overflow conditions more closely. Currently not +-- correct and possibly in a fatal way. Define boundaries e.g. +-- maximum safe input length. +-- +-- Check for a better/faster idea/method than zero-padding the end. +-- Writing 32-bytes to an unaligned address seems to cost 20 cycles +-- both with vector (VMOVDQU) and with integer (4 x MOVQ) moves. +-- +-- This algorithm is a new formulation based on: +-- Tony Rogvall's C intrinsics code (Snabb arch/avx2.c) and ideas. +-- RFC 1071 "Computing the Internet Checksum" +-- Article http://locklessinc.com/articles/tcp_checksum/ +-- Article https://www.klittlepage.com/2013/12/10/accelerated-fix-processing-via-avx2-vector-instructions/ + +function asm_checksum_avx2 (Dst) + |->checksum_avx2: + | add rsi, rdi -- rsi = data pointer, rdi = end address + | vpxor ymm0, ymm0, ymm0 -- ymm0 = 0 (useful constant zero) + | vpxor ymm1, ymm1, ymm1 -- ymm1 = 0 (8 x u32 accumulators) + | vmovdqu ymm6, [rsi] -- Save the "trailer" 32-bytes + | vmovdqu [rsi], ymm0 -- Overwrite the trailer with zeros + |1: + | vmovdqu ymm2, [rdi] -- Load 32-byte chunk + | vpunpcklwd ymm3, ymm2, ymm0 -- Unpack low 8*u16 values into 8*u32 vector + | vpaddd ymm1, ymm1, ymm3 -- .. sum into 8*u32 accumulator + | vpunpckhwd ymm3, ymm2, ymm0 -- Repeat with high 8*u16 values + | vpaddd ymm1, ymm1, ymm3 -- ... + | add rdi, 0x20 -- Advance input pointer + | cmp rdi, rsi -- Check for end of input + | jl <1 + -- Finish up: + | vmovdqu [rsi], ymm6 -- Restore the "trailer" + | vphaddd ymm1, ymm1, ymm0 -- Fold from 8xU32 to 4xU32 accumulators + | vphaddd ymm1, ymm1, ymm0 -- Fold from 4xU32 to 2xU32 accumulators + | vextracti128 xmm2, ymm1, 1 -- Separate remaining accumulators + | vpaddw xmm1, xmm1, xmm2 -- Fold into 1xU32 accumulator + | vmovd edx, xmm1 -- Extract U32 accumulator into integer reg + | movzx rax, dx -- Get low 16 bits in dx + | shr edx, 16 -- Get high 16 bits in ax + | add ax, dx -- Sum into U16 accumulator + | adc ax, 0 -- Add carry + | xchg al, ah -- Swap to network byte order + | ret +end + +local Dst, globals = dasm.new(actions, nil, nil, 1 + #globalnames) +asm_checksum_avx2(Dst) +local mcode, size = Dst:build() +local entry = dasm.globals(globals, globalnames) + +cksum = ffi.cast("int(*)(void *, int)", entry.checksum_avx2) + +_anchor = mcode + +dasm.dump(mcode, size) + +function selftest () + require("lib.checksum") + local pmu = require("lib.pmu") + local sz = 32 + local s = ffi.new("char["..sz.."]") + for i = 0, sz do + s[i] = i % 256 + end + local n = 1e6 + print("asm") + -- Simple benchmark vs C intrinsics code. + -- Tweak value of 'sz' to see different sizes. + if pmu.is_available() then + pmu.profile(function () for i = 1, n do cksum(s, sz) end end, + {}, {call=n, byte=sz*n, block=sz*n/32}) + print("c") + pmu.profile(function () for i = 1, n do C.cksum_avx2(s, sz, 0) end end, + {}, {call=n, byte=sz*n, block=sz*n/32}) + end + local v = cksum(s, sz-1) + print("value", v, bit.tohex(v)) + local r = bit.bxor(0xffff, require("lib.checksum").ipsum(s, sz-1, 0)) + print("ref", r, bit.tohex(r)) +end + From 4993c37ea3fbf2d580dd8cf458b7faf79a47dd2b Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Thu, 28 Apr 2016 04:25:35 +0000 Subject: [PATCH 2/7] lib.checksum_simd: Fix overflow bug - now works Fixes an overflow bug where the 32-bit accumulators were summed using a 16-bit add instruction. Checksums now seem to be correct (same as existing routine) for up to 128KB inputs. --- src/lib/checksum_simd.dasl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/checksum_simd.dasl b/src/lib/checksum_simd.dasl index 1aa9ff302a..7125b053ae 100644 --- a/src/lib/checksum_simd.dasl +++ b/src/lib/checksum_simd.dasl @@ -63,7 +63,7 @@ function asm_checksum_avx2 (Dst) | vphaddd ymm1, ymm1, ymm0 -- Fold from 8xU32 to 4xU32 accumulators | vphaddd ymm1, ymm1, ymm0 -- Fold from 4xU32 to 2xU32 accumulators | vextracti128 xmm2, ymm1, 1 -- Separate remaining accumulators - | vpaddw xmm1, xmm1, xmm2 -- Fold into 1xU32 accumulator + | vpaddd xmm1, xmm1, xmm2 -- Fold into 1xU32 accumulator | vmovd edx, xmm1 -- Extract U32 accumulator into integer reg | movzx rax, dx -- Get low 16 bits in dx | shr edx, 16 -- Get high 16 bits in ax From 14d78e489e256691baa97bd6a99f0949ac5f8ca8 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Thu, 28 Apr 2016 04:27:55 +0000 Subject: [PATCH 3/7] lib/checksum_simd.dasl: Improved comments and selftest --- src/lib/checksum_simd.dasl | 44 ++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/lib/checksum_simd.dasl b/src/lib/checksum_simd.dasl index 7125b053ae..3b8879e21e 100644 --- a/src/lib/checksum_simd.dasl +++ b/src/lib/checksum_simd.dasl @@ -28,13 +28,10 @@ local C = ffi.C -- packet'). -- -- TODO: --- Check carry/overflow conditions more closely. Currently not --- correct and possibly in a fatal way. Define boundaries e.g. --- maximum safe input length. --- --- Check for a better/faster idea/method than zero-padding the end. --- Writing 32-bytes to an unaligned address seems to cost 20 cycles --- both with vector (VMOVDQU) and with integer (4 x MOVQ) moves. +-- Confirm maximum input size before overflow. Empirically when +-- checksumming an all-ones array I see the first overflow (checksum +-- mismatch compared with reference) at 128KB. That would be more +-- than sufficient for network packets. -- -- This algorithm is a new formulation based on: -- Tony Rogvall's C intrinsics code (Snabb arch/avx2.c) and ideas. @@ -65,8 +62,8 @@ function asm_checksum_avx2 (Dst) | vextracti128 xmm2, ymm1, 1 -- Separate remaining accumulators | vpaddd xmm1, xmm1, xmm2 -- Fold into 1xU32 accumulator | vmovd edx, xmm1 -- Extract U32 accumulator into integer reg - | movzx rax, dx -- Get low 16 bits in dx - | shr edx, 16 -- Get high 16 bits in ax + | movzx rax, dx -- Get low 16 bits in rdx + | shr edx, 16 -- Get high 16 bits in dx | add ax, dx -- Sum into U16 accumulator | adc ax, 0 -- Add carry | xchg al, ah -- Swap to network byte order @@ -82,30 +79,41 @@ cksum = ffi.cast("int(*)(void *, int)", entry.checksum_avx2) _anchor = mcode -dasm.dump(mcode, size) +--dasm.dump(mcode, size) function selftest () + print("selftest: checksum_simd") require("lib.checksum") local pmu = require("lib.pmu") - local sz = 32 + local sz = 10*1024 local s = ffi.new("char["..sz.."]") for i = 0, sz do s[i] = i % 256 end - local n = 1e6 - print("asm") + local n = 1e5 -- Simple benchmark vs C intrinsics code. -- Tweak value of 'sz' to see different sizes. if pmu.is_available() then + print("ASM") pmu.profile(function () for i = 1, n do cksum(s, sz) end end, {}, {call=n, byte=sz*n, block=sz*n/32}) - print("c") + print() + print("C") pmu.profile(function () for i = 1, n do C.cksum_avx2(s, sz, 0) end end, {}, {call=n, byte=sz*n, block=sz*n/32}) + else + local _, reason = pmu.is_available() + print("Skipping benchmark. PMU not available: " .. reason) + end + -- Compare values for all input sizes + for i = 1, sz do + local v = cksum(s, sz-1) + local r = bit.bxor(0xffff, require("lib.checksum").ipsum(s, sz-1, 0)) + if v ~= r then + print(i, bit.tohex(v), bit.tohex(r)) + error("checksum mismatch") + end end - local v = cksum(s, sz-1) - print("value", v, bit.tohex(v)) - local r = bit.bxor(0xffff, require("lib.checksum").ipsum(s, sz-1, 0)) - print("ref", r, bit.tohex(r)) + print("selftest: ok") end From 999807df97181a72ba72055dfd8ee9554288d7c9 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Sat, 30 Apr 2016 08:57:30 +0000 Subject: [PATCH 4/7] lib.checksum_simd: Add VZEROUPPER before RET From the comment: This routine executes a VZEROUPPER instruction before returning in order to flush 256-bit AVX register state and avoid potential expensive SSE-AVX transition penalties. This is a cheap form of insurance against taking ~ 75 cycle penalties when mixing SSE and AVX code in the same program. See https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties and particularly section 3.3. --- src/lib/checksum_simd.dasl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/lib/checksum_simd.dasl b/src/lib/checksum_simd.dasl index 3b8879e21e..ad57b6b6cd 100644 --- a/src/lib/checksum_simd.dasl +++ b/src/lib/checksum_simd.dasl @@ -27,6 +27,14 @@ local C = ffi.C -- This requires input pointers for which this is safe (e.g. 'struct -- packet'). -- +-- This routine executes a VZEROUPPER instruction before returning in +-- order to flush 256-bit AVX register state and avoid potential +-- expensive SSE-AVX transition penalties. This is a cheap form of +-- insurance against taking ~ 75 cycle penalties when mixing SSE and +-- AVX code in the same program. See +-- https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties +-- and particularly section 3.3. +-- -- TODO: -- Confirm maximum input size before overflow. Empirically when -- checksumming an all-ones array I see the first overflow (checksum @@ -67,6 +75,7 @@ function asm_checksum_avx2 (Dst) | add ax, dx -- Sum into U16 accumulator | adc ax, 0 -- Add carry | xchg al, ah -- Swap to network byte order + | vzeroupper -- Avoid potential AVX-SSE transition penalty | ret end From 9ea803140c16fa9b99b834cdd5cfbee8741f17df Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Sat, 30 Apr 2016 16:46:42 +0000 Subject: [PATCH 5/7] lib.checksum_simd: Add support for 'initial' argument The assembler routine now has the same function interface as the C functions and should be able to serve as a drop-in replacement. --- src/lib/checksum_simd.dasl | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/lib/checksum_simd.dasl b/src/lib/checksum_simd.dasl index ad57b6b6cd..15a8dd1a47 100644 --- a/src/lib/checksum_simd.dasl +++ b/src/lib/checksum_simd.dasl @@ -47,9 +47,11 @@ local C = ffi.C -- Article http://locklessinc.com/articles/tcp_checksum/ -- Article https://www.klittlepage.com/2013/12/10/accelerated-fix-processing-via-avx2-vector-instructions/ +-- Generate the function: +-- uint16_t asm_checksum_avx2(char *ptr, int length, uint16_t initial) function asm_checksum_avx2 (Dst) |->checksum_avx2: - | add rsi, rdi -- rsi = data pointer, rdi = end address + | add rsi, rdi -- rsi = end address, rdi = data pointer | vpxor ymm0, ymm0, ymm0 -- ymm0 = 0 (useful constant zero) | vpxor ymm1, ymm1, ymm1 -- ymm1 = 0 (8 x u32 accumulators) | vmovdqu ymm6, [rsi] -- Save the "trailer" 32-bytes @@ -69,12 +71,15 @@ function asm_checksum_avx2 (Dst) | vphaddd ymm1, ymm1, ymm0 -- Fold from 4xU32 to 2xU32 accumulators | vextracti128 xmm2, ymm1, 1 -- Separate remaining accumulators | vpaddd xmm1, xmm1, xmm2 -- Fold into 1xU32 accumulator - | vmovd edx, xmm1 -- Extract U32 accumulator into integer reg - | movzx rax, dx -- Get low 16 bits in rdx - | shr edx, 16 -- Get high 16 bits in dx - | add ax, dx -- Sum into U16 accumulator + | vmovd ecx, xmm1 -- Extract U32 accumulator into integer reg + | movzx rax, cx -- Get low 16 bits in rdx + | shr ecx, 16 -- Get high 16 bits in dx + | add ax, cx -- Sum into U16 accumulator | adc ax, 0 -- Add carry | xchg al, ah -- Swap to network byte order + | add ax, dx -- Add 'initial' argument (edx = arg 3) + | adc ax, 0 -- Add carry + | xor ax, 0xffff -- Convert to one's complement | vzeroupper -- Avoid potential AVX-SSE transition penalty | ret end @@ -84,12 +89,13 @@ asm_checksum_avx2(Dst) local mcode, size = Dst:build() local entry = dasm.globals(globals, globalnames) -cksum = ffi.cast("int(*)(void *, int)", entry.checksum_avx2) +cksum = ffi.cast("uint16_t(*)(unsigned char *, size_t, uint16_t)", entry.checksum_avx2) _anchor = mcode --dasm.dump(mcode, size) +-- See also 'snabbmark checksum' command function selftest () print("selftest: checksum_simd") require("lib.checksum") @@ -103,12 +109,13 @@ function selftest () -- Simple benchmark vs C intrinsics code. -- Tweak value of 'sz' to see different sizes. if pmu.is_available() then + local rand = math.random(256*256) print("ASM") - pmu.profile(function () for i = 1, n do cksum(s, sz) end end, + pmu.profile(function () for i = 1, n do cksum(s, sz, rand) end end, {}, {call=n, byte=sz*n, block=sz*n/32}) print() print("C") - pmu.profile(function () for i = 1, n do C.cksum_avx2(s, sz, 0) end end, + pmu.profile(function () for i = 1, n do C.cksum_avx2(s, sz, rand) end end, {}, {call=n, byte=sz*n, block=sz*n/32}) else local _, reason = pmu.is_available() @@ -116,8 +123,9 @@ function selftest () end -- Compare values for all input sizes for i = 1, sz do - local v = cksum(s, sz-1) - local r = bit.bxor(0xffff, require("lib.checksum").ipsum(s, sz-1, 0)) + local rand = math.random(256*256) + local v = cksum(s, sz-1, rand) + local r = require("lib.checksum").ipsum(s, sz-1, rand) if v ~= r then print(i, bit.tohex(v), bit.tohex(r)) error("checksum mismatch") From 2674a1e26e99c8089343cf04975eeadaec890af8 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Sat, 30 Apr 2016 16:55:55 +0000 Subject: [PATCH 6/7] snabbmark checksum: New benchmark snabbmark can now measure the performance of built-in IP checksum routines and presents results for apples-to-apples comparison. The benchmark parameters are currently hard-coded. Length is randomly chosen from a "log uniform" distribution (favoring smaller values but drawn from a large range). Alignment is randomized. The intention is to favor robust routines that are not sensitive to alignment and predictable branches. Currently the alignment is forced to be even. Initially this was to be realistic for normal protocols but I discovered that odd addresses actually crash the SSE implementation. Have to address that bug separately. --- src/program/snabbmark/README | 5 +++ src/program/snabbmark/snabbmark.lua | 60 +++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/src/program/snabbmark/README b/src/program/snabbmark/README index 76b010e342..327e4c2abc 100644 --- a/src/program/snabbmark/README +++ b/src/program/snabbmark/README @@ -36,3 +36,8 @@ Usage: Example usage with 10 million packets, packet size 128 bytes: sudo SNABB_PCI0="0000:02:00.0" SNABB_PCI1="0000:03:00.0" ./snabb snabbmark intel1g 10e6 128 + + snabbmark checksum + Compare performance of built-in IP checksum implementations. + (Requires CPU AVX2 support.) + diff --git a/src/program/snabbmark/snabbmark.lua b/src/program/snabbmark/snabbmark.lua index 8a554ffa8b..990eb28487 100644 --- a/src/program/snabbmark/snabbmark.lua +++ b/src/program/snabbmark/snabbmark.lua @@ -22,6 +22,8 @@ function run (args) solarflare(unpack(args)) elseif command == 'intel1g' and #args >= 2 and #args <= 3 then intel1g(unpack(args)) + elseif command == 'checksum' then + checksum(args) else print(usage) main.exit(1) @@ -328,3 +330,61 @@ receive_device.interface= "rx1GE" main.exit(1) end end + +-- Checksum benchmark + +function checksum1 (size_min, size_max, verbose) + local loops = 1000 + local inputs = 1000 + local sizes = {} + local arrays = {} + local bytes = 0 + for i = 1, inputs do + -- Random sizes up to 10K from a "log uniform" distribution i.e. + -- proportionally more smaller values. + local size = size_min + math.floor(math.exp(math.log(size_max-size_min)*math.random())) + sizes[i] = size + bytes = bytes + size + -- Use even but otherwise random alignment. + -- XXX odd alignment breaks SSE2 checksum -- fix separately! + local align = math.random(32) * 2 + arrays[i] = ffi.new("char[?]", size + align) + align + -- Fill with random data + for j = 0, size-1 do + arrays[i][j] = math.random(256) + end + end + local pmu = require("lib.pmu") + simd = require("lib.checksum_simd") + local checksum = require("lib.checksum") + local cksum = function (f) + return function () + for i = 1, loops do + for i = 1, inputs do + f(arrays[i], sizes[i], 0) + end + end + end + end + local r = {} + local pmu_aux = {byte=bytes*loops, call=inputs*loops} + local pmu_events = {} + _, r.asm = pmu.measure(cksum(simd.cksum), pmu_events, pmu_aux) + _, r.avx2 = pmu.measure(cksum(C.cksum_avx2), pmu_events, pmu_aux) + _, r.sse2 = pmu.measure(cksum(C.cksum_sse2), pmu_events, pmu_aux) + _, r.base = pmu.measure(cksum(C.cksum_generic), pmu_events, pmu_aux) + print(("%-14s %14s %14s %14s"):format("VARIANT", "BYTES/PACKET", "BYTES/CYCLE", "CYCLES/PACKET")) + local totalbytes = bytes * loops + for variant, result in pairs(r) do + local bpp = bytes / inputs + local bpc = totalbytes / result.cycles + local cpp = result.cycles / (inputs * loops) + print(("%-14s %14.3f %14.3f %14.3f"):format(variant, bpp, bpc, cpp)) + if verbose then pmu.report(result, pmu_aux) print() end + end +end + +function checksum (args) + -- XXX add a useful command-line syntax + checksum1(20, 5000, false) +end From 3c1bb619bb7f5fb3dea7602a3900be739d321cfb Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Sun, 1 May 2016 07:24:16 +0000 Subject: [PATCH 7/7] lib.checksum: Rework C implementations Before there were three separate C checksum implementations (generic, SSE, AVX) that are each compiled with different compiler settings. These were fairly complex due to SIMD intrinsics. The SSE implementation was also incorrect and would segfault with odd-numbered addresses. Now there is one C checksum routine that is compiled with two different compiler settings (default/SSE and AVX). The checksum routine is written in a very simple style that GCC successfully vectorizes automatically (tested with GCC 4.8.5, 4.9.3, 5.3.0). I experimented with "waving a voodoo chicken" in a few different ways (# accumulators = 1, 2, 4; accumulator size = 32bit, 64bit). This formulation seems to work best for GCC. This does feel like hokus-pokus that exposes us to GCC behavior that is not nailed down, but that bothers me less than the high-brow intrinsics code. I have retained the AVX2 assembler implementation with DynASM because I have not been able to beat that with GCC yet. Current scoreboard: VARIANT BYTES/PACKET BYTES/CYCLE CYCLES/PACKET base 631.331 2.939 214.796 asm 631.331 4.161 151.719 avx2 631.331 3.416 184.825 --- src/Makefile | 13 ++-- src/arch/avx2.c | 84 --------------------- src/arch/checksum.c | 42 +++++++++++ src/arch/sse2.c | 94 ------------------------ src/lib/checksum.h | 9 +-- src/lib/checksum.lua | 17 +---- src/lib/{checksum.c => checksum_extra.c} | 66 +---------------- src/program/snabbmark/snabbmark.lua | 5 +- 8 files changed, 60 insertions(+), 270 deletions(-) delete mode 100644 src/arch/avx2.c create mode 100644 src/arch/checksum.c delete mode 100644 src/arch/sse2.c rename src/lib/{checksum.c => checksum_extra.c} (65%) diff --git a/src/Makefile b/src/Makefile index 831d6fc60c..0d405a23f5 100644 --- a/src/Makefile +++ b/src/Makefile @@ -15,7 +15,6 @@ PFLUASRC = $(shell cd ../lib/pflua/src && \ CSRC = $(shell find . -regex '[^\#]*\.c' -not -regex './arch/.*' -printf '%P ') CHDR = $(shell find . -regex '[^\#]*\.h' -printf '%P ') ASM = $(shell find . -regex '[^\#]*\.dasl' -printf '%P ') -ARCHSRC= $(shell find . -regex '^./arch/[^\#]*\.c' -printf '%P ') RMSRC = $(shell find . -name '*.md' -not -regex './obj.*' -printf '%P ') # regexp is to include program/foo but not program/foo/bar PROGRAM = $(shell find program -regex '^[^/]+/[^/]+' -type d -printf '%P ') @@ -26,7 +25,7 @@ LUAOBJ := $(patsubst %.lua,obj/%_lua.o,$(LUASRC)) PFLUAOBJ := $(patsubst %.lua,obj/%_lua.o,$(PFLUASRC)) COBJ := $(patsubst %.c,obj/%_c.o, $(CSRC)) HOBJ := $(patsubst %.h,obj/%_h.o, $(CHDR)) -ARCHOBJ:= $(patsubst %.c,obj/%_c.o, $(ARCHSRC)) +ARCHOBJ:= obj/arch/checksum_c.o obj/arch/checksum_avx2_c.o ASMOBJ := $(patsubst %.dasl,obj/%_dasl.o, $(ASM)) JITOBJS:= $(patsubst %,obj/jit_%.o,$(JITSRC)) EXTRAOBJS := obj/jit_tprof.o obj/jit_vmprof.o obj/strict.o @@ -126,13 +125,13 @@ $(COBJ): obj/%_c.o: %.c $(CHDR) Makefile | $(OBJDIR) $(E) "C $@" $(Q) gcc $(DEBUG) -Wl,-E -I ../lib/luajit/src -I . -include $(CURDIR)/../gcc-preinclude.h -c -Wall -Werror -o $@ $< -obj/arch/avx2_c.o: arch/avx2.c Makefile +obj/arch/checksum_avx2_c.o: arch/checksum.c Makefile $(E) "C(AVX2) $@" - $(Q) gcc -O2 -mavx2 $(DEBUG) -Wl,-E -I ../lib/luajit/src -I . -include $(CURDIR)/../gcc-preinclude.h -c -Wall -Werror -o $@ $< + $(Q) gcc -Dcksum=cksum_avx2 -O3 -mavx2 $(DEBUG) -Wl,-E -I ../lib/luajit/src -I . -include $(CURDIR)/../gcc-preinclude.h -c -Wall -Werror -o $@ $< -obj/arch/sse2_c.o: arch/sse2.c Makefile - $(E) "C(SSE2) $@" - $(Q) gcc -O2 -msse2 $(DEBUG) -Wl,-E -I ../lib/luajit/src -I . -include $(CURDIR)/../gcc-preinclude.h -c -Wall -Werror -o $@ $< +obj/arch/checksum_c.o: arch/checksum.c Makefile + $(E) "C(SSE4) $@" + $(Q) gcc -msse4 -O3 $(DEBUG) -Wl,-E -I ../lib/luajit/src -I . -include $(CURDIR)/../gcc-preinclude.h -c -Wall -Werror -o $@ $< $(HOBJ): obj/%_h.o: %.h Makefile | $(OBJDIR) $(E) "H $@" diff --git a/src/arch/avx2.c b/src/arch/avx2.c deleted file mode 100644 index fce7e78a0e..0000000000 --- a/src/arch/avx2.c +++ /dev/null @@ -1,84 +0,0 @@ -/* Use of this source code is governed by the Apache 2.0 license; see COPYING. - * Based on original SSE2 code by Tony Rogvall that is - * copyright 2011 Teclo Networks AG. MIT licensed by Juho Snellman. */ - -/* IP checksum routine for AVX2. */ - -#include -#include -#include -#include "lib/checksum.h" -#include "lib/checksum_lib.h" - -static inline uint32_t cksum_avx2_loop(unsigned char *p, size_t n) -{ - __m256i sum0, sum1, zero; - uint32_t s[8] __attribute__((aligned(32))); // aligned for avx2 store - uint32_t sum2; - - zero = _mm256_set_epi64x(0,0,0,0); - sum0 = zero; - sum1 = zero; - - while(n) { - size_t k = (n >= 0xff) ? 0xff : n; - __m256i t0,t1; - __m256i s0 = zero; - __m256i s1 = zero; - n -= k; - while (k) { - __m256i src = _mm256_loadu_si256((__m256i const*) p); - __m256i t; - - t = _mm256_unpacklo_epi8(src, zero); - s0 = _mm256_adds_epu16(s0, t); - t = _mm256_unpackhi_epi8(src, zero); - s1 = _mm256_adds_epu16(s1, t); - p += sizeof(src); - k--; - } - - // LOW - combine S0 and S1 into sum0 - t0 = _mm256_unpacklo_epi16(s0, zero); - sum0 = _mm256_add_epi32(sum0, t0); - t1 = _mm256_unpacklo_epi16(s1, zero); - sum1 = _mm256_add_epi32(sum1, t1); - - // HIGH - combine S0 and S1 into sum1 - t0 = _mm256_unpackhi_epi16(s0, zero); - sum0 = _mm256_add_epi32(sum0, t0); - t1 = _mm256_unpackhi_epi16(s1, zero); - sum1 = _mm256_add_epi32(sum1, t1); - } - // here we must sum the 4-32 bit sums into one 32 bit sum - _mm256_store_si256((__m256i*)s, sum0); - sum2 = (s[0]<<8) + s[1] + (s[2]<<8) + s[3] + (s[4]<<8) + s[5] + (s[6]<<8) + s[7]; - _mm256_store_si256((__m256i*)s, sum1); - sum2 += (s[0]<<8) + s[1] + (s[2]<<8) + s[3] + (s[4]<<8) + s[5] + (s[6]<<8) + s[7]; - - return sum2; -} - -uint16_t cksum_avx2(unsigned char *p, size_t n, uint16_t initial) -{ - uint32_t sum = initial; - - if (n < 128) { return cksum_generic(p, n, initial); } - if (n >= 64) { - size_t k = (n >> 5); - sum += cksum_avx2_loop(p, k); - n -= (32*k); - p += (32*k); - } - if (n > 1) { - size_t k = (n>>1); // number of 16-bit words - sum += cksum_ua_loop(p, k); - n -= (2*k); - p += (2*k); - } - if (n) // take care of left over byte - sum += (p[0] << 8); - while(sum>>16) - sum = (sum & 0xFFFF) + (sum>>16); - return (uint16_t)~sum; -} diff --git a/src/arch/checksum.c b/src/arch/checksum.c new file mode 100644 index 0000000000..458c66efd1 --- /dev/null +++ b/src/arch/checksum.c @@ -0,0 +1,42 @@ +/* Use of this source code is governed by the Apache 2.0 license; see COPYING. */ +/* IP checksum routines. */ + +#include +#include +#include +#include +#include +#include +#include + +uint16_t cksum(unsigned char *p, size_t len, uint16_t initial) +{ + uint64_t sum = htons(initial); + uint64_t sum1 = 0; + const uint32_t *u32 = (const uint32_t *)p; + + while (len >= (sizeof(*u32) * 2)) { + sum += u32[0]; + sum1 += u32[1]; + u32 += 2; + len -= sizeof(*u32) * 2; + } + sum += sum1; + + const uint16_t *u16 = (const uint16_t *)u32; + while (len >= sizeof(*u16)) { + sum += *u16; + len -= sizeof(*u16); + u16 += 1; + } + + /* if length is in odd bytes */ + if (len == 1) + sum += *((const uint8_t *)u16); + + while(sum>>16) + sum = (sum & 0xFFFF) + (sum>>16); + return ntohs((uint16_t)~sum); +} + + diff --git a/src/arch/sse2.c b/src/arch/sse2.c deleted file mode 100644 index f98878e871..0000000000 --- a/src/arch/sse2.c +++ /dev/null @@ -1,94 +0,0 @@ -/* Use of this source code is governed by the Apache 2.0 license; see COPYING. - * Original code by Tony Rogvall that is - * copyright 2011 Teclo Networks AG. MIT licensed by Juho Snellman. */ - -/* IP checksum routine for SSE2. */ - -#include -#include -#include -#include "lib/checksum.h" -#include "lib/checksum_lib.h" - -// -// this loop may only run when data is aligned 16 byte aligned -// n is number of 16 byte vectors -// -static inline uint32_t cksum_sse2_loop(unsigned char *p, size_t n) -{ - __m128i sum0, sum1, zero; - uint32_t s[4]; - uint32_t sum2; - - zero = _mm_set_epi32(0,0,0,0); - sum0 = zero; - sum1 = zero; - - while(n) { - size_t k = (n >= 0xff) ? 0xff : n; - __m128i t0,t1; - __m128i s0 = zero; - __m128i s1 = zero; - n -= k; - while (k) { - __m128i src = _mm_load_si128((__m128i const*) p); - __m128i t; - - t = _mm_unpacklo_epi8(src, zero); - s0 = _mm_adds_epu16(s0, t); - t = _mm_unpackhi_epi8(src, zero); - s1 = _mm_adds_epu16(s1, t); - p += sizeof(src); - k--; - } - - // LOW - combine S0 and S1 into sum0 - t0 = _mm_unpacklo_epi16(s0, zero); - sum0 = _mm_add_epi32(sum0, t0); - t1 = _mm_unpacklo_epi16(s1, zero); - sum1 = _mm_add_epi32(sum1, t1); - - // HIGH - combine S0 and S1 into sum1 - t0 = _mm_unpackhi_epi16(s0, zero); - sum0 = _mm_add_epi32(sum0, t0); - t1 = _mm_unpackhi_epi16(s1, zero); - sum1 = _mm_add_epi32(sum1, t1); - } - // here we must sum the 4-32 bit sums into one 32 bit sum - _mm_store_si128((__m128i*)s, sum0); - sum2 = (s[0]<<8) + s[1] + (s[2]<<8) + s[3]; - _mm_store_si128((__m128i*)s, sum1); - sum2 += (s[0]<<8) + s[1] + (s[2]<<8) + s[3]; - return sum2; -} - -uint16_t cksum_sse2(unsigned char *p, size_t n, uint16_t initial) -{ - uint32_t sum = initial; - - if (n < 128) { return cksum_generic(p, n, initial); } - int unaligned = (unsigned long) p & 0xf; - if (unaligned) { - size_t k = (0x10 - unaligned) >> 1; - sum += cksum_ua_loop(p, k); - n -= (2*k); - p += (2*k); - } - if (n >= 32) { // fast even with only two vectors - size_t k = (n >> 4); - sum += cksum_sse2_loop(p, k); - n -= (16*k); - p += (16*k); - } - if (n > 1) { - size_t k = (n>>1); // number of 16-bit words - sum += cksum_ua_loop(p, k); - n -= (2*k); - p += (2*k); - } - if (n) // take care of left over byte - sum += (p[0] << 8); - while(sum>>16) - sum = (sum & 0xFFFF) + (sum>>16); - return (uint16_t)~sum; -} diff --git a/src/lib/checksum.h b/src/lib/checksum.h index 008bfa190e..2081d84459 100644 --- a/src/lib/checksum.h +++ b/src/lib/checksum.h @@ -1,17 +1,12 @@ /* Use of this source code is governed by the Apache 2.0 license; see COPYING. */ -// Calculate IP checksum using SSE2 instructions. -// (This will crash if you call it on a CPU that does not support SSE.) -uint16_t cksum_sse2(unsigned char *p, size_t n, uint16_t initial); +// Calculate IP checksum. +uint16_t cksum(unsigned char *p, size_t n, uint16_t initial); // Calculate IP checksum using AVX2 instructions. // (This will crash if you call it on a CPU that does not support AVX2.) uint16_t cksum_avx2(unsigned char *p, size_t n, uint16_t initial); -// Calculate IP checksum using portable C code. -// This works on all hardware. -uint16_t cksum_generic(unsigned char *p, size_t n, uint16_t initial); - // Incrementally update checksum when modifying a 16-bit value. void checksum_update_incremental_16(uint16_t* checksum_cell, uint16_t* value_cell, diff --git a/src/lib/checksum.lua b/src/lib/checksum.lua index 05ebc07849..de3ae151a6 100644 --- a/src/lib/checksum.lua +++ b/src/lib/checksum.lua @@ -15,12 +15,8 @@ local band, lshift = bit.band, bit.lshift local cpuinfo = lib.readfile("/proc/cpuinfo", "*a") assert(cpuinfo, "failed to read /proc/cpuinfo for hardware check") local have_avx2 = cpuinfo:match("avx2") -local have_sse2 = cpuinfo:match("sse2") - -if have_avx2 then ipsum = C.cksum_avx2 -elseif have_sse2 then ipsum = C.cksum_sse2 -else ipsum = C.cksum_generic end +if have_avx2 then ipsum = C.cksum_avx2 else ipsum = C.cksum end function finish_packet (buf, len, offset) ffi.cast('uint16_t *', buf+offset)[0] = lib.htons(ipsum(buf, len, 0)) @@ -102,27 +98,22 @@ end function selftest () print("selftest: checksum") - local tests = 1000 + local tests = 10000 local n = 1000000 local array = ffi.new("char[?]", n) - for i = 0, n-1 do array[i] = i end + for i = 0, n-1 do array[i] = math.random(256) end local avx2ok, sse2ok = 0, 0 for i = 1, tests do local initial = math.random(0, 0xFFFF) - local ref = C.cksum_generic(array+i*2, i*10+i, initial) + local ref = C.cksum(array+i*2, i*10+i, initial) if have_avx2 and C.cksum_avx2(array+i*2, i*10+i, initial) == ref then avx2ok = avx2ok + 1 end - if have_sse2 and C.cksum_sse2(array+i*2, i*10+i, initial) == ref then - sse2ok = sse2ok + 1 - end assert(ipsum(array+i*2, i*10+i, initial) == ref, "API function check") end if have_avx2 then print("avx2: "..avx2ok.."/"..tests) else print("no avx2") end - if have_sse2 then print("sse2: "..sse2ok.."/"..tests) else print("no sse2") end selftest_ipv4_tcp() assert(not have_avx2 or avx2ok == tests, "AVX2 test failed") - assert(not have_sse2 or sse2ok == tests, "SSE2 test failed") print("selftest: ok") end diff --git a/src/lib/checksum.c b/src/lib/checksum_extra.c similarity index 65% rename from src/lib/checksum.c rename to src/lib/checksum_extra.c index b75d1b40e0..f6e8e5c910 100644 --- a/src/lib/checksum.c +++ b/src/lib/checksum_extra.c @@ -1,67 +1,9 @@ -/* Use of this source code is governed by the Apache 2.0 license; see COPYING. - * Generic checksm routine originally taken from DPDK: - * BSD license; (C) Intel 2010-2015, 6WIND 2014. */ - -/* IP checksum routines. - * - * See src/arch/ for architecture specific SIMD versions. */ +/* Use of this source code is governed by the Apache 2.0 license; see COPYING. */ +/* IP checksum routines. */ #include -#include -#include -#include -#include #include -#include - -uint16_t cksum_generic(unsigned char *p, size_t len, uint16_t initial) -{ - uint32_t sum = htons(initial); - const uint16_t *u16 = (const uint16_t *)p; - - while (len >= (sizeof(*u16) * 4)) { - sum += u16[0]; - sum += u16[1]; - sum += u16[2]; - sum += u16[3]; - len -= sizeof(*u16) * 4; - u16 += 4; - } - while (len >= sizeof(*u16)) { - sum += *u16; - len -= sizeof(*u16); - u16 += 1; - } - - /* if length is in odd bytes */ - if (len == 1) - sum += *((const uint8_t *)u16); - - while(sum>>16) - sum = (sum & 0xFFFF) + (sum>>16); - return ntohs((uint16_t)~sum); -} - -// SIMD versions - -// -// A unaligned version of the cksum, -// n is number of 16-bit values to sum over, n in it self is a -// 16 bit number in order to avoid overflow in the loop -// -static inline uint32_t cksum_ua_loop(unsigned char *p, uint16_t n) -{ - uint32_t s0 = 0; - uint32_t s1 = 0; - - while (n) { - s0 += p[0]; - s1 += p[1]; - p += 2; - n--; - } - return (s0<<8)+s1; -} +#include "checksum.h" // Incrementally update checksum when modifying a 16-bit value. void checksum_update_incremental_16(uint16_t* checksum_cell, @@ -125,7 +67,7 @@ uint32_t pseudo_header_initial(const int8_t *buf, size_t len) uint32_t sum = 0; len -= headersize; if (ipv == 4) { // IPv4 - if (cksum_generic((unsigned char *)buf, headersize, 0) != 0) { + if (cksum((unsigned char *)buf, headersize, 0) != 0) { return 0xFFFF0002; } sum = htons(len & 0x0000FFFF) + (proto << 8) diff --git a/src/program/snabbmark/snabbmark.lua b/src/program/snabbmark/snabbmark.lua index 990eb28487..77885478f2 100644 --- a/src/program/snabbmark/snabbmark.lua +++ b/src/program/snabbmark/snabbmark.lua @@ -334,7 +334,7 @@ end -- Checksum benchmark function checksum1 (size_min, size_max, verbose) - local loops = 1000 + local loops = 10000 local inputs = 1000 local sizes = {} local arrays = {} @@ -371,8 +371,7 @@ function checksum1 (size_min, size_max, verbose) local pmu_events = {} _, r.asm = pmu.measure(cksum(simd.cksum), pmu_events, pmu_aux) _, r.avx2 = pmu.measure(cksum(C.cksum_avx2), pmu_events, pmu_aux) - _, r.sse2 = pmu.measure(cksum(C.cksum_sse2), pmu_events, pmu_aux) - _, r.base = pmu.measure(cksum(C.cksum_generic), pmu_events, pmu_aux) + _, r.base = pmu.measure(cksum(C.cksum), pmu_events, pmu_aux) print(("%-14s %14s %14s %14s"):format("VARIANT", "BYTES/PACKET", "BYTES/CYCLE", "CYCLES/PACKET")) local totalbytes = bytes * loops for variant, result in pairs(r) do